173 lines
8 KiB
Bash
Executable file
173 lines
8 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# skills/ingest/scripts/run-ingest.sh
|
|
# Post-semantic orchestrator. Runs OUTSIDE the model, on vm101, in the genome
|
|
# checkout. Consumes .ingest-manifest.json (written by ingest-semantic.py) and
|
|
# performs every deterministic step — index, log, scoped lint, PR.
|
|
#
|
|
# run-ingest.sh <genome_name> [manifest_path]
|
|
#
|
|
# Emits a single JSON result line on stdout for n8n to parse.
|
|
#
|
|
# every page listed in the manifest must exist on disk before we trust the run.
|
|
# Everything else is unchanged: the manifest the semantic phase now produces is
|
|
# already in this script's expected schema.
|
|
# =============================================================================
|
|
set -euo pipefail
|
|
|
|
genome="${1:?usage: run-ingest.sh <genome> [manifest]}"
|
|
manifest="${2:-.ingest-manifest.json}"
|
|
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
fail() {
|
|
jq -nc --arg stage "$1" --arg reason "$2" \
|
|
'{status:"error", stage:$stage, reason:$reason}'
|
|
exit 1
|
|
}
|
|
|
|
command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; }
|
|
command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)"
|
|
[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}"
|
|
|
|
# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) ---
|
|
# 1) well-formed JSON object with a string raw_source and an array of pages
|
|
jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \
|
|
"$manifest" >/dev/null 2>&1 \
|
|
|| fail "manifest" "invalid manifest: need object with string raw_source and array pages"
|
|
|
|
# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal)
|
|
if jq -e '[.pages[].path
|
|
| select((type!="string") or (startswith("wiki/")|not) or contains(".."))]
|
|
| length > 0' "$manifest" >/dev/null 2>&1; then
|
|
fail "manifest" "unsafe page path (must be a string under wiki/, no '..')"
|
|
fi
|
|
|
|
# --- read manifest scalars ---
|
|
raw_source="$(jq -r '.raw_source' "$manifest")"
|
|
# model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its
|
|
# own tag, so we do not trust a self-reported manifest field. Fall back only if unset.
|
|
model="${INGEST_MODEL:-$(jq -r '.model // "unknown"' "$manifest")}"
|
|
reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")"
|
|
pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")"
|
|
contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
|
|
|
|
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
|
|
|
|
slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
|
|
|
|
# --- collect touched paths ---
|
|
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
|
|
mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest")
|
|
all_paths=( "${created_paths[@]}" "${modified_paths[@]}" )
|
|
[[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported"
|
|
|
|
# --- the semantic phase (ingest-semantic.py) writes the files; verify
|
|
# every manifest page actually exists on disk before trusting the run. Catches any
|
|
# drift between what the manifest claims and what was really written. ---
|
|
for _p in "${all_paths[@]}"; do
|
|
[[ -f "$_p" ]] || fail "pages" "manifest lists a file not present on disk: ${_p}"
|
|
done
|
|
|
|
conflict_label=""
|
|
|
|
# NOTE: No rollback. The steps below modify the working tree in order (index → log → commit).
|
|
# All steps are idempotent on re-run EXCEPT log-append (append-only). If a step fails midway,
|
|
# nothing is committed (open-pr is the only committer) — the operator re-runs, or checks
|
|
# wiki/ if log-append has already written a line. The manifest is removed only upon full success.
|
|
# log-append is not idempotent: a re-run after a post-log failure produces
|
|
# duplicate lines. This is accepted by design (append-only ledger, no rollback). If this
|
|
# becomes a nuisance tomorrow, add a dedup check on run_id in log-append.sh
|
|
# (grep for run_id before appending). Manual recovery: grep for run_id in wiki/log.md.
|
|
|
|
# --- 1. index entries (created pages only), inserted in order ---
|
|
while IFS=$'\t' read -r path summary maturity; do
|
|
[[ -z "$path" ]] && continue
|
|
link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo
|
|
folder="${link%%/*}"
|
|
case "$folder" in
|
|
sources) section="Sources" ;;
|
|
entities) section="Entities" ;;
|
|
concepts) section="Concepts" ;;
|
|
queries)
|
|
if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT"
|
|
else section="Queries"; fi ;;
|
|
# private/ is not routed here — ingest is public-only. Add when private ingest is built.
|
|
*) section="Sources" ;;
|
|
esac
|
|
|
|
if [[ "$section" == "Conflicts" ]]; then
|
|
entry="- [[${link}]]" # conflicts: slug only
|
|
else
|
|
entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`"
|
|
fi
|
|
|
|
python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \
|
|
|| fail "index" "index-append failed for ${path}"
|
|
done < <(jq -r '.pages[] | select(.status=="created")
|
|
| [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest")
|
|
|
|
# --- 2. log entry ---
|
|
# Stable run_id: deterministic from the input (raw path + content hash). Survives wrapper
|
|
# re-runs and makes the append-only log idempotent (paired with the guard in log-append.sh).
|
|
src_sha="$(sha256sum "$raw_source" 2>/dev/null | cut -d' ' -f1)" || src_sha="unknown"
|
|
run_id="$(printf '%s' "${raw_source}:${src_sha}" | sha256sum | cut -c1-16)"
|
|
out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")"
|
|
bash "${SCRIPTS}/log-append.sh" --run-id "$run_id" --type INGEST --subject "$slug" --model "$model" \
|
|
--context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \
|
|
|| fail "log" "log-append failed"
|
|
|
|
# --- 3. scoped linter (capture findings for the PR; never aborts the run) ---
|
|
# Point scoped-lint at the same manifest we were handed so its duplicate
|
|
# advisory reads the right file even when a non-default path arrives as $2.
|
|
# (The dedup check lives inside lib/lint.sh and is invoked by scoped-lint —
|
|
# there is no separate check-duplicates.sh script.)
|
|
export INGEST_MANIFEST="$manifest"
|
|
lint_out="$(
|
|
bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1
|
|
)" && lint_rc=0 || lint_rc=$?
|
|
|
|
# --- 4. assemble the PR body (manifest tables + lint results) ---
|
|
body="$(mktemp)"
|
|
trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash)
|
|
|
|
{
|
|
echo "## Summary"
|
|
echo "$pr_summary"
|
|
echo ""
|
|
echo "## Pages"
|
|
echo "| Path | Status | Maturity |"
|
|
echo "|------|--------|----------|"
|
|
jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest"
|
|
echo ""
|
|
echo "## Contradictions"
|
|
echo "$contradictions"
|
|
echo ""
|
|
echo "## Scoped Lint (post-ingest)"
|
|
echo '```'
|
|
echo "$lint_out"
|
|
echo '```'
|
|
} > "$body"
|
|
|
|
# --- 5. open the PR ---
|
|
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" )
|
|
[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" )
|
|
pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
|
|
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
|
|
|
|
# --- final result line for n8n ---
|
|
jq -nc \
|
|
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \
|
|
--arg slug "$slug" \
|
|
--arg pr_url "$pr_url" \
|
|
--argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \
|
|
--argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \
|
|
--arg detail "$pr_out" \
|
|
'{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}'
|
|
|
|
# The manifest is a single file that is overwritten with each run, but if the process is
|
|
# completely successful, we remove it to prevent an outdated manifest from being reprocessed by mistake.
|
|
if [[ $pr_rc -eq 0 ]]; then
|
|
rm -f "$manifest"
|
|
else
|
|
exit 1
|
|
fi
|