#!/usr/bin/env bash # ============================================================================= # skills/ingest/scripts/run-ingest.sh # Post-semantic orchestrator. Runs OUTSIDE the model, on vm101, in the genome # checkout. Consumes .ingest-manifest.json (written by ingest-semantic.py) and # performs every deterministic step — index, log, scoped lint, PR. # # run-ingest.sh [manifest_path] # # Emits a single JSON result line on stdout for n8n to parse. # # every page listed in the manifest must exist on disk before we trust the run. # Everything else is unchanged: the manifest the semantic phase now produces is # already in this script's expected schema. # ============================================================================= set -euo pipefail genome="${1:?usage: run-ingest.sh [manifest]}" manifest="${2:-.ingest-manifest.json}" SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" fail() { jq -nc --arg stage "$1" --arg reason "$2" \ '{status:"error", stage:$stage, reason:$reason}' exit 1 } command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; } command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" [[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}" # --- validate the manifest BEFORE trusting any field (LLM output is stochastic) --- # 1) well-formed JSON object with a string raw_source and an array of pages jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \ "$manifest" >/dev/null 2>&1 \ || fail "manifest" "invalid manifest: need object with string raw_source and array pages" # 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal) if jq -e '[.pages[].path | select((type!="string") or (startswith("wiki/")|not) or contains(".."))] | length > 0' "$manifest" >/dev/null 2>&1; then fail "manifest" "unsafe page path (must be a string under wiki/, no '..')" fi # --- read manifest scalars --- raw_source="$(jq -r '.raw_source' "$manifest")" # model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its # own tag, so we do not trust a self-reported manifest field. Fall back only if unset. model="${INGEST_MODEL:-$(jq -r '.model // "unknown"' "$manifest")}" reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")" pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")" contradictions="$(jq -r '.contradictions // "None"' "$manifest")" [[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}" # --- collect touched paths --- mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest") all_paths=( "${created_paths[@]}" "${modified_paths[@]}" ) [[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported" # --- the semantic phase (ingest-semantic.py) writes the files; verify # every manifest page actually exists on disk before trusting the run. Catches any # drift between what the manifest claims and what was really written. --- for _p in "${all_paths[@]}"; do [[ -f "$_p" ]] || fail "pages" "manifest lists a file not present on disk: ${_p}" done conflict_label="" # NOTE: No rollback. The steps below modify the working tree in order (index → log → commit). # All steps are idempotent on re-run EXCEPT log-append (append-only). If a step fails midway, # nothing is committed (open-pr is the only committer) — the operator re-runs, or checks # wiki/ if log-append has already written a line. The manifest is removed only upon full success. # log-append is not idempotent: a re-run after a post-log failure produces # duplicate lines. This is accepted by design (append-only ledger, no rollback). If this # becomes a nuisance tomorrow, add a dedup check on run_id in log-append.sh # (grep for run_id before appending). Manual recovery: grep for run_id in wiki/log.md. # --- 1. index entries (created pages only), inserted in order --- while IFS=$'\t' read -r path summary maturity; do [[ -z "$path" ]] && continue link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo folder="${link%%/*}" case "$folder" in sources) section="Sources" ;; entities) section="Entities" ;; concepts) section="Concepts" ;; queries) if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT" else section="Queries"; fi ;; # private/ is not routed here — ingest is public-only. Add when private ingest is built. *) section="Sources" ;; esac if [[ "$section" == "Conflicts" ]]; then entry="- [[${link}]]" # conflicts: slug only else entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`" fi python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \ || fail "index" "index-append failed for ${path}" done < <(jq -r '.pages[] | select(.status=="created") | [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest") # --- 2. log entry --- out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")" bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ || fail "log" "log-append failed" # --- 3. scoped linter (capture findings for the PR; never aborts the run) --- # Point scoped-lint at the same manifest we were handed so its duplicate # advisory reads the right file even when a non-default path arrives as $2. # (The dedup check lives inside lib/lint.sh and is invoked by scoped-lint — # there is no separate check-duplicates.sh script.) export INGEST_MANIFEST="$manifest" lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? # --- 4. assemble the PR body (manifest tables + lint results) --- body="$(mktemp)" trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash) { echo "## Summary" echo "$pr_summary" echo "" echo "## Pages" echo "| Path | Status | Maturity |" echo "|------|--------|----------|" jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest" echo "" echo "## Contradictions" echo "$contradictions" echo "" echo "## Scoped Lint (post-ingest)" echo '```' echo "$lint_out" echo '```' } > "$body" # --- 5. open the PR --- pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" ) [[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" # --- final result line for n8n --- jq -nc \ --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ --arg slug "$slug" \ --arg pr_url "$pr_url" \ --argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \ --argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \ --arg detail "$pr_out" \ '{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}' # The manifest is a single file that is overwritten with each run, but if the process is # completely successful, we remove it to prevent an outdated manifest from being reprocessed by mistake. if [[ $pr_rc -eq 0 ]]; then rm -f "$manifest" else exit 1 fi