knowledge-genome-orchestrator/skills/ingest/scripts/run-ingest.sh

150 lines
6.7 KiB
Bash

#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/run-ingest.sh
# Post-pi orchestrator. Runs OUTSIDE pi's loop, on vm101, in the genome checkout.
# Consumes .ingest-manifest.json (written by the ingest skill) and performs every
# deterministic step — index, log, scoped lint, PR — so pi's context stays clean.
#
# run-ingest.sh <genome_name> [manifest_path]
#
# Emits a single JSON result line on stdout for n8n to parse.
# =============================================================================
set -euo pipefail
genome="${1:?usage: run-ingest.sh <genome> [manifest]}"
manifest="${2:-.ingest-manifest.json}"
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
fail() {
jq -nc --arg stage "$1" --arg reason "$2" \
'{status:"error", stage:$stage, reason:$reason}'
exit 1
}
command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; }
command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)"
[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}"
# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) ---
# 1) well-formed JSON object with a string raw_source and an array of pages
jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \
"$manifest" >/dev/null 2>&1 \
|| fail "manifest" "invalid manifest: need object with string raw_source and array pages"
# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal)
if jq -e '[.pages[].path
| select((type!="string") or (startswith("wiki/")|not) or contains(".."))]
| length > 0' "$manifest" >/dev/null 2>&1; then
fail "manifest" "unsafe page path (must be a string under wiki/, no '..')"
fi
# --- read manifest scalars ---
raw_source="$(jq -r '.raw_source' "$manifest")"
# model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its
# own tag, so we do not trust a self-reported manifest field. Fall back only if unset.
model="${INGEST_MODEL:-$(jq -r '.model // "unknown"' "$manifest")}"
reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")"
pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")"
contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
# --- collect touched paths ---
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest")
all_paths=( "${created_paths[@]}" "${modified_paths[@]}" )
[[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported"
conflict_label=""
# NOTE: No rollback. The steps below modify the working tree in order (index → log → commit).
# All steps are idempotent on re-run EXCEPT log-append (append-only). If a step fails midway,
# nothing is committed (open-pr is the only committer) — the operator re-runs, or checks
# wiki/ if log-append has already written a line. The manifest is removed only upon full success.
# log-append is not idempotent: a re-run after a post-log failure produces
# duplicate lines. This is accepted by design (append-only ledger, no rollback). If this
# becomes a nuisance tomorrow, add a dedup check on run_id in log-append.sh
# (grep for run_id before appending). Manual recovery: grep for run_id in wiki/log.md.
# --- 1. index entries (created pages only), inserted in order ---
while IFS=$'\t' read -r path summary maturity; do
[[ -z "$path" ]] && continue
link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo
folder="${link%%/*}"
case "$folder" in
sources) section="Sources" ;;
entities) section="Entities" ;;
concepts) section="Concepts" ;;
queries)
if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT"
else section="Queries"; fi ;;
*) section="Sources" ;;
esac
if [[ "$section" == "Conflicts" ]]; then
entry="- [[${link}]]" # conflicts: slug only
else
entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`"
fi
python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \
|| fail "index" "index-append failed for ${path}"
done < <(jq -r '.pages[] | select(.status=="created")
| [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest")
# --- 2. log entry ---
out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")"
bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \
--context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \
|| fail "log" "log-append failed"
# --- 3. scoped linter (capture findings for the PR; never aborts the run) ---
lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
# --- 4. assemble the PR body (manifest tables + lint results) ---
body="$(mktemp)"
trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash)
{
echo "## Summary"
echo "$pr_summary"
echo ""
echo "## Pages"
echo "| Path | Status | Maturity |"
echo "|------|--------|----------|"
jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest"
echo ""
echo "## Contradictions"
echo "$contradictions"
echo ""
echo "## Scoped Lint (post-ingest)"
echo '```'
echo "$lint_out"
echo '```'
} > "$body"
# --- 5. open the PR ---
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" )
[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" )
pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
# --- final result line for n8n ---
jq -nc \
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \
--arg slug "$slug" \
--arg pr_url "$pr_url" \
--argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \
--argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \
--arg detail "$pr_out" \
'{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}'
# The manifest is a single file that is overwritten with each run, but if the process is
# completely successful, we remove it to prevent an outdated manifest from being reprocessed by mistake.
if [[ $pr_rc -eq 0 ]]; then
rm -f "$manifest"
else
exit 1
fi