From c0659d5ce9d9690c826204d855c13172b0eb9d33 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 1 Jul 2026 19:37:10 +0200 Subject: [PATCH] feat(ingest): introduce run-prune.sh for orphaned source removal --- skills/ingest/scripts/run-prune.sh | 96 ++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100755 skills/ingest/scripts/run-prune.sh diff --git a/skills/ingest/scripts/run-prune.sh b/skills/ingest/scripts/run-prune.sh new file mode 100755 index 0000000..6cf22f2 --- /dev/null +++ b/skills/ingest/scripts/run-prune.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/run-prune.sh +# Symmetric companion to run-ingest: prune source pages whose raw source no +# longer exists. RE-DERIVES the orphan set itself (mirrors orphan-wiki.sh) — it +# never trusts a list handed in by n8n, so there is no "detected-vs-pruned" +# race. Removes ONLY the pages it derived plus their index entries, commits +# ONLY wiki/ on chore/prune-orphans-, and opens a GATED removal PR (the +# operator approves the deletion; principle 2). Never deletes of its own accord. +# +# Runs OUTSIDE the model, on vm101, cwd = genome checkout. The wrapper (`pi +# prune`) has already taken the per-genome lock and done clean_start, exactly +# like `pi ingest` — so this script does neither. +# +# run-prune.sh +# +# Emits a single JSON result line on stdout for n8n to parse. +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: run-prune.sh }" +SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +fail() { + jq -nc --arg stage "$1" --arg reason "$2" '{status:"error", stage:$stage, reason:$reason}' + exit 1 +} + +command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; } +command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" + +# --- re-derive orphans (same rule as orphan-wiki.sh; computed fresh, here, now) --- +# A wiki/sources/*.md page is orphaned when its frontmatter source_path points at +# a raw file that no longer exists. Legacy pages without source_path are ignored. +declare -a ORPH=() +for page in wiki/sources/*.md; do + [[ -e "$page" ]] || continue + sp="$(sed -n 's/^source_path:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)" + [[ -n "$sp" ]] || continue + [[ -f "$sp" ]] || ORPH+=("$page") +done + +if [[ ${#ORPH[@]} -eq 0 ]]; then + jq -nc '{status:"ok", count:0, pruned:[], detail:"no orphans"}' + exit 0 +fi + +# --- remove each orphan page + its index entry (anti-traversal, wiki/-only) --- +declare -a PRUNED=() +for page in "${ORPH[@]}"; do + case "$page" in + wiki/*) : ;; + *) fail "prune" "refusing to remove outside wiki/: ${page}" ;; + esac + case "$page" in *..*) fail "prune" "path traversal in page: ${page}" ;; esac + [[ -f "$page" ]] || continue + rm -f "$page" + link="${page#wiki/}"; link="${link%.md}" # e.g. sources/foo + python3 "${SCRIPTS}/index-append.py" --remove "$link" \ + || fail "index" "index-append --remove failed for ${link}" + PRUNED+=("$link") +done + +# --- assemble the PR body --- +date_tag="$(date +%F)" +body="$(mktemp)" +trap 'rm -f "$body"' EXIT +{ + echo "## Prune orphaned sources" + echo "" + echo "These source pages reference a \`source_path\` whose raw file no longer exists" + echo "in \`raw/\`. Removing them keeps the wiki in sync with git (the source of truth)." + echo "" + echo "| Removed page |" + echo "|--------------|" + for l in "${PRUNED[@]}"; do echo "| \`wiki/${l}.md\` |"; done +} > "$body" + +# --- open the GATED removal PR on a chore/ branch (open-pr --branch override) --- +branch="chore/prune-orphans-${date_tag}" +pr_out="$( bash "${SCRIPTS}/open-pr.sh" \ + --branch "$branch" \ + --title "chore: prune ${#PRUNED[@]} orphaned source(s)" \ + --body-file "$body" --base "${INGEST_BASE:-main}" 2>&1 )" && pr_rc=0 || pr_rc=$? +pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" + +# --- result line for n8n --- +jq -nc \ + --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ + --argjson count "${#PRUNED[@]}" \ + --arg pr_url "$pr_url" \ + --arg detail "$pr_out" \ + --argjson pruned "$(printf '%s\n' "${PRUNED[@]}" | jq -R . | jq -s .)" \ + '{status:$status, count:$count, pr_url:$pr_url, pruned:$pruned, detail:$detail}' + +[[ $pr_rc -eq 0 ]] || exit 1