Compare commits

...

6 commits

6 changed files with 361 additions and 91 deletions

View file

@ -79,6 +79,29 @@ case "$cmd" in
# MECHANICAL step: validate manifest -> index/log/scoped-lint/commit/PR -> 1 JSON line
exec "${HOME}/.pi/agent/skills/ingest/scripts/run-ingest.sh" "${genome}"
;;
"pi prune "*)
# Pota le source orfane. Stesso lock dell'ingest (serializza le scritture per genoma),
# clean_start, poi run-prune.sh (che ri-deriva gli orfani e apre una PR gated).
genome="${cmd#pi prune }"
case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac
logger -t n8n-pi-wrap "ok: pi prune ${genome}"
exec 9>"/run/lock/kg-ingest-${genome}.lock" 2>/dev/null || exec 9>"/tmp/kg-ingest-${genome}.lock"
if ! flock -n 9; then
echo '{"status":"busy","reason":"another ingest/prune is running for this genome","genome":"'"$genome"'"}'
exit 0
fi
set -a; . "${HOME}/.config/knowledge-genome.env"; set +a
cd "${GENOMES_ROOT}/${genome}" || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}"
source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \
|| { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; }
clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; }
exec "${HOME}/.pi/agent/skills/ingest/scripts/run-prune.sh" "${genome}"
;;
"pi ingest-rework "*)
# args: <genome> <raw_path> <feedback_base64> (3 token).
# Feedback in base64 nell'argv: il nodo SSH di n8n non passa stdin, e cosi' i metacaratteri

View file

@ -1,11 +1,12 @@
#!/usr/bin/env python3
# =============================================================================
# skills/ingest/scripts/index-append.py
# Insert an entry line into the correct section of wiki/index.md and keep that
# section's entries alphabetically ordered. Bumps frontmatter last_updated.
# Insert OR remove an entry line in wiki/index.md, keeping the target section
# alphabetically ordered. Bumps frontmatter last_updated.
#
# index-append.py --section Sources \
# --entry '- [[sources/foo]] — One-line summary. `maturity: draft`'
# index-append.py --remove 'sources/foo' # delete the entry by wikilink
# =============================================================================
import argparse
import datetime
@ -17,14 +18,116 @@ LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]")
HEADER_RE = re.compile(r"^## ")
def bump_last_updated(lines, today):
"""Bump (or self-heal) last_updated inside the first frontmatter block."""
fm_open = False
fm_close_idx = None
bumped = False
for i, ln in enumerate(lines):
if ln.strip() == "---":
if not fm_open:
fm_open = True
continue
fm_close_idx = i
break
if fm_open and ln.startswith("last_updated:"):
lines[i] = f"last_updated: {today}"
bumped = True
if not fm_open:
print("index-append: warning: no frontmatter found, last_updated not bumped",
file=sys.stderr)
elif not bumped and fm_close_idx is not None:
lines.insert(fm_close_idx, f"last_updated: {today}")
print("index-append: last_updated key was missing — inserted", file=sys.stderr)
def do_remove(lines, link, today):
"""Remove every entry line whose wikilink == link. Idempotent."""
bump_last_updated(lines, today)
kept = []
removed = 0
for ln in lines:
m = LINK_RE.match(ln)
if m and m.group(1) == link:
removed += 1
continue
kept.append(ln)
if removed:
print(f"index-append: removed [[{link}]] ({removed} line(s))")
else:
# Idempotent: the goal state (entry absent) already holds.
print(f"index-append: [[{link}]] not present, nothing to remove")
return kept
def do_append(lines, section, entry, today):
bump_last_updated(lines, today)
# Locate the target section [start, end)
start = None
for i, ln in enumerate(lines):
if HEADER_RE.match(ln) and ln[3:].startswith(section):
start = i
break
if start is None:
print(f"index-append: section '{section}' not found", file=sys.stderr)
return None
end = len(lines)
for i in range(start + 1, len(lines)):
if HEADER_RE.match(lines[i]):
end = i
break
body = lines[start + 1:end]
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
entries = [ln for ln in body if ENTRY_RE.match(ln)]
new_m = LINK_RE.match(entry)
new_link = new_m.group(1) if new_m else None
if new_link is not None:
replaced = False
for idx, ln in enumerate(entries):
m = LINK_RE.match(ln)
if m and m.group(1) == new_link:
if ln == entry:
print("index-append: entry already present, skipping")
return lines
entries[idx] = entry
replaced = True
break
if not replaced:
entries.append(entry)
else:
if entry in entries:
print("index-append: entry already present, skipping")
return lines
entries.append(entry)
entries.sort(key=str.casefold)
while intro and intro[-1].strip() == "":
intro.pop()
new_section = intro + [""] + entries + [""]
print(f"index-append: added to {section}")
return lines[:start + 1] + new_section + lines[end:]
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--section", required=True,
help="Section name, e.g. Sources / Entities / Concepts / Queries / Conflicts")
ap.add_argument("--entry", required=True, help="Full index line to insert")
ap.add_argument("--section", help="Section name (required with --entry)")
ap.add_argument("--entry", help="Full index line to insert")
ap.add_argument("--remove", metavar="WIKILINK",
help="Remove the entry with this wikilink, e.g. sources/foo")
ap.add_argument("--file", default="wiki/index.md")
args = ap.parse_args()
if bool(args.remove) == bool(args.entry):
print("index-append: provide exactly one of --entry or --remove", file=sys.stderr)
return 2
if args.entry and not args.section:
print("index-append: --entry requires --section", file=sys.stderr)
return 2
try:
with open(args.file, encoding="utf-8") as fh:
lines = fh.read().splitlines()
@ -33,90 +136,15 @@ def main() -> int:
return 1
today = datetime.date.today().isoformat()
# 1. Bump last_updated inside the first frontmatter block
fm_open = False
fm_close_idx = None
bumped = False
for i, ln in enumerate(lines):
if ln.strip() == "---":
if not fm_open:
fm_open = True
continue
fm_close_idx = i # the closing ---
break
if fm_open and ln.startswith("last_updated:"):
lines[i] = f"last_updated: {today}"
bumped = True
if not fm_open:
print("index-append: warning: no frontmatter found, last_updated not bumped",
file=sys.stderr)
elif not bumped and fm_close_idx is not None:
# self-heal: frontmatter present but missing the key — insert it before the close
lines.insert(fm_close_idx, f"last_updated: {today}")
print("index-append: last_updated key was missing — inserted", file=sys.stderr)
# 2. Locate the target section [start, end)
start = None
for i, ln in enumerate(lines):
if HEADER_RE.match(ln) and ln[3:].startswith(args.section):
start = i
break
if start is None:
print(f"index-append: section '{args.section}' not found in {args.file}",
file=sys.stderr)
return 1
end = len(lines)
for i in range(start + 1, len(lines)):
if HEADER_RE.match(lines[i]):
end = i
break
# 3. Split the section body into intro (non-entry) and entries
body = lines[start + 1:end]
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
entries = [ln for ln in body if ENTRY_RE.match(ln)]
# Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed
# summary/maturity should UPDATE the existing entry, not add a duplicate line.
new_m = LINK_RE.match(args.entry)
new_link = new_m.group(1) if new_m else None
if new_link is not None:
replaced = False
for idx, ln in enumerate(entries):
m = LINK_RE.match(ln)
if m and m.group(1) == new_link:
if ln == args.entry:
print("index-append: entry already present, skipping")
return 0
entries[idx] = args.entry # same page, refreshed text
replaced = True
break
if not replaced:
entries.append(args.entry)
if args.remove:
out = do_remove(lines, args.remove, today)
else:
# No parseable wikilink — fall back to exact-line dedup.
if args.entry in entries:
print("index-append: entry already present, skipping")
return 0
entries.append(args.entry)
entries.sort(key=str.casefold)
# Normalise intro: drop trailing blanks, keep header + comment(s)
while intro and intro[-1].strip() == "":
intro.pop()
new_section = intro + [""] + entries + [""]
lines = lines[:start + 1] + new_section + lines[end:]
out = do_append(lines, args.section, args.entry, today)
if out is None:
return 1
with open(args.file, "w", encoding="utf-8") as fh:
fh.write("\n".join(lines) + "\n")
print(f"index-append: added to {args.section}")
fh.write("\n".join(out) + "\n")
return 0

View file

@ -16,10 +16,11 @@ set -euo pipefail
: "${FORGEJO_USER:?missing FORGEJO_USER}"
: "${FORGEJO_TOKEN:?missing FORGEJO_TOKEN}"
slug="" title="" body_file="" base="main" label=""
slug="" title="" body_file="" base="main" label="" branch=""
while [[ $# -gt 0 ]]; do
case "$1" in
--slug) slug="$2"; shift 2 ;;
--branch) branch="$2"; shift 2 ;;
--title) title="$2"; shift 2 ;;
--body-file) body_file="$2"; shift 2 ;;
--base) base="$2"; shift 2 ;;
@ -28,16 +29,23 @@ while [[ $# -gt 0 ]]; do
esac
done
: "${slug:?--slug required}"
: "${title:?--title required}"
: "${body_file:?--body-file required}"
[[ -f "$body_file" ]] || { echo "open-pr: body file not found: $body_file" >&2; exit 1; }
branch="feat/ai-ingest-${slug}"
# --branch overrides the default; otherwise derive the ingest branch from --slug.
# (run-prune passes its own chore/prune-orphans-* branch; run-ingest passes --slug.)
if [[ -z "$branch" ]]; then
: "${slug:?--slug or --branch required}"
branch="feat/ai-ingest-${slug}"
fi
repo="$(basename -s .git "$(git config --get remote.origin.url)")"
# 1. Branch + commit + push (AGENTS.md rule 5: never commit to main)
git switch -c "$branch" 2>/dev/null || git switch "$branch"
# Rolling PR: -C force-resets the branch label to the current base (we are on it after
# clean_start) and CARRIES the freshly-written wiki/ changes, so a re-ingest of the same
# source rebuilds the branch cleanly instead of hitting a dirty-switch refusal.
git switch -C "$branch"
git add wiki/
# Scope BOTH the emptiness check and the commit to wiki/ — never commit anything that
# happened to be staged outside wiki/ (a stray hook, an aborted prior run, etc.).
@ -46,7 +54,10 @@ if git diff --cached --quiet -- wiki/; then
exit 1
fi
git commit -m "$title" -- wiki/
git push -u origin "$branch"
# Try a normal push (new branch / fast-forward). If the branch was rebuilt from base and
# diverged, force-with-lease updates the open PR in place — the lease refuses to clobber if
# origin moved unexpectedly since our fetch, so concurrent work is never lost.
git push -u origin "$branch" 2>/dev/null || git push -u --force-with-lease origin "$branch"
# DRY_RUN: local git work done; skip the Forgejo API (offline tests).
if [[ -n "${DRY_RUN:-}" ]]; then

View file

@ -0,0 +1,96 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/run-prune.sh
# Symmetric companion to run-ingest: prune source pages whose raw source no
# longer exists. RE-DERIVES the orphan set itself (mirrors orphan-wiki.sh) — it
# never trusts a list handed in by n8n, so there is no "detected-vs-pruned"
# race. Removes ONLY the pages it derived plus their index entries, commits
# ONLY wiki/ on chore/prune-orphans-<date>, and opens a GATED removal PR (the
# operator approves the deletion; principle 2). Never deletes of its own accord.
#
# Runs OUTSIDE the model, on vm101, cwd = genome checkout. The wrapper (`pi
# prune`) has already taken the per-genome lock and done clean_start, exactly
# like `pi ingest` — so this script does neither.
#
# run-prune.sh <genome>
#
# Emits a single JSON result line on stdout for n8n to parse.
# =============================================================================
set -euo pipefail
genome="${1:?usage: run-prune.sh <genome>}"
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
fail() {
jq -nc --arg stage "$1" --arg reason "$2" '{status:"error", stage:$stage, reason:$reason}'
exit 1
}
command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; }
command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)"
# --- re-derive orphans (same rule as orphan-wiki.sh; computed fresh, here, now) ---
# A wiki/sources/*.md page is orphaned when its frontmatter source_path points at
# a raw file that no longer exists. Legacy pages without source_path are ignored.
declare -a ORPH=()
for page in wiki/sources/*.md; do
[[ -e "$page" ]] || continue
sp="$(sed -n 's/^source_path:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)"
[[ -n "$sp" ]] || continue
[[ -f "$sp" ]] || ORPH+=("$page")
done
if [[ ${#ORPH[@]} -eq 0 ]]; then
jq -nc '{status:"ok", count:0, pruned:[], detail:"no orphans"}'
exit 0
fi
# --- remove each orphan page + its index entry (anti-traversal, wiki/-only) ---
declare -a PRUNED=()
for page in "${ORPH[@]}"; do
case "$page" in
wiki/*) : ;;
*) fail "prune" "refusing to remove outside wiki/: ${page}" ;;
esac
case "$page" in *..*) fail "prune" "path traversal in page: ${page}" ;; esac
[[ -f "$page" ]] || continue
rm -f "$page"
link="${page#wiki/}"; link="${link%.md}" # e.g. sources/foo
python3 "${SCRIPTS}/index-append.py" --remove "$link" \
|| fail "index" "index-append --remove failed for ${link}"
PRUNED+=("$link")
done
# --- assemble the PR body ---
date_tag="$(date +%F)"
body="$(mktemp)"
trap 'rm -f "$body"' EXIT
{
echo "## Prune orphaned sources"
echo ""
echo "These source pages reference a \`source_path\` whose raw file no longer exists"
echo "in \`raw/\`. Removing them keeps the wiki in sync with git (the source of truth)."
echo ""
echo "| Removed page |"
echo "|--------------|"
for l in "${PRUNED[@]}"; do echo "| \`wiki/${l}.md\` |"; done
} > "$body"
# --- open the GATED removal PR on a chore/ branch (open-pr --branch override) ---
branch="chore/prune-orphans-${date_tag}"
pr_out="$( bash "${SCRIPTS}/open-pr.sh" \
--branch "$branch" \
--title "chore: prune ${#PRUNED[@]} orphaned source(s)" \
--body-file "$body" --base "${INGEST_BASE:-main}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
# --- result line for n8n ---
jq -nc \
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \
--argjson count "${#PRUNED[@]}" \
--arg pr_url "$pr_url" \
--arg detail "$pr_out" \
--argjson pruned "$(printf '%s\n' "${PRUNED[@]}" | jq -R . | jq -s .)" \
'{status:$status, count:$count, pr_url:$pr_url, pruned:$pruned, detail:$detail}'
[[ $pr_rc -eq 0 ]] || exit 1

44
tests/index-remove.bats Normal file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env bats
# tests/index-remove.bats — index-append.py --remove mode.
setup() {
load 'helpers'
export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
g_src="$(make_fixture_genome)"; export g="$g_src"
}
@test "index --remove: deletes the matching entry, keeps the others" {
cd "$g"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/a]] — A. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/b]] — B. `maturity: draft`'
grep -q 'sources/a' wiki/index.md
grep -q 'sources/b' wiki/index.md
run python3 "$SKILL_SCRIPTS/index-append.py" --remove 'sources/a'
[ "$status" -eq 0 ]
! grep -q '\[\[sources/a\]\]' wiki/index.md
grep -q 'sources/b' wiki/index.md
}
@test "index --remove: idempotent when the entry is absent" {
cd "$g"
run python3 "$SKILL_SCRIPTS/index-append.py" --remove 'sources/does-not-exist'
[ "$status" -eq 0 ]
[[ "$output" == *'nothing to remove'* ]]
}
@test "index --remove: bumps last_updated" {
cd "$g"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/a]] — A. `maturity: draft`'
# set last_updated to an old date, then remove and check it moved
sed -i 's/^last_updated:.*/last_updated: 2000-01-01/' wiki/index.md
run python3 "$SKILL_SCRIPTS/index-append.py" --remove 'sources/a'
[ "$status" -eq 0 ]
! grep -q '2000-01-01' wiki/index.md
grep -q "last_updated: $(date +%F)" wiki/index.md
}
@test "index --remove: rejects passing both --entry and --remove" {
cd "$g"
run python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/a]] — x' --remove 'sources/a'
[ "$status" -eq 2 ]
}

68
tests/run-prune.bats Normal file
View file

@ -0,0 +1,68 @@
#!/usr/bin/env bats
# tests/run-prune.bats — prune orphaned sources (no LLM, no network; DRY_RUN).
setup() {
load 'helpers'
export PRUNE="${SKILL_SCRIPTS}/run-prune.sh"
export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
export INGEST_BASE="main"
export KG_LIB_DIR="${LIB_DIR}"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t"
export DRY_RUN=1
g_src="$(make_fixture_genome)"; export g_name="fixture-genome"
mv "$g_src" "${GENOMES_ROOT}/${g_name}"; export g="${GENOMES_ROOT}/${g_name}"
( cd "$g" && rm -f raw/articles/test.md && git add -A && git commit -q -m clear && git push -q )
}
@test "run-prune: removes only the orphaned source + its index entry, opens a dry PR" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
cd "$g"
# kept: raw exists. orphan: raw missing.
echo content > raw/articles/kept.md
h="$(sha256sum raw/articles/kept.md | cut -d' ' -f1)"
printf -- '---\nsource_path: raw/articles/kept.md\nsource_sha256: %s\n---\nbody\n' "$h" > wiki/sources/kept.md
printf -- '---\nsource_path: raw/articles/gone.md\nsource_sha256: abc\n---\nbody\n' > wiki/sources/orphan.md
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/kept]] — kept. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/orphan]] — orphan. `maturity: draft`'
git add -A && git commit -q -m setup && git push -q
run bash "$PRUNE" "$g_name"
[ "$status" -eq 0 ]
[[ "$output" == *'"status":"ok"'* ]]
[[ "$output" == *'"count":1'* ]]
# only the orphan page is gone
[ ! -f wiki/sources/orphan.md ]
[ -f wiki/sources/kept.md ]
# index reflects the removal
! grep -q 'sources/orphan' wiki/index.md
grep -q 'sources/kept' wiki/index.md
# committed on a chore/ branch (NOT feat/ai-ingest-*)
git rev-parse --verify "chore/prune-orphans-$(date +%F)"
}
@test "run-prune: no orphans -> count 0 and no PR/branch" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
cd "$g"
echo content > raw/articles/kept.md
h="$(sha256sum raw/articles/kept.md | cut -d' ' -f1)"
printf -- '---\nsource_path: raw/articles/kept.md\nsource_sha256: %s\n---\nbody\n' "$h" > wiki/sources/kept.md
git add -A && git commit -q -m setup && git push -q
run bash "$PRUNE" "$g_name"
[ "$status" -eq 0 ]
[[ "$output" == *'"count":0'* ]]
run git rev-parse --verify "chore/prune-orphans-$(date +%F)"
[ "$status" -ne 0 ]
}
@test "run-prune: refuses when an orphan path would escape wiki/ (defense in depth)" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
cd "$g"
# legacy page without source_path is ignored; a page with a missing raw is the orphan.
printf -- '---\nsource_path: raw/articles/gone.md\nsource_sha256: abc\n---\nbody\n' > wiki/sources/orphan.md
git add -A && git commit -q -m setup && git push -q
run bash "$PRUNE" "$g_name"
[ "$status" -eq 0 ]
[[ "$output" == *'"count":1'* ]]
[ ! -f wiki/sources/orphan.md ]
}