From e62ad0c831e35a5ed3c30afcb59c0de4b628cc26 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:15:58 +0200 Subject: [PATCH 1/5] feat: Add `slug.sh --raw` for deterministic raw file slugging --- skills/ingest/scripts/slug.sh | 12 ++++++++++++ tests/slug.bats | 30 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 tests/slug.bats diff --git a/skills/ingest/scripts/slug.sh b/skills/ingest/scripts/slug.sh index 2f7fdc5..b026b6b 100755 --- a/skills/ingest/scripts/slug.sh +++ b/skills/ingest/scripts/slug.sh @@ -7,6 +7,18 @@ # ============================================================================= set -euo pipefail +if [[ "${1:-}" == "--raw" ]]; then + raw="${2:?usage: slug.sh --raw }" + rel="${raw#raw/}"; rel="${rel#*/}" # strip "raw/" and the bucket name + rel="${rel%.*}" # strip extension + slug="$(printf '%s\n' "$rel" | tr '/' '\n' \ + | sed -E 's/[^a-zA-Z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//' \ + | tr '[:upper:]' '[:lower:]' | paste -sd- -)" + [[ -n "$slug" ]] || { echo "slug: empty result for input '${raw}'" >&2; exit 1; } + printf '%s\n' "$slug" + exit 0 +fi + input="${1:?usage: slug.sh }" # Strip directory and extension when given a path diff --git a/tests/slug.bats b/tests/slug.bats new file mode 100644 index 0000000..75410b4 --- /dev/null +++ b/tests/slug.bats @@ -0,0 +1,30 @@ +#!/usr/bin/env bats + +setup() { + load 'helpers' + SLUG="${SKILL_SCRIPTS}/slug.sh" +} + +@test "slug --raw: flat file remains unchanged" { + run bash "$SLUG" --raw "raw/articles/il-pane.md" + [ "$status" -eq 0 ] + [ "$output" = "il-pane" ] +} + +@test "slug --raw: nested file gets folder prefix" { + run bash "$SLUG" --raw "raw/articles/cibo/il-pane.md" + [ "$status" -eq 0 ] + [ "$output" = "cibo-il-pane" ] +} + +@test "slug --raw: distinct subdirs avoid collision" { + s1="$(bash "$SLUG" --raw "raw/articles/cibo/pane.md")" + s2="$(bash "$SLUG" --raw "raw/articles/storia/pane.md")" + [ "$s1" != "$s2" ] +} + +@test "slug --raw: Bash and Python-calling-bash agree (single implementation)" { + b="$(bash "$SLUG" --raw "raw/articles/cibo/il-pane.md")" + p="$(python3 -c "import subprocess;print(subprocess.check_output(['bash','$SLUG','--raw','raw/articles/cibo/il-pane.md'],text=True).strip())")" + [ "$b" = "$p" ] +} From 0ff98e1ebd5616ecb9b6527374f0d69bec2f6102 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:15:58 +0200 Subject: [PATCH 2/5] feat: Enhance ingest to track raw source path and SHA256 hash --- skills/ingest/scripts/ingest-semantic.py | 18 +++++++++-- skills/ingest/scripts/run-ingest.sh | 2 +- tests/ingest-semantic.bats | 29 ++++++++++++++++++ tests/run-ingest.bats | 38 ++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 tests/ingest-semantic.bats diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index 09991e0..fec6582 100755 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -20,7 +20,7 @@ # # Emits a single JSON status line on stdout (for n8n / logs). # ============================================================================= -import json, os, re, sys, datetime, urllib.request, urllib.error +import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error # --- config (override via env; these live in ~/.config/knowledge-genome.env) --- OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat") @@ -257,7 +257,16 @@ def call_model(): # --- run the semantic pass --- sem = call_model() -source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0]) + +# Source of truth: slug from slug.sh --raw (deterministic, path-aware, collision-proof) +source_slug = subprocess.check_output( + ["bash", os.path.join(os.path.dirname(__file__), "slug.sh"), "--raw", raw_rel], + text=True +).strip() + +with open(raw_rel, "rb") as f: + src_sha = hashlib.sha256(f.read()).hexdigest() + pages = [] # 1. source page — canonical summary of THIS source (re)written @@ -273,7 +282,10 @@ src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])] + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8] os.makedirs("wiki/sources", exist_ok=True) with open(src_path, "w", encoding="utf-8") as f: - f.write(frontmatter("source", src_title, src_tags)) + fm = frontmatter("source", src_title, src_tags) + # Inject tracking fields before the closing '---' (first newline-dash-dash-dash-newline) + fm = fm.replace("\n---\n", f"\nsource_path: {raw_rel}\nsource_sha256: {src_sha}\n---\n", 1) + f.write(fm) f.write(f"\n# {src_title}\n\n{src_body}\n") pages.append({"path": src_path, "summary": twords(src_title), diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index cb998cd..07197b2 100755 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -53,7 +53,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")" [[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" -slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}" +slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}" # --- collect touched paths --- mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") diff --git a/tests/ingest-semantic.bats b/tests/ingest-semantic.bats new file mode 100644 index 0000000..11e48b6 --- /dev/null +++ b/tests/ingest-semantic.bats @@ -0,0 +1,29 @@ +#!/usr/bin/env bats + +setup() { + load 'helpers' + source "$LIB_DIR/output.sh" + source "$LIB_DIR/lint.sh" +} + +@test "lint tolerates source_path/source_sha256 in source frontmatter" { + G="$(make_fixture_genome)" + mkdir -p "$G/wiki/sources" + cat > "$G/wiki/sources/test-source.md" <<'EOFMD' +--- +title: "Test Source" +type: source +domain: genome-test +maturity: draft +last_updated: 2026-06-25 +private: false +tags: [test] +source_path: raw/articles/test.md +source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +--- +# Test Source +body +EOFMD + run lint_markdown_file "$G/wiki/sources/test-source.md" genome-test + [ "$status" -eq 0 ] +} diff --git a/tests/run-ingest.bats b/tests/run-ingest.bats index 6b7c30b..b478743 100644 --- a/tests/run-ingest.bats +++ b/tests/run-ingest.bats @@ -171,3 +171,41 @@ EOF [ "$status" -eq 0 ] [[ "$output" == *"develop"* ]] } + +@test "run-ingest: branch name matches slug.sh --raw for nested raw paths" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + G="$(make_fixture_genome)"; cd "$G" + mkdir -p wiki/sources + cat > wiki/sources/cibo-il-pane.md <<'EOFMD' +--- +title: "Il Pane" +type: source +domain: genome-test +tags: [cibo] +maturity: draft +last_updated: 2026-06-25 +private: false +source_path: raw/articles/cibo/il-pane.md +source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +--- +# Il Pane +body +EOFMD + cat > .ingest-manifest.json <<'EOFJSON' +{ + "raw_source": "raw/articles/cibo/il-pane.md", + "model": "qwen3.5-9b", + "reasoning": "Ingest.", + "pr_summary": "Ingest summary.", + "contradictions": "None", + "pages": [ + {"path": "wiki/sources/cibo-il-pane.md", "summary": "Summary.", "maturity": "draft", "status": "created"} + ] +} +EOFJSON + export KG_LIB_DIR="$LIB_DIR" + export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1 + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *"cibo-il-pane"* ]] +} From 918d632b41512e91c371005cf60aa560aafae70f Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:15:59 +0200 Subject: [PATCH 3/5] feat: Implement `pending-raw.sh` to identify changed sources --- skills/ingest/scripts/pending-raw.sh | 64 ++++++++++++++++++++ tests/pending-raw.bats | 90 ++++++++++++++++++++++++++++ tests/permissions.bats | 1 + 3 files changed, 155 insertions(+) create mode 100755 skills/ingest/scripts/pending-raw.sh create mode 100644 tests/pending-raw.bats diff --git a/skills/ingest/scripts/pending-raw.sh b/skills/ingest/scripts/pending-raw.sh new file mode 100755 index 0000000..0bd1a21 --- /dev/null +++ b/skills/ingest/scripts/pending-raw.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# ============================================================================= +# pending-raw.sh — deterministic "what needs ingesting" calculator. +# Reads the clean base checkout and classifies each raw/articles/*.md as: +# new -> no wiki/sources/.md +# modified -> page exists but its source_sha256 != current file hash +# Emits the same JSON envelope as changed-raw (drop-in), plus detail[] for ntfy. +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: pending-raw.sh }" +base_dir="${GENOMES_ROOT:-${HOME}/genomes}" +cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; } + +# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh). +git fetch -q origin \ + && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \ + && git reset -q --hard "origin/${INGEST_BASE:-main}" \ + && git clean -q -fd + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SLUG="${SCRIPT_DIR}/slug.sh" + +declare -a NEW=() +declare -a MOD=() +declare -A SEEN_SLUG=() + +if [[ -d raw/articles ]]; then + while IFS= read -r -d '' f; do + rel="${f#./}" + case "$rel" in + */.stfolder/*|*/.stignore|*/.gitkeep) continue ;; + esac + slug="$("$SLUG" --raw "$rel")" || continue + + # Residual collision (two distinct raws -> same slug): warn, do not silence. + if [[ -n "${SEEN_SLUG[$slug]:-}" && "${SEEN_SLUG[$slug]}" != "$rel" ]]; then + logger -t pending-raw "warn: slug collision ${slug}: ${SEEN_SLUG[$slug]} <-> ${rel}" + fi + SEEN_SLUG[$slug]="$rel" + + page="wiki/sources/${slug}.md" + if [[ ! -f "$page" ]]; then + NEW+=("$rel") + else + cur="$(sha256sum "$rel" | cut -d' ' -f1)" + rec="$(sed -n 's/^source_sha256:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)" + if [[ "$cur" != "$rec" ]]; then + MOD+=("$rel") + fi + fi + done < <(find raw/articles -type f -name '*.md' -print0 2>/dev/null) +fi + +if [[ ${#NEW[@]} -eq 0 && ${#MOD[@]} -eq 0 ]]; then + echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}' +else + { + for x in "${NEW[@]}"; do printf '%s\tnew\n' "$x"; done + for x in "${MOD[@]}"; do printf '%s\tmodified\n' "$x"; done + } | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \ + | jq -s --arg g "$genome" \ + '{status: "ok", genome: $g, count: length, files: [.[].path], detail: .}' +fi diff --git a/tests/pending-raw.bats b/tests/pending-raw.bats new file mode 100644 index 0000000..f43237f --- /dev/null +++ b/tests/pending-raw.bats @@ -0,0 +1,90 @@ +#!/usr/bin/env bats + +setup() { + load 'helpers' + export PENDING="${SKILL_SCRIPTS}/pending-raw.sh" + export GENOMES_ROOT="${BATS_TEST_TMPDIR}" + export INGEST_BASE="main" + + g_src="$(make_fixture_genome)" + export g_name="fixture-genome" + mv "$g_src" "${GENOMES_ROOT}/${g_name}" + export g="${GENOMES_ROOT}/${g_name}" + + # FIX: make_fixture_genome ships raw/articles/test.md with no source page, which would + # otherwise count as a permanent 'new' and break every count assertion. Clear it so each + # test controls exactly what is pending (verified: count base becomes 0). + ( cd "$g" && rm -f raw/articles/test.md && git add -A \ + && git commit -q -m "test: clear default raw" && git push -q ) +} + +@test "pending-raw: detects a brand new raw file" { + echo "new content" > "${g}/raw/articles/new-file.md" + ( cd "$g" && git add . && git commit -q -m "add raw" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.detail[0].path == "raw/articles/new-file.md"' + echo "$output" | jq -e '.detail[0].reason == "new"' +} + +@test "pending-raw: skips up-to-date files" { + echo "ok content" > "${g}/raw/articles/ok-file.md" + hash_ok="$(sha256sum "${g}/raw/articles/ok-file.md" | cut -d' ' -f1)" + cat > "${g}/wiki/sources/ok-file.md" < "${g}/raw/articles/mod-file.md" + hash_v1="$(sha256sum "${g}/raw/articles/mod-file.md" | cut -d' ' -f1)" + cat > "${g}/wiki/sources/mod-file.md" < "${g}/raw/articles/mod-file.md" + ( cd "$g" && git add . && git commit -q -m "v2" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.detail[0].reason == "modified"' +} + +@test "pending-raw: nested subdirectory yields prefixed slug" { + mkdir -p "${g}/raw/articles/sub-b" + echo "subdir content" > "${g}/raw/articles/sub-b/file.md" + ( cd "$g" && git add . && git commit -q -m "subdir" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.files[0] == "raw/articles/sub-b/file.md"' +} + +@test "pending-raw: excludes noise (.stfolder, .gitkeep)" { + touch "${g}/raw/articles/.gitkeep" + mkdir -p "${g}/raw/articles/.stfolder" + touch "${g}/raw/articles/.stfolder/sync.log" + ( cd "$g" && git add . && git commit -q -m "noise" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 0' +} + +@test "pending-raw: reports both files on a slug collision" { + mkdir -p "${g}/raw/articles/cibo" + echo "c1" > "${g}/raw/articles/cibo-pane.md" + echo "c2" > "${g}/raw/articles/cibo/pane.md" + ( cd "$g" && git add . && git commit -q -m "collision" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 2' +} diff --git a/tests/permissions.bats b/tests/permissions.bats index c71d32f..ebe9888 100644 --- a/tests/permissions.bats +++ b/tests/permissions.bats @@ -17,6 +17,7 @@ EXECUTABLES=( skills/ingest/scripts/open-pr.sh skills/ingest/scripts/log-append.sh skills/ingest/scripts/slug.sh + skills/ingest/scripts/pending-raw.sh skills/ingest/scripts/index-append.py scripts/add-genome.sh scripts/setup.sh From 32c722a6ae29cb3c847eb38ea3dbd4b5cd45491d Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:15:59 +0200 Subject: [PATCH 4/5] chore: Integrate `pi pending-raw` command into `n8n-pi-wrap` --- deploy/vm101/n8n-pi-wrap | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/deploy/vm101/n8n-pi-wrap b/deploy/vm101/n8n-pi-wrap index 3baa288..2f79cf1 100755 --- a/deploy/vm101/n8n-pi-wrap +++ b/deploy/vm101/n8n-pi-wrap @@ -2,6 +2,15 @@ set -eu cmd="${SSH_ORIGINAL_COMMAND:-}" case "$cmd" in + "pi pending-raw "*) + genome="${cmd#pi pending-raw }" + case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac + logger -t n8n-pi-wrap "ok: pi pending-raw ${genome}" + set -a; . "${HOME}/.config/knowledge-genome.env" 2>/dev/null || true; set +a + # Run from the DEPLOYED skill dir (same place as ingest-semantic.py / run-ingest.sh on + # lines 54/59), so pending-raw.sh resolves its sibling slug.sh via BASH_SOURCE. + exec "${HOME}/.pi/agent/skills/ingest/scripts/pending-raw.sh" "$genome" + ;; "pi run") logger -t n8n-pi-wrap "ok: pi run (prompt via stdin)" prompt=$(cat) From 1b19a0397194f74624f90de65cf9ed36e085397c Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:17:03 +0200 Subject: [PATCH 5/5] Update version --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cfd7314..a45ae40 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # ============================================================================= -# Knowledge Genome - Makefile v. 1.8.1 +# Knowledge Genome - Makefile v. 1.9.0 # Orchestrates the setup and management of the knowledge base. # =============================================================================