diff --git a/Makefile b/Makefile
index 777d213..14bf331 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
# =============================================================================
-# Knowledge Genome - Makefile v. 1.9.1
+# Knowledge Genome - Makefile v. 1.10.0
# Orchestrates the setup and management of the knowledge base.
# =============================================================================
diff --git a/deploy/vm101/n8n-pi-wrap b/deploy/vm101/n8n-pi-wrap
index 2f79cf1..fc5321f 100755
--- a/deploy/vm101/n8n-pi-wrap
+++ b/deploy/vm101/n8n-pi-wrap
@@ -11,6 +11,13 @@ case "$cmd" in
# lines 54/59), so pending-raw.sh resolves its sibling slug.sh via BASH_SOURCE.
exec "${HOME}/.pi/agent/skills/ingest/scripts/pending-raw.sh" "$genome"
;;
+ "pi orphan-wiki "*)
+ genome="${cmd#pi orphan-wiki }"
+ case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac
+ logger -t n8n-pi-wrap "ok: pi orphan-wiki ${genome}"
+ set -a; . "${HOME}/.config/knowledge-genome.env" 2>/dev/null || true; set +a
+ exec "${HOME}/.pi/agent/skills/ingest/scripts/orphan-wiki.sh" "$genome"
+ ;;
"pi run")
logger -t n8n-pi-wrap "ok: pi run (prompt via stdin)"
prompt=$(cat)
@@ -42,20 +49,25 @@ case "$cmd" in
esac
logger -t n8n-pi-wrap "ok: pi ingest ${genome} ${raw_path}"
+
+ # Per-genome lock: serialize writes; never two concurrent ingests on the same genome.
+ exec 9>"/run/lock/kg-ingest-${genome}.lock" 2>/dev/null || exec 9>"/tmp/kg-ingest-${genome}.lock"
+ if ! flock -n 9; then
+ echo '{"status":"busy","reason":"another ingest is running for this genome","genome":"'"$genome"'"}'
+ exit 0
+ fi
+
set -a; . "${HOME}/.config/knowledge-genome.env"; set +a
cd "${GENOMES_ROOT}/${genome}" || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
# The raw file must actually exist under the genome's raw/ dir.
[ -f "$raw_path" ] || { echo '{"status":"error","reason":"raw file not found"}'; exit 1; }
- # Clean start on the configured base (develop), pinned to the remote. Destroys only
- # vm101's scratch checkout (never a shared branch, never a force-push) — this is by design.
- # `clean -fd` also removes leftover UNTRACKED files (e.g. wiki/sources/* or a stale
- # .ingest-manifest.json from a half-finished previous run) that `reset --hard` won't touch.
- git fetch -q origin \
- && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \
- && git reset -q --hard "origin/${INGEST_BASE:-main}" \
- && git clean -q -fd
+ # Clean start on the configured base (single source of truth in lib/clean-start.sh).
+ : "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}"
+ source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \
+ || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; }
+ clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; }
# SEMANTIC step: dedicated script drives pi to WRITE wiki pages + manifest.
# (NOT `pi -p "/skill:ingest ..."`, which makes the model reply in chat and write nothing.)
diff --git a/lib/clean-start.sh b/lib/clean-start.sh
new file mode 100644
index 0000000..545035b
--- /dev/null
+++ b/lib/clean-start.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# =============================================================================
+# lib/clean-start.sh — single source of truth for the pre-session reset.
+# Caller must already be INSIDE the genome checkout.
+# Aligns the working tree to origin/. Never force-pushes a shared branch.
+# Tolerates a missing remote branch (first-setup scenario).
+# NOTE: sourced library — no `set -euo pipefail` (would leak into the caller).
+# =============================================================================
+
+clean_start() {
+ local base="${INGEST_BASE:-main}"
+ git fetch -q origin || return 1
+ git switch -q "$base" 2>/dev/null || git checkout -q -b "$base" || return 1
+ if git ls-remote --exit-code --heads origin "$base" >/dev/null 2>&1; then
+ git reset -q --hard "origin/${base}" || return 1
+ fi
+ git clean -q -fd || return 1
+}
diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py
index fec6582..f73612e 100755
--- a/skills/ingest/scripts/ingest-semantic.py
+++ b/skills/ingest/scripts/ingest-semantic.py
@@ -20,7 +20,7 @@
#
# Emits a single JSON status line on stdout (for n8n / logs).
# =============================================================================
-import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error
+import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error, time
# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
@@ -209,15 +209,13 @@ SCHEMA = {
}
-def call_model():
- # format existing names as a human-readable list
+def call_model(max_retries=2, base_delay=2.0):
+ """Call Ollama with retry on transient errors (connection, timeout, malformed JSON).
+ Retries up to max_retries times with exponential backoff. Does NOT retry on
+ content errors (schema violations, empty response) — those are model issues."""
existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)"
existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)"
-
- prompt = SYSTEM_PROMPT.format(
- existing_entities=existing_ents,
- existing_concepts=existing_conc,
- )
+ prompt = SYSTEM_PROMPT.format(existing_entities=existing_ents, existing_concepts=existing_conc)
payload = {
"model": MODEL,
@@ -227,33 +225,45 @@ def call_model():
"Source path: " + raw_rel + "\n\n--- SOURCE START ---\n"
+ source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},
],
- "format": SCHEMA, # schema-constrained generation
+ "format": SCHEMA,
"stream": False,
- # deterministic extraction; repetition penalties OFF for structured output
"options": {"temperature": 0.2, "repeat_penalty": 1.0, "num_ctx": NUM_CTX},
}
if THINK is not None:
payload["think"] = THINK.strip().lower() in ("1", "true", "yes", "on")
data = json.dumps(payload).encode("utf-8")
- req = urllib.request.Request(
- OLLAMA_URL, data=data, headers={"Content-Type": "application/json"})
- try:
- with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
- resp = json.loads(r.read().decode("utf-8"))
- except urllib.error.URLError as e:
- die("model", "ollama request failed: " + str(e))
- content = ((resp.get("message") or {}).get("content") or "").strip()
- # schema-constrained, but stay defensive if a model wraps it in a fence
- if content.startswith("```"):
- content = content.strip("`")
- brace = content.find("{")
- if brace >= 0:
- content = content[brace:]
- try:
- return json.loads(content)
- except json.JSONDecodeError as e:
- die("model", "model did not return valid JSON: " + str(e))
+ last_error = None
+ for attempt in range(max_retries + 1):
+ if attempt > 0:
+ delay = base_delay * (2 ** (attempt - 1))
+ print(f"call_model: retry {attempt}/{max_retries} after {delay}s: {last_error}", file=sys.stderr)
+ time.sleep(delay)
+
+ req = urllib.request.Request(OLLAMA_URL, data=data, headers={"Content-Type": "application/json"})
+ try:
+ with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
+ resp = json.loads(r.read().decode("utf-8"))
+ except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e:
+ last_error = f"connection/transport error: {e}"; continue
+ except json.JSONDecodeError as e:
+ last_error = f"invalid JSON from Ollama API: {e}"; continue
+
+ content = ((resp.get("message") or {}).get("content") or "").strip()
+ if content.startswith("```"):
+ content = content.strip("`")
+ brace = content.find("{")
+ if brace >= 0:
+ content = content[brace:]
+ try:
+ return json.loads(content)
+ except json.JSONDecodeError as e:
+ last_error = f"model did not return valid JSON: {e}"
+ if len(content) < 10:
+ continue # likely truncated -> retry
+ break # long but malformed -> model issue, stop
+
+ die("model", last_error or "model call failed after retries")
# --- run the semantic pass ---
sem = call_model()
diff --git a/skills/ingest/scripts/log-append.sh b/skills/ingest/scripts/log-append.sh
index b3108a2..8c6e40a 100755
--- a/skills/ingest/scripts/log-append.sh
+++ b/skills/ingest/scripts/log-append.sh
@@ -21,6 +21,7 @@ while [[ $# -gt 0 ]]; do
--context) context="$2"; shift 2 ;;
--output) output="$2"; shift 2 ;;
--reasoning) reasoning="$2"; shift 2 ;;
+ --run-id) run_id_arg="$2"; shift 2 ;;
*) echo "log-append: unknown arg: $1" >&2; exit 1 ;;
esac
done
@@ -35,9 +36,15 @@ esac
[[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; }
-run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')"
+run_id="${run_id_arg:-$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')}"
today="$(date +%Y-%m-%d)"
+if grep -qF "run_id: \`${run_id}\`" "$LOG_FILE" 2>/dev/null; then
+ echo "log-append: run_id ${run_id} already present — skipping (idempotent)" >&2
+ echo "run_id=${run_id}"
+ exit 0
+fi
+
{
printf '\n## [%s] %s | %s\n\n' "$today" "$type" "$subject"
printf -- '- run_id: `%s`\n' "$run_id"
diff --git a/skills/ingest/scripts/orphan-wiki.sh b/skills/ingest/scripts/orphan-wiki.sh
new file mode 100755
index 0000000..686d167
--- /dev/null
+++ b/skills/ingest/scripts/orphan-wiki.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# =============================================================================
+# orphan-wiki.sh — find source pages whose raw source no longer exists.
+# Reads source_path from each wiki/sources/*.md frontmatter. If the raw is gone,
+# the page is orphaned. Emits JSON envelope: {status, genome, count, files[], detail[]}.
+# Read-only: no lock needed (same policy as pending-raw).
+# =============================================================================
+set -euo pipefail
+
+genome="${1:?usage: orphan-wiki.sh }"
+base_dir="${GENOMES_ROOT:-${HOME}/genomes}"
+cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
+
+# Clean start on the configured base (single source of truth in lib/clean-start.sh).
+: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}"
+source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \
+ || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; }
+clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; }
+
+declare -a ORPH=()
+for page in wiki/sources/*.md; do
+ [[ -e "$page" ]] || continue
+ sp="$(sed -n 's/^source_path:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)"
+ # Pages without source_path are pre-Step-2 legacy: ignore, don't false-positive.
+ [[ -n "$sp" ]] || continue
+ [[ -f "$sp" ]] || ORPH+=("$page")
+done
+
+if [[ ${#ORPH[@]} -eq 0 ]]; then
+ echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}'
+else
+ for x in "${ORPH[@]}"; do printf '%s\torphan\n' "$x"; done \
+ | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \
+ | jq -s --arg g "$genome" '{status:"ok", genome:$g, count:length, files:[.[].path], detail:.}'
+fi
diff --git a/skills/ingest/scripts/pending-raw.sh b/skills/ingest/scripts/pending-raw.sh
index 0bd1a21..f82ce3d 100755
--- a/skills/ingest/scripts/pending-raw.sh
+++ b/skills/ingest/scripts/pending-raw.sh
@@ -12,11 +12,11 @@ genome="${1:?usage: pending-raw.sh }"
base_dir="${GENOMES_ROOT:-${HOME}/genomes}"
cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
-# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh).
-git fetch -q origin \
- && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \
- && git reset -q --hard "origin/${INGEST_BASE:-main}" \
- && git clean -q -fd
+# Clean start on the configured base (single source of truth in lib/clean-start.sh).
+: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}"
+source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \
+ || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; }
+clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; }
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SLUG="${SCRIPT_DIR}/slug.sh"
diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh
index 07197b2..1b90882 100755
--- a/skills/ingest/scripts/run-ingest.sh
+++ b/skills/ingest/scripts/run-ingest.sh
@@ -107,8 +107,12 @@ done < <(jq -r '.pages[] | select(.status=="created")
| [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest")
# --- 2. log entry ---
+# Stable run_id: deterministic from the input (raw path + content hash). Survives wrapper
+# re-runs and makes the append-only log idempotent (paired with the guard in log-append.sh).
+src_sha="$(sha256sum "$raw_source" 2>/dev/null | cut -d' ' -f1)" || src_sha="unknown"
+run_id="$(printf '%s' "${raw_source}:${src_sha}" | sha256sum | cut -c1-16)"
out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")"
-bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \
+bash "${SCRIPTS}/log-append.sh" --run-id "$run_id" --type INGEST --subject "$slug" --model "$model" \
--context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \
|| fail "log" "log-append failed"
diff --git a/tests/clean-start.bats b/tests/clean-start.bats
new file mode 100644
index 0000000..2be2326
--- /dev/null
+++ b/tests/clean-start.bats
@@ -0,0 +1,18 @@
+#!/usr/bin/env bats
+setup() {
+ load 'helpers'
+ source "${LIB_DIR}/clean-start.sh" 2>/dev/null || source "${REPO_ROOT}/lib/clean-start.sh"
+}
+@test "clean_start: aligns to origin/base, reverts tracked edits, removes untracked" {
+ G="$(make_fixture_genome)"; cd "$G"
+ echo "from origin" >> wiki/index.md
+ git add -A && git commit -q -m "origin ahead" && git push -q
+ git reset --hard HEAD~1 # local BEHIND origin/main
+ echo "local junk" >> wiki/log.md # tracked edit, uncommitted
+ echo "scratch" > scratch.txt # genuinely untracked
+ INGEST_BASE="main" clean_start
+ git diff --quiet origin/main # aligned to origin
+ grep -q "from origin" wiki/index.md # forwarded to origin state
+ ! grep -q "local junk" wiki/log.md # tracked edit reverted
+ [ ! -f scratch.txt ] # untracked removed
+}
diff --git a/tests/orphan-wiki.bats b/tests/orphan-wiki.bats
new file mode 100644
index 0000000..c8fb8b8
--- /dev/null
+++ b/tests/orphan-wiki.bats
@@ -0,0 +1,38 @@
+#!/usr/bin/env bats
+setup() {
+ load 'helpers'
+ export ORPHAN="${SKILL_SCRIPTS}/orphan-wiki.sh"
+ export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
+ export INGEST_BASE="main"
+ export KG_LIB_DIR="${LIB_DIR}" # orphan-wiki.sh sources clean-start.sh via KG_LIB_DIR
+ g_src="$(make_fixture_genome)"
+ export g_name="fixture-genome"
+ mv "$g_src" "${GENOMES_ROOT}/${g_name}"
+ export g="${GENOMES_ROOT}/${g_name}"
+ ( cd "$g" && rm -f raw/articles/test.md && git add -A && git commit -q -m "clear" && git push -q )
+}
+@test "orphan-wiki: no orphans when raw and source page match" {
+ mkdir -p "${g}/raw/articles"; echo "content" > "${g}/raw/articles/existing.md"
+ hash="$(sha256sum "${g}/raw/articles/existing.md" | cut -d' ' -f1)"
+ mkdir -p "${g}/wiki/sources"
+ printf -- '---\nsource_path: raw/articles/existing.md\nsource_sha256: %s\n---\n' "$hash" > "${g}/wiki/sources/existing.md"
+ ( cd "$g" && git add . && git commit -q -m "setup" && git push -q )
+ run bash "$ORPHAN" "$g_name"
+ [ "$status" -eq 0 ]; echo "$output" | jq -e '.count == 0'
+}
+@test "orphan-wiki: detects orphaned source page" {
+ mkdir -p "${g}/wiki/sources"
+ printf -- '---\nsource_path: raw/articles/deleted.md\nsource_sha256: abc123\n---\n' > "${g}/wiki/sources/orphaned.md"
+ ( cd "$g" && git add . && git commit -q -m "orphan" && git push -q )
+ run bash "$ORPHAN" "$g_name"
+ [ "$status" -eq 0 ]
+ echo "$output" | jq -e '.count == 1'
+ echo "$output" | jq -e '.detail[0].reason == "orphan"'
+}
+@test "orphan-wiki: ignores legacy pages without source_path" {
+ mkdir -p "${g}/wiki/sources"
+ printf -- '---\ntitle: "Legacy"\ntype: source\n---\n' > "${g}/wiki/sources/legacy.md"
+ ( cd "$g" && git add . && git commit -q -m "legacy" && git push -q )
+ run bash "$ORPHAN" "$g_name"
+ [ "$status" -eq 0 ]; echo "$output" | jq -e '.count == 0'
+}
diff --git a/tests/pending-raw.bats b/tests/pending-raw.bats
index f43237f..f0f8407 100644
--- a/tests/pending-raw.bats
+++ b/tests/pending-raw.bats
@@ -5,6 +5,7 @@ setup() {
export PENDING="${SKILL_SCRIPTS}/pending-raw.sh"
export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
export INGEST_BASE="main"
+ export KG_LIB_DIR="${LIB_DIR}"
g_src="$(make_fixture_genome)"
export g_name="fixture-genome"
diff --git a/tests/permissions.bats b/tests/permissions.bats
index ebe9888..5850791 100644
--- a/tests/permissions.bats
+++ b/tests/permissions.bats
@@ -18,6 +18,7 @@ EXECUTABLES=(
skills/ingest/scripts/log-append.sh
skills/ingest/scripts/slug.sh
skills/ingest/scripts/pending-raw.sh
+ skills/ingest/scripts/orphan-wiki.sh
skills/ingest/scripts/index-append.py
scripts/add-genome.sh
scripts/setup.sh
@@ -29,7 +30,7 @@ EXECUTABLES=(
# Librerie sourced: NON devono essere eseguibili.
LIBRARIES=(
- lib/lint.sh lib/output.sh lib/deps.sh lib/git-crypt.sh lib/scaffold.sh lib/structure.sh
+ lib/lint.sh lib/output.sh lib/deps.sh lib/git-crypt.sh lib/scaffold.sh lib/structure.sh lib/clean-start.sh
providers/forgejo.sh providers/github.sh
registry.sh globals.env
)
diff --git a/tests/scripts.bats b/tests/scripts.bats
index 19f758e..cef17b7 100644
--- a/tests/scripts.bats
+++ b/tests/scripts.bats
@@ -86,3 +86,17 @@ EOF
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — s. `maturity: draft`'
grep -q "^last_updated: $(date +%F)$" wiki/index.md
}
+
+@test "log-append: dedup on stable run_id prevents duplicate entries" {
+ G="$(make_fixture_genome)"; cd "$G"
+ stable_id="test-stable-run-id-001"
+ run bash "$SKILL_SCRIPTS/log-append.sh" --run-id "$stable_id" --type INGEST --subject "test" --model "m" \
+ --context "[[raw/x]]" --output "[[sources/x]]" --reasoning "r"
+ [ "$status" -eq 0 ]
+ run bash "$SKILL_SCRIPTS/log-append.sh" --run-id "$stable_id" --type INGEST --subject "test" --model "m" \
+ --context "[[raw/x]]" --output "[[sources/x]]" --reasoning "r"
+ [ "$status" -eq 0 ]
+ [[ "$output" == *"already present"* ]]
+ count="$(grep -cF "run_id: \`${stable_id}\`" wiki/log.md || true)"
+ [ "$count" -eq 1 ]
+}