diff --git a/Makefile b/Makefile index 777d213..14bf331 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # ============================================================================= -# Knowledge Genome - Makefile v. 1.9.1 +# Knowledge Genome - Makefile v. 1.10.0 # Orchestrates the setup and management of the knowledge base. # ============================================================================= diff --git a/deploy/vm101/n8n-pi-wrap b/deploy/vm101/n8n-pi-wrap index 2f79cf1..fc5321f 100755 --- a/deploy/vm101/n8n-pi-wrap +++ b/deploy/vm101/n8n-pi-wrap @@ -11,6 +11,13 @@ case "$cmd" in # lines 54/59), so pending-raw.sh resolves its sibling slug.sh via BASH_SOURCE. exec "${HOME}/.pi/agent/skills/ingest/scripts/pending-raw.sh" "$genome" ;; + "pi orphan-wiki "*) + genome="${cmd#pi orphan-wiki }" + case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac + logger -t n8n-pi-wrap "ok: pi orphan-wiki ${genome}" + set -a; . "${HOME}/.config/knowledge-genome.env" 2>/dev/null || true; set +a + exec "${HOME}/.pi/agent/skills/ingest/scripts/orphan-wiki.sh" "$genome" + ;; "pi run") logger -t n8n-pi-wrap "ok: pi run (prompt via stdin)" prompt=$(cat) @@ -42,20 +49,25 @@ case "$cmd" in esac logger -t n8n-pi-wrap "ok: pi ingest ${genome} ${raw_path}" + + # Per-genome lock: serialize writes; never two concurrent ingests on the same genome. + exec 9>"/run/lock/kg-ingest-${genome}.lock" 2>/dev/null || exec 9>"/tmp/kg-ingest-${genome}.lock" + if ! flock -n 9; then + echo '{"status":"busy","reason":"another ingest is running for this genome","genome":"'"$genome"'"}' + exit 0 + fi + set -a; . "${HOME}/.config/knowledge-genome.env"; set +a cd "${GENOMES_ROOT}/${genome}" || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; } # The raw file must actually exist under the genome's raw/ dir. [ -f "$raw_path" ] || { echo '{"status":"error","reason":"raw file not found"}'; exit 1; } - # Clean start on the configured base (develop), pinned to the remote. Destroys only - # vm101's scratch checkout (never a shared branch, never a force-push) — this is by design. - # `clean -fd` also removes leftover UNTRACKED files (e.g. wiki/sources/* or a stale - # .ingest-manifest.json from a half-finished previous run) that `reset --hard` won't touch. - git fetch -q origin \ - && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \ - && git reset -q --hard "origin/${INGEST_BASE:-main}" \ - && git clean -q -fd + # Clean start on the configured base (single source of truth in lib/clean-start.sh). + : "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}" + source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \ + || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; } + clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; } # SEMANTIC step: dedicated script drives pi to WRITE wiki pages + manifest. # (NOT `pi -p "/skill:ingest ..."`, which makes the model reply in chat and write nothing.) diff --git a/lib/clean-start.sh b/lib/clean-start.sh new file mode 100644 index 0000000..545035b --- /dev/null +++ b/lib/clean-start.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/clean-start.sh — single source of truth for the pre-session reset. +# Caller must already be INSIDE the genome checkout. +# Aligns the working tree to origin/. Never force-pushes a shared branch. +# Tolerates a missing remote branch (first-setup scenario). +# NOTE: sourced library — no `set -euo pipefail` (would leak into the caller). +# ============================================================================= + +clean_start() { + local base="${INGEST_BASE:-main}" + git fetch -q origin || return 1 + git switch -q "$base" 2>/dev/null || git checkout -q -b "$base" || return 1 + if git ls-remote --exit-code --heads origin "$base" >/dev/null 2>&1; then + git reset -q --hard "origin/${base}" || return 1 + fi + git clean -q -fd || return 1 +} diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index fec6582..f73612e 100755 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -20,7 +20,7 @@ # # Emits a single JSON status line on stdout (for n8n / logs). # ============================================================================= -import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error +import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error, time # --- config (override via env; these live in ~/.config/knowledge-genome.env) --- OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat") @@ -209,15 +209,13 @@ SCHEMA = { } -def call_model(): - # format existing names as a human-readable list +def call_model(max_retries=2, base_delay=2.0): + """Call Ollama with retry on transient errors (connection, timeout, malformed JSON). + Retries up to max_retries times with exponential backoff. Does NOT retry on + content errors (schema violations, empty response) — those are model issues.""" existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)" existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)" - - prompt = SYSTEM_PROMPT.format( - existing_entities=existing_ents, - existing_concepts=existing_conc, - ) + prompt = SYSTEM_PROMPT.format(existing_entities=existing_ents, existing_concepts=existing_conc) payload = { "model": MODEL, @@ -227,33 +225,45 @@ def call_model(): "Source path: " + raw_rel + "\n\n--- SOURCE START ---\n" + source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."}, ], - "format": SCHEMA, # schema-constrained generation + "format": SCHEMA, "stream": False, - # deterministic extraction; repetition penalties OFF for structured output "options": {"temperature": 0.2, "repeat_penalty": 1.0, "num_ctx": NUM_CTX}, } if THINK is not None: payload["think"] = THINK.strip().lower() in ("1", "true", "yes", "on") data = json.dumps(payload).encode("utf-8") - req = urllib.request.Request( - OLLAMA_URL, data=data, headers={"Content-Type": "application/json"}) - try: - with urllib.request.urlopen(req, timeout=TIMEOUT) as r: - resp = json.loads(r.read().decode("utf-8")) - except urllib.error.URLError as e: - die("model", "ollama request failed: " + str(e)) - content = ((resp.get("message") or {}).get("content") or "").strip() - # schema-constrained, but stay defensive if a model wraps it in a fence - if content.startswith("```"): - content = content.strip("`") - brace = content.find("{") - if brace >= 0: - content = content[brace:] - try: - return json.loads(content) - except json.JSONDecodeError as e: - die("model", "model did not return valid JSON: " + str(e)) + last_error = None + for attempt in range(max_retries + 1): + if attempt > 0: + delay = base_delay * (2 ** (attempt - 1)) + print(f"call_model: retry {attempt}/{max_retries} after {delay}s: {last_error}", file=sys.stderr) + time.sleep(delay) + + req = urllib.request.Request(OLLAMA_URL, data=data, headers={"Content-Type": "application/json"}) + try: + with urllib.request.urlopen(req, timeout=TIMEOUT) as r: + resp = json.loads(r.read().decode("utf-8")) + except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError) as e: + last_error = f"connection/transport error: {e}"; continue + except json.JSONDecodeError as e: + last_error = f"invalid JSON from Ollama API: {e}"; continue + + content = ((resp.get("message") or {}).get("content") or "").strip() + if content.startswith("```"): + content = content.strip("`") + brace = content.find("{") + if brace >= 0: + content = content[brace:] + try: + return json.loads(content) + except json.JSONDecodeError as e: + last_error = f"model did not return valid JSON: {e}" + if len(content) < 10: + continue # likely truncated -> retry + break # long but malformed -> model issue, stop + + die("model", last_error or "model call failed after retries") # --- run the semantic pass --- sem = call_model() diff --git a/skills/ingest/scripts/log-append.sh b/skills/ingest/scripts/log-append.sh index b3108a2..8c6e40a 100755 --- a/skills/ingest/scripts/log-append.sh +++ b/skills/ingest/scripts/log-append.sh @@ -21,6 +21,7 @@ while [[ $# -gt 0 ]]; do --context) context="$2"; shift 2 ;; --output) output="$2"; shift 2 ;; --reasoning) reasoning="$2"; shift 2 ;; + --run-id) run_id_arg="$2"; shift 2 ;; *) echo "log-append: unknown arg: $1" >&2; exit 1 ;; esac done @@ -35,9 +36,15 @@ esac [[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; } -run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')" +run_id="${run_id_arg:-$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')}" today="$(date +%Y-%m-%d)" +if grep -qF "run_id: \`${run_id}\`" "$LOG_FILE" 2>/dev/null; then + echo "log-append: run_id ${run_id} already present — skipping (idempotent)" >&2 + echo "run_id=${run_id}" + exit 0 +fi + { printf '\n## [%s] %s | %s\n\n' "$today" "$type" "$subject" printf -- '- run_id: `%s`\n' "$run_id" diff --git a/skills/ingest/scripts/orphan-wiki.sh b/skills/ingest/scripts/orphan-wiki.sh new file mode 100755 index 0000000..686d167 --- /dev/null +++ b/skills/ingest/scripts/orphan-wiki.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# ============================================================================= +# orphan-wiki.sh — find source pages whose raw source no longer exists. +# Reads source_path from each wiki/sources/*.md frontmatter. If the raw is gone, +# the page is orphaned. Emits JSON envelope: {status, genome, count, files[], detail[]}. +# Read-only: no lock needed (same policy as pending-raw). +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: orphan-wiki.sh }" +base_dir="${GENOMES_ROOT:-${HOME}/genomes}" +cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; } + +# Clean start on the configured base (single source of truth in lib/clean-start.sh). +: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}" +source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \ + || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; } +clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; } + +declare -a ORPH=() +for page in wiki/sources/*.md; do + [[ -e "$page" ]] || continue + sp="$(sed -n 's/^source_path:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)" + # Pages without source_path are pre-Step-2 legacy: ignore, don't false-positive. + [[ -n "$sp" ]] || continue + [[ -f "$sp" ]] || ORPH+=("$page") +done + +if [[ ${#ORPH[@]} -eq 0 ]]; then + echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}' +else + for x in "${ORPH[@]}"; do printf '%s\torphan\n' "$x"; done \ + | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \ + | jq -s --arg g "$genome" '{status:"ok", genome:$g, count:length, files:[.[].path], detail:.}' +fi diff --git a/skills/ingest/scripts/pending-raw.sh b/skills/ingest/scripts/pending-raw.sh index 0bd1a21..f82ce3d 100755 --- a/skills/ingest/scripts/pending-raw.sh +++ b/skills/ingest/scripts/pending-raw.sh @@ -12,11 +12,11 @@ genome="${1:?usage: pending-raw.sh }" base_dir="${GENOMES_ROOT:-${HOME}/genomes}" cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; } -# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh). -git fetch -q origin \ - && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \ - && git reset -q --hard "origin/${INGEST_BASE:-main}" \ - && git clean -q -fd +# Clean start on the configured base (single source of truth in lib/clean-start.sh). +: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}" +source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \ + || { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; } +clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; } SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SLUG="${SCRIPT_DIR}/slug.sh" diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 07197b2..1b90882 100755 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -107,8 +107,12 @@ done < <(jq -r '.pages[] | select(.status=="created") | [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest") # --- 2. log entry --- +# Stable run_id: deterministic from the input (raw path + content hash). Survives wrapper +# re-runs and makes the append-only log idempotent (paired with the guard in log-append.sh). +src_sha="$(sha256sum "$raw_source" 2>/dev/null | cut -d' ' -f1)" || src_sha="unknown" +run_id="$(printf '%s' "${raw_source}:${src_sha}" | sha256sum | cut -c1-16)" out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")" -bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ +bash "${SCRIPTS}/log-append.sh" --run-id "$run_id" --type INGEST --subject "$slug" --model "$model" \ --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ || fail "log" "log-append failed" diff --git a/tests/clean-start.bats b/tests/clean-start.bats new file mode 100644 index 0000000..2be2326 --- /dev/null +++ b/tests/clean-start.bats @@ -0,0 +1,18 @@ +#!/usr/bin/env bats +setup() { + load 'helpers' + source "${LIB_DIR}/clean-start.sh" 2>/dev/null || source "${REPO_ROOT}/lib/clean-start.sh" +} +@test "clean_start: aligns to origin/base, reverts tracked edits, removes untracked" { + G="$(make_fixture_genome)"; cd "$G" + echo "from origin" >> wiki/index.md + git add -A && git commit -q -m "origin ahead" && git push -q + git reset --hard HEAD~1 # local BEHIND origin/main + echo "local junk" >> wiki/log.md # tracked edit, uncommitted + echo "scratch" > scratch.txt # genuinely untracked + INGEST_BASE="main" clean_start + git diff --quiet origin/main # aligned to origin + grep -q "from origin" wiki/index.md # forwarded to origin state + ! grep -q "local junk" wiki/log.md # tracked edit reverted + [ ! -f scratch.txt ] # untracked removed +} diff --git a/tests/orphan-wiki.bats b/tests/orphan-wiki.bats new file mode 100644 index 0000000..c8fb8b8 --- /dev/null +++ b/tests/orphan-wiki.bats @@ -0,0 +1,38 @@ +#!/usr/bin/env bats +setup() { + load 'helpers' + export ORPHAN="${SKILL_SCRIPTS}/orphan-wiki.sh" + export GENOMES_ROOT="${BATS_TEST_TMPDIR}" + export INGEST_BASE="main" + export KG_LIB_DIR="${LIB_DIR}" # orphan-wiki.sh sources clean-start.sh via KG_LIB_DIR + g_src="$(make_fixture_genome)" + export g_name="fixture-genome" + mv "$g_src" "${GENOMES_ROOT}/${g_name}" + export g="${GENOMES_ROOT}/${g_name}" + ( cd "$g" && rm -f raw/articles/test.md && git add -A && git commit -q -m "clear" && git push -q ) +} +@test "orphan-wiki: no orphans when raw and source page match" { + mkdir -p "${g}/raw/articles"; echo "content" > "${g}/raw/articles/existing.md" + hash="$(sha256sum "${g}/raw/articles/existing.md" | cut -d' ' -f1)" + mkdir -p "${g}/wiki/sources" + printf -- '---\nsource_path: raw/articles/existing.md\nsource_sha256: %s\n---\n' "$hash" > "${g}/wiki/sources/existing.md" + ( cd "$g" && git add . && git commit -q -m "setup" && git push -q ) + run bash "$ORPHAN" "$g_name" + [ "$status" -eq 0 ]; echo "$output" | jq -e '.count == 0' +} +@test "orphan-wiki: detects orphaned source page" { + mkdir -p "${g}/wiki/sources" + printf -- '---\nsource_path: raw/articles/deleted.md\nsource_sha256: abc123\n---\n' > "${g}/wiki/sources/orphaned.md" + ( cd "$g" && git add . && git commit -q -m "orphan" && git push -q ) + run bash "$ORPHAN" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.detail[0].reason == "orphan"' +} +@test "orphan-wiki: ignores legacy pages without source_path" { + mkdir -p "${g}/wiki/sources" + printf -- '---\ntitle: "Legacy"\ntype: source\n---\n' > "${g}/wiki/sources/legacy.md" + ( cd "$g" && git add . && git commit -q -m "legacy" && git push -q ) + run bash "$ORPHAN" "$g_name" + [ "$status" -eq 0 ]; echo "$output" | jq -e '.count == 0' +} diff --git a/tests/pending-raw.bats b/tests/pending-raw.bats index f43237f..f0f8407 100644 --- a/tests/pending-raw.bats +++ b/tests/pending-raw.bats @@ -5,6 +5,7 @@ setup() { export PENDING="${SKILL_SCRIPTS}/pending-raw.sh" export GENOMES_ROOT="${BATS_TEST_TMPDIR}" export INGEST_BASE="main" + export KG_LIB_DIR="${LIB_DIR}" g_src="$(make_fixture_genome)" export g_name="fixture-genome" diff --git a/tests/permissions.bats b/tests/permissions.bats index ebe9888..5850791 100644 --- a/tests/permissions.bats +++ b/tests/permissions.bats @@ -18,6 +18,7 @@ EXECUTABLES=( skills/ingest/scripts/log-append.sh skills/ingest/scripts/slug.sh skills/ingest/scripts/pending-raw.sh + skills/ingest/scripts/orphan-wiki.sh skills/ingest/scripts/index-append.py scripts/add-genome.sh scripts/setup.sh @@ -29,7 +30,7 @@ EXECUTABLES=( # Librerie sourced: NON devono essere eseguibili. LIBRARIES=( - lib/lint.sh lib/output.sh lib/deps.sh lib/git-crypt.sh lib/scaffold.sh lib/structure.sh + lib/lint.sh lib/output.sh lib/deps.sh lib/git-crypt.sh lib/scaffold.sh lib/structure.sh lib/clean-start.sh providers/forgejo.sh providers/github.sh registry.sh globals.env ) diff --git a/tests/scripts.bats b/tests/scripts.bats index 19f758e..cef17b7 100644 --- a/tests/scripts.bats +++ b/tests/scripts.bats @@ -86,3 +86,17 @@ EOF python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — s. `maturity: draft`' grep -q "^last_updated: $(date +%F)$" wiki/index.md } + +@test "log-append: dedup on stable run_id prevents duplicate entries" { + G="$(make_fixture_genome)"; cd "$G" + stable_id="test-stable-run-id-001" + run bash "$SKILL_SCRIPTS/log-append.sh" --run-id "$stable_id" --type INGEST --subject "test" --model "m" \ + --context "[[raw/x]]" --output "[[sources/x]]" --reasoning "r" + [ "$status" -eq 0 ] + run bash "$SKILL_SCRIPTS/log-append.sh" --run-id "$stable_id" --type INGEST --subject "test" --model "m" \ + --context "[[raw/x]]" --output "[[sources/x]]" --reasoning "r" + [ "$status" -eq 0 ] + [[ "$output" == *"already present"* ]] + count="$(grep -cF "run_id: \`${stable_id}\`" wiki/log.md || true)" + [ "$count" -eq 1 ] +}