feat: Enhance ingest to track raw source path and SHA256 hash

This commit is contained in:
Matteo Cherubini 2026-06-27 12:15:58 +02:00
parent e62ad0c831
commit 0ff98e1ebd
4 changed files with 83 additions and 4 deletions

View file

@ -20,7 +20,7 @@
# #
# Emits a single JSON status line on stdout (for n8n / logs). # Emits a single JSON status line on stdout (for n8n / logs).
# ============================================================================= # =============================================================================
import json, os, re, sys, datetime, urllib.request, urllib.error import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error
# --- config (override via env; these live in ~/.config/knowledge-genome.env) --- # --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat") OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
@ -257,7 +257,16 @@ def call_model():
# --- run the semantic pass --- # --- run the semantic pass ---
sem = call_model() sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
# Source of truth: slug from slug.sh --raw (deterministic, path-aware, collision-proof)
source_slug = subprocess.check_output(
["bash", os.path.join(os.path.dirname(__file__), "slug.sh"), "--raw", raw_rel],
text=True
).strip()
with open(raw_rel, "rb") as f:
src_sha = hashlib.sha256(f.read()).hexdigest()
pages = [] pages = []
# 1. source page — canonical summary of THIS source (re)written # 1. source page — canonical summary of THIS source (re)written
@ -273,7 +282,10 @@ src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8] + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True) os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f: with open(src_path, "w", encoding="utf-8") as f:
f.write(frontmatter("source", src_title, src_tags)) fm = frontmatter("source", src_title, src_tags)
# Inject tracking fields before the closing '---' (first newline-dash-dash-dash-newline)
fm = fm.replace("\n---\n", f"\nsource_path: {raw_rel}\nsource_sha256: {src_sha}\n---\n", 1)
f.write(fm)
f.write(f"\n# {src_title}\n\n{src_body}\n") f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path, pages.append({"path": src_path,
"summary": twords(src_title), "summary": twords(src_title),

View file

@ -53,7 +53,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" [[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}" slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
# --- collect touched paths --- # --- collect touched paths ---
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")

View file

@ -0,0 +1,29 @@
#!/usr/bin/env bats
setup() {
load 'helpers'
source "$LIB_DIR/output.sh"
source "$LIB_DIR/lint.sh"
}
@test "lint tolerates source_path/source_sha256 in source frontmatter" {
G="$(make_fixture_genome)"
mkdir -p "$G/wiki/sources"
cat > "$G/wiki/sources/test-source.md" <<'EOFMD'
---
title: "Test Source"
type: source
domain: genome-test
maturity: draft
last_updated: 2026-06-25
private: false
tags: [test]
source_path: raw/articles/test.md
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
---
# Test Source
body
EOFMD
run lint_markdown_file "$G/wiki/sources/test-source.md" genome-test
[ "$status" -eq 0 ]
}

View file

@ -171,3 +171,41 @@ EOF
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
[[ "$output" == *"develop"* ]] [[ "$output" == *"develop"* ]]
} }
@test "run-ingest: branch name matches slug.sh --raw for nested raw paths" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
mkdir -p wiki/sources
cat > wiki/sources/cibo-il-pane.md <<'EOFMD'
---
title: "Il Pane"
type: source
domain: genome-test
tags: [cibo]
maturity: draft
last_updated: 2026-06-25
private: false
source_path: raw/articles/cibo/il-pane.md
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
---
# Il Pane
body
EOFMD
cat > .ingest-manifest.json <<'EOFJSON'
{
"raw_source": "raw/articles/cibo/il-pane.md",
"model": "qwen3.5-9b",
"reasoning": "Ingest.",
"pr_summary": "Ingest summary.",
"contradictions": "None",
"pages": [
{"path": "wiki/sources/cibo-il-pane.md", "summary": "Summary.", "maturity": "draft", "status": "created"}
]
}
EOFJSON
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *"cibo-il-pane"* ]]
}