feat: Enhance ingest to track raw source path and SHA256 hash
This commit is contained in:
parent
e62ad0c831
commit
0ff98e1ebd
4 changed files with 83 additions and 4 deletions
|
|
@ -20,7 +20,7 @@
|
||||||
#
|
#
|
||||||
# Emits a single JSON status line on stdout (for n8n / logs).
|
# Emits a single JSON status line on stdout (for n8n / logs).
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
import json, os, re, sys, datetime, urllib.request, urllib.error
|
import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error
|
||||||
|
|
||||||
# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
|
# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
|
||||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
|
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
|
||||||
|
|
@ -257,7 +257,16 @@ def call_model():
|
||||||
|
|
||||||
# --- run the semantic pass ---
|
# --- run the semantic pass ---
|
||||||
sem = call_model()
|
sem = call_model()
|
||||||
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
|
|
||||||
|
# Source of truth: slug from slug.sh --raw (deterministic, path-aware, collision-proof)
|
||||||
|
source_slug = subprocess.check_output(
|
||||||
|
["bash", os.path.join(os.path.dirname(__file__), "slug.sh"), "--raw", raw_rel],
|
||||||
|
text=True
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
with open(raw_rel, "rb") as f:
|
||||||
|
src_sha = hashlib.sha256(f.read()).hexdigest()
|
||||||
|
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
# 1. source page — canonical summary of THIS source (re)written
|
# 1. source page — canonical summary of THIS source (re)written
|
||||||
|
|
@ -273,7 +282,10 @@ src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
|
||||||
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
|
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
|
||||||
os.makedirs("wiki/sources", exist_ok=True)
|
os.makedirs("wiki/sources", exist_ok=True)
|
||||||
with open(src_path, "w", encoding="utf-8") as f:
|
with open(src_path, "w", encoding="utf-8") as f:
|
||||||
f.write(frontmatter("source", src_title, src_tags))
|
fm = frontmatter("source", src_title, src_tags)
|
||||||
|
# Inject tracking fields before the closing '---' (first newline-dash-dash-dash-newline)
|
||||||
|
fm = fm.replace("\n---\n", f"\nsource_path: {raw_rel}\nsource_sha256: {src_sha}\n---\n", 1)
|
||||||
|
f.write(fm)
|
||||||
f.write(f"\n# {src_title}\n\n{src_body}\n")
|
f.write(f"\n# {src_title}\n\n{src_body}\n")
|
||||||
pages.append({"path": src_path,
|
pages.append({"path": src_path,
|
||||||
"summary": twords(src_title),
|
"summary": twords(src_title),
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
|
||||||
|
|
||||||
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
|
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
|
||||||
|
|
||||||
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
|
slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
|
||||||
|
|
||||||
# --- collect touched paths ---
|
# --- collect touched paths ---
|
||||||
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
|
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
|
||||||
|
|
|
||||||
29
tests/ingest-semantic.bats
Normal file
29
tests/ingest-semantic.bats
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
setup() {
|
||||||
|
load 'helpers'
|
||||||
|
source "$LIB_DIR/output.sh"
|
||||||
|
source "$LIB_DIR/lint.sh"
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "lint tolerates source_path/source_sha256 in source frontmatter" {
|
||||||
|
G="$(make_fixture_genome)"
|
||||||
|
mkdir -p "$G/wiki/sources"
|
||||||
|
cat > "$G/wiki/sources/test-source.md" <<'EOFMD'
|
||||||
|
---
|
||||||
|
title: "Test Source"
|
||||||
|
type: source
|
||||||
|
domain: genome-test
|
||||||
|
maturity: draft
|
||||||
|
last_updated: 2026-06-25
|
||||||
|
private: false
|
||||||
|
tags: [test]
|
||||||
|
source_path: raw/articles/test.md
|
||||||
|
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
|
---
|
||||||
|
# Test Source
|
||||||
|
body
|
||||||
|
EOFMD
|
||||||
|
run lint_markdown_file "$G/wiki/sources/test-source.md" genome-test
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
}
|
||||||
|
|
@ -171,3 +171,41 @@ EOF
|
||||||
[ "$status" -eq 0 ]
|
[ "$status" -eq 0 ]
|
||||||
[[ "$output" == *"develop"* ]]
|
[[ "$output" == *"develop"* ]]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@test "run-ingest: branch name matches slug.sh --raw for nested raw paths" {
|
||||||
|
command -v jq >/dev/null 2>&1 || skip "jq not installed"
|
||||||
|
G="$(make_fixture_genome)"; cd "$G"
|
||||||
|
mkdir -p wiki/sources
|
||||||
|
cat > wiki/sources/cibo-il-pane.md <<'EOFMD'
|
||||||
|
---
|
||||||
|
title: "Il Pane"
|
||||||
|
type: source
|
||||||
|
domain: genome-test
|
||||||
|
tags: [cibo]
|
||||||
|
maturity: draft
|
||||||
|
last_updated: 2026-06-25
|
||||||
|
private: false
|
||||||
|
source_path: raw/articles/cibo/il-pane.md
|
||||||
|
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||||
|
---
|
||||||
|
# Il Pane
|
||||||
|
body
|
||||||
|
EOFMD
|
||||||
|
cat > .ingest-manifest.json <<'EOFJSON'
|
||||||
|
{
|
||||||
|
"raw_source": "raw/articles/cibo/il-pane.md",
|
||||||
|
"model": "qwen3.5-9b",
|
||||||
|
"reasoning": "Ingest.",
|
||||||
|
"pr_summary": "Ingest summary.",
|
||||||
|
"contradictions": "None",
|
||||||
|
"pages": [
|
||||||
|
{"path": "wiki/sources/cibo-il-pane.md", "summary": "Summary.", "maturity": "draft", "status": "created"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
EOFJSON
|
||||||
|
export KG_LIB_DIR="$LIB_DIR"
|
||||||
|
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1
|
||||||
|
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[[ "$output" == *"cibo-il-pane"* ]]
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue