feat: Enhance ingest to track raw source path and SHA256 hash
This commit is contained in:
parent
e62ad0c831
commit
0ff98e1ebd
4 changed files with 83 additions and 4 deletions
|
|
@ -20,7 +20,7 @@
|
|||
#
|
||||
# Emits a single JSON status line on stdout (for n8n / logs).
|
||||
# =============================================================================
|
||||
import json, os, re, sys, datetime, urllib.request, urllib.error
|
||||
import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error
|
||||
|
||||
# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
|
||||
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
|
||||
|
|
@ -257,7 +257,16 @@ def call_model():
|
|||
|
||||
# --- run the semantic pass ---
|
||||
sem = call_model()
|
||||
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
|
||||
|
||||
# Source of truth: slug from slug.sh --raw (deterministic, path-aware, collision-proof)
|
||||
source_slug = subprocess.check_output(
|
||||
["bash", os.path.join(os.path.dirname(__file__), "slug.sh"), "--raw", raw_rel],
|
||||
text=True
|
||||
).strip()
|
||||
|
||||
with open(raw_rel, "rb") as f:
|
||||
src_sha = hashlib.sha256(f.read()).hexdigest()
|
||||
|
||||
pages = []
|
||||
|
||||
# 1. source page — canonical summary of THIS source (re)written
|
||||
|
|
@ -273,7 +282,10 @@ src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
|
|||
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
|
||||
os.makedirs("wiki/sources", exist_ok=True)
|
||||
with open(src_path, "w", encoding="utf-8") as f:
|
||||
f.write(frontmatter("source", src_title, src_tags))
|
||||
fm = frontmatter("source", src_title, src_tags)
|
||||
# Inject tracking fields before the closing '---' (first newline-dash-dash-dash-newline)
|
||||
fm = fm.replace("\n---\n", f"\nsource_path: {raw_rel}\nsource_sha256: {src_sha}\n---\n", 1)
|
||||
f.write(fm)
|
||||
f.write(f"\n# {src_title}\n\n{src_body}\n")
|
||||
pages.append({"path": src_path,
|
||||
"summary": twords(src_title),
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
|
|||
|
||||
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
|
||||
|
||||
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
|
||||
slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
|
||||
|
||||
# --- collect touched paths ---
|
||||
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
|
||||
|
|
|
|||
29
tests/ingest-semantic.bats
Normal file
29
tests/ingest-semantic.bats
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
#!/usr/bin/env bats
|
||||
|
||||
setup() {
|
||||
load 'helpers'
|
||||
source "$LIB_DIR/output.sh"
|
||||
source "$LIB_DIR/lint.sh"
|
||||
}
|
||||
|
||||
@test "lint tolerates source_path/source_sha256 in source frontmatter" {
|
||||
G="$(make_fixture_genome)"
|
||||
mkdir -p "$G/wiki/sources"
|
||||
cat > "$G/wiki/sources/test-source.md" <<'EOFMD'
|
||||
---
|
||||
title: "Test Source"
|
||||
type: source
|
||||
domain: genome-test
|
||||
maturity: draft
|
||||
last_updated: 2026-06-25
|
||||
private: false
|
||||
tags: [test]
|
||||
source_path: raw/articles/test.md
|
||||
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
---
|
||||
# Test Source
|
||||
body
|
||||
EOFMD
|
||||
run lint_markdown_file "$G/wiki/sources/test-source.md" genome-test
|
||||
[ "$status" -eq 0 ]
|
||||
}
|
||||
|
|
@ -171,3 +171,41 @@ EOF
|
|||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"develop"* ]]
|
||||
}
|
||||
|
||||
@test "run-ingest: branch name matches slug.sh --raw for nested raw paths" {
|
||||
command -v jq >/dev/null 2>&1 || skip "jq not installed"
|
||||
G="$(make_fixture_genome)"; cd "$G"
|
||||
mkdir -p wiki/sources
|
||||
cat > wiki/sources/cibo-il-pane.md <<'EOFMD'
|
||||
---
|
||||
title: "Il Pane"
|
||||
type: source
|
||||
domain: genome-test
|
||||
tags: [cibo]
|
||||
maturity: draft
|
||||
last_updated: 2026-06-25
|
||||
private: false
|
||||
source_path: raw/articles/cibo/il-pane.md
|
||||
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
||||
---
|
||||
# Il Pane
|
||||
body
|
||||
EOFMD
|
||||
cat > .ingest-manifest.json <<'EOFJSON'
|
||||
{
|
||||
"raw_source": "raw/articles/cibo/il-pane.md",
|
||||
"model": "qwen3.5-9b",
|
||||
"reasoning": "Ingest.",
|
||||
"pr_summary": "Ingest summary.",
|
||||
"contradictions": "None",
|
||||
"pages": [
|
||||
{"path": "wiki/sources/cibo-il-pane.md", "summary": "Summary.", "maturity": "draft", "status": "created"}
|
||||
]
|
||||
}
|
||||
EOFJSON
|
||||
export KG_LIB_DIR="$LIB_DIR"
|
||||
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1
|
||||
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" == *"cibo-il-pane"* ]]
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue