Compare commits

...

7 commits

11 changed files with 290 additions and 5 deletions

View file

@ -1,5 +1,5 @@
# =============================================================================
# Knowledge Genome - Makefile v. 1.8.1
# Knowledge Genome - Makefile v. 1.9.0
# Orchestrates the setup and management of the knowledge base.
# =============================================================================

View file

@ -2,6 +2,15 @@
set -eu
cmd="${SSH_ORIGINAL_COMMAND:-}"
case "$cmd" in
"pi pending-raw "*)
genome="${cmd#pi pending-raw }"
case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac
logger -t n8n-pi-wrap "ok: pi pending-raw ${genome}"
set -a; . "${HOME}/.config/knowledge-genome.env" 2>/dev/null || true; set +a
# Run from the DEPLOYED skill dir (same place as ingest-semantic.py / run-ingest.sh on
# lines 54/59), so pending-raw.sh resolves its sibling slug.sh via BASH_SOURCE.
exec "${HOME}/.pi/agent/skills/ingest/scripts/pending-raw.sh" "$genome"
;;
"pi run")
logger -t n8n-pi-wrap "ok: pi run (prompt via stdin)"
prompt=$(cat)

View file

@ -20,7 +20,7 @@
#
# Emits a single JSON status line on stdout (for n8n / logs).
# =============================================================================
import json, os, re, sys, datetime, urllib.request, urllib.error
import json, os, hashlib, subprocess, re, sys, datetime, urllib.request, urllib.error
# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
@ -257,7 +257,16 @@ def call_model():
# --- run the semantic pass ---
sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
# Source of truth: slug from slug.sh --raw (deterministic, path-aware, collision-proof)
source_slug = subprocess.check_output(
["bash", os.path.join(os.path.dirname(__file__), "slug.sh"), "--raw", raw_rel],
text=True
).strip()
with open(raw_rel, "rb") as f:
src_sha = hashlib.sha256(f.read()).hexdigest()
pages = []
# 1. source page — canonical summary of THIS source (re)written
@ -273,7 +282,10 @@ src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f:
f.write(frontmatter("source", src_title, src_tags))
fm = frontmatter("source", src_title, src_tags)
# Inject tracking fields before the closing '---' (first newline-dash-dash-dash-newline)
fm = fm.replace("\n---\n", f"\nsource_path: {raw_rel}\nsource_sha256: {src_sha}\n---\n", 1)
f.write(fm)
f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path,
"summary": twords(src_title),

View file

@ -0,0 +1,64 @@
#!/usr/bin/env bash
# =============================================================================
# pending-raw.sh — deterministic "what needs ingesting" calculator.
# Reads the clean base checkout and classifies each raw/articles/*.md as:
# new -> no wiki/sources/<slug>.md
# modified -> page exists but its source_sha256 != current file hash
# Emits the same JSON envelope as changed-raw (drop-in), plus detail[] for ntfy.
# =============================================================================
set -euo pipefail
genome="${1:?usage: pending-raw.sh <genome>}"
base_dir="${GENOMES_ROOT:-${HOME}/genomes}"
cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh).
git fetch -q origin \
&& git switch -q "${INGEST_BASE:-main}" 2>/dev/null \
&& git reset -q --hard "origin/${INGEST_BASE:-main}" \
&& git clean -q -fd
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SLUG="${SCRIPT_DIR}/slug.sh"
declare -a NEW=()
declare -a MOD=()
declare -A SEEN_SLUG=()
if [[ -d raw/articles ]]; then
while IFS= read -r -d '' f; do
rel="${f#./}"
case "$rel" in
*/.stfolder/*|*/.stignore|*/.gitkeep) continue ;;
esac
slug="$("$SLUG" --raw "$rel")" || continue
# Residual collision (two distinct raws -> same slug): warn, do not silence.
if [[ -n "${SEEN_SLUG[$slug]:-}" && "${SEEN_SLUG[$slug]}" != "$rel" ]]; then
logger -t pending-raw "warn: slug collision ${slug}: ${SEEN_SLUG[$slug]} <-> ${rel}"
fi
SEEN_SLUG[$slug]="$rel"
page="wiki/sources/${slug}.md"
if [[ ! -f "$page" ]]; then
NEW+=("$rel")
else
cur="$(sha256sum "$rel" | cut -d' ' -f1)"
rec="$(sed -n 's/^source_sha256:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)"
if [[ "$cur" != "$rec" ]]; then
MOD+=("$rel")
fi
fi
done < <(find raw/articles -type f -name '*.md' -print0 2>/dev/null)
fi
if [[ ${#NEW[@]} -eq 0 && ${#MOD[@]} -eq 0 ]]; then
echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}'
else
{
for x in "${NEW[@]}"; do printf '%s\tnew\n' "$x"; done
for x in "${MOD[@]}"; do printf '%s\tmodified\n' "$x"; done
} | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \
| jq -s --arg g "$genome" \
'{status: "ok", genome: $g, count: length, files: [.[].path], detail: .}'
fi

View file

@ -53,7 +53,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
slug="$(bash "${SCRIPTS}/slug.sh" --raw "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
# --- collect touched paths ---
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")

View file

@ -7,6 +7,18 @@
# =============================================================================
set -euo pipefail
if [[ "${1:-}" == "--raw" ]]; then
raw="${2:?usage: slug.sh --raw <raw/bucket/rel/path>}"
rel="${raw#raw/}"; rel="${rel#*/}" # strip "raw/" and the bucket name
rel="${rel%.*}" # strip extension
slug="$(printf '%s\n' "$rel" | tr '/' '\n' \
| sed -E 's/[^a-zA-Z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//' \
| tr '[:upper:]' '[:lower:]' | paste -sd- -)"
[[ -n "$slug" ]] || { echo "slug: empty result for input '${raw}'" >&2; exit 1; }
printf '%s\n' "$slug"
exit 0
fi
input="${1:?usage: slug.sh <path-or-title>}"
# Strip directory and extension when given a path

View file

@ -0,0 +1,29 @@
#!/usr/bin/env bats
setup() {
load 'helpers'
source "$LIB_DIR/output.sh"
source "$LIB_DIR/lint.sh"
}
@test "lint tolerates source_path/source_sha256 in source frontmatter" {
G="$(make_fixture_genome)"
mkdir -p "$G/wiki/sources"
cat > "$G/wiki/sources/test-source.md" <<'EOFMD'
---
title: "Test Source"
type: source
domain: genome-test
maturity: draft
last_updated: 2026-06-25
private: false
tags: [test]
source_path: raw/articles/test.md
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
---
# Test Source
body
EOFMD
run lint_markdown_file "$G/wiki/sources/test-source.md" genome-test
[ "$status" -eq 0 ]
}

90
tests/pending-raw.bats Normal file
View file

@ -0,0 +1,90 @@
#!/usr/bin/env bats
setup() {
load 'helpers'
export PENDING="${SKILL_SCRIPTS}/pending-raw.sh"
export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
export INGEST_BASE="main"
g_src="$(make_fixture_genome)"
export g_name="fixture-genome"
mv "$g_src" "${GENOMES_ROOT}/${g_name}"
export g="${GENOMES_ROOT}/${g_name}"
# FIX: make_fixture_genome ships raw/articles/test.md with no source page, which would
# otherwise count as a permanent 'new' and break every count assertion. Clear it so each
# test controls exactly what is pending (verified: count base becomes 0).
( cd "$g" && rm -f raw/articles/test.md && git add -A \
&& git commit -q -m "test: clear default raw" && git push -q )
}
@test "pending-raw: detects a brand new raw file" {
echo "new content" > "${g}/raw/articles/new-file.md"
( cd "$g" && git add . && git commit -q -m "add raw" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 1'
echo "$output" | jq -e '.detail[0].path == "raw/articles/new-file.md"'
echo "$output" | jq -e '.detail[0].reason == "new"'
}
@test "pending-raw: skips up-to-date files" {
echo "ok content" > "${g}/raw/articles/ok-file.md"
hash_ok="$(sha256sum "${g}/raw/articles/ok-file.md" | cut -d' ' -f1)"
cat > "${g}/wiki/sources/ok-file.md" <<FM
---
source_sha256: $hash_ok
---
FM
( cd "$g" && git add . && git commit -q -m "add ok" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 0'
}
@test "pending-raw: flags modified files" {
echo "content v1" > "${g}/raw/articles/mod-file.md"
hash_v1="$(sha256sum "${g}/raw/articles/mod-file.md" | cut -d' ' -f1)"
cat > "${g}/wiki/sources/mod-file.md" <<FM
---
source_sha256: $hash_v1
---
FM
( cd "$g" && git add . && git commit -q -m "v1" && git push -q )
echo "content v2" > "${g}/raw/articles/mod-file.md"
( cd "$g" && git add . && git commit -q -m "v2" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 1'
echo "$output" | jq -e '.detail[0].reason == "modified"'
}
@test "pending-raw: nested subdirectory yields prefixed slug" {
mkdir -p "${g}/raw/articles/sub-b"
echo "subdir content" > "${g}/raw/articles/sub-b/file.md"
( cd "$g" && git add . && git commit -q -m "subdir" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 1'
echo "$output" | jq -e '.files[0] == "raw/articles/sub-b/file.md"'
}
@test "pending-raw: excludes noise (.stfolder, .gitkeep)" {
touch "${g}/raw/articles/.gitkeep"
mkdir -p "${g}/raw/articles/.stfolder"
touch "${g}/raw/articles/.stfolder/sync.log"
( cd "$g" && git add . && git commit -q -m "noise" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 0'
}
@test "pending-raw: reports both files on a slug collision" {
mkdir -p "${g}/raw/articles/cibo"
echo "c1" > "${g}/raw/articles/cibo-pane.md"
echo "c2" > "${g}/raw/articles/cibo/pane.md"
( cd "$g" && git add . && git commit -q -m "collision" && git push -q )
run bash "$PENDING" "$g_name"
[ "$status" -eq 0 ]
echo "$output" | jq -e '.count == 2'
}

View file

@ -17,6 +17,7 @@ EXECUTABLES=(
skills/ingest/scripts/open-pr.sh
skills/ingest/scripts/log-append.sh
skills/ingest/scripts/slug.sh
skills/ingest/scripts/pending-raw.sh
skills/ingest/scripts/index-append.py
scripts/add-genome.sh
scripts/setup.sh

View file

@ -171,3 +171,41 @@ EOF
[ "$status" -eq 0 ]
[[ "$output" == *"develop"* ]]
}
@test "run-ingest: branch name matches slug.sh --raw for nested raw paths" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
mkdir -p wiki/sources
cat > wiki/sources/cibo-il-pane.md <<'EOFMD'
---
title: "Il Pane"
type: source
domain: genome-test
tags: [cibo]
maturity: draft
last_updated: 2026-06-25
private: false
source_path: raw/articles/cibo/il-pane.md
source_sha256: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
---
# Il Pane
body
EOFMD
cat > .ingest-manifest.json <<'EOFJSON'
{
"raw_source": "raw/articles/cibo/il-pane.md",
"model": "qwen3.5-9b",
"reasoning": "Ingest.",
"pr_summary": "Ingest summary.",
"contradictions": "None",
"pages": [
{"path": "wiki/sources/cibo-il-pane.md", "summary": "Summary.", "maturity": "draft", "status": "created"}
]
}
EOFJSON
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *"cibo-il-pane"* ]]
}

30
tests/slug.bats Normal file
View file

@ -0,0 +1,30 @@
#!/usr/bin/env bats
setup() {
load 'helpers'
SLUG="${SKILL_SCRIPTS}/slug.sh"
}
@test "slug --raw: flat file remains unchanged" {
run bash "$SLUG" --raw "raw/articles/il-pane.md"
[ "$status" -eq 0 ]
[ "$output" = "il-pane" ]
}
@test "slug --raw: nested file gets folder prefix" {
run bash "$SLUG" --raw "raw/articles/cibo/il-pane.md"
[ "$status" -eq 0 ]
[ "$output" = "cibo-il-pane" ]
}
@test "slug --raw: distinct subdirs avoid collision" {
s1="$(bash "$SLUG" --raw "raw/articles/cibo/pane.md")"
s2="$(bash "$SLUG" --raw "raw/articles/storia/pane.md")"
[ "$s1" != "$s2" ]
}
@test "slug --raw: Bash and Python-calling-bash agree (single implementation)" {
b="$(bash "$SLUG" --raw "raw/articles/cibo/il-pane.md")"
p="$(python3 -c "import subprocess;print(subprocess.check_output(['bash','$SLUG','--raw','raw/articles/cibo/il-pane.md'],text=True).strip())")"
[ "$b" = "$p" ]
}