Merge branch 'release/1.3.1' into main

This commit is contained in:
Matteo Cherubini 2026-06-19 05:53:48 +02:00
commit 4ae1a3de5f
8 changed files with 342 additions and 71 deletions

View file

@ -1,5 +1,5 @@
# =============================================================================
# Knowledge Genome - Makefile v. 1.3.0
# Knowledge Genome - Makefile v. 1.3.1
# Orchestrates the setup and management of the knowledge base.
# =============================================================================

View file

@ -77,6 +77,11 @@ master-knowledge-genome/ ← Root orchestrator (submodule registry)
└── AGENTS.md ← Global coordination schema (cross-genome rules)
```
> The genome names above (`genome-dev`, `genome-finance`, `genome-homelab`) are
> **illustrative** — they show the kind of multi-domain layout this orchestrator targets.
> The shipped `registry.sh` defines a single disposable sandbox, **`genome-test`**; you
> create real genomes yourself with `make add-genome` (see the registry examples below).
Each genome is an independent git repository:
```text
@ -175,6 +180,11 @@ knowledge-genome-orchestrator/ ← This repository (setup tooling)
> The `skills/ingest/` directory is version-controlled here but **deployed** to the AI
> node (vm101) under `~/.pi/agent/skills/ingest`. The agent (`pi`) does only semantic work
> and writes a manifest; `run-ingest.sh` does the mechanical steps. See [Workflows → Ingest](#ingest).
>
> ingest-semantic.py: one schema-constrained call to local model, returns JSON. run-ingest.sh: index/log/lint/PR.
> Semantic JSON extraction → deterministic wiki conform + manifest.
>
> cp skills/ingest/\* ~/.pi/agent/skills/ingest/ after make setup. Updated via git pull on laptop, pushed to vm101 via SSH in n8n flow.
---
@ -807,7 +817,10 @@ model must not waste context on:
8. Appends the `INGEST | <slug>` entry to `wiki/log.md` (the model name comes from the
orchestrator via `INGEST_MODEL` — the agent cannot reliably know its own tag)
9. Runs scoped lint on exactly the pages touched this run (`scoped-lint.sh`, reusing
`lib/lint.sh`)
`lib/lint.sh`), including a **duplicate-slug advisory**: a slug created this run that is
highly similar to an entity/concept already in `wiki/index.md` is flagged in the PR so a
human can merge them. It is advisory only — it never fails the lint or blocks the PR
(threshold tunable via `KG_DUP_THRESHOLD`, default 70)
10. Commits **only `wiki/`** on `feat/ai-ingest-<slug>` and opens a PR against the integration
base (`INGEST_BASE`, default `main`); the body matches the `templates/pr-description.md`
structure (Summary / Pages / Contradictions / Scoped Lint)
@ -960,6 +973,10 @@ The agent proposes re-validation but does not change `maturity` without new sour
### Cross-genome references
> **Status: planned.** The cross-genome collector and **navigation skill** described in this
> section are specified but **not yet implemented** in this release — only the `ingest` skill
> ships today. What follows documents the intended design and the boundary contract it will honour.
Cross-domain knowledge moves by **pull, never push**: the genome you are working in draws
material _in_; nothing is ever written into another genome. There are **no cross-genome
wikilinks** — submodule pointers make relative paths brittle.
@ -1062,7 +1079,7 @@ grep "^## \[" wiki/log.md | grep "CONFLICT" # All conflicts
grep "^## \[2026-05" wiki/log.md # Entries from a specific month
```
The orchestrator always injects only `tail -n 20 wiki/log.md` into agent context.
ingest-semantic.py receives source text + existing entity/concept names (from index) as prompt context.
The LLM never loads the full log.
---
@ -1122,6 +1139,8 @@ Note: `.obsidian/` is in `.gitignore`. Workspace and plugin settings are local
### n8n automation
n8n → SSH → ingest-semantic.py <genome> <raw> → run-ingest.sh <genome>.
n8n (running on the storage node) can automate the ingest pipeline:
1. Forgejo webhook fires on push to a genome's `raw/` directory

View file

@ -208,3 +208,111 @@ check_broken_links() {
fi
done <<< "$links"
}
# ---------------------------------------------------------------------------
# levenshtein <s1> <s2>
# Classic edit distance via a two-row rolling buffer, so every array subscript
# is a single integer. The previous implementation used comma subscripts
# (d[i,j]); in bash arithmetic the comma operator collapses to one dimension,
# so the table aliased onto itself and returned wrong distances — it could not
# even score two identical strings as 0. This form is portable to bash 3.2
# (no associative arrays). Echoes the integer distance.
# ---------------------------------------------------------------------------
levenshtein() {
local s1="$1" s2="$2"
local len1=${#s1} len2=${#s2}
(( len1 == 0 )) && { echo "$len2"; return; }
(( len2 == 0 )) && { echo "$len1"; return; }
local -a prev=() curr=()
local i j cost del ins sub min
for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done
for (( i = 1; i <= len1; i++ )); do
curr[0]=$i
for (( j = 1; j <= len2; j++ )); do
cost=1
[[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0
del=$(( prev[j] + 1 ))
ins=$(( curr[j-1] + 1 ))
sub=$(( prev[j-1] + cost ))
min=$del
(( ins < min )) && min=$ins
(( sub < min )) && min=$sub
curr[j]=$min
done
prev=( "${curr[@]}" )
done
echo "${prev[len2]}"
}
# ---------------------------------------------------------------------------
# similarity <s1> <s2>
# Percentage similarity from the edit distance: 100 = identical, 0 = entirely
# different. Two empty strings are treated as identical (100), so the divide
# is always guarded.
# ---------------------------------------------------------------------------
similarity() {
local s1="$1" s2="$2"
local maxlen=${#s1}
(( ${#s2} > maxlen )) && maxlen=${#s2}
(( maxlen == 0 )) && { echo "100"; return; }
local dist
dist=$(levenshtein "$s1" "$s2")
echo $(( 100 - (dist * 100 / maxlen) ))
}
# ---------------------------------------------------------------------------
# check_duplicates <manifest>
# Advisory only: warns when a page created this run has a slug suspiciously
# close to an entity/concept already listed in wiki/index.md, so a human can
# merge them in the PR rather than grow two near-identical pages. Never fails
# the lint (always returns 0), exactly like check_broken_links.
#
# The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches
# are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index
# BEFORE the lint runs, so without the skip every new slug would match itself at
# 100%. A page that genuinely collides with a pre-existing file is reported by
# the manifest as 'modified', not 'created', so skipping created==existing pairs
# can never mask a real collision.
# ---------------------------------------------------------------------------
check_duplicates() {
local manifest="$1"
[[ -f "$manifest" ]] || return 0
command -v jq >/dev/null 2>&1 || return 0
# New leaf slugs from pages created this run.
local -a new_slugs=()
local slug
while IFS= read -r slug; do
[[ -n "$slug" ]] && new_slugs+=("$slug")
done < <(jq -r '.pages[]? | select(.status=="created") | .path
| split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null)
# Existing entity/concept slugs already catalogued in the index.
local -a existing_slugs=()
if [[ -f "wiki/index.md" ]]; then
local line
while IFS= read -r line; do
if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then
existing_slugs+=("${BASH_REMATCH[2]}")
fi
done < "wiki/index.md"
fi
(( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0
local threshold="${KG_DUP_THRESHOLD:-70}"
local new exist sim
for new in "${new_slugs[@]}"; do
for exist in "${existing_slugs[@]}"; do
[[ "$new" == "$exist" ]] && continue # skip exact self-match (see header)
sim=$(similarity "$new" "$exist")
if (( sim > threshold )); then
warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR"
fi
done
done
return 0
}

View file

@ -1,24 +1,23 @@
#!/usr/bin/env python3
# =============================================================================
# skills/ingest/scripts/ingest-semantic.py
# Phase 1 (semantic) of the Knowledge Genome ingest — the LIGHT version.
# Phase 1 (semantic) of the Knowledge Genome ingest — light agent + deterministic conform.
#
# - FIXED: Add 'title:' field to frontmatter (lint was complaining about missing title)
# - NEW: Inject existing index (entity/concept names) into prompt to prevent duplicates
# - NEW: Richer prompt asking for 2-4 sentences per description (not 1-2), with concrete details
# - Enhanced schema to handle longer descriptions naturally
#
# The model does ONLY semantic extraction and returns ONE schema-constrained JSON
# object (no tools, no file writing, no git, no frontmatter, no slugs). This script
# then CONFORMS that output deterministically into wiki pages with enforced
# frontmatter + kebab-case paths, and writes a .ingest-manifest.json in EXACTLY the
# schema run-ingest.sh expects. run-ingest.sh (phase 2) then does index / log /
# scoped-lint / PR, unchanged.
# schema run-ingest.sh expects.
#
# cd <genome checkout>
# ingest-semantic.py <genome> raw/articles/<file>.md # phase 1 (this)
# run-ingest.sh <genome> # phase 2 (deterministic)
#
# Why this shape: local tool-calling via pi/ollama proved fragile, and a small
# model does not reliably honour folders / naming / frontmatter / manifest schema
# when it writes files itself. Here the model cannot break the contract because it
# never touches the filesystem — the script owns all structure. Stdlib only.
#
# Emits a single JSON status line on stdout (for n8n / logs).
# =============================================================================
import json, os, re, sys, datetime, urllib.request, urllib.error
@ -57,7 +56,84 @@ if not source_text.strip():
die("preflight", "source is empty: " + raw_rel)
# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
# --- read existing index to avoid duplicate slugs ---
existing_entities = set()
existing_concepts = set()
if os.path.isfile("wiki/index.md"):
try:
with open("wiki/index.md", "r", encoding="utf-8") as f:
idx_text = f.read()
# extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
existing_entities.add(m.group(1))
for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
existing_concepts.add(m.group(1))
except Exception:
pass # index not readable or not found; that's OK
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=20):
"""Truncate at n words; used for index entry summaries."""
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def yaml_dq(s):
"""Render a value as a YAML double-quoted scalar.
Titles can contain characters that break a bare scalar most commonly a
colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
'-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
collapsed to spaces so the scalar stays on one line.
"""
s = " ".join((s or "").split())
s = s.replace("\\", "\\\\").replace('"', '\\"')
return f'"{s}"'
def frontmatter(ptype, title, tags):
"""Return YAML frontmatter with title field."""
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"title: {yaml_dq(title)}\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, title, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- the semantic contract ---
SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
Read the source and return ONLY structured data describing what it contains.
You do not write files, you do not produce frontmatter, and you do not invent
@ -65,17 +141,29 @@ paths, slugs, branches, commits or PRs — a deterministic script does all of th
Rules:
- source_summary: a faithful, self-contained summary of the source, in the
source's own language. Plain prose, no markdown headings.
- key_points: the handful of concrete facts/claims worth indexing.
source's own language. Plain prose, NO markdown headings. 2-4 sentences,
with concrete details. Preserve the essence and nuance of the source.
- key_points: 3-5 concrete facts or claims worth indexing; no padding.
- entities: every person, tool, organisation or product the source names.
kind is one of person|tool|org|product. description is one or two factual
sentences. No markdown headings inside the description.
kind is one of person|tool|org|product. description is 2-3 factual sentences
with specifics. No markdown headings inside the description.
- concepts: every pattern, theory, decision or named idea the source explains.
description is one or two factual sentences.
description is 2-3 factual sentences with concrete examples or context.
- contradictions: ONLY when the source makes a claim that directly contradicts a
widely-known fact or contradicts itself. Otherwise return an empty list.
- Names must be the natural name of the thing; the script will normalise them.
Do not pad. Be faithful to the source."""
If the source references an entity or concept already in the wiki (see the list below),
use the EXACT name already present; do not invent a variant. This prevents duplicates.
Existing entities in this genome:
{existing_entities}
Existing concepts in this genome:
{existing_concepts}
Be faithful to the source. Be specific. Do not pad or improvise."""
# --- JSON schema -> constrained decoding (Ollama structured outputs) ---
SCHEMA = {
@ -118,10 +206,19 @@ SCHEMA = {
def call_model():
# format existing names as a human-readable list
existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)"
existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)"
prompt = SYSTEM_PROMPT.format(
existing_entities=existing_ents,
existing_concepts=existing_conc,
)
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "system", "content": prompt},
{"role": "user", "content":
"Source path: " + raw_rel + "\n\n--- SOURCE START ---\n"
+ source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},
@ -152,51 +249,6 @@ def call_model():
die("model", "model did not return valid JSON: " + str(e))
# --- conform helpers (the script OWNS all structure) ---
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=12):
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def frontmatter(ptype, tags):
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- run the semantic pass ---
sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +262,15 @@ src_body = (sem.get("source_summary") or "").strip()
if kp_lines:
src_body += "\n\n## Key points\n\n" + kp_lines
src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
src_title = sem.get('source_title') or source_slug
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f:
f.write(frontmatter("source", src_tags))
f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
f.write(frontmatter("source", src_title, src_tags))
f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path,
"summary": twords(sem.get("source_title") or source_slug),
"summary": twords(src_title),
"maturity": "draft", "status": src_status})

View file

@ -113,7 +113,14 @@ bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model"
|| fail "log" "log-append failed"
# --- 3. scoped linter (capture findings for the PR; never aborts the run) ---
lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
# Point scoped-lint at the same manifest we were handed so its duplicate
# advisory reads the right file even when a non-default path arrives as $2.
# (The dedup check lives inside lib/lint.sh and is invoked by scoped-lint —
# there is no separate check-duplicates.sh script.)
export INGEST_MANIFEST="$manifest"
lint_out="$(
bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1
)" && lint_rc=0 || lint_rc=$?
# --- 4. assemble the PR body (manifest tables + lint results) ---
body="$(mktemp)"

View file

@ -49,6 +49,13 @@ for f in "$@"; do
check_broken_links "$f" || true # warnings only
done
# Cross-page duplicate advisory: runs ONCE over the whole manifest (not per
# file) — it compares this run's created slugs against the index, so repeating
# it for every file would only print the same warnings N times. Warn-only;
# never affects the exit status. INGEST_MANIFEST lets run-ingest.sh point us at
# a non-default manifest path; falls back to the conventional name otherwise.
check_duplicates "${INGEST_MANIFEST:-.ingest-manifest.json}"
echo ""
echo "scoped-lint: ${errors} error(s), ${stale} stale across ${count} file(s)"

View file

@ -9,7 +9,7 @@ they run anywhere (laptop, CI, a git hook). They do **not** belong on vm101 or i
| File | Covers |
|------|--------|
| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) |
| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` reuse |
| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` reuse + duplicate-slug advisory (edit-distance math, self-match skip, once-per-run) |
| `structure.bats` | `lib/structure.sh` report/sync |
| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` |

View file

@ -69,3 +69,80 @@ EOF
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/good.md
[ "$status" -eq 0 ]
}
# --- duplicate-slug advisory (check_duplicates + its distance helpers) --------
# These guard the dedup feature: correct edit-distance math, the warn-only
# contract, the exact-self-match skip (run-ingest appends new slugs to the
# index before lint runs), and that the advisory fires once per run, not once
# per file.
@test "levenshtein: identical strings have distance 0" {
run levenshtein cat cat
[ "$status" -eq 0 ]
[ "$output" -eq 0 ]
}
@test "levenshtein: kitten→sitting is 3 (textbook case)" {
run levenshtein kitten sitting
[ "$output" -eq 3 ]
}
@test "similarity: identical strings score 100" {
run similarity gpu-pricing gpu-pricing
[ "$output" -eq 100 ]
}
@test "check_duplicates: warns on a near-duplicate of an indexed concept" {
G="$(make_fixture_genome)"; cd "$G"
printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
cat > .ingest-manifest.json <<'JSON'
{"raw_source":"src","pages":[{"path":"wiki/concepts/llm-routings.md","status":"created"}]}
JSON
run check_duplicates .ingest-manifest.json
[ "$status" -eq 0 ]
[[ "$output" == *"≈"* ]]
[[ "$output" == *"llm-routings"* ]]
}
@test "check_duplicates: silent when the new slug is unlike anything indexed" {
G="$(make_fixture_genome)"; cd "$G"
printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
cat > .ingest-manifest.json <<'JSON'
{"raw_source":"src","pages":[{"path":"wiki/concepts/budget-hardware.md","status":"created"}]}
JSON
run check_duplicates .ingest-manifest.json
[ "$status" -eq 0 ]
[[ "$output" != *"≈"* ]]
}
@test "check_duplicates: an exact self-match is not flagged (index already has the slug)" {
G="$(make_fixture_genome)"; cd "$G"
# run-ingest step 1 inserts this run's slug into the index BEFORE lint runs;
# the slug must not be reported as a duplicate of itself.
printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
cat > .ingest-manifest.json <<'JSON'
{"raw_source":"src","pages":[{"path":"wiki/concepts/llm-routing.md","status":"created"}]}
JSON
run check_duplicates .ingest-manifest.json
[ "$status" -eq 0 ]
[[ "$output" != *"≈"* ]]
}
@test "scoped-lint: duplicate advisory fires once across multiple files, not per file" {
G="$(make_fixture_genome)"
write_page "$G/wiki/concepts/data-pipelines.md" concept genome-test
write_page "$G/wiki/concepts/other-topic.md" concept genome-test
printf -- '- [[concepts/data-pipeline]] — x\n' >> "$G/wiki/index.md"
cat > "$G/.ingest-manifest.json" <<'JSON'
{"raw_source":"src","pages":[
{"path":"wiki/concepts/data-pipelines.md","status":"created"},
{"path":"wiki/concepts/other-topic.md","status":"created"}
]}
JSON
cd "$G"
export KG_LIB_DIR="$LIB_DIR"
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test \
wiki/concepts/data-pipelines.md wiki/concepts/other-topic.md
[ "$status" -eq 0 ]
[ "$(grep -c "≈" <<< "$output")" -eq 1 ]
}