From ea9283637b68ba1bd09946f79a10692680df04c9 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 3 Jun 2026 12:28:18 +0200 Subject: [PATCH 01/28] feat: centralize genome directory structure definition --- lib/scaffold.sh | 9 +++-- lib/structure.sh | 70 +++++++++++++++++++++++++++++++++++++++ scripts/verify-genomes.sh | 50 ++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 lib/structure.sh create mode 100644 scripts/verify-genomes.sh diff --git a/lib/scaffold.sh b/lib/scaffold.sh index fbefb6c..5add2ed 100644 --- a/lib/scaffold.sh +++ b/lib/scaffold.sh @@ -4,6 +4,9 @@ # Directory structure creation and template rendering engine. # ============================================================================= +# Canonical directory layout lives in one place (lib/structure.sh). +source "$(dirname "${BASH_SOURCE[0]}")/structure.sh" + render_template() { local template_file="$1" local output_file="$2" @@ -32,13 +35,9 @@ render_template() { scaffold_genome() { local base="$1" - local dirs=( - "raw/articles" "raw/transcripts" "raw/code-packs" "raw/assets" "raw/private" - "wiki/sources" "wiki/entities" "wiki/concepts" "wiki/queries" "wiki/private" - ) info "Building directory structure in ${base}..." - for dir in "${dirs[@]}"; do + for dir in "${GENOME_DIRS[@]}"; do mkdir -p "${base}/${dir}" touch "${base}/${dir}/.gitkeep" done diff --git a/lib/structure.sh b/lib/structure.sh new file mode 100644 index 0000000..f94bba1 --- /dev/null +++ b/lib/structure.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# ============================================================================= +# lib/structure.sh +# Single source of truth for the canonical genome directory layout, plus the +# verify/sync helpers used by scripts/verify-genomes.sh. +# +# IMPORTANT: this is the ONE place the structure is defined. scaffold.sh sources +# this file and builds new genomes from GENOME_DIRS, so scaffolding and the +# structure check can never drift apart. +# ============================================================================= + +# Canonical directories every genome must have. +# raw/* are input buckets (collaborator-writable); wiki/* is the agent-owned, +# contract-bound layout the lint, the index sections and the ingest skill depend on. +GENOME_DIRS=( + "raw/articles" "raw/transcripts" "raw/code-packs" "raw/assets" "raw/private" + "wiki/sources" "wiki/entities" "wiki/concepts" "wiki/queries" "wiki/private" +) + +# --------------------------------------------------------------------------- +# structure_report +# Reports drift of against GENOME_DIRS. +# - missing canonical dir → counted as drift (returns non-zero) +# - extra dir under raw/ or wiki/ → warning only (does not fail) +# Returns the number of MISSING canonical directories. +# --------------------------------------------------------------------------- +structure_report() { + local base="$1" + local missing=0 + + for d in "${GENOME_DIRS[@]}"; do + if [[ ! -d "${base}/${d}" ]]; then + warn "missing: ${d}" + missing=$((missing + 1)) + fi + done + + # Extra directories (drift the other way) — informational only. + local canon=" ${GENOME_DIRS[*]} " + while IFS= read -r d; do + d="${d#"${base}/"}" + [[ "$canon" == *" ${d} "* ]] && continue + info "extra (not in canon): ${d}" + done < <(find "${base}/raw" "${base}/wiki" -mindepth 1 -type d 2>/dev/null) + + return $missing +} + +# --------------------------------------------------------------------------- +# structure_sync +# Creates any MISSING canonical directories (idempotent). Never deletes — +# retiring a bucket is a deliberate, contract-aware change to GENOME_DIRS + +# the templates, not an automatic prune. +# --------------------------------------------------------------------------- +structure_sync() { + local base="$1" + local added=0 + + for d in "${GENOME_DIRS[@]}"; do + if [[ ! -d "${base}/${d}" ]]; then + mkdir -p "${base}/${d}" + touch "${base}/${d}/.gitkeep" + success "created: ${d}" + added=$((added + 1)) + fi + done + + [[ $added -eq 0 ]] && info "already in sync: ${base}" + return 0 +} diff --git a/scripts/verify-genomes.sh b/scripts/verify-genomes.sh new file mode 100644 index 0000000..85a4a62 --- /dev/null +++ b/scripts/verify-genomes.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ============================================================================= +# scripts/verify-genomes.sh +# Check (default) or --sync the directory structure of every registered genome +# against the canonical layout in lib/structure.sh. +# +# bash scripts/verify-genomes.sh # report drift, non-zero exit on drift +# bash scripts/verify-genomes.sh --sync # create missing dirs everywhere (safe) +# +# No hardware/LLM involved — pure structure check. Run anywhere. +# ============================================================================= +set -euo pipefail +source "lib/output.sh" +source "globals.env" +source "registry.sh" +source "lib/structure.sh" + +MODE="verify" +[[ "${1:-}" == "--sync" ]] && MODE="sync" + +step "Genome structure: ${MODE}" + +TOTAL_MISSING=0 +for entry in "${GENOMES[@]}"; do + IFS='|' read -r GENOME_NAME _ _ <<< "$entry" # 3-field registry; ignore desc + linked + genome_dir="${WORK_DIR}/${MASTER_REPO}/${GENOME_NAME}" + + if [[ ! -d "$genome_dir" ]]; then + warn "not found locally, skipping: ${GENOME_NAME}" + continue + fi + + info "Genome: ${GENOME_NAME}" + if [[ "$MODE" == "sync" ]]; then + structure_sync "$genome_dir" + else + structure_report "$genome_dir" && m=0 || m=$? + TOTAL_MISSING=$((TOTAL_MISSING + m)) + fi +done + +echo "" +if [[ "$MODE" == "sync" ]]; then + success "Structure sync complete." +elif [[ $TOTAL_MISSING -eq 0 ]]; then + success "Structure verified: all genomes match the canonical layout." +else + error "Structure drift: ${TOTAL_MISSING} missing directory(ies). Fix with: make sync-structure" + exit 1 +fi From ee4f5beacfdb1c44706faeec55c7730b6ca4cac1 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 3 Jun 2026 12:28:18 +0200 Subject: [PATCH 02/28] feat: add linked project field to genome registry --- lib/scaffold.sh | 18 +++++++++++------- registry.sh | 12 ++++++++---- scripts/add-genome.sh | 6 ++++-- scripts/setup-genomes.sh | 5 +++-- templates/agents-genome.md | 22 ++++++++++++++++++---- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/lib/scaffold.sh b/lib/scaffold.sh index 5add2ed..b9dc437 100644 --- a/lib/scaffold.sh +++ b/lib/scaffold.sh @@ -16,17 +16,21 @@ render_template() { local content content=$(<"$template_file") + # Defaults (:-) so master-repo templates render even when GENOME_* are unset + # (scaffold_master runs before any genome; set -u would otherwise abort here). local genome_name_upper - genome_name_upper=$(tr '[:lower:]' '[:upper:]' <<< "${GENOME_NAME}") + genome_name_upper=$(tr '[:lower:]' '[:upper:]' <<< "${GENOME_NAME:-}") # Placeholder replacement - content="${content//\{\{GENOME_NAME\}\}/${GENOME_NAME}}" + content="${content//\{\{GENOME_NAME\}\}/${GENOME_NAME:-}}" content="${content//\{\{GENOME_NAME_UPPER\}\}/${genome_name_upper}}" - content="${content//\{\{GENOME_DESC\}\}/${GENOME_DESC}}" - content="${content//\{\{FORGEJO_URL\}\}/${FORGEJO_URL}}" - content="${content//\{\{FORGEJO_USER\}\}/${FORGEJO_USER}}" - content="${content//\{\{VAULTWARDEN_URL\}\}/${VAULTWARDEN_URL}}" - content="${content//\{\{MASTER_REPO\}\}/${MASTER_REPO}}" + content="${content//\{\{GENOME_DESC\}\}/${GENOME_DESC:-}}" + content="${content//\{\{FORGEJO_URL\}\}/${FORGEJO_URL:-}}" + content="${content//\{\{FORGEJO_USER\}\}/${FORGEJO_USER:-}}" + content="${content//\{\{VAULTWARDEN_URL\}\}/${VAULTWARDEN_URL:-}}" + content="${content//\{\{MASTER_REPO\}\}/${MASTER_REPO:-}}" + # linked project reference (optional) — empty registry field renders as 'none' + content="${content//\{\{LINKED_PROJECT\}\}/${GENOME_LINKED:-none}}" content="${content//\{\{DATE\}\}/$(date +%Y-%m-%d)}" mkdir -p "$(dirname "$output_file")" diff --git a/registry.sh b/registry.sh index 88513f5..1d5c994 100644 --- a/registry.sh +++ b/registry.sh @@ -19,9 +19,13 @@ LIB_DIR="${PROJECT_ROOT}/lib" PROVIDERS_DIR="${PROJECT_ROOT}/providers" # --- GENOME REGISTRY --- -# Format: "name|description" +# Format: "name|description|linked_repo" +# - linked_repo is OPTIONAL. Leave empty (trailing pipe) for knowledge-only genomes. +# - It is an opaque reference rendered verbatim into the genome's AGENTS.md +# (phase-2 project work is parked, so the framework does not act on it yet). +# - Example with a project: "genome-homelab|Keru infrastructure...|keru/homelab-infra" GENOMES=( - "genome-dev|Web development, TUI, Angular, software architecture" - "genome-finance|Personal finance, investments, market analysis" - "genome-homelab|Keru infrastructure, network configs, architecture logs" + "genome-dev|Web development, TUI, Angular, software architecture|" + "genome-finance|Personal finance, investments, market analysis|" + "genome-homelab|Keru infrastructure, network configs, architecture logs|" ) diff --git a/scripts/add-genome.sh b/scripts/add-genome.sh index 37dab87..53cab5d 100644 --- a/scripts/add-genome.sh +++ b/scripts/add-genome.sh @@ -11,16 +11,18 @@ source "registry.sh" GENOME_NAME="${1:-}" GENOME_DESC="${2:-}" +GENOME_LINKED="${3:-}" # optional: linked project repo reference if [[ -z "$GENOME_NAME" || -z "$GENOME_DESC" ]]; then error "Missing arguments." - echo "Usage: $0 " + echo "Usage: $0 [linked-repo]" exit 1 fi step "Adding New Genome: ${GENOME_NAME}" -GENOMES=("${GENOME_NAME}|${GENOME_DESC}") +# Build a 3-field registry entry (linked_repo may be empty) +GENOMES=("${GENOME_NAME}|${GENOME_DESC}|${GENOME_LINKED}") source "scripts/setup-genomes.sh" diff --git a/scripts/setup-genomes.sh b/scripts/setup-genomes.sh index c5c3999..c6c7975 100644 --- a/scripts/setup-genomes.sh +++ b/scripts/setup-genomes.sh @@ -19,8 +19,9 @@ source "providers/${PROVIDER}.sh" step "Processing Genome Registry" for entry in "${GENOMES[@]}"; do - IFS='|' read -r GENOME_NAME GENOME_DESC <<< "$entry" - export GENOME_NAME GENOME_DESC + # 3-field format: name|description|linked_repo (linked_repo optional → may be empty) + IFS='|' read -r GENOME_NAME GENOME_DESC GENOME_LINKED <<< "$entry" + export GENOME_NAME GENOME_DESC GENOME_LINKED info "Processing: ${GENOME_NAME}..." diff --git a/templates/agents-genome.md b/templates/agents-genome.md index a5f7f86..4ecbda4 100644 --- a/templates/agents-genome.md +++ b/templates/agents-genome.md @@ -14,14 +14,28 @@ --- +## Linked Project + +| Field | Value | +| --------------- | --------------------- | +| Project repo | `{{LINKED_PROJECT}}` | +| Branch | `main` | +| Allowed tasks | `readme, tests, code` | +| Preferred model | `auto` | + +If `Project repo` is `none`, this genome is knowledge-only — phase-2 project work +does not apply. When set, after a wiki PR is **merged**, the orchestrator may trigger +work on this repo within _Allowed tasks_. The agent never touches the project repo +during ingest. + ## PRIVATE_CONTEXT **Default: `disabled`** — never infer; require explicit operator declaration per session. -| State | Behavior | -|-------|----------| -| `disabled` | `raw/private/` and `wiki/private/` do not exist. No read, list, grep, or summary on private paths. All outputs safe for collaborators. | -| `enabled` | Operator has confirmed `git-crypt unlock` ran on host. Read/write `private/` authorized. All outputs from private data go exclusively to `wiki/private/`. Prefix every response drawing on private data: `[PRIVATE DATA INCLUDED]`. Never leak private synthesis into public wiki paths. | +| State | Behavior | +| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `disabled` | `raw/private/` and `wiki/private/` do not exist. No read, list, grep, or summary on private paths. All outputs safe for collaborators. | +| `enabled` | Operator has confirmed `git-crypt unlock` ran on host. Read/write `private/` authorized. All outputs from private data go exclusively to `wiki/private/`. Prefix every response drawing on private data: `[PRIVATE DATA INCLUDED]`. Never leak private synthesis into public wiki paths. | Pre-commit `PLAINTEXT LEAK DETECTED`: stop immediately. Do not use `--no-verify`. Ask operator to verify `.gitattributes` and encryption state. From 3005366cfd0e968099de1b4f1f1470916cca3ea9 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 3 Jun 2026 12:28:18 +0200 Subject: [PATCH 03/28] feat: implement ingest skill workflow with post-processing --- skills/ingest/SKILL.md | 83 ++++++++++++++++ skills/ingest/references/frontmatter.md | 0 skills/ingest/scripts/index-append.py | 96 +++++++++++++++++++ skills/ingest/scripts/log-append.sh | 50 ++++++++++ skills/ingest/scripts/open-pr.sh | 98 +++++++++++++++++++ skills/ingest/scripts/run-ingest.sh | 120 ++++++++++++++++++++++++ skills/ingest/scripts/scoped-lint.sh | 50 ++++++++++ skills/ingest/scripts/slug.sh | 18 ++++ 8 files changed, 515 insertions(+) create mode 100644 skills/ingest/SKILL.md create mode 100644 skills/ingest/references/frontmatter.md create mode 100644 skills/ingest/scripts/index-append.py create mode 100644 skills/ingest/scripts/log-append.sh create mode 100644 skills/ingest/scripts/open-pr.sh create mode 100644 skills/ingest/scripts/run-ingest.sh create mode 100644 skills/ingest/scripts/scoped-lint.sh create mode 100644 skills/ingest/scripts/slug.sh diff --git a/skills/ingest/SKILL.md b/skills/ingest/SKILL.md new file mode 100644 index 0000000..bd75214 --- /dev/null +++ b/skills/ingest/SKILL.md @@ -0,0 +1,83 @@ +--- +name: ingest +description: Semantic pass of a single raw source into the current genome's wiki — read the source, write sources/entities/concepts, handle contradictions, then emit a manifest and STOP. Use when a new file lands in raw/. Does NOT do git, log, index, lint, or PRs (a post-processor handles those), and does NOT handle private sources or project repos. +license: see repository +compatibility: Runs inside one genome checkout (cwd = genome root). Tools needed — read, edit only. NO bash, NO git. The deterministic steps (index, log, scoped lint, PR) run AFTER you exit, via run-ingest.sh. PRIVATE_CONTEXT must be disabled. +allowed-tools: read edit +metadata: + framework: knowledge-genome + phase: "1-ingest-semantic" +--- + +# Ingest — semantic pass + +You run inside ONE genome checkout. `AGENTS.md` (already in your context) is the +authoritative contract. Your job is the **semantic pass only**: read the source, write +the wiki pages, handle contradictions. You do **not** touch git, the log, the index, the +linter, or PRs — a post-processor (`run-ingest.sh`) does all of that _after you stop_, +from the manifest you leave behind. This keeps your context clean and your turns few, +which matters on a small local model. + +**Argument:** the relative path of the single raw source to ingest +(e.g. `raw/articles/foo.md`). Process only this one. + +## Pre-flight — stop the session if any check fails + +1. Refuse if the argument path is under any `private/` directory. +2. Refuse if `PRIVATE_CONTEXT` is not `disabled`. +3. Confirm the file exists under `raw/`. + +## Semantic work (your only job) + +1. Read the source once. +2. Write `wiki/sources/.md` — faithful summary + key points, with the required + frontmatter (`type: source`, `domain: `, `maturity: draft`, + `last_updated: `, `private: false`, sensible `tags`). +3. For each entity (person, tool, org) → create or update `wiki/entities/.md`. +4. For each concept (pattern, theory, decision) → create or update + `wiki/concepts/.md`. +5. On a real contradiction with an existing claim, follow `AGENTS.md` §Conflict: create + `wiki/queries/conflict--.md`. Never overwrite the existing page. + +Name files in kebab-case and pick stable names. Read `wiki/index.md` (and the specific +pages it points to) to decide create-vs-update and to spot contradictions. Do not scan +whole directories. + +## Finish: write the manifest, then STOP + +As your **final action**, write `.ingest-manifest.json` at the genome root +(NOT under `wiki/`) describing exactly what you did. Then stop — do not commit, lint, +append to the log/index, or open anything. + +```json +{ + "raw_source": "raw/articles/foo.md", + "model": "", + "reasoning": "One sentence for the log: what changed and why.", + "pr_summary": "One or two sentences describing this ingest for the PR.", + "contradictions": "None (or: 1 conflict file created — )", + "pages": [ + { + "path": "wiki/sources/foo.md", + "summary": "One-line index summary.", + "maturity": "draft", + "status": "created" + }, + { + "path": "wiki/entities/acme.md", + "summary": "Acme — vendor.", + "maturity": "draft", + "status": "modified" + } + ] +} +``` + +Manifest rules: + +- List every page you created or modified, with `status` `created` or `modified`. +- `summary` is the one-line index description (≈12 words max). For conflict pages the + summary is ignored — the index lists conflicts by slug only. +- Do not invent a `run_id`, branch, commit, or PR — those belong to the post-processor. + +One source per session. After writing the manifest, stop. diff --git a/skills/ingest/references/frontmatter.md b/skills/ingest/references/frontmatter.md new file mode 100644 index 0000000..e69de29 diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py new file mode 100644 index 0000000..e70009a --- /dev/null +++ b/skills/ingest/scripts/index-append.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# ============================================================================= +# skills/ingest/scripts/index-append.py +# Insert an entry line into the correct section of wiki/index.md and keep that +# section's entries alphabetically ordered. Bumps frontmatter last_updated. +# +# NOTE: agents-genome.md and wiki-index.md claim the pre-commit hook sorts the +# index. The actual pre-commit.sh only runs the plaintext-leak check — it does +# NOT sort. This script owns the ordering instead. (If you later move sorting +# into the hook, reduce this to a plain append.) +# +# index-append.py --section Sources \ +# --entry '- [[sources/foo]] — One-line summary. `maturity: draft`' +# ============================================================================= +import argparse +import datetime +import re +import sys + +ENTRY_RE = re.compile(r"^- \[\[") +HEADER_RE = re.compile(r"^## ") + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--section", required=True, + help="Section name, e.g. Sources / Entities / Concepts / Queries / Conflicts") + ap.add_argument("--entry", required=True, help="Full index line to insert") + ap.add_argument("--file", default="wiki/index.md") + args = ap.parse_args() + + try: + with open(args.file, encoding="utf-8") as fh: + lines = fh.read().splitlines() + except FileNotFoundError: + print(f"index-append: not found: {args.file}", file=sys.stderr) + return 1 + + today = datetime.date.today().isoformat() + + # 1. Bump last_updated inside the first frontmatter block + fm_open = False + for i, ln in enumerate(lines): + if ln.strip() == "---": + if not fm_open: + fm_open = True + continue + break # end of frontmatter + if fm_open and ln.startswith("last_updated:"): + lines[i] = f"last_updated: {today}" + + # 2. Locate the target section [start, end) + start = None + for i, ln in enumerate(lines): + if HEADER_RE.match(ln) and ln[3:].startswith(args.section): + start = i + break + if start is None: + print(f"index-append: section '{args.section}' not found in {args.file}", + file=sys.stderr) + return 1 + + end = len(lines) + for i in range(start + 1, len(lines)): + if HEADER_RE.match(lines[i]): + end = i + break + + # 3. Split the section body into intro (non-entry) and entries + body = lines[start + 1:end] + intro = [ln for ln in body if not ENTRY_RE.match(ln)] + entries = [ln for ln in body if ENTRY_RE.match(ln)] + + if args.entry in entries: + print(f"index-append: entry already present, skipping") + return 0 + + entries.append(args.entry) + entries.sort(key=str.casefold) + + # Normalise intro: drop trailing blanks, keep header + comment(s) + while intro and intro[-1].strip() == "": + intro.pop() + + new_section = intro + [""] + entries + [""] + lines = lines[:start + 1] + new_section + lines[end:] + + with open(args.file, "w", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + print(f"index-append: added to {args.section}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/ingest/scripts/log-append.sh b/skills/ingest/scripts/log-append.sh new file mode 100644 index 0000000..32e6ca0 --- /dev/null +++ b/skills/ingest/scripts/log-append.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/log-append.sh +# Append one entry to the append-only ledger wiki/log.md, in the exact format +# defined by AGENTS.md / wiki-log.md. Generates run_id. Never edits prior entries. +# +# log-append.sh --type INGEST --subject "" --model "" \ +# --context "[[raw/x]]" --output "[[sources/x]]" \ +# --reasoning "One sentence." +# ============================================================================= +set -euo pipefail + +LOG_FILE="${LOG_FILE:-wiki/log.md}" + +type="" subject="" model="" context="" output="" reasoning="" +while [[ $# -gt 0 ]]; do + case "$1" in + --type) type="$2"; shift 2 ;; + --subject) subject="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --context) context="$2"; shift 2 ;; + --output) output="$2"; shift 2 ;; + --reasoning) reasoning="$2"; shift 2 ;; + *) echo "log-append: unknown arg: $1" >&2; exit 1 ;; + esac +done + +: "${type:?--type required}" +: "${subject:?--subject required}" + +case "$type" in + INGEST|LINT|QUERY|CONFLICT|CONFIG|SECURITY) ;; + *) echo "log-append: invalid TYPE '${type}'" >&2; exit 1 ;; +esac + +[[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; } + +run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid)" +today="$(date +%Y-%m-%d)" + +{ + printf '\n## [%s] %s | %s\n\n' "$today" "$type" "$subject" + printf -- '- run_id: `%s`\n' "$run_id" + printf -- '- model: `%s`\n' "${model:-unknown}" + printf -- '- context_read: %s\n' "${context:-*(none)*}" + printf -- '- output_written: %s\n' "${output:-*(none)*}" + printf -- '- reasoning: %s\n' "${reasoning:-No reasoning provided.}" +} >> "$LOG_FILE" + +echo "run_id=${run_id}" diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh new file mode 100644 index 0000000..cac5fc0 --- /dev/null +++ b/skills/ingest/scripts/open-pr.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/open-pr.sh +# Branch, commit (conventional), push, and open a Forgejo PR for the wiki/ changes. +# Mirrors the API conventions of providers/forgejo.sh (token auth + http_code). +# Runs inside the genome checkout (cwd = genome root). Never touches main. +# +# open-pr.sh --slug --title "feat: ingest " --body-file \ +# [--base main] [--label CONFLICT] +# +# Requires env: FORGEJO_URL, FORGEJO_USER, FORGEJO_TOKEN. +# ============================================================================= +set -euo pipefail + +: "${FORGEJO_URL:?missing FORGEJO_URL}" +: "${FORGEJO_USER:?missing FORGEJO_USER}" +: "${FORGEJO_TOKEN:?missing FORGEJO_TOKEN}" + +slug="" title="" body_file="" base="main" label="" +while [[ $# -gt 0 ]]; do + case "$1" in + --slug) slug="$2"; shift 2 ;; + --title) title="$2"; shift 2 ;; + --body-file) body_file="$2"; shift 2 ;; + --base) base="$2"; shift 2 ;; + --label) label="$2"; shift 2 ;; + *) echo "open-pr: unknown arg: $1" >&2; exit 1 ;; + esac +done + +: "${slug:?--slug required}" +: "${title:?--title required}" +: "${body_file:?--body-file required}" +[[ -f "$body_file" ]] || { echo "open-pr: body file not found: $body_file" >&2; exit 1; } + +branch="feat/ai-ingest-${slug}" +repo="$(basename -s .git "$(git config --get remote.origin.url)")" + +# 1. Branch + commit + push (AGENTS.md rule 5: never commit to main) +git switch -c "$branch" 2>/dev/null || git switch "$branch" +git add wiki/ +if git diff --cached --quiet; then + echo "open-pr: nothing staged under wiki/ — aborting" >&2 + exit 1 +fi +git commit -m "$title" +git push -u origin "$branch" + +# 2. Open the PR via Forgejo API (jq builds the JSON safely) +body="$(cat "$body_file")" +payload="$(jq -n --arg head "$branch" --arg base "$base" \ + --arg title "$title" --arg body "$body" \ + '{head:$head, base:$base, title:$title, body:$body}')" + +resp="$(curl -s -w '\n%{http_code}' \ + -H "Authorization: token ${FORGEJO_TOKEN}" \ + -H "Content-Type: application/json" \ + -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls" \ + -d "$payload")" + +code="$(printf '%s' "$resp" | tail -n1)" +json="$(printf '%s' "$resp" | sed '$d')" + +case "$code" in + 201) + url="$(printf '%s' "$json" | jq -r '.html_url')" + number="$(printf '%s' "$json" | jq -r '.number')" + echo "PR opened: ${url}" + ;; + 409) + echo "open-pr: a PR for '${branch}' already exists — push updated the branch." >&2 + exit 0 + ;; + 401) + echo "open-pr: unauthorized — check FORGEJO_TOKEN (n8n-bot)." >&2 + exit 1 + ;; + *) + echo "open-pr: Forgejo API HTTP ${code}: ${json}" >&2 + exit 1 + ;; +esac + +# 3. Optional label (e.g. CONFLICT). Best-effort; non-fatal. +if [[ -n "$label" && -n "${number:-}" ]]; then + label_id="$(curl -s -H "Authorization: token ${FORGEJO_TOKEN}" \ + "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/labels" \ + | jq -r --arg n "$label" '.[] | select(.name==$n) | .id' | head -n1)" + if [[ -n "$label_id" && "$label_id" != "null" ]]; then + curl -s -o /dev/null \ + -H "Authorization: token ${FORGEJO_TOKEN}" -H "Content-Type: application/json" \ + -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/issues/${number}/labels" \ + -d "{\"labels\":[${label_id}]}" \ + && echo "label '${label}' applied" >&2 + else + echo "open-pr: label '${label}' not found in repo — skipped." >&2 + fi +fi diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh new file mode 100644 index 0000000..b9fab70 --- /dev/null +++ b/skills/ingest/scripts/run-ingest.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/run-ingest.sh +# Post-pi orchestrator. Runs OUTSIDE pi's loop, on vm101, in the genome checkout. +# Consumes .ingest-manifest.json (written by the ingest skill) and performs every +# deterministic step — index, log, scoped lint, PR — so pi's context stays clean. +# +# run-ingest.sh [manifest_path] +# +# Emits a single JSON result line on stdout for n8n to parse. +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: run-ingest.sh [manifest]}" +manifest="${2:-.ingest-manifest.json}" +SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +fail() { + jq -n --arg stage "$1" --arg reason "$2" \ + '{status:"error", stage:$stage, reason:$reason}' + exit 1 +} + +command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; } +command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" +[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}" + +# --- read manifest scalars --- +raw_source="$(jq -r '.raw_source' "$manifest")" +model="$(jq -r '.model // "unknown"' "$manifest")" +reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")" +pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")" +contradictions="$(jq -r '.contradictions // "None"' "$manifest")" + +[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" + +slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" + +# --- collect touched paths --- +mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") +mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest") +all_paths=( "${created_paths[@]}" "${modified_paths[@]}" ) +[[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported" + +conflict_label="" + +# --- 1. index entries (created pages only), inserted in order --- +while IFS=$'\t' read -r path summary maturity; do + [[ -z "$path" ]] && continue + link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo + folder="${link%%/*}" + case "$folder" in + sources) section="Sources" ;; + entities) section="Entities" ;; + concepts) section="Concepts" ;; + queries) + if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT" + else section="Queries"; fi ;; + *) section="Sources" ;; + esac + + if [[ "$section" == "Conflicts" ]]; then + entry="- [[${link}]]" # conflicts: slug only + else + entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`" + fi + + python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \ + || fail "index" "index-append failed for ${path}" +done < <(jq -r '.pages[] | select(.status=="created") + | [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest") + +# --- 2. log entry --- +out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")" +"${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ + --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ + || fail "log" "log-append failed" + +# --- 3. scoped lint (capture findings for the PR; never aborts the run) --- +lint_out="$( "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? + +# --- 4. assemble the PR body (manifest tables + lint results) --- +body="$(mktemp)" +{ + echo "## Summary" + echo "$pr_summary" + echo "" + echo "## Pages" + echo "| Path | Status | Maturity |" + echo "|------|--------|----------|" + jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest" + echo "" + echo "## Contradictions" + echo "$contradictions" + echo "" + echo "## Scoped Lint (post-ingest)" + echo '```' + echo "$lint_out" + echo '```' +} > "$body" + +# --- 5. open the PR --- +pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" ) +[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) +pr_out="$( "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? +pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" + +rm -f "$body" + +# --- final result line for n8n --- +jq -n \ + --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ + --arg slug "$slug" \ + --arg pr_url "$pr_url" \ + --argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \ + --argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \ + --arg detail "$pr_out" \ + '{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}' + +[[ $pr_rc -eq 0 ]] diff --git a/skills/ingest/scripts/scoped-lint.sh b/skills/ingest/scripts/scoped-lint.sh new file mode 100644 index 0000000..a064fd9 --- /dev/null +++ b/skills/ingest/scripts/scoped-lint.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/scoped-lint.sh +# Run the framework's validation on ONLY the files touched this session. +# Reuses lib/lint.sh + lib/output.sh — same checks as `make lint`, scoped. +# +# KG_LIB_DIR=/opt/knowledge-genome-setup/lib \ +# scoped-lint.sh wiki/sources/x.md wiki/entities/y.md +# +# Exits non-zero if any hard error is found, so the agent notices. +# Findings are printed (stderr from the lint functions + a summary on stdout). +# ============================================================================= +set -euo pipefail + +: "${KG_LIB_DIR:?set KG_LIB_DIR to the framework lib/ dir (e.g. /opt/knowledge-genome-setup/lib)}" + +# shellcheck source=/dev/null +source "${KG_LIB_DIR}/output.sh" +# shellcheck source=/dev/null +source "${KG_LIB_DIR}/lint.sh" + +genome="${1:?usage: scoped-lint.sh }" +shift +[[ $# -gt 0 ]] || { echo "scoped-lint: no files given" >&2; exit 1; } + +errors=0 +stale=0 +count=$# + +for f in "$@"; do + if [[ ! -f "$f" ]]; then + warn "scoped-lint: missing file (skipped): $f" + continue + fi + + lint_markdown_file "$f" "$genome" && fe=0 || fe=$? + check_privacy_consistency "$f" && pce=0 || pce=$? + check_page_size "$f" && pse=0 || pse=$? + errors=$(( errors + fe + pce + pse )) + + check_knowledge_decay "$f" && st=0 || st=$? + stale=$(( stale + st )) + + check_broken_links "$f" || true # warnings only +done + +echo "" +echo "scoped-lint: ${errors} error(s), ${stale} stale across ${count} file(s)" + +[[ $errors -eq 0 ]] diff --git a/skills/ingest/scripts/slug.sh b/skills/ingest/scripts/slug.sh new file mode 100644 index 0000000..a5711ac --- /dev/null +++ b/skills/ingest/scripts/slug.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/slug.sh +# Derive a wiki slug from a path, filename, or title string. +# slug.sh "raw/articles/My Source.md" -> my-source +# slug.sh "Some Concept Name" -> some-concept-name +# ============================================================================= +set -euo pipefail + +input="${1:?usage: slug.sh }" + +# Strip directory and extension when given a path +base="${input##*/}" +base="${base%.*}" + +printf '%s\n' "$base" \ + | tr '[:upper:]' '[:lower:]' \ + | sed -E 's/[^a-z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//' From 2e06d8f4e8f19b5becddbf1346f8c8ec6361642a Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 3 Jun 2026 12:28:18 +0200 Subject: [PATCH 04/28] docs: update AGENTS.md template for ingest skill and clarity --- templates/agents-genome.md | 56 ++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/templates/agents-genome.md b/templates/agents-genome.md index 4ecbda4..98c6075 100644 --- a/templates/agents-genome.md +++ b/templates/agents-genome.md @@ -2,11 +2,11 @@ ## Identity -| Field | Value | -|--------|-------| -| Genome | `{{GENOME_NAME}}` | -| Domain | `{{GENOME_DESC}}` | -| Owner | `{{FORGEJO_USER}}` | +| Field | Value | +| ------ | -------------------------------------------------- | +| Genome | `{{GENOME_NAME}}` | +| Domain | `{{GENOME_DESC}}` | +| Owner | `{{FORGEJO_USER}}` | | Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{GENOME_NAME}}` | **Role:** Wiki maintainer for `{{GENOME_NAME}}`. @@ -55,6 +55,7 @@ Session end or return to `disabled`: remind operator to run `git-crypt lock` on 8. Every PR must use `templates/pr-description.md`. Do not omit the tabular summary. ### NEVER + - Load `wiki/log.md` in full — read only the tail injected by the orchestrator. - Rewrite `wiki/index.md` to reorder entries — append only; sorting is automated. - Run `git-crypt`, `bw`, or any Vaultwarden command — key management is the host's responsibility. @@ -62,6 +63,7 @@ Session end or return to `disabled`: remind operator to run `git-crypt lock` on - Merge PRs — human approval required. ### ASK FIRST + - Deleting any wiki page. - Changing `maturity` from `stable` to `deprecated`. - Writing to `wiki/private/` when PRIVATE_CONTEXT state is ambiguous. @@ -84,7 +86,8 @@ Execute in this order before any file operation: ## Workflows ### Ingest -*Triggered by new file in `raw/`.* + +_Triggered by new file in `raw/`._ 1. Read source once. 2. Create `wiki/sources/.md` — summary + key points. @@ -96,12 +99,14 @@ Execute in this order before any file operation: 8. Run scoped lint on pages created or modified in this session. Report issues in PR description. Do not auto-fix. 9. Commit on `feat/ai-ingest-`. Open PR using `templates/pr-description.md`. -*Private source* (`PRIVATE_CONTEXT: enabled` required): +_Private source_ (`PRIVATE_CONTEXT: enabled` required): + - All output → `wiki/private/.md` only. - PR title: `[PRIVATE] ingest: `. ### Query -*Triggered by operator question.* + +_Triggered by operator question._ 1. `qmd search ""` → identify candidate pages. 2. Read candidate pages directly. @@ -110,10 +115,11 @@ Execute in this order before any file operation: 5. Append entry to `wiki/index.md` under Queries. 6. Append log entry: `QUERY | `. -*For general orientation without a specific query: read `wiki/index.md` directly.* +_For general orientation without a specific query: read `wiki/index.md` directly._ ### Lint -*Triggered by operator with bash pre-scan output.* + +_Triggered by operator with bash pre-scan output._ Pre-requisite: operator runs `bash scripts/lint-genomes.sh` and provides output to this session. The script handles deterministically: broken links, knowledge decay, page size, frontmatter validation. @@ -133,13 +139,14 @@ Append log entry: `LINT | `. ## File Conventions ### Frontmatter + Required on every wiki page: ```yaml --- title: "Strict String Title" type: source | entity | concept | query | conflict | private -domain: {{GENOME_NAME}} +domain: { { GENOME_NAME } } tags: [lowercase, hyphen-separated] maturity: draft | stable | deprecated last_updated: YYYY-MM-DD @@ -152,19 +159,25 @@ private: true | false - `deprecated` — superseded. Add `> **DEPRECATED:** ` callout at top of body. ### Links + - Internal: `[[folder/file]]` — Obsidian wikilinks only. Never `[text](url)` for internal refs. - Cross-genome: `[[../genome-target/wiki/folder/file]]`. - External: `[text](https://...)`. ### Index entries + Append at bottom of relevant section in `wiki/index.md`: + ``` - [[folder/slug]] — One-line summary. `maturity: draft` ``` + Never reorder. Alphabetical sort is handled by the pre-commit hook. ### Log entries + Append one entry per operation to `wiki/log.md`: + ```markdown ## [YYYY-MM-DD] TYPE | Subject @@ -174,6 +187,7 @@ Append one entry per operation to `wiki/log.md`: - output_written: `[[path/C]]` - reasoning: One sentence — what changed and why. ``` + Valid TYPEs: `INGEST` `LINT` `QUERY` `CONFLICT` `CONFIG` `SECURITY` Parse: `grep "^## \[" wiki/log.md | tail -5` @@ -191,22 +205,26 @@ When new evidence contradicts an existing wiki claim: --- title: "Conflict: " type: conflict -domain: {{GENOME_NAME}} +domain: { { GENOME_NAME } } maturity: draft last_updated: YYYY-MM-DD private: false --- ``` + ```markdown ## Conflict: **Claim A (existing):** [[path/to/existing-page]] + > Summary of current wiki position. **Claim B (new):** [[path/to/new-source]] + > Summary of contradicting evidence. **Assessment:** + - Confidence A: high | medium | low — - Confidence B: high | medium | low — - Recommendation: `accept_b` | `keep_a` | `requires_human_review` @@ -226,20 +244,22 @@ private: false - `maturity: draft` not updated in **90 days** → flag during lint. Flagged pages: prepend to body: + ```markdown > **⚠️ STALE:** Last validated {{last_updated}}. Re-validation required. ``` + Propose re-validation task. Do not change `maturity` without new source evidence. --- ## Collaboration -| Role | Access | Permitted | -|------|--------|-----------| -| Owner | Full — key holder | Read/write everywhere | -| Collaborator | No key | Push to `raw/articles`, `raw/transcripts`, `raw/code-packs`, `raw/assets` | -| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` | -| Cloud AI model | Public only | `PRIVATE_CONTEXT` must be `disabled`; never send private files outside local network | +| Role | Access | Permitted | +| -------------- | ----------------- | ------------------------------------------------------------------------------------ | +| Owner | Full — key holder | Read/write everywhere | +| Collaborator | No key | Push to `raw/articles`, `raw/transcripts`, `raw/code-packs`, `raw/assets` | +| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` | +| Cloud AI model | Public only | `PRIVATE_CONTEXT` must be `disabled`; never send private files outside local network | Grant collaborator: add as Forgejo contributor with Write role. Never share the git-crypt key. From 33697e9b82578b48a441d879a404e4e781f05204 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 3 Jun 2026 12:28:19 +0200 Subject: [PATCH 05/28] docs: improve readability and consistency across other documentation --- README.md | 201 ++++++++++++++++++++---------------- templates/agents-master.md | 16 ++- templates/pr-description.md | 16 ++- templates/wiki-index.md | 23 +++-- templates/wiki-log.md | 9 +- 5 files changed, 154 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index 59e38d9..5c75647 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ evolving synthesis. Knowledge is compiled once and kept current. Contradictions have been flagged. The synthesis already reflects everything ingested. This means: + - No vector database. - No embedding pipeline. - No external retrieval infrastructure. @@ -103,11 +104,11 @@ genome-{name}/ ### Three layers -| Layer | Path | Owner | Rule | -|-------|------|-------|------| -| Raw sources | `raw/` | Human | Immutable. LLM reads only. Never modified. | -| Wiki | `wiki/` | LLM | Agent creates, updates, cross-links, maintains. | -| Schema | `AGENTS.md` | Human + LLM | Co-evolved contract defining structure and workflows. | +| Layer | Path | Owner | Rule | +| ----------- | ----------- | ----------- | ----------------------------------------------------- | +| Raw sources | `raw/` | Human | Immutable. LLM reads only. Never modified. | +| Wiki | `wiki/` | LLM | Agent creates, updates, cross-links, maintains. | +| Schema | `AGENTS.md` | Human + LLM | Co-evolved contract defining structure and workflows. | ### Framework structure @@ -154,6 +155,7 @@ All tools (git-crypt, bw, qmd) have native Linux binaries. ### macOS — full support All scripts are compatible with macOS. Requirements: + - bash 3.2+ (macOS default) — fully supported. All `bash 4+` constructs removed. - GNU coreutils not required — BSD variants of `date`, `grep`, `sed` all handled. - `git-crypt`: install via Homebrew — `brew install git-crypt` @@ -166,6 +168,7 @@ If you use Homebrew bash (`brew install bash`), the scripts work identically to **Git Bash and native Windows are not supported.** Reasons: + - `git-crypt` has no native Windows binary. - Process substitution `<(...)` used for runtime key injection is not available in Git Bash or PowerShell. @@ -179,13 +182,13 @@ All setup and runtime operations work identically to native Linux inside WSL2. The system is designed for a homelab architecture: -| Component | Recommended | Role | -|-----------|-------------|------| -| Storage node | Any Linux server with NFS | Hosts Forgejo, stores genome repos | -| AI compute node | GPU server (16GB+ VRAM) | Runs local LLM agent sessions | -| VRAM | 16GB minimum | 14B model at Q5_K_M ≈ 10GB weights; ~6GB for KV cache | -| Local LLM | 14B–32B quantised | Active wiki maintenance sessions | -| Large LLM | 70B (async) | Deep reflection, complex synthesis (scheduled, not interactive) | +| Component | Recommended | Role | +| --------------- | ------------------------- | --------------------------------------------------------------- | +| Storage node | Any Linux server with NFS | Hosts Forgejo, stores genome repos | +| AI compute node | GPU server (16GB+ VRAM) | Runs local LLM agent sessions | +| VRAM | 16GB minimum | 14B model at Q5_K_M ≈ 10GB weights; ~6GB for KV cache | +| Local LLM | 14B–32B quantised | Active wiki maintenance sessions | +| Large LLM | 70B (async) | Deep reflection, complex synthesis (scheduled, not interactive) | > **On VRAM constraints:** with a 16GB card and a 14B model, the KV cache budget > is ~6GB — approximately 32k tokens of effective context. Every token in `AGENTS.md`, @@ -198,18 +201,18 @@ The system is designed for a homelab architecture: ### Required -| Tool | Purpose | -|------|---------| -| `git` | Version control | -| `git-crypt` | Transparent file encryption | -| `curl` | REST API calls to Forgejo/GitHub | -| `jq` | JSON parsing | +| Tool | Purpose | +| ----------- | -------------------------------- | +| `git` | Version control | +| `git-crypt` | Transparent file encryption | +| `curl` | REST API calls to Forgejo/GitHub | +| `jq` | JSON parsing | ### Optional -| Tool | Purpose | -|------|---------| -| `bw` | Bitwarden CLI — runtime key injection from Vaultwarden (no key on disk) | +| Tool | Purpose | +| ----- | ----------------------------------------------------------------------- | +| `bw` | Bitwarden CLI — runtime key injection from Vaultwarden (no key on disk) | | `qmd` | Local BM25 + vector search for Markdown files with MCP server interface | > **`bw` vs `bws`:** Use `bw` (standard Bitwarden CLI). `bws` is the Bitwarden @@ -347,6 +350,7 @@ make setup - Commits submodule pointer in master repo After setup completes: + - Upload all files in `keys/` to Vaultwarden (see Key Management) - Delete key files from disk: `rm keys/*.key` @@ -354,16 +358,16 @@ After setup completes: ## Makefile Reference -| Target | Description | -|--------|-------------| -| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` | -| `make add-genome NAME=x DESC="y"` | Scaffold and register a single new genome | -| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) | -| `make status` | Show submodule status and first 10 git-crypt encryption states | -| `make lock` | Lock all encrypted repos (master + all genome submodules) | -| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing | -| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome | -| `make help` | Print all available targets | +| Target | Description | +| --------------------------------- | ------------------------------------------------------------------------------ | +| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` | +| `make add-genome NAME=x DESC="y"` | Scaffold and register a single new genome | +| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) | +| `make status` | Show submodule status and first 10 git-crypt encryption states | +| `make lock` | Lock all encrypted repos (master + all genome submodules) | +| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing | +| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome | +| `make help` | Print all available targets | ### Examples @@ -407,6 +411,7 @@ After adding: upload the new key to Vaultwarden and delete the key file. ### Removing a genome Manual process: + ```bash # In master repo git submodule deinit genome-name @@ -421,16 +426,16 @@ git push When a genome is scaffolded, `render_template` replaces these placeholders in all template files: -| Placeholder | Source | Example | -|-------------|--------|---------| -| `{{GENOME_NAME}}` | registry.sh | `genome-dev` | -| `{{GENOME_NAME_UPPER}}` | derived | `GENOME-DEV` | -| `{{GENOME_DESC}}` | registry.sh | `Web development...` | -| `{{FORGEJO_URL}}` | globals.env | `https://git.yourserver.com` | -| `{{FORGEJO_USER}}` | globals.env | `yourusername` | -| `{{VAULTWARDEN_URL}}` | globals.env | `https://vault.yourserver.com` | -| `{{MASTER_REPO}}` | globals.env | `master-knowledge-genome` | -| `{{DATE}}` | runtime | `2026-05-11` | +| Placeholder | Source | Example | +| ----------------------- | ----------- | ------------------------------ | +| `{{GENOME_NAME}}` | registry.sh | `genome-dev` | +| `{{GENOME_NAME_UPPER}}` | derived | `GENOME-DEV` | +| `{{GENOME_DESC}}` | registry.sh | `Web development...` | +| `{{FORGEJO_URL}}` | globals.env | `https://git.yourserver.com` | +| `{{FORGEJO_USER}}` | globals.env | `yourusername` | +| `{{VAULTWARDEN_URL}}` | globals.env | `https://vault.yourserver.com` | +| `{{MASTER_REPO}}` | globals.env | `master-knowledge-genome` | +| `{{DATE}}` | runtime | `2026-05-11` | --- @@ -441,9 +446,9 @@ template files: Each genome uses a unique symmetric AES-256-CTR key managed by git-crypt. Two directories in every genome are always encrypted: -| Directory | Contents | On remote | -|-----------|----------|-----------| -| `raw/private/` | Sensitive source material | Opaque binary blob | +| Directory | Contents | On remote | +| --------------- | --------------------------- | ------------------ | +| `raw/private/` | Sensitive source material | Opaque binary blob | | `wiki/private/` | Private synthesis and notes | Opaque binary blob | All other directories (`raw/articles/`, `wiki/sources/`, etc.) are plaintext. @@ -502,6 +507,7 @@ PRIVATE_CONTEXT: enabled ← Agent may read/write private/. Requires git-cryp ``` Rules: + - Never inferred. Never carried over from a previous session. - `enabled` requires the operator to confirm that `git-crypt unlock` has run on the host. - Per-genome, per-session: enabling for `genome-finance` does NOT enable for `genome-dev`. @@ -530,6 +536,7 @@ The key flows: Vaultwarden → `bw get notes` → `base64 -d` → kernel pipe At no point is the key written to any file on disk. Lock a genome when the session ends: + ```bash git-crypt lock ``` @@ -544,11 +551,11 @@ git-crypt lock Each genome key is stored as a base64-encoded Secure Note in Vaultwarden: -| Genome | Vaultwarden Note Name | -|--------|----------------------| -| `genome-dev` | `genome-dev key` | -| `genome-finance` | `genome-finance key` | -| `genome-homelab` | `genome-homelab key` | +| Genome | Vaultwarden Note Name | +| ---------------- | --------------------- | +| `genome-dev` | `genome-dev key` | +| `genome-finance` | `genome-finance key` | +| `genome-homelab` | `genome-homelab key` | After `make setup` or `make add-genome`, key files are exported to `keys/`. Upload procedure: @@ -593,6 +600,7 @@ gcrypt_rotate_key "genome-dev" ``` `gcrypt_rotate_key` performs: + 1. Unlocks repo with existing key 2. Removes old key material 3. Generates new symmetric key via `git-crypt init` @@ -603,13 +611,16 @@ gcrypt_rotate_key "genome-dev" > **Limitation:** git history still contains blobs encrypted with the old key. > Anyone with the old key and git history access can decrypt them. To purge old > encrypted blobs from history: +> > ```bash > git filter-repo --invert-paths --path raw/private --path wiki/private > git push --force origin main > ``` +> > This rewrites all commit hashes — coordinate with any collaborators first. After rotation: + - Upload new key to Vaultwarden (replace existing note) - Delete both `keys/genome-dev.key` and `keys/genome-dev-rotated-*.key` from disk - Revoke access from previous key holders @@ -621,6 +632,7 @@ After rotation: ### Prerequisites for every session Before starting an LLM agent session on a genome: + 1. The host (AI server) runs `git-crypt unlock` for the required genomes 2. The orchestrator prepares context: `tail -n 20 wiki/log.md` 3. Declare `PRIVATE_CONTEXT` state explicitly in the opening prompt @@ -651,6 +663,7 @@ sequentially — not one session with 5 files. ### n8n automation For Forgejo webhook → automated ingest: + 1. Forgejo sends webhook on push to `raw/` 2. n8n receives webhook, identifies new files 3. n8n starts one agent session per new file (sequential, not parallel) @@ -677,6 +690,7 @@ Triggered by a new file in `raw/` (manual or via webhook). 9. Commit on `feat/ai-ingest-`; open PR using `templates/pr-description.md` For private sources (`PRIVATE_CONTEXT: enabled` required): + - All output goes to `wiki/private/.md` only - PR title: `[PRIVATE] ingest: ` @@ -697,11 +711,13 @@ For general orientation without a specific query: read `wiki/index.md` directly. The lint workflow is split between deterministic bash checks and semantic LLM judgment. **Step 1 — operator runs bash linter:** + ```bash make lint ``` The bash linter checks automatically: + - YAML frontmatter validity (all mandatory fields present) - Domain consistency (domain field matches genome name) - Type validity (value from allowed list) @@ -713,6 +729,7 @@ The bash linter checks automatically: **Step 2 — operator provides bash output to LLM agent:** The agent applies semantic judgment to findings the bash linter cannot make: + - **Orphan pages** (from bash list): for each orphan, identify 1-3 existing pages that should link to it; propose specific additions - **Implicit concepts** (from bash term frequency list): determine if a candidate @@ -735,22 +752,28 @@ The PR description uses `templates/pr-description.md`: ```markdown ## Summary + One sentence: goal of this session and source processed. ## Pages Created + | Path | Type | Maturity | ## Pages Modified + | Path | Change | ## Contradictions Found -[ ] None / [ ] n conflict file(s) created + +[ ] None / [ ] n conflict file(s) created ## Private Data Accessed -[ ] No (PRIVATE_CONTEXT: disabled) / [ ] Yes + +[ ] No (PRIVATE_CONTEXT: disabled) / [ ] Yes ## Scoped Lint (post-ingest) -[ ] Frontmatter valid [ ] No broken links [ ] No issues found + +[ ] Frontmatter valid [ ] No broken links [ ] No issues found ``` This makes human review fast and structured: read the table, scan the diff, @@ -776,10 +799,10 @@ The operator resolves the conflict, updates relevant pages, closes the PR. Pages have a `last_updated` field in frontmatter. During lint passes: -| Maturity | Threshold | Action | -|----------|-----------|--------| -| `stable` | 180 days | Flag as stale — add `⚠️ STALE` callout | -| `draft` | 90 days | Flag as stale — add `⚠️ STALE` callout | +| Maturity | Threshold | Action | +| -------- | --------- | -------------------------------------- | +| `stable` | 180 days | Flag as stale — add `⚠️ STALE` callout | +| `draft` | 90 days | Flag as stale — add `⚠️ STALE` callout | The agent proposes re-validation but does not change `maturity` without new source evidence. @@ -816,47 +839,47 @@ private: true | false --- ``` -| Field | Rules | -|-------|-------| -| `type` | Must be one of: `source entity concept query conflict private index log` | -| `maturity: draft` | Single source or unvalidated | -| `maturity: stable` | Confirmed by 2+ independent sources | -| `maturity: deprecated` | Superseded — add `> **DEPRECATED:** ` callout at top | -| `private: true` | Required on all pages in `wiki/private/` and `raw/private/` | +| Field | Rules | +| ---------------------- | ------------------------------------------------------------------------ | +| `type` | Must be one of: `source entity concept query conflict private index log` | +| `maturity: draft` | Single source or unvalidated | +| `maturity: stable` | Confirmed by 2+ independent sources | +| `maturity: deprecated` | Superseded — add `> **DEPRECATED:** ` callout at top | +| `private: true` | Required on all pages in `wiki/private/` and `raw/private/` | Do not use semantic versioning for content. Git history tracks every change. `maturity` captures epistemic state; `last_updated` tracks recency. ### Page types and directories -| Type | Directory | Description | -|------|-----------|-------------| -| `source` | `wiki/sources/` | One page per processed raw source | -| `entity` | `wiki/entities/` | People, tools, organisations, projects | -| `concept` | `wiki/concepts/` | Patterns, theories, architectural decisions | -| `query` | `wiki/queries/` | Preserved answers and analyses | -| `conflict` | `wiki/queries/conflict-*.md` | Unresolved contradictions | -| `private` | `wiki/private/` | Private synthesis (PRIVATE_CONTEXT: enabled) | -| `index` | `wiki/index.md` | Primary navigation catalog (singleton) | -| `log` | `wiki/log.md` | Operations ledger (singleton) | +| Type | Directory | Description | +| ---------- | ---------------------------- | -------------------------------------------- | +| `source` | `wiki/sources/` | One page per processed raw source | +| `entity` | `wiki/entities/` | People, tools, organisations, projects | +| `concept` | `wiki/concepts/` | Patterns, theories, architectural decisions | +| `query` | `wiki/queries/` | Preserved answers and analyses | +| `conflict` | `wiki/queries/conflict-*.md` | Unresolved contradictions | +| `private` | `wiki/private/` | Private synthesis (PRIVATE_CONTEXT: enabled) | +| `index` | `wiki/index.md` | Primary navigation catalog (singleton) | +| `log` | `wiki/log.md` | Operations ledger (singleton) | ### Page size limits -| Limit | Lines | Action | -|-------|-------|--------| -| Soft cap | 400 | Bash linter warns | -| Hard cap | 800 | Bash linter errors — split the page | +| Limit | Lines | Action | +| -------- | ----- | ----------------------------------- | +| Soft cap | 400 | Bash linter warns | +| Hard cap | 800 | Bash linter errors — split the page | These limits ensure pages fit within the LLM context window without attention degradation and keep the wiki atomically navigable. ### Linking conventions -| Type | Format | -|------|--------| +| Type | Format | +| ---------------------- | ------------------------------------------- | | Internal (same genome) | `[[folder/slug]]` — Obsidian wikilinks only | -| Cross-genome | `[[../genome-target/wiki/folder/slug]]` | -| External | `[text](https://url)` — standard Markdown | +| Cross-genome | `[[../genome-target/wiki/folder/slug]]` | +| External | `[text](https://url)` — standard Markdown | Never use `[text](relative/path)` for internal references. Obsidian wikilinks are bidirectional and appear in the graph view. @@ -878,6 +901,7 @@ Every operation appends one entry to `wiki/log.md`: Valid TYPEs: `INGEST` `LINT` `QUERY` `CONFLICT` `CONFIG` `SECURITY` Parse examples: + ```bash grep "^## \[" wiki/log.md | tail -5 # Last 5 entries grep "^## \[" wiki/log.md | grep "CONFLICT" # All conflicts @@ -891,12 +915,12 @@ The LLM never loads the full log. ## Collaboration Model -| Role | Key access | Permitted operations | -|------|-----------|----------------------| -| Owner | Full — key holder | Read/write everywhere | -| Collaborator | None | Push to `raw/articles/`, `raw/transcripts/`, `raw/code-packs/`, `raw/assets/` | -| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` | -| Cloud AI model | Never | `PRIVATE_CONTEXT` must be `disabled`; private data stays on local network | +| Role | Key access | Permitted operations | +| -------------- | ----------------- | ----------------------------------------------------------------------------- | +| Owner | Full — key holder | Read/write everywhere | +| Collaborator | None | Push to `raw/articles/`, `raw/transcripts/`, `raw/code-packs/`, `raw/assets/` | +| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` | +| Cloud AI model | Never | `PRIVATE_CONTEXT` must be `disabled`; private data stays on local network | Grant collaborator access: add as Forgejo contributor with Write role. Never share the git-crypt key — collaborators operate exclusively in public directories. @@ -930,6 +954,7 @@ qmd serve --port 3333 Obsidian is the recommended wiki browser. Open any genome directory as an Obsidian vault. Recommended setup: + - **Graph view** — visualise page connections; spot orphans and hubs instantly - **Obsidian Web Clipper** — browser extension to clip articles directly to `raw/articles/` as Markdown @@ -991,6 +1016,7 @@ sudo apt install git git-crypt curl jq The staged file is in a path matching `**/private/**` but is not encrypted. Fix options: + 1. Verify `.gitattributes` contains `**/private/** filter=git-crypt diff=git-crypt -text` 2. Run `git-crypt init` if git-crypt is not initialised in this repo 3. Run `git-crypt status` to check the encryption state of all files @@ -1011,6 +1037,7 @@ git commit -m "fix: re-stage private files for encryption" ### Agent returns stale or missing cross-references Likely causes: + 1. Session was too long — KV cache degraded. Use one source per session. 2. `wiki/index.md` was not read at session start — agent lacked the page catalog. 3. qmd index is stale — re-index: `qmd index /wiki/` diff --git a/templates/agents-master.md b/templates/agents-master.md index 8b6015d..fbc057f 100644 --- a/templates/agents-master.md +++ b/templates/agents-master.md @@ -2,10 +2,10 @@ ## Identity -| Field | Value | -|--------|-------| -| Repo | `{{MASTER_REPO}}` | -| Owner | `{{FORGEJO_USER}}` | +| Field | Value | +| ------ | -------------------------------------------------- | +| Repo | `{{MASTER_REPO}}` | +| Owner | `{{FORGEJO_USER}}` | | Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}` | **Role:** Cross-genome coordinator for the Knowledge Genome network. @@ -32,14 +32,17 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file. ## Global Security Rules ### PRIVATE_CONTEXT scope + - Toggle is **per-genome and per-session**. Enabling for `genome-finance` does NOT enable for `genome-dev`. - Cloud LLM models: `PRIVATE_CONTEXT` must be `disabled` for all genomes. Private data never leaves the local network. ### Log sanitization + - Never print decrypted secrets, session tokens, or key contents to stdout or log files. - Document only `run_id` and genome name — never the key value. ### Key management + - Key injection is the host's responsibility — executed before this session starts. - Never write, suggest, or generate scripts that save `.key` files to disk. @@ -54,12 +57,14 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file. 5. Per-genome `AGENTS.md` governs all wiki operations within that genome. This file governs boundaries only. ### NEVER + - Load multiple `wiki/index.md` files simultaneously for cross-genome comparison — use qmd. - Run `git-crypt`, `bw`, or Vaultwarden commands — host responsibility. - Modify files in more than one genome in the same operation. - Modify `core-karpathy` in any way. ### ASK FIRST + - Any operation that touches two or more genomes. - Updating submodule pointers in master. - Any key rotation procedure. @@ -77,7 +82,8 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file. --- ## Cross-Genome Lint -*Manual, monthly — requires operator initiation. Not automated.* + +_Manual, monthly — requires operator initiation. Not automated._ 1. Use `qmd search ""` to find pages covering the same concept across genomes. 2. Identify: diff --git a/templates/pr-description.md b/templates/pr-description.md index 59cb25f..828517c 100644 --- a/templates/pr-description.md +++ b/templates/pr-description.md @@ -1,25 +1,31 @@ ## Summary + ## Pages Created -| Path | Type | Maturity | -|------|------|----------| -| `[[folder/slug]]` | entity / concept / source / query | draft | + +| Path | Type | Maturity | +| ----------------- | --------------------------------- | -------- | +| `[[folder/slug]]` | entity / concept / source / query | draft | ## Pages Modified -| Path | Change | -|------|--------| + +| Path | Change | +| ----------------- | ----------------------------------------- | | `[[folder/slug]]` | Added cross-reference to `[[other/page]]` | ## Contradictions Found + - [ ] None - [ ] `n` conflict file(s) created — listed below ## Private Data Accessed + - [ ] No — `PRIVATE_CONTEXT: disabled` - [ ] Yes — `PRIVATE_CONTEXT: enabled` · outputs in `wiki/private/` only ## Scoped Lint (post-ingest) + - [ ] Frontmatter valid on all touched pages - [ ] No broken wikilinks on touched pages - [ ] No issues found diff --git a/templates/wiki-index.md b/templates/wiki-index.md index d948e11..4bae5d9 100644 --- a/templates/wiki-index.md +++ b/templates/wiki-index.md @@ -1,9 +1,9 @@ --- title: "Index — {{GENOME_NAME}}" type: index -domain: {{GENOME_NAME}} +domain: { { GENOME_NAME } } maturity: stable -last_updated: {{DATE}} +last_updated: { { DATE } } private: false --- @@ -19,27 +19,28 @@ Entry format: `- [[folder/slug]] — One-line summary. \`maturity: \`` --- ## Sources (`wiki/sources/`) -*Ingested raw materials. One entry per processed source.* +_Ingested raw materials. One entry per processed source._ ## Entities (`wiki/entities/`) -*People, organisations, tools, projects.* +_People, organisations, tools, projects._ ## Concepts (`wiki/concepts/`) -*Theories, methodologies, patterns, architectural decisions.* +_Theories, methodologies, patterns, architectural decisions._ ## Queries (`wiki/queries/`) -*Synthesised answers worth preserving. Archived explorations and analyses.* +_Synthesised answers worth preserving. Archived explorations and analyses._ ## Conflicts Pending Review (`wiki/queries/conflict-*.md`) -*Created automatically when the agent detects contradictions between sources.* -*Do not summarise entries here — list slugs only to avoid surfacing unresolved claims.* -*Remove entry once the operator has resolved and closed the corresponding PR.* +_Created automatically when the agent detects contradictions between sources._ +_Do not summarise entries here — list slugs only to avoid surfacing unresolved claims._ +_Remove entry once the operator has resolved and closed the corresponding PR._ ## Private Synthesis (`wiki/private/`) -*Restricted access. Requires `PRIVATE_CONTEXT: enabled` and unlocked repo.* -*List slug names ONLY. Do not append summaries — prevents metadata leakage.* + +_Restricted access. Requires `PRIVATE_CONTEXT: enabled` and unlocked repo._ +_List slug names ONLY. Do not append summaries — prevents metadata leakage._ diff --git a/templates/wiki-log.md b/templates/wiki-log.md index facf25a..25d799c 100644 --- a/templates/wiki-log.md +++ b/templates/wiki-log.md @@ -1,9 +1,9 @@ --- title: "Operations Log — {{GENOME_NAME}}" type: log -domain: {{GENOME_NAME}} +domain: { { GENOME_NAME } } maturity: stable -last_updated: {{DATE}} +last_updated: { { DATE } } private: false --- @@ -22,11 +22,13 @@ Append new entries at the bottom using the format defined below. ## Entry Format ### Required header (enables shell parsing): + ```text ## [YYYY-MM-DD] TYPE | Subject or title ``` ### Required metadata block for all agent-generated entries: + ```markdown - run_id: `` - model: `` @@ -38,6 +40,7 @@ Append new entries at the bottom using the format defined below. **Valid TYPEs:** `INGEST` | `LINT` | `QUERY` | `CONFLICT` | `CONFIG` | `SECURITY` **Parse examples:** + ```bash # Last 5 entries grep "^## \[" wiki/log.md | tail -5 @@ -55,6 +58,6 @@ grep "^## \[2026-05" wiki/log.md - run_id: `system-init` - model: `setup-knowledge-genome.sh` -- context_read: *(none — initial scaffold)* +- context_read: _(none — initial scaffold)_ - output_written: `[[wiki/index.md]]`, `[[wiki/log.md]]`, `[[AGENTS.md]]` - reasoning: Initial directory structure and encryption layer initialized by setup script. From e531135bf3e1eea9d5a87eac8019047165e2eb30 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:08 +0200 Subject: [PATCH 06/28] feat: Introduce Bats testing framework and helpers --- tests/README.md | 56 ++++++++++++++++++++++++++ tests/helpers.bash | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 tests/README.md create mode 100644 tests/helpers.bash diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..669ff41 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,56 @@ +# Tests + +Deterministic tests for the mechanical layer of the framework — **no LLM, no GPU, no +network**. They simulate pi's output with fixtures and exercise the scripts directly, so +they run anywhere (laptop, CI, a git hook). They do **not** belong on vm101 or in n8n. + +## What's covered + +| File | Covers | +|------|--------| +| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) | +| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` reuse | +| `structure.bats` | `lib/structure.sh` report/sync | +| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` | + +`run-ingest.bats` auto-`skip`s if `jq` is missing; everything else needs only bash + git +(+ `python3` for the index tests). + +## Install bats + +```bash +# Debian/Ubuntu +sudo apt install bats +# or pinned, as a vendored submodule +git submodule add https://github.com/bats-core/bats-core.git test/bats +``` + +## Run + +```bash +bats tests/ # whole suite +bats tests/lint.bats # one file +bats -f "sorted" tests/scripts.bats # filter by name +``` + +Each test builds its own throwaway genome under `BATS_TEST_TMPDIR` (auto-cleaned) with a +local bare git remote, so `open-pr.sh --DRY_RUN` can branch/commit/push without touching +Forgejo. + +## Makefile targets + +```make +test: + @bats tests/ + +verify-structure: + @bash scripts/verify-genomes.sh + +sync-structure: + @bash scripts/verify-genomes.sh --sync +``` + +## Note on `helpers.bash` + +`FIXTURE_DIRS` in `helpers.bash` must match `GENOME_DIRS` in `lib/structure.sh`. If you +change the canonical layout, update both (the structure tests assume a clean baseline). diff --git a/tests/helpers.bash b/tests/helpers.bash new file mode 100644 index 0000000..1b31397 --- /dev/null +++ b/tests/helpers.bash @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# tests/helpers.bash — shared helpers for the bats suite. + +REPO_ROOT="$(cd "${BATS_TEST_DIRNAME}/.." && pwd)" +LIB_DIR="${REPO_ROOT}/lib" +SKILL_SCRIPTS="${REPO_ROOT}/skills/ingest/scripts" + +# Canonical dirs a fresh genome must contain (kept in sync with lib/structure.sh). +FIXTURE_DIRS=( + raw/articles raw/transcripts raw/code-packs raw/assets raw/private + wiki/sources wiki/entities wiki/concepts wiki/queries wiki/private +) + +# make_fixture_genome → echoes the path to a throwaway genome checkout with a +# local bare remote, the full canonical structure, and rendered index/log. +# Uses BATS_TEST_TMPDIR so bats cleans it up automatically. +make_fixture_genome() { + local base; base="$(mktemp -d "${BATS_TEST_TMPDIR:-/tmp}/genome.XXXXXX")" + git init --bare -q "${base}/origin.git" + + local g="${base}/genome" + local d + for d in "${FIXTURE_DIRS[@]}"; do mkdir -p "${g}/${d}"; touch "${g}/${d}/.gitkeep"; done + + cat > "${g}/wiki/index.md" <<'EOF' +--- +title: "Index — genome-test" +type: index +domain: genome-test +maturity: stable +last_updated: 2026-01-01 +private: false +--- + +# Master Index: genome-test + +--- + +## Sources (`wiki/sources/`) +*Ingested raw materials.* + + +## Entities (`wiki/entities/`) +*People, tools.* + + +## Concepts (`wiki/concepts/`) +*Patterns.* + + +## Queries (`wiki/queries/`) +*Answers.* + + +## Conflicts Pending Review (`wiki/queries/conflict-*.md`) +*slugs only.* +EOF + + cat > "${g}/wiki/log.md" <<'EOF' +--- +title: "Operations Log — genome-test" +type: log +domain: genome-test +maturity: stable +last_updated: 2026-01-01 +private: false +--- + +# Operations Log + +--- + +## [2026-01-01] CONFIG | scaffolded +- run_id: `init` +EOF + + echo "raw test" > "${g}/raw/articles/test.md" + + mkdir -p "${base}/nohooks" + + ( + cd "${g}" + git init -q + # Hermetic: ignore the user's global git config (signing, global hooks); + # otherwise commit.gpgsign or a global core.hooksPath makes git commit fail here. + git config commit.gpgsign false + git config core.hooksPath "${base}/nohooks" + git config user.email t@t + git config user.name tester + git add . + git commit -qm init + git branch -M main + git remote add origin "${base}/origin.git" + git push -q -u origin main + ) >/dev/null + + echo "${g}" +} From ff0828f5a74cccef3bd5f1311b9c732d4e54fefa Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 07/28] refactor(ingest): Enhance run-ingest.sh for testability and robustness --- skills/ingest/scripts/open-pr.sh | 6 ++++++ skills/ingest/scripts/run-ingest.sh | 10 +++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh index cac5fc0..9ea4a9b 100644 --- a/skills/ingest/scripts/open-pr.sh +++ b/skills/ingest/scripts/open-pr.sh @@ -46,6 +46,12 @@ fi git commit -m "$title" git push -u origin "$branch" +# DRY_RUN: local git work done; skip the Forgejo API (offline tests). +if [[ -n "${DRY_RUN:-}" ]]; then + echo "PR opened: DRY-RUN ${branch} -> ${base}" + exit 0 +fi + # 2. Open the PR via Forgejo API (jq builds the JSON safely) body="$(cat "$body_file")" payload="$(jq -n --arg head "$branch" --arg base "$base" \ diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index b9fab70..5bb6964 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -16,7 +16,7 @@ manifest="${2:-.ingest-manifest.json}" SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" fail() { - jq -n --arg stage "$1" --arg reason "$2" \ + jq -nc --arg stage "$1" --arg reason "$2" \ '{status:"error", stage:$stage, reason:$reason}' exit 1 } @@ -72,12 +72,12 @@ done < <(jq -r '.pages[] | select(.status=="created") # --- 2. log entry --- out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")" -"${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ +bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ || fail "log" "log-append failed" # --- 3. scoped lint (capture findings for the PR; never aborts the run) --- -lint_out="$( "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? +lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? # --- 4. assemble the PR body (manifest tables + lint results) --- body="$(mktemp)" @@ -102,13 +102,13 @@ body="$(mktemp)" # --- 5. open the PR --- pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" ) [[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) -pr_out="$( "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? +pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" rm -f "$body" # --- final result line for n8n --- -jq -n \ +jq -nc \ --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ --arg slug "$slug" \ --arg pr_url "$pr_url" \ From 9b61e748215bfeee4e4dbc77fc95d654ec087aa6 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 08/28] test(ingest): Add Bats tests for core skill scripts --- tests/scripts.bats | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/scripts.bats diff --git a/tests/scripts.bats b/tests/scripts.bats new file mode 100644 index 0000000..5b7fe1b --- /dev/null +++ b/tests/scripts.bats @@ -0,0 +1,53 @@ +#!/usr/bin/env bats +# tests/scripts.bats — unit tests for the deterministic skill scripts. +load helpers + +@test "slug: path with extension and spaces" { + run bash "$SKILL_SCRIPTS/slug.sh" "raw/articles/My Test Source.md" + [ "$status" -eq 0 ] + [ "$output" = "my-test-source" ] +} + +@test "slug: punctuation and repeats collapse to single hyphens" { + run bash "$SKILL_SCRIPTS/slug.sh" "Qualche Concetto!! Strano" + [ "$output" = "qualche-concetto-strano" ] +} + +@test "log-append: appends a well-formed INGEST entry with a run_id" { + G="$(make_fixture_genome)"; cd "$G" + run bash "$SKILL_SCRIPTS/log-append.sh" --type INGEST --subject foo --model m \ + --context "[[raw/x]]" --output "[[sources/foo]]" --reasoning "why" + [ "$status" -eq 0 ] + grep -q "INGEST | foo" wiki/log.md + grep -q '^- run_id: `' wiki/log.md + grep -q '^- model: `m`' wiki/log.md +} + +@test "log-append: rejects an invalid TYPE" { + G="$(make_fixture_genome)"; cd "$G" + run bash "$SKILL_SCRIPTS/log-append.sh" --type BOGUS --subject foo + [ "$status" -ne 0 ] +} + +@test "index-append: inserts under the right section and keeps it sorted" { + G="$(make_fixture_genome)"; cd "$G" + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/zzz]] — z. `maturity: draft`' + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/aaa]] — a. `maturity: draft`' + a=$(grep -n 'sources/aaa' wiki/index.md | cut -d: -f1) + z=$(grep -n 'sources/zzz' wiki/index.md | cut -d: -f1) + [ -n "$a" ] && [ -n "$z" ] + [ "$a" -lt "$z" ] +} + +@test "index-append: bumps frontmatter last_updated to today" { + G="$(make_fixture_genome)"; cd "$G" + python3 "$SKILL_SCRIPTS/index-append.py" --section Concepts --entry '- [[concepts/x]] — x. `maturity: draft`' + grep -q "^last_updated: $(date +%F)$" wiki/index.md +} + +@test "index-append: is idempotent for the same entry" { + G="$(make_fixture_genome)"; cd "$G" + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`' + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`' + [ "$(grep -c 'sources/dup' wiki/index.md)" -eq 1 ] +} From b88468cc0653f53453426d3a1ce6c1c97a344be5 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 09/28] test(lint,structure): Add Bats tests for linting and structure validation --- tests/lint.bats | 71 ++++++++++++++++++++++++++++++++++++++++++++ tests/structure.bats | 40 +++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tests/lint.bats create mode 100644 tests/structure.bats diff --git a/tests/lint.bats b/tests/lint.bats new file mode 100644 index 0000000..f0b0306 --- /dev/null +++ b/tests/lint.bats @@ -0,0 +1,71 @@ +#!/usr/bin/env bats +# tests/lint.bats — lib/lint.sh validators and the scoped-lint wrapper. +load helpers + +setup() { + source "$LIB_DIR/output.sh" + source "$LIB_DIR/lint.sh" +} + +write_page() { # write_page + cat > "$1" <800-line page errors" { + G="$(make_fixture_genome)" + { write_page "$G/wiki/sources/big.md" source genome-test; yes "x" | head -n 850 >> "$G/wiki/sources/big.md"; } + run check_page_size "$G/wiki/sources/big.md" + [ "$status" -ne 0 ] +} + +@test "scoped-lint: aggregates findings and exits non-zero on errors" { + G="$(make_fixture_genome)" + write_page "$G/wiki/sources/bad.md" banana wrong-genome + cd "$G" + export KG_LIB_DIR="$LIB_DIR" + run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/bad.md + [ "$status" -ne 0 ] + [[ "$output" == *"error(s)"* ]] +} + +@test "scoped-lint: a clean page passes (exit 0)" { + G="$(make_fixture_genome)" + write_page "$G/wiki/sources/good.md" source genome-test + cd "$G" + export KG_LIB_DIR="$LIB_DIR" + run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/good.md + [ "$status" -eq 0 ] +} diff --git a/tests/structure.bats b/tests/structure.bats new file mode 100644 index 0000000..237b4e1 --- /dev/null +++ b/tests/structure.bats @@ -0,0 +1,40 @@ +#!/usr/bin/env bats +# tests/structure.bats — canonical-structure verify/sync. +load helpers + +setup() { + source "$LIB_DIR/output.sh" + source "$LIB_DIR/structure.sh" +} + +@test "structure_report: a full fixture has no drift" { + G="$(make_fixture_genome)" + run structure_report "$G" + [ "$status" -eq 0 ] +} + +@test "structure_report: flags a missing canonical dir" { + G="$(make_fixture_genome)" + rm -rf "$G/wiki/private" + run structure_report "$G" + [ "$status" -ne 0 ] + [[ "$output" == *"wiki/private"* ]] +} + +@test "structure_report: notes an extra dir but does not fail on it" { + G="$(make_fixture_genome)" + mkdir -p "$G/wiki/experiments" + run structure_report "$G" + [ "$status" -eq 0 ] + [[ "$output" == *"experiments"* ]] +} + +@test "structure_sync: creates missing dirs and is idempotent" { + G="$(make_fixture_genome)" + rm -rf "$G/wiki/private" "$G/raw/transcripts" + structure_sync "$G" + [ -d "$G/wiki/private" ] && [ -d "$G/raw/transcripts" ] + run structure_report "$G" + [ "$status" -eq 0 ] + structure_sync "$G" # second run: nothing to do +} From e0465b6f25e61eb4d87e572f13a993728a85f6c0 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 10/28] test(ingest): Add end-to-end Bats test for run-ingest.sh --- tests/run-ingest.bats | 93 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 tests/run-ingest.bats diff --git a/tests/run-ingest.bats b/tests/run-ingest.bats new file mode 100644 index 0000000..20864f9 --- /dev/null +++ b/tests/run-ingest.bats @@ -0,0 +1,93 @@ +#!/usr/bin/env bats +# tests/run-ingest.bats — end-to-end orchestrator test (no LLM, no network). +# Simulates pi's output (a source page + manifest) and runs the mechanical pass. +load helpers + +@test "run-ingest: DRY_RUN end-to-end updates index + log and opens a dry PR" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + + G="$(make_fixture_genome)"; cd "$G" + + # --- simulate the semantic pass that pi would have done --- + cat > wiki/sources/test-source.md <<'EOF' +--- +title: "Test Source" +type: source +domain: genome-test +tags: [t] +maturity: draft +last_updated: 2026-06-03 +private: false +--- +body +EOF + + cat > .ingest-manifest.json <<'EOF' +{ + "raw_source": "raw/articles/test.md", + "model": "qwen3.5-9b", + "reasoning": "Ingested the test source.", + "pr_summary": "Ingest of test: 1 source page.", + "contradictions": "None", + "pages": [ + {"path": "wiki/sources/test-source.md", "summary": "A smoke-test source.", "maturity": "draft", "status": "created"} + ] +} +EOF + + export KG_LIB_DIR="$LIB_DIR" + export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" + export DRY_RUN=1 + + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *'"status":"ok"'* ]] + [[ "$output" == *'"lint_clean":true'* ]] + [[ "$output" == *'"conflict":false'* ]] + + # side effects on the working tree + grep -q 'sources/test-source' wiki/index.md + grep -q 'INGEST | test' wiki/log.md + git rev-parse --verify feat/ai-ingest-test +} + +@test "run-ingest: a conflict page is labelled and lands in the Conflicts section" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + + G="$(make_fixture_genome)"; cd "$G" + + cat > wiki/queries/conflict-pricing-2026-06-03.md <<'EOF' +--- +title: "Conflict: pricing" +type: conflict +domain: genome-test +maturity: draft +last_updated: 2026-06-03 +private: false +--- +conflict body +EOF + + cat > .ingest-manifest.json <<'EOF' +{ + "raw_source": "raw/articles/test.md", + "model": "m", + "reasoning": "Flagged a contradiction.", + "pr_summary": "Conflict on pricing.", + "contradictions": "1 conflict file created — pricing", + "pages": [ + {"path": "wiki/queries/conflict-pricing-2026-06-03.md", "summary": "ignored", "maturity": "draft", "status": "created"} + ] +} +EOF + + export KG_LIB_DIR="$LIB_DIR" + export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" + export DRY_RUN=1 + + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *'"conflict":true'* ]] + # listed by slug under the Conflicts section + grep -q 'queries/conflict-pricing-2026-06-03' wiki/index.md +} From 35f476c2c7178caed293013d4d1f3e10caeeb936 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 11/28] feat(ingest): Add diagnose-run-ingest.sh diagnostic tool --- diagnose-run-ingest.sh | 130 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 diagnose-run-ingest.sh diff --git a/diagnose-run-ingest.sh b/diagnose-run-ingest.sh new file mode 100644 index 0000000..0cfa30d --- /dev/null +++ b/diagnose-run-ingest.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash +# diagnose-run-ingest.sh +# Run from the repo root: bash diagnose-run-ingest.sh +# Builds the same fixture the bats test uses and runs run-ingest under `bash -x` +# so we can see exactly which command makes it exit non-zero. + +set -uo pipefail + +REPO="$(pwd)" +RI="${REPO}/skills/ingest/scripts/run-ingest.sh" + +echo "==================== ENV ====================" +echo "bash: $(bash --version | head -1)" +echo "git : $(git --version)" +echo "jq : $(jq --version 2>/dev/null || echo MISSING)" +echo "py : $(python3 --version 2>/dev/null || echo MISSING)" +echo + +echo "============ run-ingest.sh on disk ============" +if [[ ! -f "$RI" ]]; then echo "NOT FOUND: $RI (run me from the repo root)"; exit 1; fi +echo "-- helper invocations (want 'bash ...'): --" +grep -nE 'log-append\.sh|scoped-lint\.sh|open-pr\.sh' "$RI" +echo "-- result emitter (want 'jq -nc'): --" +grep -nE 'jq -nc?|jq -n ' "$RI" +echo + +echo "============ build hermetic fixture ============" +T="$(mktemp -d)" +mkdir -p "$T/nohooks" +git init --bare -q "$T/origin.git" +g="$T/g" +mkdir -p "$g"/{raw/articles,wiki/sources,wiki/entities,wiki/concepts,wiki/queries,wiki/private} + +cat > "$g/wiki/index.md" <<'EOF' +--- +title: "Index" +type: index +domain: genome-test +maturity: stable +last_updated: 2026-01-01 +private: false +--- + +# Index + +--- + +## Sources (`wiki/sources/`) +*x* + + +## Entities (`wiki/entities/`) +*x* + + +## Concepts (`wiki/concepts/`) +*x* + + +## Queries (`wiki/queries/`) +*x* + + +## Conflicts Pending Review (`wiki/queries/conflict-*.md`) +*x* +EOF + +cat > "$g/wiki/log.md" <<'EOF' +--- +title: "Log" +type: log +domain: genome-test +maturity: stable +last_updated: 2026-01-01 +private: false +--- + +# Log + +--- + +## [2026-01-01] CONFIG | init +- run_id: `init` +EOF + +echo raw > "$g/raw/articles/test.md" + +( + cd "$g" + git init -q + git config commit.gpgsign false + git config core.hooksPath "$T/nohooks" + git config user.email t@t + git config user.name t + git add . + git commit -qm init + git branch -M main + git remote add origin "$T/origin.git" + git push -q -u origin main +) && echo "fixture commit+push OK" || echo "FIXTURE SETUP FAILED (look above)" + +cat > "$g/wiki/sources/test-source.md" <<'EOF' +--- +title: "Test Source" +type: source +domain: genome-test +tags: [t] +maturity: draft +last_updated: 2026-06-04 +private: false +--- +body +EOF + +cat > "$g/.ingest-manifest.json" <<'EOF' +{ "raw_source":"raw/articles/test.md","model":"m","reasoning":"r","pr_summary":"s","contradictions":"None", + "pages":[{"path":"wiki/sources/test-source.md","summary":"a source","maturity":"draft","status":"created"}] } +EOF + +echo +echo "============ run-ingest (bash -x) ============" +cd "$g" +export KG_LIB_DIR="${REPO}/lib" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1 +bash -x "$RI" genome-test >"$T/out.txt" 2>"$T/trace.txt" +rc=$? +echo "EXIT=$rc" +echo "-- run-ingest stdout (final JSON should be here): --" +cat "$T/out.txt" +echo "-- last 25 lines of the trace (the failing command is near the end): --" +tail -n 25 "$T/trace.txt" From 624bd5f8d584fc1ee14140d0c1295af116d1c8b9 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Thu, 4 Jun 2026 10:59:09 +0200 Subject: [PATCH 12/28] chore: Remove deprecated frontmatter reference; add master README template --- templates/readme-master.md | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 templates/readme-master.md diff --git a/templates/readme-master.md b/templates/readme-master.md new file mode 100644 index 0000000..f8ad347 --- /dev/null +++ b/templates/readme-master.md @@ -0,0 +1,45 @@ +# {{MASTER_REPO}} + +Master (umbrella) repository for the Knowledge Genome network. + +| Field | Value | +| ---------- | -------------------------------------------------- | +| Owner | `{{FORGEJO_USER}}` | +| Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}` | +| Scaffolded | `{{DATE}}` | + +## What this repo is + +This repository does **not** hold knowledge itself. It is the orchestrator: each genome +is a Git submodule, plus `core-karpathy` as a read-only reference pattern. Cross-genome +coordination rules live in `AGENTS.md`. + +```text +{{MASTER_REPO}}/ +├── core-karpathy/ ← reference pattern — read-only, never modify +├── genome-*/ ← one submodule per genome (own AGENTS.md, own git-crypt) +└── AGENTS.md ← cross-genome coordinator (boundaries only) +``` + +## Working with submodules + +```bash +# Clone with all genomes +git clone --recurse-submodules {{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}.git + +# Pull the latest pointers for every genome +git submodule update --remote --merge + +# Operate inside a single genome (one genome at a time — see AGENTS.md) +cd genome- +``` + +## Rules of the road + +- Operate within **one genome at a time**; no commits spanning multiple genomes. +- `core-karpathy` is read-only. +- Never commit to `main` in a genome — PRs only, no self-merge. +- Private data (`**/private/**`) is git-crypt encrypted and never leaves the local network. + +Genome-level operations are governed by each genome's own `AGENTS.md`. This README and the +master `AGENTS.md` govern boundaries only. From 42c1302035a9a91b46bdbed5dbbe07c30c1eff35 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:24:58 +0200 Subject: [PATCH 13/28] feat(build): Introduce testing, structure management, and linked genome support --- Makefile | 40 +++++++++++++++++-------- skills/ingest/references/frontmatter.md | 0 2 files changed, 27 insertions(+), 13 deletions(-) delete mode 100644 skills/ingest/references/frontmatter.md diff --git a/Makefile b/Makefile index 09243a7..8e9441c 100644 --- a/Makefile +++ b/Makefile @@ -1,22 +1,25 @@ # ============================================================================= -# Knowledge Genome - Makefile v. 1.0.0 +# Knowledge Genome - Makefile v. 1.1.0 # Orchestrates the setup and management of the knowledge base. # ============================================================================= include globals.env export $(shell grep -v '^[#[:space:]]' globals.env | sed 's/=.*//') -.PHONY: setup add-genome status lint lock doctor sync help +.PHONY: setup add-genome status lint lock doctor sync test verify-structure sync-structure help help: @echo "Available commands:" - @echo " make setup - Full system initialization" - @echo " make add-genome - Register and scaffold a new genome" - @echo " make status - Check submodule and encryption status" - @echo " make lint - Verify schema, privacy flags, and metadata" - @echo " make lock - Lock all encrypted files across all genomes" - @echo " make doctor - Verify all required tools are installed" - @echo " make sync - Sync submodules and report unpushed commits" + @echo " make setup - Full system initialization" + @echo " make add-genome - Register and scaffold a new genome [LINKED=owner/repo]" + @echo " make status - Check submodule and encryption status" + @echo " make lint - Verify schema, privacy flags, and metadata" + @echo " make verify-structure - Report directory drift across all genomes" + @echo " make sync-structure - Create any missing canonical dirs (safe)" + @echo " make test - Run the bats test suite (no LLM/GPU needed)" + @echo " make lock - Lock all encrypted files across all genomes" + @echo " make doctor - Verify all required tools are installed" + @echo " make sync - Sync submodules and report unpushed commits" lint: @bash scripts/lint-genomes.sh @@ -27,16 +30,26 @@ setup: add-genome: @if [ -z "$(NAME)" ] || [ -z "$(DESC)" ]; then \ echo "Error: NAME and DESC are required."; \ - echo "Usage: make add-genome NAME=my-genome DESC='My description'"; \ + echo "Usage: make add-genome NAME=my-genome DESC='My description' [LINKED=owner/project-repo]"; \ exit 1; \ fi - @bash scripts/add-genome.sh "$(NAME)" "$(DESC)" + @bash scripts/add-genome.sh "$(NAME)" "$(DESC)" "$(LINKED)" status: @echo "--- Master Status ---" @git submodule status - @echo "--- Encryption Status (First 10 files) ---" - @git-crypt status | head -n 10 + @echo "--- Encryption Status (per genome) ---" + @git submodule foreach 'git-crypt status 2>/dev/null | head -n 10 || true' + +verify-structure: + @bash scripts/verify-genomes.sh + +sync-structure: + @bash scripts/verify-genomes.sh --sync + +test: + @command -v bats >/dev/null 2>&1 || { echo " MISSING: bats (sudo apt install bats)"; exit 1; } + @bats tests/ doctor: @echo "Checking required tools..." @@ -45,6 +58,7 @@ doctor: @command -v curl >/dev/null 2>&1 || { echo " MISSING: curl"; exit 1; } @command -v jq >/dev/null 2>&1 || { echo " MISSING: jq"; exit 1; } @command -v bw >/dev/null 2>&1 || echo " OPTIONAL: bw (Bitwarden CLI) not found — key injection will be manual." + @command -v python3 >/dev/null 2>&1 || echo " OPTIONAL: python3 not found — needed for 'make test' and the ingest skill (index-append.py), not for setup." @echo "System ready." sync: diff --git a/skills/ingest/references/frontmatter.md b/skills/ingest/references/frontmatter.md deleted file mode 100644 index e69de29..0000000 From 13d34b49067b6c95851a0f0c42612d0ad4901a50 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:27:14 +0200 Subject: [PATCH 14/28] docs: Update README --- README.md | 228 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 170 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 5c75647..96fe9c2 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,17 @@ and a human-in-the-loop Git Flow for quality control. 5. [Configuration](#configuration) 6. [Quick Start](#quick-start) 7. [Makefile Reference](#makefile-reference) -8. [Genome Lifecycle](#genome-lifecycle) -9. [Security Model](#security-model) -10. [Key Management](#key-management) -11. [Agent Sessions](#agent-sessions) -12. [Workflows](#workflows) -13. [Knowledge Quality](#knowledge-quality) -14. [Knowledge Schema](#knowledge-schema) -15. [Collaboration Model](#collaboration-model) -16. [Optional Extensions](#optional-extensions) -17. [Troubleshooting](#troubleshooting) +8. [Testing](#testing) +9. [Genome Lifecycle](#genome-lifecycle) +10. [Security Model](#security-model) +11. [Key Management](#key-management) +12. [Agent Sessions](#agent-sessions) +13. [Workflows](#workflows) +14. [Knowledge Quality](#knowledge-quality) +15. [Knowledge Schema](#knowledge-schema) +16. [Collaboration Model](#collaboration-model) +17. [Optional Extensions](#optional-extensions) +18. [Troubleshooting](#troubleshooting) --- @@ -110,10 +111,18 @@ genome-{name}/ | Wiki | `wiki/` | LLM | Agent creates, updates, cross-links, maintains. | | Schema | `AGENTS.md` | Human + LLM | Co-evolved contract defining structure and workflows. | +### Linked projects (optional) + +A genome can optionally declare a **linked project repository** — a separate repo where +the knowledge in that genome is meant to be applied (e.g. `genome-dev` linked to an app +repo). The link is recorded as a third field in the registry and rendered into the +genome's `AGENTS.md` (`## Linked Project`). A genome with no link is _knowledge-only_ and +behaves exactly as before. See [Configuration](#configuration). + ### Framework structure ```text -knowledge-genome-setup/ ← This repository (setup tooling) +knowledge-genome-orchestrator/ ← This repository (setup tooling) ├── globals.env ← Static KEY=VALUE config (Make-includable) ├── registry.sh ← Bash-only: GENOMES array + dynamic paths ├── Makefile ← Entry point for all operations @@ -121,6 +130,7 @@ knowledge-genome-setup/ ← This repository (setup tooling) │ ├── output.sh ← Terminal helpers (colors, log levels) │ ├── deps.sh ← Dependency validation │ ├── scaffold.sh ← Template rendering engine +│ ├── structure.sh ← Canonical genome layout (single source of truth) │ ├── lint.sh ← Per-file validation functions │ └── git-crypt.sh ← git-crypt lifecycle (init, export, verify, rotate) ├── providers/ @@ -131,18 +141,41 @@ knowledge-genome-setup/ ← This repository (setup tooling) │ ├── setup-master.sh ← Master repo initialisation │ ├── setup-genomes.sh ← Genome provisioning loop │ ├── add-genome.sh ← Add a single new genome -│ └── lint-genomes.sh ← Quality control across all genomes -└── templates/ - ├── agents-genome.md ← Per-genome agent contract template - ├── agents-master.md ← Master coordination schema template - ├── wiki-index.md ← Index template (rendered per genome) - ├── wiki-log.md ← Log template (rendered per genome) - ├── pr-description.md ← PR review checklist template - ├── pre-commit.sh ← Security hook template - ├── gitattributes ← Git encryption rules template - └── gitignore ← Git ignore template +│ ├── lint-genomes.sh ← Quality control across all genomes +│ └── verify-genomes.sh ← Structure verify / --sync across all genomes +├── templates/ +│ ├── agents-genome.md ← Per-genome agent contract template +│ ├── agents-master.md ← Master coordination schema template +│ ├── readme-master.md ← Master repo README template +│ ├── wiki-index.md ← Index template (rendered per genome) +│ ├── wiki-log.md ← Log template (rendered per genome) +│ ├── pr-description.md ← PR review checklist template +│ ├── pre-commit.sh ← Security hook template +│ ├── gitattributes ← Git encryption rules template +│ └── gitignore ← Git ignore template +├── skills/ +│ └── ingest/ ← pi skill: deployed to the AI node (vm101) +│ ├── SKILL.md ← Semantic-only contract (read/edit, emits manifest) +│ ├── references/ ← On-demand reference docs for the agent +│ └── scripts/ ← Deterministic post-processor (runs outside the agent) +│ ├── run-ingest.sh ← Orchestrator: consumes the manifest, emits one JSON line +│ ├── slug.sh ← Slug normalisation +│ ├── index-append.py ← Sorted insert into wiki/index.md + last_updated bump +│ ├── log-append.sh ← Append a wiki/log.md entry +│ ├── scoped-lint.sh ← Lint only the pages touched this run (reuses lib/lint.sh) +│ └── open-pr.sh ← Branch / commit / push / open PR (DRY_RUN seam for tests) +└── tests/ ← bats suite — deterministic, no LLM/GPU (see Testing) + ├── helpers.bash + ├── scripts.bats + ├── lint.bats + ├── structure.bats + └── run-ingest.bats ``` +> The `skills/ingest/` directory is version-controlled here but **deployed** to the AI +> node (vm101) under `~/.pi/agent/skills/ingest`. The agent (`pi`) does only semantic work +> and writes a manifest; `run-ingest.sh` does the mechanical steps. See [Workflows → Ingest](#ingest). + --- ## System Requirements @@ -156,7 +189,9 @@ All tools (git-crypt, bw, qmd) have native Linux binaries. All scripts are compatible with macOS. Requirements: -- bash 3.2+ (macOS default) — fully supported. All `bash 4+` constructs removed. +- bash 3.2+ (macOS default) — supported for the **setup scripts** (`make` targets, scaffolding). + The `ingest` skill uses bash 4+ constructs (`mapfile`), but it is deployed and run on the + Linux AI node, not on the macOS setup machine — so this is not a constraint in practice. - GNU coreutils not required — BSD variants of `date`, `grep`, `sed` all handled. - `git-crypt`: install via Homebrew — `brew install git-crypt` - `jq`, `curl`: pre-installed or via Homebrew @@ -195,6 +230,11 @@ The system is designed for a homelab architecture: > the index, and the log tail is a cost. This is why all agent files are token-optimised > and sessions are kept to one source at a time. +> **Reference deployment:** the table above is a target profile, not a hard requirement. +> The current setup runs a single 16GB GPU (RTX 5060 Ti) with a ~9B model for interactive +> ingest, and offloads heavy/async synthesis to a cloud model. Smaller models work — they +> just make the "one source per session" discipline and the token budget matter more. + --- ## Prerequisites @@ -285,14 +325,17 @@ resolution. Never included by Make. ```bash # Dynamic paths (resolved at source time) -WORK_DIR="${HOME}/knowledge-genome-setup" +WORK_DIR="${HOME}/knowledge-genome-orchestrator" KEYS_DIR="${WORK_DIR}/keys" -# Genome registry — format: "name|description" +# Genome registry — format: "name|description|linked_repo" +# The third field is OPTIONAL: +# - leave it empty → knowledge-only genome (no linked project) +# - owner/repo → genome is linked to that project repository (rendered into AGENTS.md) GENOMES=( - "genome-dev|Web development, TUI, Angular, software architecture" - "genome-finance|Personal finance, investments, market analysis" - "genome-homelab|Infrastructure, network configs, architecture logs" + "genome-dev|Web development, TUI, Angular, software architecture|myorg/my-app" + "genome-finance|Personal finance, investments, market analysis|" + "genome-homelab|Infrastructure, network configs, architecture logs|" ) ``` @@ -315,8 +358,8 @@ export GITHUB_TOKEN="your_github_token" ```bash # 1. Clone the setup framework -git clone knowledge-genome-setup -cd knowledge-genome-setup +git clone knowledge-genome-orchestrator +cd knowledge-genome-orchestrator # 2. Configure your environment cp globals.env.example globals.env # edit with your values @@ -358,16 +401,19 @@ After setup completes: ## Makefile Reference -| Target | Description | -| --------------------------------- | ------------------------------------------------------------------------------ | -| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` | -| `make add-genome NAME=x DESC="y"` | Scaffold and register a single new genome | -| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) | -| `make status` | Show submodule status and first 10 git-crypt encryption states | -| `make lock` | Lock all encrypted repos (master + all genome submodules) | -| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing | -| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome | -| `make help` | Print all available targets | +| Target | Description | +| ----------------------------------------------------- | ------------------------------------------------------------------------------------- | +| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` | +| `make add-genome NAME=x DESC="y" [LINKED=owner/repo]` | Scaffold and register a single new genome (optional linked project) | +| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) | +| `make verify-structure` | Report directory drift of each genome vs the canonical layout (`lib/structure.sh`) | +| `make sync-structure` | Create any missing canonical directories across all genomes (safe, idempotent) | +| `make test` | Run the bats test suite (deterministic; no LLM/GPU/network) — see [Testing](#testing) | +| `make status` | Show submodule status and per-genome git-crypt encryption state | +| `make lock` | Lock all encrypted repos (master + all genome submodules) | +| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing | +| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome | +| `make help` | Print all available targets | ### Examples @@ -378,6 +424,12 @@ make doctor # Add a new genome after initial setup make add-genome NAME=genome-research DESC="Academic papers and deep research" +# Add a genome linked to a project repository +make add-genome NAME=genome-dev DESC="Web development" LINKED=myorg/my-app + +# Check every genome against the canonical directory layout +make verify-structure + # Run full lint pass (bash deterministic checks) make lint @@ -390,6 +442,38 @@ make lock --- +## Testing + +The mechanical layer (slug, index, log, lint, structure, the ingest orchestrator) is +covered by a [bats](https://github.com/bats-core/bats-core) suite. The tests are +**deterministic and have zero dependency on the LLM, the GPU, or the network** — they +simulate the agent's output with fixtures and exercise the scripts directly, so they run +anywhere git + bash live (laptop, CI, a git hook). They are **not** meant to run on the AI +node or via n8n. + +```bash +sudo apt install bats # once +make test # or: bats tests/ +``` + +| File | Covers | +| ----------------- | ------------------------------------------------------------------------------ | +| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) | +| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` | +| `structure.bats` | `lib/structure.sh` report / sync | +| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` | + +Each test builds its own throwaway genome with a local bare remote, configured to ignore +the operator's global git settings (signing, global hooks) so the suite is hermetic. The +`run-ingest` tests auto-`skip` if `jq` is absent. If you change the canonical layout in +`lib/structure.sh`, update `FIXTURE_DIRS` in `tests/helpers.bash` to match. + +> Why this matters: the only non-deterministic part of the system is the model. Pinning +> the mechanical layer with tests means that when an ingest misbehaves, you know it's the +> model or the prompt — not the plumbing. + +--- + ## Genome Lifecycle ### Initial setup @@ -431,6 +515,7 @@ template files: | `{{GENOME_NAME}}` | registry.sh | `genome-dev` | | `{{GENOME_NAME_UPPER}}` | derived | `GENOME-DEV` | | `{{GENOME_DESC}}` | registry.sh | `Web development...` | +| `{{LINKED_PROJECT}}` | registry.sh | `myorg/my-app` (or `none`) | | `{{FORGEJO_URL}}` | globals.env | `https://git.yourserver.com` | | `{{FORGEJO_USER}}` | globals.env | `yourusername` | | `{{VAULTWARDEN_URL}}` | globals.env | `https://vault.yourserver.com` | @@ -593,9 +678,9 @@ git clone https://git.yourserver.com/yourusername/genome-dev.git If a key is lost or compromised: ```bash -# From the knowledge-genome-setup/ directory +# From the knowledge-genome-orchestrator/ directory source lib/git-crypt.sh -cd ~/knowledge-genome-setup/genome-dev +cd ~/knowledge-genome-orchestrator/genome-dev gcrypt_rotate_key "genome-dev" ``` @@ -643,7 +728,8 @@ The agent executes in this order at the start of every session: 1. Read `wiki/index.md` — primary catalog of all pages and maturity 2. Read last 20 log entries (injected by orchestrator — does NOT open `wiki/log.md` directly) -3. For tasks involving related pages: `qmd search ""` before opening any files +3. For tasks involving related pages: if the optional `qmd` extension is installed, + `qmd search ""` before opening files; otherwise navigate from `wiki/index.md` 4. Operate on individual files — never scan entire directories ### One source per session @@ -668,7 +754,7 @@ For Forgejo webhook → automated ingest: 2. n8n receives webhook, identifies new files 3. n8n starts one agent session per new file (sequential, not parallel) 4. Each session: inject `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path -5. Agent ingest workflow runs, opens PR +5. Phase 1 agent (`/skill:ingest`) writes the manifest; Phase 2 `run-ingest.sh` opens the PR 6. Human reviews and merges PR --- @@ -677,17 +763,39 @@ For Forgejo webhook → automated ingest: ### Ingest -Triggered by a new file in `raw/` (manual or via webhook). +Triggered by a new file in `raw/` (manual or via webhook). Ingest is split into two +phases so that the small local model spends its limited context only on judgement, and +all the deterministic bookkeeping happens outside the model's loop. -1. Read source once -2. Create `wiki/sources/.md` — summary and key points -3. Per entity (person, tool, organisation): create or update `wiki/entities/.md` -4. Per concept (pattern, theory, decision): create or update `wiki/concepts/.md` -5. Check each touched page for contradictions → apply Conflict Resolution if found -6. Append entry to `wiki/index.md` (bottom of relevant section — do not reorder) -7. Append log entry: `INGEST | ` -8. Run scoped lint on pages created or modified in this session; report in PR -9. Commit on `feat/ai-ingest-`; open PR using `templates/pr-description.md` +**Phase 1 — agent (semantic only).** The `ingest` skill gives the agent read/edit tools +only (no shell). It: + +1. Reads the source once +2. Creates `wiki/sources/.md` — summary and key points +3. Per entity (person, tool, organisation): creates or updates `wiki/entities/.md` +4. Per concept (pattern, theory, decision): creates or updates `wiki/concepts/.md` +5. Checks each touched page for contradictions → applies Conflict Resolution if found +6. Writes `.ingest-manifest.json` (the list of pages it created/modified, the model name, + a one-line reasoning, the PR summary, and any contradictions) — then **stops** + +**Phase 2 — `run-ingest.sh` (deterministic, outside the agent).** The post-processor +consumes the manifest and does the mechanical work the model must not waste context on: + +7. Inserts each page into the correct `wiki/index.md` section **in alphabetical order** + (`index-append.py`) and bumps the index `last_updated` +8. Appends the `INGEST | ` entry to `wiki/log.md` +9. Runs scoped lint on exactly the pages touched this run (`scoped-lint.sh`, reusing + `lib/lint.sh`) +10. Commits on `feat/ai-ingest-` and opens the PR using `templates/pr-description.md` +11. Emits a single compact JSON line (status, slug, PR url, lint_clean, conflict) for n8n + +The agent never runs git, never edits the index/log mechanically, and never lints — those +are deterministic and tested (see [Testing](#testing)). Invocation on the AI node: + +```bash +pi --mode json -p "/skill:ingest raw/articles/.md" # phase 1 → writes manifest +run-ingest.sh # phase 2 → index/log/lint/PR +``` For private sources (`PRIVATE_CONTEXT: enabled` required): @@ -698,7 +806,8 @@ For private sources (`PRIVATE_CONTEXT: enabled` required): Triggered by an operator question. -1. `qmd search ""` → identify candidate pages +1. `qmd search ""` (if the optional qmd extension is installed) → identify + candidate pages; otherwise start from `wiki/index.md` 2. Read candidate pages directly (qmd already returns file paths — no intermediate index lookup) 3. Synthesise answer with `[[wikilink]]` citations 4. If answer is non-trivial: save as `wiki/queries/.md` and append to index @@ -974,7 +1083,8 @@ n8n (running on the storage node) can automate the ingest pipeline: 2. n8n flow identifies new files 3. For each new file: starts one agent session (sequential — never parallel) 4. Each session receives: `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path -5. Agent runs ingest workflow and opens PR +5. Phase 1 — agent runs `/skill:ingest` (semantic → writes manifest); Phase 2 — + `run-ingest.sh` does index/log/lint and opens the PR, returning one JSON line to n8n 6. Human reviews the PR Key constraint: one source per session, sessions sequential. @@ -984,11 +1094,13 @@ Never batch multiple sources into one agent session. If the AI compute node has an Intel NPU (e.g. Core Ultra series): -- Background tasks (embedding updates, index refresh) → Intel NPU via OpenVINO +- Background/auxiliary tasks (OCR of `raw/assets/`, async summarisation, or qmd + re-indexing **if** the optional qmd extension is in use) → Intel NPU via OpenVINO - Active reasoning sessions (ingest, query, synthesis) → GPU -This keeps the GPU's KV cache free for interactive work and reduces power consumption -for background operations. +Note: the core system has no embedding pipeline (see [Core Philosophy](#core-philosophy)), +so there is nothing to embed here — the NPU is only for auxiliary work. This keeps the +GPU's KV cache free for interactive sessions and lowers power draw for background jobs. --- From 76700cd2a68aea67115296478135e8d3b55a087b Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:33:45 +0200 Subject: [PATCH 15/28] docs(ingest): Define manifest rules for model and maturity fields --- skills/ingest/SKILL.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/skills/ingest/SKILL.md b/skills/ingest/SKILL.md index bd75214..449bb2e 100644 --- a/skills/ingest/SKILL.md +++ b/skills/ingest/SKILL.md @@ -52,7 +52,6 @@ append to the log/index, or open anything. ```json { "raw_source": "raw/articles/foo.md", - "model": "", "reasoning": "One sentence for the log: what changed and why.", "pr_summary": "One or two sentences describing this ingest for the PR.", "contradictions": "None (or: 1 conflict file created — )", @@ -66,7 +65,6 @@ append to the log/index, or open anything. { "path": "wiki/entities/acme.md", "summary": "Acme — vendor.", - "maturity": "draft", "status": "modified" } ] @@ -78,6 +76,10 @@ Manifest rules: - List every page you created or modified, with `status` `created` or `modified`. - `summary` is the one-line index description (≈12 words max). For conflict pages the summary is ignored — the index lists conflicts by slug only. +- `maturity` is required only on `created` pages (it seeds the new index entry). It is + ignored for `modified` pages, so omit it there. +- Do NOT add a `model` field — the orchestrator records which model produced this run; you + cannot know your own model name reliably, so do not guess one. - Do not invent a `run_id`, branch, commit, or PR — those belong to the post-processor. One source per session. After writing the manifest, stop. From 39775398f7212c8636ea6a9c4c8513166f98dc01 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:33:45 +0200 Subject: [PATCH 16/28] feat(ingest): Use orchestrator-provided INGEST_MODEL for log attribution --- skills/ingest/scripts/run-ingest.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 5bb6964..4ae9c97 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -27,7 +27,9 @@ command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by in # --- read manifest scalars --- raw_source="$(jq -r '.raw_source' "$manifest")" -model="$(jq -r '.model // "unknown"' "$manifest")" +# model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its +# own tag, so we do not trust a self-reported manifest field. Fall back only if unset. +model="${INGEST_MODEL:-$(jq -r '.model // "unknown"' "$manifest")}" reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")" pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")" contradictions="$(jq -r '.contradictions // "None"' "$manifest")" From 50d3f39f5186f47a6c133f027c6e4a5590d5776e Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:33:45 +0200 Subject: [PATCH 17/28] test(ingest): Add integration test for INGEST_MODEL logging --- tests/run-ingest.bats | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/run-ingest.bats b/tests/run-ingest.bats index 20864f9..65fdb43 100644 --- a/tests/run-ingest.bats +++ b/tests/run-ingest.bats @@ -91,3 +91,44 @@ EOF # listed by slug under the Conflicts section grep -q 'queries/conflict-pricing-2026-06-03' wiki/index.md } + +@test "run-ingest: records INGEST_MODEL in the log (manifest carries no model field)" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + + G="$(make_fixture_genome)"; cd "$G" + + cat > wiki/sources/test-source.md <<'EOF' +--- +title: "Test Source" +type: source +domain: genome-test +tags: [t] +maturity: draft +last_updated: 2026-06-04 +private: false +--- +body +EOF + + # New contract: NO "model" field — the orchestrator supplies it via INGEST_MODEL. + cat > .ingest-manifest.json <<'EOF' +{ + "raw_source": "raw/articles/test.md", + "reasoning": "Ingested the test source.", + "pr_summary": "Ingest of test: 1 source page.", + "contradictions": "None", + "pages": [ + {"path": "wiki/sources/test-source.md", "summary": "A smoke-test source.", "maturity": "draft", "status": "created"} + ] +} +EOF + + export KG_LIB_DIR="$LIB_DIR" + export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1 + export INGEST_MODEL="qwen-test-tag" + + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *'"status":"ok"'* ]] + grep -q 'qwen-test-tag' wiki/log.md +} From 6d1151fa5a55937a4afc39e808fed98b20f232a5 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:33:45 +0200 Subject: [PATCH 18/28] docs(ingest): Enhance guidance for naming and context awareness --- skills/ingest/SKILL.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/skills/ingest/SKILL.md b/skills/ingest/SKILL.md index 449bb2e..1f143d7 100644 --- a/skills/ingest/SKILL.md +++ b/skills/ingest/SKILL.md @@ -39,9 +39,17 @@ which matters on a small local model. 5. On a real contradiction with an existing claim, follow `AGENTS.md` §Conflict: create `wiki/queries/conflict--.md`. Never overwrite the existing page. -Name files in kebab-case and pick stable names. Read `wiki/index.md` (and the specific -pages it points to) to decide create-vs-update and to spot contradictions. Do not scan -whole directories. +**Naming — you are the sole author of these names; nothing renames your files.** Use +minimal kebab-case: lowercase letters, digits and hyphens only — no spaces, no underscores, +no capitals. Pick stable names so the same entity is never created twice (always `acme`, +never also `acme-corp`). The path you write a file to MUST be byte-for-byte the path you +list in the manifest. + +**Deciding create-vs-update and spotting contradictions — mind the context budget.** Use +`wiki/index.md` to locate existing pages, then read **only** the handful that _this source +actually names_ — the entities and concepts in the source's title and opening paragraphs — +not everything the index lists. When in doubt, read fewer: a missed cross-link is far +cheaper than a saturated context. Never scan whole directories. ## Finish: write the manifest, then STOP From 203fbadd6306263d4eefdc9b491cde025e324a59 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 19/28] feat(ingest): Refine slug generation with validation --- skills/ingest/scripts/run-ingest.sh | 2 +- skills/ingest/scripts/slug.sh | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 4ae9c97..032761c 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -36,7 +36,7 @@ contradictions="$(jq -r '.contradictions // "None"' "$manifest")" [[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" -slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" +slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}" # --- collect touched paths --- mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") diff --git a/skills/ingest/scripts/slug.sh b/skills/ingest/scripts/slug.sh index a5711ac..2f7fdc5 100644 --- a/skills/ingest/scripts/slug.sh +++ b/skills/ingest/scripts/slug.sh @@ -13,6 +13,11 @@ input="${1:?usage: slug.sh }" base="${input##*/}" base="${base%.*}" -printf '%s\n' "$base" \ +slug="$(printf '%s\n' "$base" \ | tr '[:upper:]' '[:lower:]' \ - | sed -E 's/[^a-z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//' + | sed -E 's/[^a-z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//')" + +# An all-symbols input (e.g. "!!!.md") collapses to "" — refuse rather than emit a +# broken/empty slug that would produce an invalid branch name downstream. +[[ -n "$slug" ]] || { echo "slug: empty result for input '${input}'" >&2; exit 1; } +printf '%s\n' "$slug" From e8980b55260c90b9d5d0f35208094f2b892d1e93 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 20/28] feat(ingest): Implement wikilink-based deduplication for index entries --- skills/ingest/scripts/index-append.py | 29 +++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py index e70009a..a4fc718 100644 --- a/skills/ingest/scripts/index-append.py +++ b/skills/ingest/scripts/index-append.py @@ -18,6 +18,7 @@ import re import sys ENTRY_RE = re.compile(r"^- \[\[") +LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]") HEADER_RE = re.compile(r"^## ") @@ -71,11 +72,31 @@ def main() -> int: intro = [ln for ln in body if not ENTRY_RE.match(ln)] entries = [ln for ln in body if ENTRY_RE.match(ln)] - if args.entry in entries: - print(f"index-append: entry already present, skipping") - return 0 + # Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed + # summary/maturity should UPDATE the existing entry, not add a duplicate line. + new_m = LINK_RE.match(args.entry) + new_link = new_m.group(1) if new_m else None + + if new_link is not None: + replaced = False + for idx, ln in enumerate(entries): + m = LINK_RE.match(ln) + if m and m.group(1) == new_link: + if ln == args.entry: + print("index-append: entry already present, skipping") + return 0 + entries[idx] = args.entry # same page, refreshed text + replaced = True + break + if not replaced: + entries.append(args.entry) + else: + # No parseable wikilink — fall back to exact-line dedup. + if args.entry in entries: + print("index-append: entry already present, skipping") + return 0 + entries.append(args.entry) - entries.append(args.entry) entries.sort(key=str.casefold) # Normalise intro: drop trailing blanks, keep header + comment(s) From 99806b8b3dac920eddc4be99be69705bcf98bae7 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 21/28] fix(ingest): Add frontmatter missing warning to index-append.py --- skills/ingest/scripts/index-append.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py index a4fc718..bb91238 100644 --- a/skills/ingest/scripts/index-append.py +++ b/skills/ingest/scripts/index-append.py @@ -50,6 +50,10 @@ def main() -> int: if fm_open and ln.startswith("last_updated:"): lines[i] = f"last_updated: {today}" + if not fm_open: + print("index-append: warning: no frontmatter found, last_updated not bumped", + file=sys.stderr) + # 2. Locate the target section [start, end) start = None for i, ln in enumerate(lines): From 00fb74c76a9bc1ab000e68500ed06c8f6efdaf64 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 22/28] feat(ingest): Improve general script robustness and cleanup --- skills/ingest/scripts/log-append.sh | 2 +- skills/ingest/scripts/run-ingest.sh | 13 ++++++++++++- skills/ingest/scripts/scoped-lint.sh | 7 ++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/skills/ingest/scripts/log-append.sh b/skills/ingest/scripts/log-append.sh index 32e6ca0..b3108a2 100644 --- a/skills/ingest/scripts/log-append.sh +++ b/skills/ingest/scripts/log-append.sh @@ -35,7 +35,7 @@ esac [[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; } -run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid)" +run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')" today="$(date +%Y-%m-%d)" { diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 032761c..4ffbd1f 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -46,6 +46,11 @@ all_paths=( "${created_paths[@]}" "${modified_paths[@]}" ) conflict_label="" +# NOTE: no rollback. Steps below mutate the working tree in order (index → log → commit). +# All are idempotent on re-run EXCEPT log-append (append-only). If a step fails midway, +# nothing is committed (open-pr is the only committer) — the operator re-runs, or inspects +# wiki/ if log-append already wrote a line. The manifest is removed only on full success. + # --- 1. index entries (created pages only), inserted in order --- while IFS=$'\t' read -r path summary maturity; do [[ -z "$path" ]] && continue @@ -119,4 +124,10 @@ jq -nc \ --arg detail "$pr_out" \ '{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}' -[[ $pr_rc -eq 0 ]] +# The manifest is a single file overwritten by each pi run (not accumulating), but on full +# success we remove it so a stale manifest can never be re-processed by mistake. +if [[ $pr_rc -eq 0 ]]; then + rm -f "$manifest" +else + exit 1 +fi diff --git a/skills/ingest/scripts/scoped-lint.sh b/skills/ingest/scripts/scoped-lint.sh index a064fd9..ded50a1 100644 --- a/skills/ingest/scripts/scoped-lint.sh +++ b/skills/ingest/scripts/scoped-lint.sh @@ -12,7 +12,12 @@ # ============================================================================= set -euo pipefail -: "${KG_LIB_DIR:?set KG_LIB_DIR to the framework lib/ dir (e.g. /opt/knowledge-genome-setup/lib)}" +: "${KG_LIB_DIR:?set KG_LIB_DIR to the framework lib/ dir (e.g. /opt/knowledge-genome-orchestrator/lib)}" + +# Fail clearly if the lib files are missing, rather than a raw `source: No such file`. +for _f in output.sh lint.sh; do + [[ -f "${KG_LIB_DIR}/${_f}" ]] || { echo "scoped-lint: missing ${KG_LIB_DIR}/${_f}" >&2; exit 1; } +done # shellcheck source=/dev/null source "${KG_LIB_DIR}/output.sh" From 3272450ec557b5f61ecb01347ca46fb3a5c87f45 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 23/28] fix(ingest): Scope git operations and add curl timeouts in open-pr.sh --- skills/ingest/scripts/open-pr.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh index 9ea4a9b..8f8f23b 100644 --- a/skills/ingest/scripts/open-pr.sh +++ b/skills/ingest/scripts/open-pr.sh @@ -39,11 +39,13 @@ repo="$(basename -s .git "$(git config --get remote.origin.url)")" # 1. Branch + commit + push (AGENTS.md rule 5: never commit to main) git switch -c "$branch" 2>/dev/null || git switch "$branch" git add wiki/ -if git diff --cached --quiet; then +# Scope BOTH the emptiness check and the commit to wiki/ — never commit anything that +# happened to be staged outside wiki/ (a stray hook, an aborted prior run, etc.). +if git diff --cached --quiet -- wiki/; then echo "open-pr: nothing staged under wiki/ — aborting" >&2 exit 1 fi -git commit -m "$title" +git commit -m "$title" -- wiki/ git push -u origin "$branch" # DRY_RUN: local git work done; skip the Forgejo API (offline tests). @@ -53,19 +55,23 @@ if [[ -n "${DRY_RUN:-}" ]]; then fi # 2. Open the PR via Forgejo API (jq builds the JSON safely) +# TODO: Forgejo-only. When registry.sh/globals.env sets PROVIDER=github, branch on +# $PROVIDER here and delegate to providers/github.sh (same token + http_code contract). body="$(cat "$body_file")" payload="$(jq -n --arg head "$branch" --arg base "$base" \ --arg title "$title" --arg body "$body" \ '{head:$head, base:$base, title:$title, body:$body}')" -resp="$(curl -s -w '\n%{http_code}' \ +resp="$(curl --max-time 30 -s -w '\n%{http_code}' \ -H "Authorization: token ${FORGEJO_TOKEN}" \ -H "Content-Type: application/json" \ -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls" \ -d "$payload")" -code="$(printf '%s' "$resp" | tail -n1)" -json="$(printf '%s' "$resp" | sed '$d')" +# curl -w appends '\n' AFTER the body, so the code is always the final line and the +# body is everything before it. Parameter expansion (no subshells), robust to multi-line JSON. +code="${resp##*$'\n'}" +json="${resp%$'\n'*}" case "$code" in 201) @@ -89,11 +95,11 @@ esac # 3. Optional label (e.g. CONFLICT). Best-effort; non-fatal. if [[ -n "$label" && -n "${number:-}" ]]; then - label_id="$(curl -s -H "Authorization: token ${FORGEJO_TOKEN}" \ + label_id="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \ "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/labels" \ | jq -r --arg n "$label" '.[] | select(.name==$n) | .id' | head -n1)" if [[ -n "$label_id" && "$label_id" != "null" ]]; then - curl -s -o /dev/null \ + curl --max-time 15 -s -o /dev/null \ -H "Authorization: token ${FORGEJO_TOKEN}" -H "Content-Type: application/json" \ -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/issues/${number}/labels" \ -d "{\"labels\":[${label_id}]}" \ From 9a81bb2d6fb4450cfa059fcebc39adfed7d0fc43 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH 24/28] test(ingest): Add tests for slug validation and index-append updates --- tests/scripts.bats | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/scripts.bats b/tests/scripts.bats index 5b7fe1b..86d18cd 100644 --- a/tests/scripts.bats +++ b/tests/scripts.bats @@ -51,3 +51,18 @@ load helpers python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`' [ "$(grep -c 'sources/dup' wiki/index.md)" -eq 1 ] } + +@test "index-append: updates an existing entry by wikilink path (no duplicate)" { + G="$(make_fixture_genome)"; cd "$G" + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — old summary. `maturity: draft`' + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — new summary. `maturity: stable`' + [ "$(grep -c 'sources/foo' wiki/index.md)" -eq 1 ] + grep -q 'new summary' wiki/index.md + ! grep -q 'old summary' wiki/index.md +} + +@test "slug: refuses an all-symbols input (no empty slug)" { + run bash "$SKILL_SCRIPTS/slug.sh" "!!!.md" + [ "$status" -ne 0 ] + [ -z "$output" ] || [[ "$output" != *"feat/ai-ingest-"* ]] +} From ddf34944e04d0b9d3f6d4c15e85754863a42bfbc Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 10:37:09 +0200 Subject: [PATCH 25/28] feat: improve index-append.py frontmatter self-healing --- skills/ingest/scripts/index-append.py | 10 +++++++++- tests/scripts.bats | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py index bb91238..89070e4 100644 --- a/skills/ingest/scripts/index-append.py +++ b/skills/ingest/scripts/index-append.py @@ -41,18 +41,26 @@ def main() -> int: # 1. Bump last_updated inside the first frontmatter block fm_open = False + fm_close_idx = None + bumped = False for i, ln in enumerate(lines): if ln.strip() == "---": if not fm_open: fm_open = True continue - break # end of frontmatter + fm_close_idx = i # the closing --- + break if fm_open and ln.startswith("last_updated:"): lines[i] = f"last_updated: {today}" + bumped = True if not fm_open: print("index-append: warning: no frontmatter found, last_updated not bumped", file=sys.stderr) + elif not bumped and fm_close_idx is not None: + # self-heal: frontmatter present but missing the key — insert it before the close + lines.insert(fm_close_idx, f"last_updated: {today}") + print("index-append: last_updated key was missing — inserted", file=sys.stderr) # 2. Locate the target section [start, end) start = None diff --git a/tests/scripts.bats b/tests/scripts.bats index 86d18cd..19f758e 100644 --- a/tests/scripts.bats +++ b/tests/scripts.bats @@ -66,3 +66,23 @@ load helpers [ "$status" -ne 0 ] [ -z "$output" ] || [[ "$output" != *"feat/ai-ingest-"* ]] } + +@test "index-append: self-heals a frontmatter missing last_updated" { + G="$(make_fixture_genome)"; cd "$G" + cat > wiki/index.md <<'EOF' +--- +title: "Index" +type: index +domain: genome-test +maturity: stable +private: false +--- + +# Index + +## Sources (`wiki/sources/`) +*x* +EOF + python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — s. `maturity: draft`' + grep -q "^last_updated: $(date +%F)$" wiki/index.md +} From 93bc5bb0075f7f6121f9c796530fe72127bf1c76 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 10:37:09 +0200 Subject: [PATCH 26/28] feat: enhance ingest workflow with validation, PR control, and robustness --- skills/ingest/scripts/open-pr.sh | 10 +++++++- skills/ingest/scripts/run-ingest.sh | 17 ++++++++++--- tests/run-ingest.bats | 39 +++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh index 8f8f23b..3e979ff 100644 --- a/skills/ingest/scripts/open-pr.sh +++ b/skills/ingest/scripts/open-pr.sh @@ -80,7 +80,15 @@ case "$code" in echo "PR opened: ${url}" ;; 409) - echo "open-pr: a PR for '${branch}' already exists — push updated the branch." >&2 + # PR already exists — fetch it so the orchestrator still gets the URL. + existing="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \ + "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls?state=open" \ + | jq -r --arg b "$branch" '.[] | select(.head.ref==$b) | .html_url' | head -n1)" + if [[ -n "$existing" && "$existing" != "null" ]]; then + echo "PR opened: ${existing}" + else + echo "open-pr: a PR for '${branch}' already exists (push updated the branch)." >&2 + fi exit 0 ;; 401) diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 4ffbd1f..ae5b3af 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -25,6 +25,18 @@ command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq mis command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" [[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}" +# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) --- +# 1) well-formed JSON object with a string raw_source and an array of pages +jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \ + "$manifest" >/dev/null 2>&1 \ + || fail "manifest" "invalid manifest: need object with string raw_source and array pages" +# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal) +if jq -e '[.pages[].path + | select((type!="string") or (startswith("wiki/")|not) or test("\\.\\."))] + | length > 0' "$manifest" >/dev/null 2>&1; then + fail "manifest" "unsafe page path (must be a string under wiki/, no '..')" +fi + # --- read manifest scalars --- raw_source="$(jq -r '.raw_source' "$manifest")" # model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its @@ -88,6 +100,7 @@ lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 ) # --- 4. assemble the PR body (manifest tables + lint results) --- body="$(mktemp)" +trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash) { echo "## Summary" echo "$pr_summary" @@ -107,13 +120,11 @@ body="$(mktemp)" } > "$body" # --- 5. open the PR --- -pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" ) +pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" ) [[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" -rm -f "$body" - # --- final result line for n8n --- jq -nc \ --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ diff --git a/tests/run-ingest.bats b/tests/run-ingest.bats index 65fdb43..6b7c30b 100644 --- a/tests/run-ingest.bats +++ b/tests/run-ingest.bats @@ -132,3 +132,42 @@ EOF [[ "$output" == *'"status":"ok"'* ]] grep -q 'qwen-test-tag' wiki/log.md } + +@test "run-ingest: rejects a manifest path that escapes wiki/ (traversal)" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + G="$(make_fixture_genome)"; cd "$G" + cat > .ingest-manifest.json <<'EOF' +{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None", + "pages":[{"path":"wiki/../etc/passwd","summary":"x","maturity":"draft","status":"created"}] } +EOF + export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1 + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -ne 0 ] + [[ "$output" == *'"status":"error"'* ]] +} + +@test "run-ingest: honours INGEST_BASE for the PR base" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + G="$(make_fixture_genome)"; cd "$G" + cat > wiki/sources/test-source.md <<'EOF' +--- +title: "Test Source" +type: source +domain: genome-test +tags: [t] +maturity: draft +last_updated: 2026-06-04 +private: false +--- +body +EOF + cat > .ingest-manifest.json <<'EOF' +{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None", + "pages":[{"path":"wiki/sources/test-source.md","summary":"s","maturity":"draft","status":"created"}] } +EOF + export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1 + export INGEST_BASE="develop" + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *"develop"* ]] +} From 2426b09b50a39799f627593c36a652602c04fa78 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 10:47:35 +0200 Subject: [PATCH 27/28] refactor(ingest): Refine path validation and improve script clarity --- skills/ingest/scripts/run-ingest.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index ae5b3af..bc68bcf 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -30,10 +30,11 @@ command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by in jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \ "$manifest" >/dev/null 2>&1 \ || fail "manifest" "invalid manifest: need object with string raw_source and array pages" + # 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal) if jq -e '[.pages[].path - | select((type!="string") or (startswith("wiki/")|not) or test("\\.\\."))] - | length > 0' "$manifest" >/dev/null 2>&1; then + | select((type!="string") or (startswith("wiki/")|not) or contains(".."))] + | length > 0' "$manifest" >/dev/null 2>&1; then fail "manifest" "unsafe page path (must be a string under wiki/, no '..')" fi @@ -95,12 +96,13 @@ bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ || fail "log" "log-append failed" -# --- 3. scoped lint (capture findings for the PR; never aborts the run) --- +# --- 3. scoped linter (capture findings for the PR; never aborts the run) --- lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? # --- 4. assemble the PR body (manifest tables + lint results) --- body="$(mktemp)" -trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash) +trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash) + { echo "## Summary" echo "$pr_summary" @@ -135,8 +137,8 @@ jq -nc \ --arg detail "$pr_out" \ '{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}' -# The manifest is a single file overwritten by each pi run (not accumulating), but on full -# success we remove it so a stale manifest can never be re-processed by mistake. +# The manifest is a single file that is overwritten with each run, but if the process is +# completely successful, we remove it to prevent an outdated manifest from being reprocessed by mistake. if [[ $pr_rc -eq 0 ]]; then rm -f "$manifest" else From ab1141e1328501b51c6bb47495d900f16947cbe6 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 12:02:18 +0200 Subject: [PATCH 28/28] feat: Document ingest security model and human-gated workflow --- README.md | 54 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 96fe9c2..c0521a3 100644 --- a/README.md +++ b/README.md @@ -580,6 +580,17 @@ This means: any file matching `**/private/**` in `.gitattributes` is protected, including future `private/` directories created anywhere in the repo. The hook never needs updating when the encryption rules change. +### Untrusted agent output — manifest validation + +The ingest agent's output is stochastic: a hallucinated manifest could carry a missing field, +a wrong type, or a malicious path such as `wiki/../../etc/passwd`. `run-ingest.sh` therefore +**validates the manifest before trusting any field** — it must be well-formed JSON with a +string `raw_source` and an array `pages`, and **every `path` must be a string under `wiki/` +with no `..`**. Anything else fails fast with a structured `{"status":"error"}` and no +filesystem access outside the wiki, so a bad path can't drive a read or a lint outside the +knowledge tree. This is the trust boundary between the (stochastic) model and the +(deterministic, tested) post-processor. + ### PRIVATE_CONTEXT toggle The `PRIVATE_CONTEXT` toggle in `AGENTS.md` controls whether the LLM agent @@ -753,9 +764,9 @@ For Forgejo webhook → automated ingest: 1. Forgejo sends webhook on push to `raw/` 2. n8n receives webhook, identifies new files 3. n8n starts one agent session per new file (sequential, not parallel) -4. Each session: inject `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path -5. Phase 1 agent (`/skill:ingest`) writes the manifest; Phase 2 `run-ingest.sh` opens the PR -6. Human reviews and merges PR +4. Each session: realign the checkout to the base (`git switch && git reset --hard origin/`), then inject `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path +5. Phase 1 agent (`/skill:ingest`) writes the manifest; Phase 2 `run-ingest.sh` opens the PR, then **stops** +6. Human reviews — **merge to accept**, or close the PR + delete the `feat` branch to reject --- @@ -778,15 +789,21 @@ only (no shell). It: 6. Writes `.ingest-manifest.json` (the list of pages it created/modified, the model name, a one-line reasoning, the PR summary, and any contradictions) — then **stops** -**Phase 2 — `run-ingest.sh` (deterministic, outside the agent).** The post-processor -consumes the manifest and does the mechanical work the model must not waste context on: +**Phase 2 — `run-ingest.sh` (deterministic, outside the agent).** The post-processor first +**validates the manifest** — well-formed JSON, expected shape, and every page path confined to +`wiki/` with no `..` (see [Security Model](#security-model)) — then does the mechanical work the +model must not waste context on: -7. Inserts each page into the correct `wiki/index.md` section **in alphabetical order** - (`index-append.py`) and bumps the index `last_updated` -8. Appends the `INGEST | ` entry to `wiki/log.md` +7. Inserts each page into the correct `wiki/index.md` section **in alphabetical order**, + deduplicated by wikilink (a re-ingest updates the entry, never duplicates it), and bumps the + index `last_updated` (`index-append.py`) +8. Appends the `INGEST | ` entry to `wiki/log.md` (the model name comes from the + orchestrator via `INGEST_MODEL` — the agent cannot reliably know its own tag) 9. Runs scoped lint on exactly the pages touched this run (`scoped-lint.sh`, reusing `lib/lint.sh`) -10. Commits on `feat/ai-ingest-` and opens the PR using `templates/pr-description.md` +10. Commits **only `wiki/`** on `feat/ai-ingest-` and opens a PR against the integration + base (`INGEST_BASE`, default `main`); the body matches the `templates/pr-description.md` + structure (Summary / Pages / Contradictions / Scoped Lint) 11. Emits a single compact JSON line (status, slug, PR url, lint_clean, conflict) for n8n The agent never runs git, never edits the index/log mechanically, and never lints — those @@ -802,6 +819,25 @@ For private sources (`PRIVATE_CONTEXT: enabled` required): - All output goes to `wiki/private/.md` only - PR title: `[PRIVATE] ingest: ` +**Branch lifecycle & the manual gate.** `run-ingest.sh` / `open-pr.sh` are deliberately +"dumb": they create the `feat/ai-ingest-` branch, commit only `wiki/`, open the PR, and +stop. They never reset, revert, or touch the integration branch — that lifecycle belongs to +the orchestrator, around the human gate: + +- **Before each session** the orchestrator realigns the checkout to the base + (`git fetch && git switch && git reset --hard origin/`) — a reset of the _local_ + checkout to match the remote, never a force-push to the shared branch. +- **After the PR opens, everything stops** until a human approves: one source per session, + sequential, no new ingest until the pending PR is closed. +- **Approve = merge. Reject = close the PR and delete the remote `feat` branch.** To undo an + already-merged ingest, open a _revert PR_ against the base — never rewrite history on a + shared branch. + +The PR base is configurable via `INGEST_BASE` (default `main`). Per-page `maturity` already +encodes stability and tags/releases mark versioned snapshots, so `main` is the integration +branch today. If a linked project later _consumes_ a genome, set `INGEST_BASE=develop` to +buffer ingests on `develop` and cut manual `develop → main` releases — no code change. + ### Query Triggered by an operator question.