Compare commits

..

7 commits

10 changed files with 597 additions and 5 deletions

130
diagnose-run-ingest.sh Normal file
View file

@ -0,0 +1,130 @@
#!/usr/bin/env bash
# diagnose-run-ingest.sh
# Run from the repo root: bash diagnose-run-ingest.sh
# Builds the same fixture the bats test uses and runs run-ingest under `bash -x`
# so we can see exactly which command makes it exit non-zero.
set -uo pipefail
REPO="$(pwd)"
RI="${REPO}/skills/ingest/scripts/run-ingest.sh"
echo "==================== ENV ===================="
echo "bash: $(bash --version | head -1)"
echo "git : $(git --version)"
echo "jq : $(jq --version 2>/dev/null || echo MISSING)"
echo "py : $(python3 --version 2>/dev/null || echo MISSING)"
echo
echo "============ run-ingest.sh on disk ============"
if [[ ! -f "$RI" ]]; then echo "NOT FOUND: $RI (run me from the repo root)"; exit 1; fi
echo "-- helper invocations (want 'bash ...'): --"
grep -nE 'log-append\.sh|scoped-lint\.sh|open-pr\.sh' "$RI"
echo "-- result emitter (want 'jq -nc'): --"
grep -nE 'jq -nc?|jq -n ' "$RI"
echo
echo "============ build hermetic fixture ============"
T="$(mktemp -d)"
mkdir -p "$T/nohooks"
git init --bare -q "$T/origin.git"
g="$T/g"
mkdir -p "$g"/{raw/articles,wiki/sources,wiki/entities,wiki/concepts,wiki/queries,wiki/private}
cat > "$g/wiki/index.md" <<'EOF'
---
title: "Index"
type: index
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Index
---
## Sources (`wiki/sources/`)
*x*
## Entities (`wiki/entities/`)
*x*
## Concepts (`wiki/concepts/`)
*x*
## Queries (`wiki/queries/`)
*x*
## Conflicts Pending Review (`wiki/queries/conflict-*.md`)
*x*
EOF
cat > "$g/wiki/log.md" <<'EOF'
---
title: "Log"
type: log
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Log
---
## [2026-01-01] CONFIG | init
- run_id: `init`
EOF
echo raw > "$g/raw/articles/test.md"
(
cd "$g"
git init -q
git config commit.gpgsign false
git config core.hooksPath "$T/nohooks"
git config user.email t@t
git config user.name t
git add .
git commit -qm init
git branch -M main
git remote add origin "$T/origin.git"
git push -q -u origin main
) && echo "fixture commit+push OK" || echo "FIXTURE SETUP FAILED (look above)"
cat > "$g/wiki/sources/test-source.md" <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-04
private: false
---
body
EOF
cat > "$g/.ingest-manifest.json" <<'EOF'
{ "raw_source":"raw/articles/test.md","model":"m","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/sources/test-source.md","summary":"a source","maturity":"draft","status":"created"}] }
EOF
echo
echo "============ run-ingest (bash -x) ============"
cd "$g"
export KG_LIB_DIR="${REPO}/lib" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
bash -x "$RI" genome-test >"$T/out.txt" 2>"$T/trace.txt"
rc=$?
echo "EXIT=$rc"
echo "-- run-ingest stdout (final JSON should be here): --"
cat "$T/out.txt"
echo "-- last 25 lines of the trace (the failing command is near the end): --"
tail -n 25 "$T/trace.txt"

View file

@ -46,6 +46,12 @@ fi
git commit -m "$title"
git push -u origin "$branch"
# DRY_RUN: local git work done; skip the Forgejo API (offline tests).
if [[ -n "${DRY_RUN:-}" ]]; then
echo "PR opened: DRY-RUN ${branch} -> ${base}"
exit 0
fi
# 2. Open the PR via Forgejo API (jq builds the JSON safely)
body="$(cat "$body_file")"
payload="$(jq -n --arg head "$branch" --arg base "$base" \

View file

@ -16,7 +16,7 @@ manifest="${2:-.ingest-manifest.json}"
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
fail() {
jq -n --arg stage "$1" --arg reason "$2" \
jq -nc --arg stage "$1" --arg reason "$2" \
'{status:"error", stage:$stage, reason:$reason}'
exit 1
}
@ -72,12 +72,12 @@ done < <(jq -r '.pages[] | select(.status=="created")
# --- 2. log entry ---
out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")"
"${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \
bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \
--context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \
|| fail "log" "log-append failed"
# --- 3. scoped lint (capture findings for the PR; never aborts the run) ---
lint_out="$( "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
# --- 4. assemble the PR body (manifest tables + lint results) ---
body="$(mktemp)"
@ -102,13 +102,13 @@ body="$(mktemp)"
# --- 5. open the PR ---
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" )
[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" )
pr_out="$( "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
rm -f "$body"
# --- final result line for n8n ---
jq -n \
jq -nc \
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \
--arg slug "$slug" \
--arg pr_url "$pr_url" \

View file

@ -0,0 +1,45 @@
# {{MASTER_REPO}}
Master (umbrella) repository for the Knowledge Genome network.
| Field | Value |
| ---------- | -------------------------------------------------- |
| Owner | `{{FORGEJO_USER}}` |
| Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}` |
| Scaffolded | `{{DATE}}` |
## What this repo is
This repository does **not** hold knowledge itself. It is the orchestrator: each genome
is a Git submodule, plus `core-karpathy` as a read-only reference pattern. Cross-genome
coordination rules live in `AGENTS.md`.
```text
{{MASTER_REPO}}/
├── core-karpathy/ ← reference pattern — read-only, never modify
├── genome-*/ ← one submodule per genome (own AGENTS.md, own git-crypt)
└── AGENTS.md ← cross-genome coordinator (boundaries only)
```
## Working with submodules
```bash
# Clone with all genomes
git clone --recurse-submodules {{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}.git
# Pull the latest pointers for every genome
git submodule update --remote --merge
# Operate inside a single genome (one genome at a time — see AGENTS.md)
cd genome-<name>
```
## Rules of the road
- Operate within **one genome at a time**; no commits spanning multiple genomes.
- `core-karpathy` is read-only.
- Never commit to `main` in a genome — PRs only, no self-merge.
- Private data (`**/private/**`) is git-crypt encrypted and never leaves the local network.
Genome-level operations are governed by each genome's own `AGENTS.md`. This README and the
master `AGENTS.md` govern boundaries only.

56
tests/README.md Normal file
View file

@ -0,0 +1,56 @@
# Tests
Deterministic tests for the mechanical layer of the framework — **no LLM, no GPU, no
network**. They simulate pi's output with fixtures and exercise the scripts directly, so
they run anywhere (laptop, CI, a git hook). They do **not** belong on vm101 or in n8n.
## What's covered
| File | Covers |
|------|--------|
| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) |
| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` reuse |
| `structure.bats` | `lib/structure.sh` report/sync |
| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` |
`run-ingest.bats` auto-`skip`s if `jq` is missing; everything else needs only bash + git
(+ `python3` for the index tests).
## Install bats
```bash
# Debian/Ubuntu
sudo apt install bats
# or pinned, as a vendored submodule
git submodule add https://github.com/bats-core/bats-core.git test/bats
```
## Run
```bash
bats tests/ # whole suite
bats tests/lint.bats # one file
bats -f "sorted" tests/scripts.bats # filter by name
```
Each test builds its own throwaway genome under `BATS_TEST_TMPDIR` (auto-cleaned) with a
local bare git remote, so `open-pr.sh --DRY_RUN` can branch/commit/push without touching
Forgejo.
## Makefile targets
```make
test:
@bats tests/
verify-structure:
@bash scripts/verify-genomes.sh
sync-structure:
@bash scripts/verify-genomes.sh --sync
```
## Note on `helpers.bash`
`FIXTURE_DIRS` in `helpers.bash` must match `GENOME_DIRS` in `lib/structure.sh`. If you
change the canonical layout, update both (the structure tests assume a clean baseline).

98
tests/helpers.bash Normal file
View file

@ -0,0 +1,98 @@
#!/usr/bin/env bash
# tests/helpers.bash — shared helpers for the bats suite.
REPO_ROOT="$(cd "${BATS_TEST_DIRNAME}/.." && pwd)"
LIB_DIR="${REPO_ROOT}/lib"
SKILL_SCRIPTS="${REPO_ROOT}/skills/ingest/scripts"
# Canonical dirs a fresh genome must contain (kept in sync with lib/structure.sh).
FIXTURE_DIRS=(
raw/articles raw/transcripts raw/code-packs raw/assets raw/private
wiki/sources wiki/entities wiki/concepts wiki/queries wiki/private
)
# make_fixture_genome → echoes the path to a throwaway genome checkout with a
# local bare remote, the full canonical structure, and rendered index/log.
# Uses BATS_TEST_TMPDIR so bats cleans it up automatically.
make_fixture_genome() {
local base; base="$(mktemp -d "${BATS_TEST_TMPDIR:-/tmp}/genome.XXXXXX")"
git init --bare -q "${base}/origin.git"
local g="${base}/genome"
local d
for d in "${FIXTURE_DIRS[@]}"; do mkdir -p "${g}/${d}"; touch "${g}/${d}/.gitkeep"; done
cat > "${g}/wiki/index.md" <<'EOF'
---
title: "Index — genome-test"
type: index
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Master Index: genome-test
---
## Sources (`wiki/sources/`)
*Ingested raw materials.*
## Entities (`wiki/entities/`)
*People, tools.*
## Concepts (`wiki/concepts/`)
*Patterns.*
## Queries (`wiki/queries/`)
*Answers.*
## Conflicts Pending Review (`wiki/queries/conflict-*.md`)
*slugs only.*
EOF
cat > "${g}/wiki/log.md" <<'EOF'
---
title: "Operations Log — genome-test"
type: log
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Operations Log
---
## [2026-01-01] CONFIG | scaffolded
- run_id: `init`
EOF
echo "raw test" > "${g}/raw/articles/test.md"
mkdir -p "${base}/nohooks"
(
cd "${g}"
git init -q
# Hermetic: ignore the user's global git config (signing, global hooks);
# otherwise commit.gpgsign or a global core.hooksPath makes git commit fail here.
git config commit.gpgsign false
git config core.hooksPath "${base}/nohooks"
git config user.email t@t
git config user.name tester
git add .
git commit -qm init
git branch -M main
git remote add origin "${base}/origin.git"
git push -q -u origin main
) >/dev/null
echo "${g}"
}

71
tests/lint.bats Normal file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env bats
# tests/lint.bats — lib/lint.sh validators and the scoped-lint wrapper.
load helpers
setup() {
source "$LIB_DIR/output.sh"
source "$LIB_DIR/lint.sh"
}
write_page() { # write_page <path> <type> <domain>
cat > "$1" <<EOF
---
title: "T"
type: $2
domain: $3
tags: [x]
maturity: draft
last_updated: $(date +%F)
private: false
---
body
EOF
}
@test "lint_markdown_file: a clean page passes (0 errors)" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/good.md" source genome-test
run lint_markdown_file "$G/wiki/sources/good.md" genome-test
[ "$status" -eq 0 ]
}
@test "lint_markdown_file: invalid type + wrong domain are caught" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/bad.md" banana wrong-genome
run lint_markdown_file "$G/wiki/sources/bad.md" genome-test
[ "$status" -ne 0 ]
}
@test "check_privacy_consistency: a private/ file without 'private: true' fails" {
G="$(make_fixture_genome)"
# page sits in wiki/private/ but is flagged private: false → leak
write_page "$G/wiki/private/p.md" private genome-test
run check_privacy_consistency "$G/wiki/private/p.md"
[ "$status" -ne 0 ]
}
@test "check_page_size: a >800-line page errors" {
G="$(make_fixture_genome)"
{ write_page "$G/wiki/sources/big.md" source genome-test; yes "x" | head -n 850 >> "$G/wiki/sources/big.md"; }
run check_page_size "$G/wiki/sources/big.md"
[ "$status" -ne 0 ]
}
@test "scoped-lint: aggregates findings and exits non-zero on errors" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/bad.md" banana wrong-genome
cd "$G"
export KG_LIB_DIR="$LIB_DIR"
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/bad.md
[ "$status" -ne 0 ]
[[ "$output" == *"error(s)"* ]]
}
@test "scoped-lint: a clean page passes (exit 0)" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/good.md" source genome-test
cd "$G"
export KG_LIB_DIR="$LIB_DIR"
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/good.md
[ "$status" -eq 0 ]
}

93
tests/run-ingest.bats Normal file
View file

@ -0,0 +1,93 @@
#!/usr/bin/env bats
# tests/run-ingest.bats — end-to-end orchestrator test (no LLM, no network).
# Simulates pi's output (a source page + manifest) and runs the mechanical pass.
load helpers
@test "run-ingest: DRY_RUN end-to-end updates index + log and opens a dry PR" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
# --- simulate the semantic pass that pi would have done ---
cat > wiki/sources/test-source.md <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-03
private: false
---
body
EOF
cat > .ingest-manifest.json <<'EOF'
{
"raw_source": "raw/articles/test.md",
"model": "qwen3.5-9b",
"reasoning": "Ingested the test source.",
"pr_summary": "Ingest of test: 1 source page.",
"contradictions": "None",
"pages": [
{"path": "wiki/sources/test-source.md", "summary": "A smoke-test source.", "maturity": "draft", "status": "created"}
]
}
EOF
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t"
export DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *'"status":"ok"'* ]]
[[ "$output" == *'"lint_clean":true'* ]]
[[ "$output" == *'"conflict":false'* ]]
# side effects on the working tree
grep -q 'sources/test-source' wiki/index.md
grep -q 'INGEST | test' wiki/log.md
git rev-parse --verify feat/ai-ingest-test
}
@test "run-ingest: a conflict page is labelled and lands in the Conflicts section" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/queries/conflict-pricing-2026-06-03.md <<'EOF'
---
title: "Conflict: pricing"
type: conflict
domain: genome-test
maturity: draft
last_updated: 2026-06-03
private: false
---
conflict body
EOF
cat > .ingest-manifest.json <<'EOF'
{
"raw_source": "raw/articles/test.md",
"model": "m",
"reasoning": "Flagged a contradiction.",
"pr_summary": "Conflict on pricing.",
"contradictions": "1 conflict file created — pricing",
"pages": [
{"path": "wiki/queries/conflict-pricing-2026-06-03.md", "summary": "ignored", "maturity": "draft", "status": "created"}
]
}
EOF
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t"
export DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *'"conflict":true'* ]]
# listed by slug under the Conflicts section
grep -q 'queries/conflict-pricing-2026-06-03' wiki/index.md
}

53
tests/scripts.bats Normal file
View file

@ -0,0 +1,53 @@
#!/usr/bin/env bats
# tests/scripts.bats — unit tests for the deterministic skill scripts.
load helpers
@test "slug: path with extension and spaces" {
run bash "$SKILL_SCRIPTS/slug.sh" "raw/articles/My Test Source.md"
[ "$status" -eq 0 ]
[ "$output" = "my-test-source" ]
}
@test "slug: punctuation and repeats collapse to single hyphens" {
run bash "$SKILL_SCRIPTS/slug.sh" "Qualche Concetto!! Strano"
[ "$output" = "qualche-concetto-strano" ]
}
@test "log-append: appends a well-formed INGEST entry with a run_id" {
G="$(make_fixture_genome)"; cd "$G"
run bash "$SKILL_SCRIPTS/log-append.sh" --type INGEST --subject foo --model m \
--context "[[raw/x]]" --output "[[sources/foo]]" --reasoning "why"
[ "$status" -eq 0 ]
grep -q "INGEST | foo" wiki/log.md
grep -q '^- run_id: `' wiki/log.md
grep -q '^- model: `m`' wiki/log.md
}
@test "log-append: rejects an invalid TYPE" {
G="$(make_fixture_genome)"; cd "$G"
run bash "$SKILL_SCRIPTS/log-append.sh" --type BOGUS --subject foo
[ "$status" -ne 0 ]
}
@test "index-append: inserts under the right section and keeps it sorted" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/zzz]] — z. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/aaa]] — a. `maturity: draft`'
a=$(grep -n 'sources/aaa' wiki/index.md | cut -d: -f1)
z=$(grep -n 'sources/zzz' wiki/index.md | cut -d: -f1)
[ -n "$a" ] && [ -n "$z" ]
[ "$a" -lt "$z" ]
}
@test "index-append: bumps frontmatter last_updated to today" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Concepts --entry '- [[concepts/x]] — x. `maturity: draft`'
grep -q "^last_updated: $(date +%F)$" wiki/index.md
}
@test "index-append: is idempotent for the same entry" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`'
[ "$(grep -c 'sources/dup' wiki/index.md)" -eq 1 ]
}

40
tests/structure.bats Normal file
View file

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# tests/structure.bats — canonical-structure verify/sync.
load helpers
setup() {
source "$LIB_DIR/output.sh"
source "$LIB_DIR/structure.sh"
}
@test "structure_report: a full fixture has no drift" {
G="$(make_fixture_genome)"
run structure_report "$G"
[ "$status" -eq 0 ]
}
@test "structure_report: flags a missing canonical dir" {
G="$(make_fixture_genome)"
rm -rf "$G/wiki/private"
run structure_report "$G"
[ "$status" -ne 0 ]
[[ "$output" == *"wiki/private"* ]]
}
@test "structure_report: notes an extra dir but does not fail on it" {
G="$(make_fixture_genome)"
mkdir -p "$G/wiki/experiments"
run structure_report "$G"
[ "$status" -eq 0 ]
[[ "$output" == *"experiments"* ]]
}
@test "structure_sync: creates missing dirs and is idempotent" {
G="$(make_fixture_genome)"
rm -rf "$G/wiki/private" "$G/raw/transcripts"
structure_sync "$G"
[ -d "$G/wiki/private" ] && [ -d "$G/raw/transcripts" ]
run structure_report "$G"
[ "$status" -eq 0 ]
structure_sync "$G" # second run: nothing to do
}