Merge branch 'feature/mvp0-linked-projects-and-ingest' into develop

This commit is contained in:
Matteo Cherubini 2026-06-05 12:17:03 +02:00
commit f3af62897f
28 changed files with 1900 additions and 216 deletions

View file

@ -1,22 +1,25 @@
# =============================================================================
# Knowledge Genome - Makefile v. 1.0.0
# Knowledge Genome - Makefile v. 1.1.0
# Orchestrates the setup and management of the knowledge base.
# =============================================================================
include globals.env
export $(shell grep -v '^[#[:space:]]' globals.env | sed 's/=.*//')
.PHONY: setup add-genome status lint lock doctor sync help
.PHONY: setup add-genome status lint lock doctor sync test verify-structure sync-structure help
help:
@echo "Available commands:"
@echo " make setup - Full system initialization"
@echo " make add-genome - Register and scaffold a new genome"
@echo " make status - Check submodule and encryption status"
@echo " make lint - Verify schema, privacy flags, and metadata"
@echo " make lock - Lock all encrypted files across all genomes"
@echo " make doctor - Verify all required tools are installed"
@echo " make sync - Sync submodules and report unpushed commits"
@echo " make setup - Full system initialization"
@echo " make add-genome - Register and scaffold a new genome [LINKED=owner/repo]"
@echo " make status - Check submodule and encryption status"
@echo " make lint - Verify schema, privacy flags, and metadata"
@echo " make verify-structure - Report directory drift across all genomes"
@echo " make sync-structure - Create any missing canonical dirs (safe)"
@echo " make test - Run the bats test suite (no LLM/GPU needed)"
@echo " make lock - Lock all encrypted files across all genomes"
@echo " make doctor - Verify all required tools are installed"
@echo " make sync - Sync submodules and report unpushed commits"
lint:
@bash scripts/lint-genomes.sh
@ -27,16 +30,26 @@ setup:
add-genome:
@if [ -z "$(NAME)" ] || [ -z "$(DESC)" ]; then \
echo "Error: NAME and DESC are required."; \
echo "Usage: make add-genome NAME=my-genome DESC='My description'"; \
echo "Usage: make add-genome NAME=my-genome DESC='My description' [LINKED=owner/project-repo]"; \
exit 1; \
fi
@bash scripts/add-genome.sh "$(NAME)" "$(DESC)"
@bash scripts/add-genome.sh "$(NAME)" "$(DESC)" "$(LINKED)"
status:
@echo "--- Master Status ---"
@git submodule status
@echo "--- Encryption Status (First 10 files) ---"
@git-crypt status | head -n 10
@echo "--- Encryption Status (per genome) ---"
@git submodule foreach 'git-crypt status 2>/dev/null | head -n 10 || true'
verify-structure:
@bash scripts/verify-genomes.sh
sync-structure:
@bash scripts/verify-genomes.sh --sync
test:
@command -v bats >/dev/null 2>&1 || { echo " MISSING: bats (sudo apt install bats)"; exit 1; }
@bats tests/
doctor:
@echo "Checking required tools..."
@ -45,6 +58,7 @@ doctor:
@command -v curl >/dev/null 2>&1 || { echo " MISSING: curl"; exit 1; }
@command -v jq >/dev/null 2>&1 || { echo " MISSING: jq"; exit 1; }
@command -v bw >/dev/null 2>&1 || echo " OPTIONAL: bw (Bitwarden CLI) not found — key injection will be manual."
@command -v python3 >/dev/null 2>&1 || echo " OPTIONAL: python3 not found — needed for 'make test' and the ingest skill (index-append.py), not for setup."
@echo "System ready."
sync:

449
README.md
View file

@ -19,16 +19,17 @@ and a human-in-the-loop Git Flow for quality control.
5. [Configuration](#configuration)
6. [Quick Start](#quick-start)
7. [Makefile Reference](#makefile-reference)
8. [Genome Lifecycle](#genome-lifecycle)
9. [Security Model](#security-model)
10. [Key Management](#key-management)
11. [Agent Sessions](#agent-sessions)
12. [Workflows](#workflows)
13. [Knowledge Quality](#knowledge-quality)
14. [Knowledge Schema](#knowledge-schema)
15. [Collaboration Model](#collaboration-model)
16. [Optional Extensions](#optional-extensions)
17. [Troubleshooting](#troubleshooting)
8. [Testing](#testing)
9. [Genome Lifecycle](#genome-lifecycle)
10. [Security Model](#security-model)
11. [Key Management](#key-management)
12. [Agent Sessions](#agent-sessions)
13. [Workflows](#workflows)
14. [Knowledge Quality](#knowledge-quality)
15. [Knowledge Schema](#knowledge-schema)
16. [Collaboration Model](#collaboration-model)
17. [Optional Extensions](#optional-extensions)
18. [Troubleshooting](#troubleshooting)
---
@ -49,6 +50,7 @@ evolving synthesis. Knowledge is compiled once and kept current.
Contradictions have been flagged. The synthesis already reflects everything ingested.
This means:
- No vector database.
- No embedding pipeline.
- No external retrieval infrastructure.
@ -103,16 +105,24 @@ genome-{name}/
### Three layers
| Layer | Path | Owner | Rule |
|-------|------|-------|------|
| Raw sources | `raw/` | Human | Immutable. LLM reads only. Never modified. |
| Wiki | `wiki/` | LLM | Agent creates, updates, cross-links, maintains. |
| Schema | `AGENTS.md` | Human + LLM | Co-evolved contract defining structure and workflows. |
| Layer | Path | Owner | Rule |
| ----------- | ----------- | ----------- | ----------------------------------------------------- |
| Raw sources | `raw/` | Human | Immutable. LLM reads only. Never modified. |
| Wiki | `wiki/` | LLM | Agent creates, updates, cross-links, maintains. |
| Schema | `AGENTS.md` | Human + LLM | Co-evolved contract defining structure and workflows. |
### Linked projects (optional)
A genome can optionally declare a **linked project repository** — a separate repo where
the knowledge in that genome is meant to be applied (e.g. `genome-dev` linked to an app
repo). The link is recorded as a third field in the registry and rendered into the
genome's `AGENTS.md` (`## Linked Project`). A genome with no link is _knowledge-only_ and
behaves exactly as before. See [Configuration](#configuration).
### Framework structure
```text
knowledge-genome-setup/ ← This repository (setup tooling)
knowledge-genome-orchestrator/ ← This repository (setup tooling)
├── globals.env ← Static KEY=VALUE config (Make-includable)
├── registry.sh ← Bash-only: GENOMES array + dynamic paths
├── Makefile ← Entry point for all operations
@ -120,6 +130,7 @@ knowledge-genome-setup/ ← This repository (setup tooling)
│ ├── output.sh ← Terminal helpers (colors, log levels)
│ ├── deps.sh ← Dependency validation
│ ├── scaffold.sh ← Template rendering engine
│ ├── structure.sh ← Canonical genome layout (single source of truth)
│ ├── lint.sh ← Per-file validation functions
│ └── git-crypt.sh ← git-crypt lifecycle (init, export, verify, rotate)
├── providers/
@ -130,18 +141,41 @@ knowledge-genome-setup/ ← This repository (setup tooling)
│ ├── setup-master.sh ← Master repo initialisation
│ ├── setup-genomes.sh ← Genome provisioning loop
│ ├── add-genome.sh ← Add a single new genome
│ └── lint-genomes.sh ← Quality control across all genomes
└── templates/
├── agents-genome.md ← Per-genome agent contract template
├── agents-master.md ← Master coordination schema template
├── wiki-index.md ← Index template (rendered per genome)
├── wiki-log.md ← Log template (rendered per genome)
├── pr-description.md ← PR review checklist template
├── pre-commit.sh ← Security hook template
├── gitattributes ← Git encryption rules template
└── gitignore ← Git ignore template
│ ├── lint-genomes.sh ← Quality control across all genomes
│ └── verify-genomes.sh ← Structure verify / --sync across all genomes
├── templates/
│ ├── agents-genome.md ← Per-genome agent contract template
│ ├── agents-master.md ← Master coordination schema template
│ ├── readme-master.md ← Master repo README template
│ ├── wiki-index.md ← Index template (rendered per genome)
│ ├── wiki-log.md ← Log template (rendered per genome)
│ ├── pr-description.md ← PR review checklist template
│ ├── pre-commit.sh ← Security hook template
│ ├── gitattributes ← Git encryption rules template
│ └── gitignore ← Git ignore template
├── skills/
│ └── ingest/ ← pi skill: deployed to the AI node (vm101)
│ ├── SKILL.md ← Semantic-only contract (read/edit, emits manifest)
│ ├── references/ ← On-demand reference docs for the agent
│ └── scripts/ ← Deterministic post-processor (runs outside the agent)
│ ├── run-ingest.sh ← Orchestrator: consumes the manifest, emits one JSON line
│ ├── slug.sh ← Slug normalisation
│ ├── index-append.py ← Sorted insert into wiki/index.md + last_updated bump
│ ├── log-append.sh ← Append a wiki/log.md entry
│ ├── scoped-lint.sh ← Lint only the pages touched this run (reuses lib/lint.sh)
│ └── open-pr.sh ← Branch / commit / push / open PR (DRY_RUN seam for tests)
└── tests/ ← bats suite — deterministic, no LLM/GPU (see Testing)
├── helpers.bash
├── scripts.bats
├── lint.bats
├── structure.bats
└── run-ingest.bats
```
> The `skills/ingest/` directory is version-controlled here but **deployed** to the AI
> node (vm101) under `~/.pi/agent/skills/ingest`. The agent (`pi`) does only semantic work
> and writes a manifest; `run-ingest.sh` does the mechanical steps. See [Workflows → Ingest](#ingest).
---
## System Requirements
@ -154,7 +188,10 @@ All tools (git-crypt, bw, qmd) have native Linux binaries.
### macOS — full support
All scripts are compatible with macOS. Requirements:
- bash 3.2+ (macOS default) — fully supported. All `bash 4+` constructs removed.
- bash 3.2+ (macOS default) — supported for the **setup scripts** (`make` targets, scaffolding).
The `ingest` skill uses bash 4+ constructs (`mapfile`), but it is deployed and run on the
Linux AI node, not on the macOS setup machine — so this is not a constraint in practice.
- GNU coreutils not required — BSD variants of `date`, `grep`, `sed` all handled.
- `git-crypt`: install via Homebrew — `brew install git-crypt`
- `jq`, `curl`: pre-installed or via Homebrew
@ -166,6 +203,7 @@ If you use Homebrew bash (`brew install bash`), the scripts work identically to
**Git Bash and native Windows are not supported.**
Reasons:
- `git-crypt` has no native Windows binary.
- Process substitution `<(...)` used for runtime key injection is not available
in Git Bash or PowerShell.
@ -179,37 +217,42 @@ All setup and runtime operations work identically to native Linux inside WSL2.
The system is designed for a homelab architecture:
| Component | Recommended | Role |
|-----------|-------------|------|
| Storage node | Any Linux server with NFS | Hosts Forgejo, stores genome repos |
| AI compute node | GPU server (16GB+ VRAM) | Runs local LLM agent sessions |
| VRAM | 16GB minimum | 14B model at Q5_K_M ≈ 10GB weights; ~6GB for KV cache |
| Local LLM | 14B32B quantised | Active wiki maintenance sessions |
| Large LLM | 70B (async) | Deep reflection, complex synthesis (scheduled, not interactive) |
| Component | Recommended | Role |
| --------------- | ------------------------- | --------------------------------------------------------------- |
| Storage node | Any Linux server with NFS | Hosts Forgejo, stores genome repos |
| AI compute node | GPU server (16GB+ VRAM) | Runs local LLM agent sessions |
| VRAM | 16GB minimum | 14B model at Q5_K_M ≈ 10GB weights; ~6GB for KV cache |
| Local LLM | 14B32B quantised | Active wiki maintenance sessions |
| Large LLM | 70B (async) | Deep reflection, complex synthesis (scheduled, not interactive) |
> **On VRAM constraints:** with a 16GB card and a 14B model, the KV cache budget
> is ~6GB — approximately 32k tokens of effective context. Every token in `AGENTS.md`,
> the index, and the log tail is a cost. This is why all agent files are token-optimised
> and sessions are kept to one source at a time.
> **Reference deployment:** the table above is a target profile, not a hard requirement.
> The current setup runs a single 16GB GPU (RTX 5060 Ti) with a ~9B model for interactive
> ingest, and offloads heavy/async synthesis to a cloud model. Smaller models work — they
> just make the "one source per session" discipline and the token budget matter more.
---
## Prerequisites
### Required
| Tool | Purpose |
|------|---------|
| `git` | Version control |
| `git-crypt` | Transparent file encryption |
| `curl` | REST API calls to Forgejo/GitHub |
| `jq` | JSON parsing |
| Tool | Purpose |
| ----------- | -------------------------------- |
| `git` | Version control |
| `git-crypt` | Transparent file encryption |
| `curl` | REST API calls to Forgejo/GitHub |
| `jq` | JSON parsing |
### Optional
| Tool | Purpose |
|------|---------|
| `bw` | Bitwarden CLI — runtime key injection from Vaultwarden (no key on disk) |
| Tool | Purpose |
| ----- | ----------------------------------------------------------------------- |
| `bw` | Bitwarden CLI — runtime key injection from Vaultwarden (no key on disk) |
| `qmd` | Local BM25 + vector search for Markdown files with MCP server interface |
> **`bw` vs `bws`:** Use `bw` (standard Bitwarden CLI). `bws` is the Bitwarden
@ -282,14 +325,17 @@ resolution. Never included by Make.
```bash
# Dynamic paths (resolved at source time)
WORK_DIR="${HOME}/knowledge-genome-setup"
WORK_DIR="${HOME}/knowledge-genome-orchestrator"
KEYS_DIR="${WORK_DIR}/keys"
# Genome registry — format: "name|description"
# Genome registry — format: "name|description|linked_repo"
# The third field is OPTIONAL:
# - leave it empty → knowledge-only genome (no linked project)
# - owner/repo → genome is linked to that project repository (rendered into AGENTS.md)
GENOMES=(
"genome-dev|Web development, TUI, Angular, software architecture"
"genome-finance|Personal finance, investments, market analysis"
"genome-homelab|Infrastructure, network configs, architecture logs"
"genome-dev|Web development, TUI, Angular, software architecture|myorg/my-app"
"genome-finance|Personal finance, investments, market analysis|"
"genome-homelab|Infrastructure, network configs, architecture logs|"
)
```
@ -312,8 +358,8 @@ export GITHUB_TOKEN="your_github_token"
```bash
# 1. Clone the setup framework
git clone <setup-repo-url> knowledge-genome-setup
cd knowledge-genome-setup
git clone <setup-repo-url> knowledge-genome-orchestrator
cd knowledge-genome-orchestrator
# 2. Configure your environment
cp globals.env.example globals.env # edit with your values
@ -347,6 +393,7 @@ make setup
- Commits submodule pointer in master repo
After setup completes:
- Upload all files in `keys/` to Vaultwarden (see Key Management)
- Delete key files from disk: `rm keys/*.key`
@ -354,16 +401,19 @@ After setup completes:
## Makefile Reference
| Target | Description |
|--------|-------------|
| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` |
| `make add-genome NAME=x DESC="y"` | Scaffold and register a single new genome |
| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) |
| `make status` | Show submodule status and first 10 git-crypt encryption states |
| `make lock` | Lock all encrypted repos (master + all genome submodules) |
| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing |
| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome |
| `make help` | Print all available targets |
| Target | Description |
| ----------------------------------------------------- | ------------------------------------------------------------------------------------- |
| `make setup` | Full system initialisation — master repo + all genomes in `registry.sh` |
| `make add-genome NAME=x DESC="y" [LINKED=owner/repo]` | Scaffold and register a single new genome (optional linked project) |
| `make lint` | Run quality checks across all genomes (schema, privacy, decay, page size) |
| `make verify-structure` | Report directory drift of each genome vs the canonical layout (`lib/structure.sh`) |
| `make sync-structure` | Create any missing canonical directories across all genomes (safe, idempotent) |
| `make test` | Run the bats test suite (deterministic; no LLM/GPU/network) — see [Testing](#testing) |
| `make status` | Show submodule status and per-genome git-crypt encryption state |
| `make lock` | Lock all encrypted repos (master + all genome submodules) |
| `make doctor` | Verify required tools: git, git-crypt, curl, jq; warn if bw missing |
| `make sync` | `git submodule update --init --recursive` + report unpushed commits per genome |
| `make help` | Print all available targets |
### Examples
@ -374,6 +424,12 @@ make doctor
# Add a new genome after initial setup
make add-genome NAME=genome-research DESC="Academic papers and deep research"
# Add a genome linked to a project repository
make add-genome NAME=genome-dev DESC="Web development" LINKED=myorg/my-app
# Check every genome against the canonical directory layout
make verify-structure
# Run full lint pass (bash deterministic checks)
make lint
@ -386,6 +442,38 @@ make lock
---
## Testing
The mechanical layer (slug, index, log, lint, structure, the ingest orchestrator) is
covered by a [bats](https://github.com/bats-core/bats-core) suite. The tests are
**deterministic and have zero dependency on the LLM, the GPU, or the network** — they
simulate the agent's output with fixtures and exercise the scripts directly, so they run
anywhere git + bash live (laptop, CI, a git hook). They are **not** meant to run on the AI
node or via n8n.
```bash
sudo apt install bats # once
make test # or: bats tests/
```
| File | Covers |
| ----------------- | ------------------------------------------------------------------------------ |
| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) |
| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` |
| `structure.bats` | `lib/structure.sh` report / sync |
| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` |
Each test builds its own throwaway genome with a local bare remote, configured to ignore
the operator's global git settings (signing, global hooks) so the suite is hermetic. The
`run-ingest` tests auto-`skip` if `jq` is absent. If you change the canonical layout in
`lib/structure.sh`, update `FIXTURE_DIRS` in `tests/helpers.bash` to match.
> Why this matters: the only non-deterministic part of the system is the model. Pinning
> the mechanical layer with tests means that when an ingest misbehaves, you know it's the
> model or the prompt — not the plumbing.
---
## Genome Lifecycle
### Initial setup
@ -407,6 +495,7 @@ After adding: upload the new key to Vaultwarden and delete the key file.
### Removing a genome
Manual process:
```bash
# In master repo
git submodule deinit genome-name
@ -421,16 +510,17 @@ git push
When a genome is scaffolded, `render_template` replaces these placeholders in all
template files:
| Placeholder | Source | Example |
|-------------|--------|---------|
| `{{GENOME_NAME}}` | registry.sh | `genome-dev` |
| `{{GENOME_NAME_UPPER}}` | derived | `GENOME-DEV` |
| `{{GENOME_DESC}}` | registry.sh | `Web development...` |
| `{{FORGEJO_URL}}` | globals.env | `https://git.yourserver.com` |
| `{{FORGEJO_USER}}` | globals.env | `yourusername` |
| `{{VAULTWARDEN_URL}}` | globals.env | `https://vault.yourserver.com` |
| `{{MASTER_REPO}}` | globals.env | `master-knowledge-genome` |
| `{{DATE}}` | runtime | `2026-05-11` |
| Placeholder | Source | Example |
| ----------------------- | ----------- | ------------------------------ |
| `{{GENOME_NAME}}` | registry.sh | `genome-dev` |
| `{{GENOME_NAME_UPPER}}` | derived | `GENOME-DEV` |
| `{{GENOME_DESC}}` | registry.sh | `Web development...` |
| `{{LINKED_PROJECT}}` | registry.sh | `myorg/my-app` (or `none`) |
| `{{FORGEJO_URL}}` | globals.env | `https://git.yourserver.com` |
| `{{FORGEJO_USER}}` | globals.env | `yourusername` |
| `{{VAULTWARDEN_URL}}` | globals.env | `https://vault.yourserver.com` |
| `{{MASTER_REPO}}` | globals.env | `master-knowledge-genome` |
| `{{DATE}}` | runtime | `2026-05-11` |
---
@ -441,9 +531,9 @@ template files:
Each genome uses a unique symmetric AES-256-CTR key managed by git-crypt.
Two directories in every genome are always encrypted:
| Directory | Contents | On remote |
|-----------|----------|-----------|
| `raw/private/` | Sensitive source material | Opaque binary blob |
| Directory | Contents | On remote |
| --------------- | --------------------------- | ------------------ |
| `raw/private/` | Sensitive source material | Opaque binary blob |
| `wiki/private/` | Private synthesis and notes | Opaque binary blob |
All other directories (`raw/articles/`, `wiki/sources/`, etc.) are plaintext.
@ -490,6 +580,17 @@ This means: any file matching `**/private/**` in `.gitattributes` is protected,
including future `private/` directories created anywhere in the repo.
The hook never needs updating when the encryption rules change.
### Untrusted agent output — manifest validation
The ingest agent's output is stochastic: a hallucinated manifest could carry a missing field,
a wrong type, or a malicious path such as `wiki/../../etc/passwd`. `run-ingest.sh` therefore
**validates the manifest before trusting any field** — it must be well-formed JSON with a
string `raw_source` and an array `pages`, and **every `path` must be a string under `wiki/`
with no `..`**. Anything else fails fast with a structured `{"status":"error"}` and no
filesystem access outside the wiki, so a bad path can't drive a read or a lint outside the
knowledge tree. This is the trust boundary between the (stochastic) model and the
(deterministic, tested) post-processor.
### PRIVATE_CONTEXT toggle
The `PRIVATE_CONTEXT` toggle in `AGENTS.md` controls whether the LLM agent
@ -502,6 +603,7 @@ PRIVATE_CONTEXT: enabled ← Agent may read/write private/. Requires git-cryp
```
Rules:
- Never inferred. Never carried over from a previous session.
- `enabled` requires the operator to confirm that `git-crypt unlock` has run on the host.
- Per-genome, per-session: enabling for `genome-finance` does NOT enable for `genome-dev`.
@ -530,6 +632,7 @@ The key flows: Vaultwarden → `bw get notes` → `base64 -d` → kernel pipe
At no point is the key written to any file on disk.
Lock a genome when the session ends:
```bash
git-crypt lock
```
@ -544,11 +647,11 @@ git-crypt lock
Each genome key is stored as a base64-encoded Secure Note in Vaultwarden:
| Genome | Vaultwarden Note Name |
|--------|----------------------|
| `genome-dev` | `genome-dev key` |
| `genome-finance` | `genome-finance key` |
| `genome-homelab` | `genome-homelab key` |
| Genome | Vaultwarden Note Name |
| ---------------- | --------------------- |
| `genome-dev` | `genome-dev key` |
| `genome-finance` | `genome-finance key` |
| `genome-homelab` | `genome-homelab key` |
After `make setup` or `make add-genome`, key files are exported to `keys/`.
Upload procedure:
@ -586,13 +689,14 @@ git clone https://git.yourserver.com/yourusername/genome-dev.git
If a key is lost or compromised:
```bash
# From the knowledge-genome-setup/ directory
# From the knowledge-genome-orchestrator/ directory
source lib/git-crypt.sh
cd ~/knowledge-genome-setup/genome-dev
cd ~/knowledge-genome-orchestrator/genome-dev
gcrypt_rotate_key "genome-dev"
```
`gcrypt_rotate_key` performs:
1. Unlocks repo with existing key
2. Removes old key material
3. Generates new symmetric key via `git-crypt init`
@ -603,13 +707,16 @@ gcrypt_rotate_key "genome-dev"
> **Limitation:** git history still contains blobs encrypted with the old key.
> Anyone with the old key and git history access can decrypt them. To purge old
> encrypted blobs from history:
>
> ```bash
> git filter-repo --invert-paths --path raw/private --path wiki/private
> git push --force origin main
> ```
>
> This rewrites all commit hashes — coordinate with any collaborators first.
After rotation:
- Upload new key to Vaultwarden (replace existing note)
- Delete both `keys/genome-dev.key` and `keys/genome-dev-rotated-*.key` from disk
- Revoke access from previous key holders
@ -621,6 +728,7 @@ After rotation:
### Prerequisites for every session
Before starting an LLM agent session on a genome:
1. The host (AI server) runs `git-crypt unlock` for the required genomes
2. The orchestrator prepares context: `tail -n 20 wiki/log.md`
3. Declare `PRIVATE_CONTEXT` state explicitly in the opening prompt
@ -631,7 +739,8 @@ The agent executes in this order at the start of every session:
1. Read `wiki/index.md` — primary catalog of all pages and maturity
2. Read last 20 log entries (injected by orchestrator — does NOT open `wiki/log.md` directly)
3. For tasks involving related pages: `qmd search "<query>"` before opening any files
3. For tasks involving related pages: if the optional `qmd` extension is installed,
`qmd search "<query>"` before opening files; otherwise navigate from `wiki/index.md`
4. Operate on individual files — never scan entire directories
### One source per session
@ -651,12 +760,13 @@ sequentially — not one session with 5 files.
### n8n automation
For Forgejo webhook → automated ingest:
1. Forgejo sends webhook on push to `raw/`
2. n8n receives webhook, identifies new files
3. n8n starts one agent session per new file (sequential, not parallel)
4. Each session: inject `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path
5. Agent ingest workflow runs, opens PR
6. Human reviews and merges PR
4. Each session: realign the checkout to the base (`git switch <base> && git reset --hard origin/<base>`), then inject `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path
5. Phase 1 agent (`/skill:ingest`) writes the manifest; Phase 2 `run-ingest.sh` opens the PR, then **stops**
6. Human reviews **merge to accept**, or close the PR + delete the `feat` branch to reject
---
@ -664,27 +774,76 @@ For Forgejo webhook → automated ingest:
### Ingest
Triggered by a new file in `raw/` (manual or via webhook).
Triggered by a new file in `raw/` (manual or via webhook). Ingest is split into two
phases so that the small local model spends its limited context only on judgement, and
all the deterministic bookkeeping happens outside the model's loop.
1. Read source once
2. Create `wiki/sources/<slug>.md` — summary and key points
3. Per entity (person, tool, organisation): create or update `wiki/entities/<name>.md`
4. Per concept (pattern, theory, decision): create or update `wiki/concepts/<name>.md`
5. Check each touched page for contradictions → apply Conflict Resolution if found
6. Append entry to `wiki/index.md` (bottom of relevant section — do not reorder)
7. Append log entry: `INGEST | <slug>`
8. Run scoped lint on pages created or modified in this session; report in PR
9. Commit on `feat/ai-ingest-<slug>`; open PR using `templates/pr-description.md`
**Phase 1 — agent (semantic only).** The `ingest` skill gives the agent read/edit tools
only (no shell). It:
1. Reads the source once
2. Creates `wiki/sources/<slug>.md` — summary and key points
3. Per entity (person, tool, organisation): creates or updates `wiki/entities/<name>.md`
4. Per concept (pattern, theory, decision): creates or updates `wiki/concepts/<name>.md`
5. Checks each touched page for contradictions → applies Conflict Resolution if found
6. Writes `.ingest-manifest.json` (the list of pages it created/modified, the model name,
a one-line reasoning, the PR summary, and any contradictions) — then **stops**
**Phase 2 — `run-ingest.sh` (deterministic, outside the agent).** The post-processor first
**validates the manifest** — well-formed JSON, expected shape, and every page path confined to
`wiki/` with no `..` (see [Security Model](#security-model)) — then does the mechanical work the
model must not waste context on:
7. Inserts each page into the correct `wiki/index.md` section **in alphabetical order**,
deduplicated by wikilink (a re-ingest updates the entry, never duplicates it), and bumps the
index `last_updated` (`index-append.py`)
8. Appends the `INGEST | <slug>` entry to `wiki/log.md` (the model name comes from the
orchestrator via `INGEST_MODEL` — the agent cannot reliably know its own tag)
9. Runs scoped lint on exactly the pages touched this run (`scoped-lint.sh`, reusing
`lib/lint.sh`)
10. Commits **only `wiki/`** on `feat/ai-ingest-<slug>` and opens a PR against the integration
base (`INGEST_BASE`, default `main`); the body matches the `templates/pr-description.md`
structure (Summary / Pages / Contradictions / Scoped Lint)
11. Emits a single compact JSON line (status, slug, PR url, lint_clean, conflict) for n8n
The agent never runs git, never edits the index/log mechanically, and never lints — those
are deterministic and tested (see [Testing](#testing)). Invocation on the AI node:
```bash
pi --mode json -p "/skill:ingest raw/articles/<file>.md" # phase 1 → writes manifest
run-ingest.sh <genome> # phase 2 → index/log/lint/PR
```
For private sources (`PRIVATE_CONTEXT: enabled` required):
- All output goes to `wiki/private/<slug>.md` only
- PR title: `[PRIVATE] ingest: <slug>`
**Branch lifecycle & the manual gate.** `run-ingest.sh` / `open-pr.sh` are deliberately
"dumb": they create the `feat/ai-ingest-<slug>` branch, commit only `wiki/`, open the PR, and
stop. They never reset, revert, or touch the integration branch — that lifecycle belongs to
the orchestrator, around the human gate:
- **Before each session** the orchestrator realigns the checkout to the base
(`git fetch && git switch <base> && git reset --hard origin/<base>`) — a reset of the _local_
checkout to match the remote, never a force-push to the shared branch.
- **After the PR opens, everything stops** until a human approves: one source per session,
sequential, no new ingest until the pending PR is closed.
- **Approve = merge. Reject = close the PR and delete the remote `feat` branch.** To undo an
already-merged ingest, open a _revert PR_ against the base — never rewrite history on a
shared branch.
The PR base is configurable via `INGEST_BASE` (default `main`). Per-page `maturity` already
encodes stability and tags/releases mark versioned snapshots, so `main` is the integration
branch today. If a linked project later _consumes_ a genome, set `INGEST_BASE=develop` to
buffer ingests on `develop` and cut manual `develop → main` releases — no code change.
### Query
Triggered by an operator question.
1. `qmd search "<query>"` → identify candidate pages
1. `qmd search "<query>"` (if the optional qmd extension is installed) → identify
candidate pages; otherwise start from `wiki/index.md`
2. Read candidate pages directly (qmd already returns file paths — no intermediate index lookup)
3. Synthesise answer with `[[wikilink]]` citations
4. If answer is non-trivial: save as `wiki/queries/<slug>.md` and append to index
@ -697,11 +856,13 @@ For general orientation without a specific query: read `wiki/index.md` directly.
The lint workflow is split between deterministic bash checks and semantic LLM judgment.
**Step 1 — operator runs bash linter:**
```bash
make lint
```
The bash linter checks automatically:
- YAML frontmatter validity (all mandatory fields present)
- Domain consistency (domain field matches genome name)
- Type validity (value from allowed list)
@ -713,6 +874,7 @@ The bash linter checks automatically:
**Step 2 — operator provides bash output to LLM agent:**
The agent applies semantic judgment to findings the bash linter cannot make:
- **Orphan pages** (from bash list): for each orphan, identify 1-3 existing pages
that should link to it; propose specific additions
- **Implicit concepts** (from bash term frequency list): determine if a candidate
@ -735,22 +897,28 @@ The PR description uses `templates/pr-description.md`:
```markdown
## Summary
One sentence: goal of this session and source processed.
## Pages Created
| Path | Type | Maturity |
## Pages Modified
| Path | Change |
## Contradictions Found
[ ] None / [ ] n conflict file(s) created
[ ] None / [ ] n conflict file(s) created
## Private Data Accessed
[ ] No (PRIVATE_CONTEXT: disabled) / [ ] Yes
[ ] No (PRIVATE_CONTEXT: disabled) / [ ] Yes
## Scoped Lint (post-ingest)
[ ] Frontmatter valid [ ] No broken links [ ] No issues found
[ ] Frontmatter valid [ ] No broken links [ ] No issues found
```
This makes human review fast and structured: read the table, scan the diff,
@ -776,10 +944,10 @@ The operator resolves the conflict, updates relevant pages, closes the PR.
Pages have a `last_updated` field in frontmatter. During lint passes:
| Maturity | Threshold | Action |
|----------|-----------|--------|
| `stable` | 180 days | Flag as stale — add `⚠️ STALE` callout |
| `draft` | 90 days | Flag as stale — add `⚠️ STALE` callout |
| Maturity | Threshold | Action |
| -------- | --------- | -------------------------------------- |
| `stable` | 180 days | Flag as stale — add `⚠️ STALE` callout |
| `draft` | 90 days | Flag as stale — add `⚠️ STALE` callout |
The agent proposes re-validation but does not change `maturity` without new source evidence.
@ -816,47 +984,47 @@ private: true | false
---
```
| Field | Rules |
|-------|-------|
| `type` | Must be one of: `source entity concept query conflict private index log` |
| `maturity: draft` | Single source or unvalidated |
| `maturity: stable` | Confirmed by 2+ independent sources |
| `maturity: deprecated` | Superseded — add `> **DEPRECATED:** <reason>` callout at top |
| `private: true` | Required on all pages in `wiki/private/` and `raw/private/` |
| Field | Rules |
| ---------------------- | ------------------------------------------------------------------------ |
| `type` | Must be one of: `source entity concept query conflict private index log` |
| `maturity: draft` | Single source or unvalidated |
| `maturity: stable` | Confirmed by 2+ independent sources |
| `maturity: deprecated` | Superseded — add `> **DEPRECATED:** <reason>` callout at top |
| `private: true` | Required on all pages in `wiki/private/` and `raw/private/` |
Do not use semantic versioning for content. Git history tracks every change.
`maturity` captures epistemic state; `last_updated` tracks recency.
### Page types and directories
| Type | Directory | Description |
|------|-----------|-------------|
| `source` | `wiki/sources/` | One page per processed raw source |
| `entity` | `wiki/entities/` | People, tools, organisations, projects |
| `concept` | `wiki/concepts/` | Patterns, theories, architectural decisions |
| `query` | `wiki/queries/` | Preserved answers and analyses |
| `conflict` | `wiki/queries/conflict-*.md` | Unresolved contradictions |
| `private` | `wiki/private/` | Private synthesis (PRIVATE_CONTEXT: enabled) |
| `index` | `wiki/index.md` | Primary navigation catalog (singleton) |
| `log` | `wiki/log.md` | Operations ledger (singleton) |
| Type | Directory | Description |
| ---------- | ---------------------------- | -------------------------------------------- |
| `source` | `wiki/sources/` | One page per processed raw source |
| `entity` | `wiki/entities/` | People, tools, organisations, projects |
| `concept` | `wiki/concepts/` | Patterns, theories, architectural decisions |
| `query` | `wiki/queries/` | Preserved answers and analyses |
| `conflict` | `wiki/queries/conflict-*.md` | Unresolved contradictions |
| `private` | `wiki/private/` | Private synthesis (PRIVATE_CONTEXT: enabled) |
| `index` | `wiki/index.md` | Primary navigation catalog (singleton) |
| `log` | `wiki/log.md` | Operations ledger (singleton) |
### Page size limits
| Limit | Lines | Action |
|-------|-------|--------|
| Soft cap | 400 | Bash linter warns |
| Hard cap | 800 | Bash linter errors — split the page |
| Limit | Lines | Action |
| -------- | ----- | ----------------------------------- |
| Soft cap | 400 | Bash linter warns |
| Hard cap | 800 | Bash linter errors — split the page |
These limits ensure pages fit within the LLM context window without attention degradation
and keep the wiki atomically navigable.
### Linking conventions
| Type | Format |
|------|--------|
| Type | Format |
| ---------------------- | ------------------------------------------- |
| Internal (same genome) | `[[folder/slug]]` — Obsidian wikilinks only |
| Cross-genome | `[[../genome-target/wiki/folder/slug]]` |
| External | `[text](https://url)` — standard Markdown |
| Cross-genome | `[[../genome-target/wiki/folder/slug]]` |
| External | `[text](https://url)` — standard Markdown |
Never use `[text](relative/path)` for internal references. Obsidian wikilinks are
bidirectional and appear in the graph view.
@ -878,6 +1046,7 @@ Every operation appends one entry to `wiki/log.md`:
Valid TYPEs: `INGEST` `LINT` `QUERY` `CONFLICT` `CONFIG` `SECURITY`
Parse examples:
```bash
grep "^## \[" wiki/log.md | tail -5 # Last 5 entries
grep "^## \[" wiki/log.md | grep "CONFLICT" # All conflicts
@ -891,12 +1060,12 @@ The LLM never loads the full log.
## Collaboration Model
| Role | Key access | Permitted operations |
|------|-----------|----------------------|
| Owner | Full — key holder | Read/write everywhere |
| Collaborator | None | Push to `raw/articles/`, `raw/transcripts/`, `raw/code-packs/`, `raw/assets/` |
| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` |
| Cloud AI model | Never | `PRIVATE_CONTEXT` must be `disabled`; private data stays on local network |
| Role | Key access | Permitted operations |
| -------------- | ----------------- | ----------------------------------------------------------------------------- |
| Owner | Full — key holder | Read/write everywhere |
| Collaborator | None | Push to `raw/articles/`, `raw/transcripts/`, `raw/code-packs/`, `raw/assets/` |
| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` |
| Cloud AI model | Never | `PRIVATE_CONTEXT` must be `disabled`; private data stays on local network |
Grant collaborator access: add as Forgejo contributor with Write role.
Never share the git-crypt key — collaborators operate exclusively in public directories.
@ -930,6 +1099,7 @@ qmd serve --port 3333
Obsidian is the recommended wiki browser. Open any genome directory as an Obsidian vault.
Recommended setup:
- **Graph view** — visualise page connections; spot orphans and hubs instantly
- **Obsidian Web Clipper** — browser extension to clip articles directly to `raw/articles/`
as Markdown
@ -949,7 +1119,8 @@ n8n (running on the storage node) can automate the ingest pipeline:
2. n8n flow identifies new files
3. For each new file: starts one agent session (sequential — never parallel)
4. Each session receives: `tail -n 20 wiki/log.md` + `PRIVATE_CONTEXT` state + source path
5. Agent runs ingest workflow and opens PR
5. Phase 1 — agent runs `/skill:ingest` (semantic → writes manifest); Phase 2 —
`run-ingest.sh` does index/log/lint and opens the PR, returning one JSON line to n8n
6. Human reviews the PR
Key constraint: one source per session, sessions sequential.
@ -959,11 +1130,13 @@ Never batch multiple sources into one agent session.
If the AI compute node has an Intel NPU (e.g. Core Ultra series):
- Background tasks (embedding updates, index refresh) → Intel NPU via OpenVINO
- Background/auxiliary tasks (OCR of `raw/assets/`, async summarisation, or qmd
re-indexing **if** the optional qmd extension is in use) → Intel NPU via OpenVINO
- Active reasoning sessions (ingest, query, synthesis) → GPU
This keeps the GPU's KV cache free for interactive work and reduces power consumption
for background operations.
Note: the core system has no embedding pipeline (see [Core Philosophy](#core-philosophy)),
so there is nothing to embed here — the NPU is only for auxiliary work. This keeps the
GPU's KV cache free for interactive sessions and lowers power draw for background jobs.
---
@ -991,6 +1164,7 @@ sudo apt install git git-crypt curl jq
The staged file is in a path matching `**/private/**` but is not encrypted.
Fix options:
1. Verify `.gitattributes` contains `**/private/** filter=git-crypt diff=git-crypt -text`
2. Run `git-crypt init` if git-crypt is not initialised in this repo
3. Run `git-crypt status` to check the encryption state of all files
@ -1011,6 +1185,7 @@ git commit -m "fix: re-stage private files for encryption"
### Agent returns stale or missing cross-references
Likely causes:
1. Session was too long — KV cache degraded. Use one source per session.
2. `wiki/index.md` was not read at session start — agent lacked the page catalog.
3. qmd index is stale — re-index: `qmd index <genome>/wiki/`

130
diagnose-run-ingest.sh Normal file
View file

@ -0,0 +1,130 @@
#!/usr/bin/env bash
# diagnose-run-ingest.sh
# Run from the repo root: bash diagnose-run-ingest.sh
# Builds the same fixture the bats test uses and runs run-ingest under `bash -x`
# so we can see exactly which command makes it exit non-zero.
set -uo pipefail
REPO="$(pwd)"
RI="${REPO}/skills/ingest/scripts/run-ingest.sh"
echo "==================== ENV ===================="
echo "bash: $(bash --version | head -1)"
echo "git : $(git --version)"
echo "jq : $(jq --version 2>/dev/null || echo MISSING)"
echo "py : $(python3 --version 2>/dev/null || echo MISSING)"
echo
echo "============ run-ingest.sh on disk ============"
if [[ ! -f "$RI" ]]; then echo "NOT FOUND: $RI (run me from the repo root)"; exit 1; fi
echo "-- helper invocations (want 'bash ...'): --"
grep -nE 'log-append\.sh|scoped-lint\.sh|open-pr\.sh' "$RI"
echo "-- result emitter (want 'jq -nc'): --"
grep -nE 'jq -nc?|jq -n ' "$RI"
echo
echo "============ build hermetic fixture ============"
T="$(mktemp -d)"
mkdir -p "$T/nohooks"
git init --bare -q "$T/origin.git"
g="$T/g"
mkdir -p "$g"/{raw/articles,wiki/sources,wiki/entities,wiki/concepts,wiki/queries,wiki/private}
cat > "$g/wiki/index.md" <<'EOF'
---
title: "Index"
type: index
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Index
---
## Sources (`wiki/sources/`)
*x*
## Entities (`wiki/entities/`)
*x*
## Concepts (`wiki/concepts/`)
*x*
## Queries (`wiki/queries/`)
*x*
## Conflicts Pending Review (`wiki/queries/conflict-*.md`)
*x*
EOF
cat > "$g/wiki/log.md" <<'EOF'
---
title: "Log"
type: log
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Log
---
## [2026-01-01] CONFIG | init
- run_id: `init`
EOF
echo raw > "$g/raw/articles/test.md"
(
cd "$g"
git init -q
git config commit.gpgsign false
git config core.hooksPath "$T/nohooks"
git config user.email t@t
git config user.name t
git add .
git commit -qm init
git branch -M main
git remote add origin "$T/origin.git"
git push -q -u origin main
) && echo "fixture commit+push OK" || echo "FIXTURE SETUP FAILED (look above)"
cat > "$g/wiki/sources/test-source.md" <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-04
private: false
---
body
EOF
cat > "$g/.ingest-manifest.json" <<'EOF'
{ "raw_source":"raw/articles/test.md","model":"m","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/sources/test-source.md","summary":"a source","maturity":"draft","status":"created"}] }
EOF
echo
echo "============ run-ingest (bash -x) ============"
cd "$g"
export KG_LIB_DIR="${REPO}/lib" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
bash -x "$RI" genome-test >"$T/out.txt" 2>"$T/trace.txt"
rc=$?
echo "EXIT=$rc"
echo "-- run-ingest stdout (final JSON should be here): --"
cat "$T/out.txt"
echo "-- last 25 lines of the trace (the failing command is near the end): --"
tail -n 25 "$T/trace.txt"

View file

@ -4,6 +4,9 @@
# Directory structure creation and template rendering engine.
# =============================================================================
# Canonical directory layout lives in one place (lib/structure.sh).
source "$(dirname "${BASH_SOURCE[0]}")/structure.sh"
render_template() {
local template_file="$1"
local output_file="$2"
@ -13,17 +16,21 @@ render_template() {
local content
content=$(<"$template_file")
# Defaults (:-) so master-repo templates render even when GENOME_* are unset
# (scaffold_master runs before any genome; set -u would otherwise abort here).
local genome_name_upper
genome_name_upper=$(tr '[:lower:]' '[:upper:]' <<< "${GENOME_NAME}")
genome_name_upper=$(tr '[:lower:]' '[:upper:]' <<< "${GENOME_NAME:-}")
# Placeholder replacement
content="${content//\{\{GENOME_NAME\}\}/${GENOME_NAME}}"
content="${content//\{\{GENOME_NAME\}\}/${GENOME_NAME:-}}"
content="${content//\{\{GENOME_NAME_UPPER\}\}/${genome_name_upper}}"
content="${content//\{\{GENOME_DESC\}\}/${GENOME_DESC}}"
content="${content//\{\{FORGEJO_URL\}\}/${FORGEJO_URL}}"
content="${content//\{\{FORGEJO_USER\}\}/${FORGEJO_USER}}"
content="${content//\{\{VAULTWARDEN_URL\}\}/${VAULTWARDEN_URL}}"
content="${content//\{\{MASTER_REPO\}\}/${MASTER_REPO}}"
content="${content//\{\{GENOME_DESC\}\}/${GENOME_DESC:-}}"
content="${content//\{\{FORGEJO_URL\}\}/${FORGEJO_URL:-}}"
content="${content//\{\{FORGEJO_USER\}\}/${FORGEJO_USER:-}}"
content="${content//\{\{VAULTWARDEN_URL\}\}/${VAULTWARDEN_URL:-}}"
content="${content//\{\{MASTER_REPO\}\}/${MASTER_REPO:-}}"
# linked project reference (optional) — empty registry field renders as 'none'
content="${content//\{\{LINKED_PROJECT\}\}/${GENOME_LINKED:-none}}"
content="${content//\{\{DATE\}\}/$(date +%Y-%m-%d)}"
mkdir -p "$(dirname "$output_file")"
@ -32,13 +39,9 @@ render_template() {
scaffold_genome() {
local base="$1"
local dirs=(
"raw/articles" "raw/transcripts" "raw/code-packs" "raw/assets" "raw/private"
"wiki/sources" "wiki/entities" "wiki/concepts" "wiki/queries" "wiki/private"
)
info "Building directory structure in ${base}..."
for dir in "${dirs[@]}"; do
for dir in "${GENOME_DIRS[@]}"; do
mkdir -p "${base}/${dir}"
touch "${base}/${dir}/.gitkeep"
done

70
lib/structure.sh Normal file
View file

@ -0,0 +1,70 @@
#!/usr/bin/env bash
# =============================================================================
# lib/structure.sh
# Single source of truth for the canonical genome directory layout, plus the
# verify/sync helpers used by scripts/verify-genomes.sh.
#
# IMPORTANT: this is the ONE place the structure is defined. scaffold.sh sources
# this file and builds new genomes from GENOME_DIRS, so scaffolding and the
# structure check can never drift apart.
# =============================================================================
# Canonical directories every genome must have.
# raw/* are input buckets (collaborator-writable); wiki/* is the agent-owned,
# contract-bound layout the lint, the index sections and the ingest skill depend on.
GENOME_DIRS=(
"raw/articles" "raw/transcripts" "raw/code-packs" "raw/assets" "raw/private"
"wiki/sources" "wiki/entities" "wiki/concepts" "wiki/queries" "wiki/private"
)
# ---------------------------------------------------------------------------
# structure_report <base>
# Reports drift of <base> against GENOME_DIRS.
# - missing canonical dir → counted as drift (returns non-zero)
# - extra dir under raw/ or wiki/ → warning only (does not fail)
# Returns the number of MISSING canonical directories.
# ---------------------------------------------------------------------------
structure_report() {
local base="$1"
local missing=0
for d in "${GENOME_DIRS[@]}"; do
if [[ ! -d "${base}/${d}" ]]; then
warn "missing: ${d}"
missing=$((missing + 1))
fi
done
# Extra directories (drift the other way) — informational only.
local canon=" ${GENOME_DIRS[*]} "
while IFS= read -r d; do
d="${d#"${base}/"}"
[[ "$canon" == *" ${d} "* ]] && continue
info "extra (not in canon): ${d}"
done < <(find "${base}/raw" "${base}/wiki" -mindepth 1 -type d 2>/dev/null)
return $missing
}
# ---------------------------------------------------------------------------
# structure_sync <base>
# Creates any MISSING canonical directories (idempotent). Never deletes —
# retiring a bucket is a deliberate, contract-aware change to GENOME_DIRS +
# the templates, not an automatic prune.
# ---------------------------------------------------------------------------
structure_sync() {
local base="$1"
local added=0
for d in "${GENOME_DIRS[@]}"; do
if [[ ! -d "${base}/${d}" ]]; then
mkdir -p "${base}/${d}"
touch "${base}/${d}/.gitkeep"
success "created: ${d}"
added=$((added + 1))
fi
done
[[ $added -eq 0 ]] && info "already in sync: ${base}"
return 0
}

View file

@ -19,9 +19,13 @@ LIB_DIR="${PROJECT_ROOT}/lib"
PROVIDERS_DIR="${PROJECT_ROOT}/providers"
# --- GENOME REGISTRY ---
# Format: "name|description"
# Format: "name|description|linked_repo"
# - linked_repo is OPTIONAL. Leave empty (trailing pipe) for knowledge-only genomes.
# - It is an opaque reference rendered verbatim into the genome's AGENTS.md
# (phase-2 project work is parked, so the framework does not act on it yet).
# - Example with a project: "genome-homelab|Keru infrastructure...|keru/homelab-infra"
GENOMES=(
"genome-dev|Web development, TUI, Angular, software architecture"
"genome-finance|Personal finance, investments, market analysis"
"genome-homelab|Keru infrastructure, network configs, architecture logs"
"genome-dev|Web development, TUI, Angular, software architecture|"
"genome-finance|Personal finance, investments, market analysis|"
"genome-homelab|Keru infrastructure, network configs, architecture logs|"
)

View file

@ -11,16 +11,18 @@ source "registry.sh"
GENOME_NAME="${1:-}"
GENOME_DESC="${2:-}"
GENOME_LINKED="${3:-}" # optional: linked project repo reference
if [[ -z "$GENOME_NAME" || -z "$GENOME_DESC" ]]; then
error "Missing arguments."
echo "Usage: $0 <genome-name> <description>"
echo "Usage: $0 <genome-name> <description> [linked-repo]"
exit 1
fi
step "Adding New Genome: ${GENOME_NAME}"
GENOMES=("${GENOME_NAME}|${GENOME_DESC}")
# Build a 3-field registry entry (linked_repo may be empty)
GENOMES=("${GENOME_NAME}|${GENOME_DESC}|${GENOME_LINKED}")
source "scripts/setup-genomes.sh"

View file

@ -19,8 +19,9 @@ source "providers/${PROVIDER}.sh"
step "Processing Genome Registry"
for entry in "${GENOMES[@]}"; do
IFS='|' read -r GENOME_NAME GENOME_DESC <<< "$entry"
export GENOME_NAME GENOME_DESC
# 3-field format: name|description|linked_repo (linked_repo optional → may be empty)
IFS='|' read -r GENOME_NAME GENOME_DESC GENOME_LINKED <<< "$entry"
export GENOME_NAME GENOME_DESC GENOME_LINKED
info "Processing: ${GENOME_NAME}..."

50
scripts/verify-genomes.sh Normal file
View file

@ -0,0 +1,50 @@
#!/usr/bin/env bash
# =============================================================================
# scripts/verify-genomes.sh
# Check (default) or --sync the directory structure of every registered genome
# against the canonical layout in lib/structure.sh.
#
# bash scripts/verify-genomes.sh # report drift, non-zero exit on drift
# bash scripts/verify-genomes.sh --sync # create missing dirs everywhere (safe)
#
# No hardware/LLM involved — pure structure check. Run anywhere.
# =============================================================================
set -euo pipefail
source "lib/output.sh"
source "globals.env"
source "registry.sh"
source "lib/structure.sh"
MODE="verify"
[[ "${1:-}" == "--sync" ]] && MODE="sync"
step "Genome structure: ${MODE}"
TOTAL_MISSING=0
for entry in "${GENOMES[@]}"; do
IFS='|' read -r GENOME_NAME _ _ <<< "$entry" # 3-field registry; ignore desc + linked
genome_dir="${WORK_DIR}/${MASTER_REPO}/${GENOME_NAME}"
if [[ ! -d "$genome_dir" ]]; then
warn "not found locally, skipping: ${GENOME_NAME}"
continue
fi
info "Genome: ${GENOME_NAME}"
if [[ "$MODE" == "sync" ]]; then
structure_sync "$genome_dir"
else
structure_report "$genome_dir" && m=0 || m=$?
TOTAL_MISSING=$((TOTAL_MISSING + m))
fi
done
echo ""
if [[ "$MODE" == "sync" ]]; then
success "Structure sync complete."
elif [[ $TOTAL_MISSING -eq 0 ]]; then
success "Structure verified: all genomes match the canonical layout."
else
error "Structure drift: ${TOTAL_MISSING} missing directory(ies). Fix with: make sync-structure"
exit 1
fi

93
skills/ingest/SKILL.md Normal file
View file

@ -0,0 +1,93 @@
---
name: ingest
description: Semantic pass of a single raw source into the current genome's wiki — read the source, write sources/entities/concepts, handle contradictions, then emit a manifest and STOP. Use when a new file lands in raw/. Does NOT do git, log, index, lint, or PRs (a post-processor handles those), and does NOT handle private sources or project repos.
license: see repository
compatibility: Runs inside one genome checkout (cwd = genome root). Tools needed — read, edit only. NO bash, NO git. The deterministic steps (index, log, scoped lint, PR) run AFTER you exit, via run-ingest.sh. PRIVATE_CONTEXT must be disabled.
allowed-tools: read edit
metadata:
framework: knowledge-genome
phase: "1-ingest-semantic"
---
# Ingest — semantic pass
You run inside ONE genome checkout. `AGENTS.md` (already in your context) is the
authoritative contract. Your job is the **semantic pass only**: read the source, write
the wiki pages, handle contradictions. You do **not** touch git, the log, the index, the
linter, or PRs — a post-processor (`run-ingest.sh`) does all of that _after you stop_,
from the manifest you leave behind. This keeps your context clean and your turns few,
which matters on a small local model.
**Argument:** the relative path of the single raw source to ingest
(e.g. `raw/articles/foo.md`). Process only this one.
## Pre-flight — stop the session if any check fails
1. Refuse if the argument path is under any `private/` directory.
2. Refuse if `PRIVATE_CONTEXT` is not `disabled`.
3. Confirm the file exists under `raw/`.
## Semantic work (your only job)
1. Read the source once.
2. Write `wiki/sources/<kebab-slug>.md` — faithful summary + key points, with the required
frontmatter (`type: source`, `domain: <genome>`, `maturity: draft`,
`last_updated: <today>`, `private: false`, sensible `tags`).
3. For each entity (person, tool, org) → create or update `wiki/entities/<kebab-name>.md`.
4. For each concept (pattern, theory, decision) → create or update
`wiki/concepts/<kebab-name>.md`.
5. On a real contradiction with an existing claim, follow `AGENTS.md` §Conflict: create
`wiki/queries/conflict-<concept>-<YYYY-MM-DD>.md`. Never overwrite the existing page.
**Naming — you are the sole author of these names; nothing renames your files.** Use
minimal kebab-case: lowercase letters, digits and hyphens only — no spaces, no underscores,
no capitals. Pick stable names so the same entity is never created twice (always `acme`,
never also `acme-corp`). The path you write a file to MUST be byte-for-byte the path you
list in the manifest.
**Deciding create-vs-update and spotting contradictions — mind the context budget.** Use
`wiki/index.md` to locate existing pages, then read **only** the handful that _this source
actually names_ — the entities and concepts in the source's title and opening paragraphs —
not everything the index lists. When in doubt, read fewer: a missed cross-link is far
cheaper than a saturated context. Never scan whole directories.
## Finish: write the manifest, then STOP
As your **final action**, write `.ingest-manifest.json` at the genome root
(NOT under `wiki/`) describing exactly what you did. Then stop — do not commit, lint,
append to the log/index, or open anything.
```json
{
"raw_source": "raw/articles/foo.md",
"reasoning": "One sentence for the log: what changed and why.",
"pr_summary": "One or two sentences describing this ingest for the PR.",
"contradictions": "None (or: 1 conflict file created — <concept>)",
"pages": [
{
"path": "wiki/sources/foo.md",
"summary": "One-line index summary.",
"maturity": "draft",
"status": "created"
},
{
"path": "wiki/entities/acme.md",
"summary": "Acme — vendor.",
"status": "modified"
}
]
}
```
Manifest rules:
- List every page you created or modified, with `status` `created` or `modified`.
- `summary` is the one-line index description (≈12 words max). For conflict pages the
summary is ignored — the index lists conflicts by slug only.
- `maturity` is required only on `created` pages (it seeds the new index entry). It is
ignored for `modified` pages, so omit it there.
- Do NOT add a `model` field — the orchestrator records which model produced this run; you
cannot know your own model name reliably, so do not guess one.
- Do not invent a `run_id`, branch, commit, or PR — those belong to the post-processor.
One source per session. After writing the manifest, stop.

View file

@ -0,0 +1,129 @@
#!/usr/bin/env python3
# =============================================================================
# skills/ingest/scripts/index-append.py
# Insert an entry line into the correct section of wiki/index.md and keep that
# section's entries alphabetically ordered. Bumps frontmatter last_updated.
#
# NOTE: agents-genome.md and wiki-index.md claim the pre-commit hook sorts the
# index. The actual pre-commit.sh only runs the plaintext-leak check — it does
# NOT sort. This script owns the ordering instead. (If you later move sorting
# into the hook, reduce this to a plain append.)
#
# index-append.py --section Sources \
# --entry '- [[sources/foo]] — One-line summary. `maturity: draft`'
# =============================================================================
import argparse
import datetime
import re
import sys
ENTRY_RE = re.compile(r"^- \[\[")
LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]")
HEADER_RE = re.compile(r"^## ")
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--section", required=True,
help="Section name, e.g. Sources / Entities / Concepts / Queries / Conflicts")
ap.add_argument("--entry", required=True, help="Full index line to insert")
ap.add_argument("--file", default="wiki/index.md")
args = ap.parse_args()
try:
with open(args.file, encoding="utf-8") as fh:
lines = fh.read().splitlines()
except FileNotFoundError:
print(f"index-append: not found: {args.file}", file=sys.stderr)
return 1
today = datetime.date.today().isoformat()
# 1. Bump last_updated inside the first frontmatter block
fm_open = False
fm_close_idx = None
bumped = False
for i, ln in enumerate(lines):
if ln.strip() == "---":
if not fm_open:
fm_open = True
continue
fm_close_idx = i # the closing ---
break
if fm_open and ln.startswith("last_updated:"):
lines[i] = f"last_updated: {today}"
bumped = True
if not fm_open:
print("index-append: warning: no frontmatter found, last_updated not bumped",
file=sys.stderr)
elif not bumped and fm_close_idx is not None:
# self-heal: frontmatter present but missing the key — insert it before the close
lines.insert(fm_close_idx, f"last_updated: {today}")
print("index-append: last_updated key was missing — inserted", file=sys.stderr)
# 2. Locate the target section [start, end)
start = None
for i, ln in enumerate(lines):
if HEADER_RE.match(ln) and ln[3:].startswith(args.section):
start = i
break
if start is None:
print(f"index-append: section '{args.section}' not found in {args.file}",
file=sys.stderr)
return 1
end = len(lines)
for i in range(start + 1, len(lines)):
if HEADER_RE.match(lines[i]):
end = i
break
# 3. Split the section body into intro (non-entry) and entries
body = lines[start + 1:end]
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
entries = [ln for ln in body if ENTRY_RE.match(ln)]
# Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed
# summary/maturity should UPDATE the existing entry, not add a duplicate line.
new_m = LINK_RE.match(args.entry)
new_link = new_m.group(1) if new_m else None
if new_link is not None:
replaced = False
for idx, ln in enumerate(entries):
m = LINK_RE.match(ln)
if m and m.group(1) == new_link:
if ln == args.entry:
print("index-append: entry already present, skipping")
return 0
entries[idx] = args.entry # same page, refreshed text
replaced = True
break
if not replaced:
entries.append(args.entry)
else:
# No parseable wikilink — fall back to exact-line dedup.
if args.entry in entries:
print("index-append: entry already present, skipping")
return 0
entries.append(args.entry)
entries.sort(key=str.casefold)
# Normalise intro: drop trailing blanks, keep header + comment(s)
while intro and intro[-1].strip() == "":
intro.pop()
new_section = intro + [""] + entries + [""]
lines = lines[:start + 1] + new_section + lines[end:]
with open(args.file, "w", encoding="utf-8") as fh:
fh.write("\n".join(lines) + "\n")
print(f"index-append: added to {args.section}")
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -0,0 +1,50 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/log-append.sh
# Append one entry to the append-only ledger wiki/log.md, in the exact format
# defined by AGENTS.md / wiki-log.md. Generates run_id. Never edits prior entries.
#
# log-append.sh --type INGEST --subject "<slug>" --model "<model>" \
# --context "[[raw/x]]" --output "[[sources/x]]" \
# --reasoning "One sentence."
# =============================================================================
set -euo pipefail
LOG_FILE="${LOG_FILE:-wiki/log.md}"
type="" subject="" model="" context="" output="" reasoning=""
while [[ $# -gt 0 ]]; do
case "$1" in
--type) type="$2"; shift 2 ;;
--subject) subject="$2"; shift 2 ;;
--model) model="$2"; shift 2 ;;
--context) context="$2"; shift 2 ;;
--output) output="$2"; shift 2 ;;
--reasoning) reasoning="$2"; shift 2 ;;
*) echo "log-append: unknown arg: $1" >&2; exit 1 ;;
esac
done
: "${type:?--type required}"
: "${subject:?--subject required}"
case "$type" in
INGEST|LINT|QUERY|CONFLICT|CONFIG|SECURITY) ;;
*) echo "log-append: invalid TYPE '${type}'" >&2; exit 1 ;;
esac
[[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; }
run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid 2>/dev/null || python3 -c 'import uuid; print(uuid.uuid4())')"
today="$(date +%Y-%m-%d)"
{
printf '\n## [%s] %s | %s\n\n' "$today" "$type" "$subject"
printf -- '- run_id: `%s`\n' "$run_id"
printf -- '- model: `%s`\n' "${model:-unknown}"
printf -- '- context_read: %s\n' "${context:-*(none)*}"
printf -- '- output_written: %s\n' "${output:-*(none)*}"
printf -- '- reasoning: %s\n' "${reasoning:-No reasoning provided.}"
} >> "$LOG_FILE"
echo "run_id=${run_id}"

View file

@ -0,0 +1,118 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/open-pr.sh
# Branch, commit (conventional), push, and open a Forgejo PR for the wiki/ changes.
# Mirrors the API conventions of providers/forgejo.sh (token auth + http_code).
# Runs inside the genome checkout (cwd = genome root). Never touches main.
#
# open-pr.sh --slug <slug> --title "feat: ingest <slug>" --body-file <path> \
# [--base main] [--label CONFLICT]
#
# Requires env: FORGEJO_URL, FORGEJO_USER, FORGEJO_TOKEN.
# =============================================================================
set -euo pipefail
: "${FORGEJO_URL:?missing FORGEJO_URL}"
: "${FORGEJO_USER:?missing FORGEJO_USER}"
: "${FORGEJO_TOKEN:?missing FORGEJO_TOKEN}"
slug="" title="" body_file="" base="main" label=""
while [[ $# -gt 0 ]]; do
case "$1" in
--slug) slug="$2"; shift 2 ;;
--title) title="$2"; shift 2 ;;
--body-file) body_file="$2"; shift 2 ;;
--base) base="$2"; shift 2 ;;
--label) label="$2"; shift 2 ;;
*) echo "open-pr: unknown arg: $1" >&2; exit 1 ;;
esac
done
: "${slug:?--slug required}"
: "${title:?--title required}"
: "${body_file:?--body-file required}"
[[ -f "$body_file" ]] || { echo "open-pr: body file not found: $body_file" >&2; exit 1; }
branch="feat/ai-ingest-${slug}"
repo="$(basename -s .git "$(git config --get remote.origin.url)")"
# 1. Branch + commit + push (AGENTS.md rule 5: never commit to main)
git switch -c "$branch" 2>/dev/null || git switch "$branch"
git add wiki/
# Scope BOTH the emptiness check and the commit to wiki/ — never commit anything that
# happened to be staged outside wiki/ (a stray hook, an aborted prior run, etc.).
if git diff --cached --quiet -- wiki/; then
echo "open-pr: nothing staged under wiki/ — aborting" >&2
exit 1
fi
git commit -m "$title" -- wiki/
git push -u origin "$branch"
# DRY_RUN: local git work done; skip the Forgejo API (offline tests).
if [[ -n "${DRY_RUN:-}" ]]; then
echo "PR opened: DRY-RUN ${branch} -> ${base}"
exit 0
fi
# 2. Open the PR via Forgejo API (jq builds the JSON safely)
# TODO: Forgejo-only. When registry.sh/globals.env sets PROVIDER=github, branch on
# $PROVIDER here and delegate to providers/github.sh (same token + http_code contract).
body="$(cat "$body_file")"
payload="$(jq -n --arg head "$branch" --arg base "$base" \
--arg title "$title" --arg body "$body" \
'{head:$head, base:$base, title:$title, body:$body}')"
resp="$(curl --max-time 30 -s -w '\n%{http_code}' \
-H "Authorization: token ${FORGEJO_TOKEN}" \
-H "Content-Type: application/json" \
-X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls" \
-d "$payload")"
# curl -w appends '\n<code>' AFTER the body, so the code is always the final line and the
# body is everything before it. Parameter expansion (no subshells), robust to multi-line JSON.
code="${resp##*$'\n'}"
json="${resp%$'\n'*}"
case "$code" in
201)
url="$(printf '%s' "$json" | jq -r '.html_url')"
number="$(printf '%s' "$json" | jq -r '.number')"
echo "PR opened: ${url}"
;;
409)
# PR already exists — fetch it so the orchestrator still gets the URL.
existing="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \
"${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls?state=open" \
| jq -r --arg b "$branch" '.[] | select(.head.ref==$b) | .html_url' | head -n1)"
if [[ -n "$existing" && "$existing" != "null" ]]; then
echo "PR opened: ${existing}"
else
echo "open-pr: a PR for '${branch}' already exists (push updated the branch)." >&2
fi
exit 0
;;
401)
echo "open-pr: unauthorized — check FORGEJO_TOKEN (n8n-bot)." >&2
exit 1
;;
*)
echo "open-pr: Forgejo API HTTP ${code}: ${json}" >&2
exit 1
;;
esac
# 3. Optional label (e.g. CONFLICT). Best-effort; non-fatal.
if [[ -n "$label" && -n "${number:-}" ]]; then
label_id="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \
"${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/labels" \
| jq -r --arg n "$label" '.[] | select(.name==$n) | .id' | head -n1)"
if [[ -n "$label_id" && "$label_id" != "null" ]]; then
curl --max-time 15 -s -o /dev/null \
-H "Authorization: token ${FORGEJO_TOKEN}" -H "Content-Type: application/json" \
-X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/issues/${number}/labels" \
-d "{\"labels\":[${label_id}]}" \
&& echo "label '${label}' applied" >&2
else
echo "open-pr: label '${label}' not found in repo — skipped." >&2
fi
fi

View file

@ -0,0 +1,146 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/run-ingest.sh
# Post-pi orchestrator. Runs OUTSIDE pi's loop, on vm101, in the genome checkout.
# Consumes .ingest-manifest.json (written by the ingest skill) and performs every
# deterministic step — index, log, scoped lint, PR — so pi's context stays clean.
#
# run-ingest.sh <genome_name> [manifest_path]
#
# Emits a single JSON result line on stdout for n8n to parse.
# =============================================================================
set -euo pipefail
genome="${1:?usage: run-ingest.sh <genome> [manifest]}"
manifest="${2:-.ingest-manifest.json}"
SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
fail() {
jq -nc --arg stage "$1" --arg reason "$2" \
'{status:"error", stage:$stage, reason:$reason}'
exit 1
}
command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; }
command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)"
[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}"
# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) ---
# 1) well-formed JSON object with a string raw_source and an array of pages
jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \
"$manifest" >/dev/null 2>&1 \
|| fail "manifest" "invalid manifest: need object with string raw_source and array pages"
# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal)
if jq -e '[.pages[].path
| select((type!="string") or (startswith("wiki/")|not) or contains(".."))]
| length > 0' "$manifest" >/dev/null 2>&1; then
fail "manifest" "unsafe page path (must be a string under wiki/, no '..')"
fi
# --- read manifest scalars ---
raw_source="$(jq -r '.raw_source' "$manifest")"
# model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its
# own tag, so we do not trust a self-reported manifest field. Fall back only if unset.
model="${INGEST_MODEL:-$(jq -r '.model // "unknown"' "$manifest")}"
reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")"
pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")"
contradictions="$(jq -r '.contradictions // "None"' "$manifest")"
[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing"
slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" || fail "slug" "empty or invalid slug for ${raw_source}"
# --- collect touched paths ---
mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest")
mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest")
all_paths=( "${created_paths[@]}" "${modified_paths[@]}" )
[[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported"
conflict_label=""
# NOTE: no rollback. Steps below mutate the working tree in order (index → log → commit).
# All are idempotent on re-run EXCEPT log-append (append-only). If a step fails midway,
# nothing is committed (open-pr is the only committer) — the operator re-runs, or inspects
# wiki/ if log-append already wrote a line. The manifest is removed only on full success.
# --- 1. index entries (created pages only), inserted in order ---
while IFS=$'\t' read -r path summary maturity; do
[[ -z "$path" ]] && continue
link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo
folder="${link%%/*}"
case "$folder" in
sources) section="Sources" ;;
entities) section="Entities" ;;
concepts) section="Concepts" ;;
queries)
if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT"
else section="Queries"; fi ;;
*) section="Sources" ;;
esac
if [[ "$section" == "Conflicts" ]]; then
entry="- [[${link}]]" # conflicts: slug only
else
entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`"
fi
python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \
|| fail "index" "index-append failed for ${path}"
done < <(jq -r '.pages[] | select(.status=="created")
| [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest")
# --- 2. log entry ---
out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")"
bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \
--context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \
|| fail "log" "log-append failed"
# --- 3. scoped linter (capture findings for the PR; never aborts the run) ---
lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
# --- 4. assemble the PR body (manifest tables + lint results) ---
body="$(mktemp)"
trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash)
{
echo "## Summary"
echo "$pr_summary"
echo ""
echo "## Pages"
echo "| Path | Status | Maturity |"
echo "|------|--------|----------|"
jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest"
echo ""
echo "## Contradictions"
echo "$contradictions"
echo ""
echo "## Scoped Lint (post-ingest)"
echo '```'
echo "$lint_out"
echo '```'
} > "$body"
# --- 5. open the PR ---
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" )
[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" )
pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
# --- final result line for n8n ---
jq -nc \
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \
--arg slug "$slug" \
--arg pr_url "$pr_url" \
--argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \
--argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \
--arg detail "$pr_out" \
'{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}'
# The manifest is a single file that is overwritten with each run, but if the process is
# completely successful, we remove it to prevent an outdated manifest from being reprocessed by mistake.
if [[ $pr_rc -eq 0 ]]; then
rm -f "$manifest"
else
exit 1
fi

View file

@ -0,0 +1,55 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/scoped-lint.sh
# Run the framework's validation on ONLY the files touched this session.
# Reuses lib/lint.sh + lib/output.sh — same checks as `make lint`, scoped.
#
# KG_LIB_DIR=/opt/knowledge-genome-setup/lib \
# scoped-lint.sh <genome_name> wiki/sources/x.md wiki/entities/y.md
#
# Exits non-zero if any hard error is found, so the agent notices.
# Findings are printed (stderr from the lint functions + a summary on stdout).
# =============================================================================
set -euo pipefail
: "${KG_LIB_DIR:?set KG_LIB_DIR to the framework lib/ dir (e.g. /opt/knowledge-genome-orchestrator/lib)}"
# Fail clearly if the lib files are missing, rather than a raw `source: No such file`.
for _f in output.sh lint.sh; do
[[ -f "${KG_LIB_DIR}/${_f}" ]] || { echo "scoped-lint: missing ${KG_LIB_DIR}/${_f}" >&2; exit 1; }
done
# shellcheck source=/dev/null
source "${KG_LIB_DIR}/output.sh"
# shellcheck source=/dev/null
source "${KG_LIB_DIR}/lint.sh"
genome="${1:?usage: scoped-lint.sh <genome> <file...>}"
shift
[[ $# -gt 0 ]] || { echo "scoped-lint: no files given" >&2; exit 1; }
errors=0
stale=0
count=$#
for f in "$@"; do
if [[ ! -f "$f" ]]; then
warn "scoped-lint: missing file (skipped): $f"
continue
fi
lint_markdown_file "$f" "$genome" && fe=0 || fe=$?
check_privacy_consistency "$f" && pce=0 || pce=$?
check_page_size "$f" && pse=0 || pse=$?
errors=$(( errors + fe + pce + pse ))
check_knowledge_decay "$f" && st=0 || st=$?
stale=$(( stale + st ))
check_broken_links "$f" || true # warnings only
done
echo ""
echo "scoped-lint: ${errors} error(s), ${stale} stale across ${count} file(s)"
[[ $errors -eq 0 ]]

View file

@ -0,0 +1,23 @@
#!/usr/bin/env bash
# =============================================================================
# skills/ingest/scripts/slug.sh
# Derive a wiki slug from a path, filename, or title string.
# slug.sh "raw/articles/My Source.md" -> my-source
# slug.sh "Some Concept Name" -> some-concept-name
# =============================================================================
set -euo pipefail
input="${1:?usage: slug.sh <path-or-title>}"
# Strip directory and extension when given a path
base="${input##*/}"
base="${base%.*}"
slug="$(printf '%s\n' "$base" \
| tr '[:upper:]' '[:lower:]' \
| sed -E 's/[^a-z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//')"
# An all-symbols input (e.g. "!!!.md") collapses to "" — refuse rather than emit a
# broken/empty slug that would produce an invalid branch name downstream.
[[ -n "$slug" ]] || { echo "slug: empty result for input '${input}'" >&2; exit 1; }
printf '%s\n' "$slug"

View file

@ -2,11 +2,11 @@
## Identity
| Field | Value |
|--------|-------|
| Genome | `{{GENOME_NAME}}` |
| Domain | `{{GENOME_DESC}}` |
| Owner | `{{FORGEJO_USER}}` |
| Field | Value |
| ------ | -------------------------------------------------- |
| Genome | `{{GENOME_NAME}}` |
| Domain | `{{GENOME_DESC}}` |
| Owner | `{{FORGEJO_USER}}` |
| Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{GENOME_NAME}}` |
**Role:** Wiki maintainer for `{{GENOME_NAME}}`.
@ -14,14 +14,28 @@
---
## Linked Project
| Field | Value |
| --------------- | --------------------- |
| Project repo | `{{LINKED_PROJECT}}` |
| Branch | `main` |
| Allowed tasks | `readme, tests, code` |
| Preferred model | `auto` |
If `Project repo` is `none`, this genome is knowledge-only — phase-2 project work
does not apply. When set, after a wiki PR is **merged**, the orchestrator may trigger
work on this repo within _Allowed tasks_. The agent never touches the project repo
during ingest.
## PRIVATE_CONTEXT
**Default: `disabled`** — never infer; require explicit operator declaration per session.
| State | Behavior |
|-------|----------|
| `disabled` | `raw/private/` and `wiki/private/` do not exist. No read, list, grep, or summary on private paths. All outputs safe for collaborators. |
| `enabled` | Operator has confirmed `git-crypt unlock` ran on host. Read/write `private/` authorized. All outputs from private data go exclusively to `wiki/private/`. Prefix every response drawing on private data: `[PRIVATE DATA INCLUDED]`. Never leak private synthesis into public wiki paths. |
| State | Behavior |
| ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `disabled` | `raw/private/` and `wiki/private/` do not exist. No read, list, grep, or summary on private paths. All outputs safe for collaborators. |
| `enabled` | Operator has confirmed `git-crypt unlock` ran on host. Read/write `private/` authorized. All outputs from private data go exclusively to `wiki/private/`. Prefix every response drawing on private data: `[PRIVATE DATA INCLUDED]`. Never leak private synthesis into public wiki paths. |
Pre-commit `PLAINTEXT LEAK DETECTED`: stop immediately. Do not use `--no-verify`. Ask operator to verify `.gitattributes` and encryption state.
@ -41,6 +55,7 @@ Session end or return to `disabled`: remind operator to run `git-crypt lock` on
8. Every PR must use `templates/pr-description.md`. Do not omit the tabular summary.
### NEVER
- Load `wiki/log.md` in full — read only the tail injected by the orchestrator.
- Rewrite `wiki/index.md` to reorder entries — append only; sorting is automated.
- Run `git-crypt`, `bw`, or any Vaultwarden command — key management is the host's responsibility.
@ -48,6 +63,7 @@ Session end or return to `disabled`: remind operator to run `git-crypt lock` on
- Merge PRs — human approval required.
### ASK FIRST
- Deleting any wiki page.
- Changing `maturity` from `stable` to `deprecated`.
- Writing to `wiki/private/` when PRIVATE_CONTEXT state is ambiguous.
@ -70,7 +86,8 @@ Execute in this order before any file operation:
## Workflows
### Ingest
*Triggered by new file in `raw/`.*
_Triggered by new file in `raw/`._
1. Read source once.
2. Create `wiki/sources/<slug>.md` — summary + key points.
@ -82,12 +99,14 @@ Execute in this order before any file operation:
8. Run scoped lint on pages created or modified in this session. Report issues in PR description. Do not auto-fix.
9. Commit on `feat/ai-ingest-<slug>`. Open PR using `templates/pr-description.md`.
*Private source* (`PRIVATE_CONTEXT: enabled` required):
_Private source_ (`PRIVATE_CONTEXT: enabled` required):
- All output → `wiki/private/<slug>.md` only.
- PR title: `[PRIVATE] ingest: <slug>`.
### Query
*Triggered by operator question.*
_Triggered by operator question._
1. `qmd search "<query>"` → identify candidate pages.
2. Read candidate pages directly.
@ -96,10 +115,11 @@ Execute in this order before any file operation:
5. Append entry to `wiki/index.md` under Queries.
6. Append log entry: `QUERY | <subject>`.
*For general orientation without a specific query: read `wiki/index.md` directly.*
_For general orientation without a specific query: read `wiki/index.md` directly._
### Lint
*Triggered by operator with bash pre-scan output.*
_Triggered by operator with bash pre-scan output._
Pre-requisite: operator runs `bash scripts/lint-genomes.sh` and provides output to this session.
The script handles deterministically: broken links, knowledge decay, page size, frontmatter validation.
@ -119,13 +139,14 @@ Append log entry: `LINT | <summary of findings>`.
## File Conventions
### Frontmatter
Required on every wiki page:
```yaml
---
title: "Strict String Title"
type: source | entity | concept | query | conflict | private
domain: {{GENOME_NAME}}
domain: { { GENOME_NAME } }
tags: [lowercase, hyphen-separated]
maturity: draft | stable | deprecated
last_updated: YYYY-MM-DD
@ -138,19 +159,25 @@ private: true | false
- `deprecated` — superseded. Add `> **DEPRECATED:** <reason>` callout at top of body.
### Links
- Internal: `[[folder/file]]` — Obsidian wikilinks only. Never `[text](url)` for internal refs.
- Cross-genome: `[[../genome-target/wiki/folder/file]]`.
- External: `[text](https://...)`.
### Index entries
Append at bottom of relevant section in `wiki/index.md`:
```
- [[folder/slug]] — One-line summary. `maturity: draft`
```
Never reorder. Alphabetical sort is handled by the pre-commit hook.
### Log entries
Append one entry per operation to `wiki/log.md`:
```markdown
## [YYYY-MM-DD] TYPE | Subject
@ -160,6 +187,7 @@ Append one entry per operation to `wiki/log.md`:
- output_written: `[[path/C]]`
- reasoning: One sentence — what changed and why.
```
Valid TYPEs: `INGEST` `LINT` `QUERY` `CONFLICT` `CONFIG` `SECURITY`
Parse: `grep "^## \[" wiki/log.md | tail -5`
@ -177,22 +205,26 @@ When new evidence contradicts an existing wiki claim:
---
title: "Conflict: <concept>"
type: conflict
domain: {{GENOME_NAME}}
domain: { { GENOME_NAME } }
maturity: draft
last_updated: YYYY-MM-DD
private: false
---
```
```markdown
## Conflict: <concept>
**Claim A (existing):** [[path/to/existing-page]]
> Summary of current wiki position.
**Claim B (new):** [[path/to/new-source]]
> Summary of contradicting evidence.
**Assessment:**
- Confidence A: high | medium | low — <reason>
- Confidence B: high | medium | low — <reason>
- Recommendation: `accept_b` | `keep_a` | `requires_human_review`
@ -212,20 +244,22 @@ private: false
- `maturity: draft` not updated in **90 days** → flag during lint.
Flagged pages: prepend to body:
```markdown
> **⚠️ STALE:** Last validated {{last_updated}}. Re-validation required.
```
Propose re-validation task. Do not change `maturity` without new source evidence.
---
## Collaboration
| Role | Access | Permitted |
|------|--------|-----------|
| Owner | Full — key holder | Read/write everywhere |
| Collaborator | No key | Push to `raw/articles`, `raw/transcripts`, `raw/code-packs`, `raw/assets` |
| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` |
| Cloud AI model | Public only | `PRIVATE_CONTEXT` must be `disabled`; never send private files outside local network |
| Role | Access | Permitted |
| -------------- | ----------------- | ------------------------------------------------------------------------------------ |
| Owner | Full — key holder | Read/write everywhere |
| Collaborator | No key | Push to `raw/articles`, `raw/transcripts`, `raw/code-packs`, `raw/assets` |
| Local AI agent | Conditional | `private/` only when `PRIVATE_CONTEXT: enabled` |
| Cloud AI model | Public only | `PRIVATE_CONTEXT` must be `disabled`; never send private files outside local network |
Grant collaborator: add as Forgejo contributor with Write role. Never share the git-crypt key.

View file

@ -2,10 +2,10 @@
## Identity
| Field | Value |
|--------|-------|
| Repo | `{{MASTER_REPO}}` |
| Owner | `{{FORGEJO_USER}}` |
| Field | Value |
| ------ | -------------------------------------------------- |
| Repo | `{{MASTER_REPO}}` |
| Owner | `{{FORGEJO_USER}}` |
| Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}` |
**Role:** Cross-genome coordinator for the Knowledge Genome network.
@ -32,14 +32,17 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file.
## Global Security Rules
### PRIVATE_CONTEXT scope
- Toggle is **per-genome and per-session**. Enabling for `genome-finance` does NOT enable for `genome-dev`.
- Cloud LLM models: `PRIVATE_CONTEXT` must be `disabled` for all genomes. Private data never leaves the local network.
### Log sanitization
- Never print decrypted secrets, session tokens, or key contents to stdout or log files.
- Document only `run_id` and genome name — never the key value.
### Key management
- Key injection is the host's responsibility — executed before this session starts.
- Never write, suggest, or generate scripts that save `.key` files to disk.
@ -54,12 +57,14 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file.
5. Per-genome `AGENTS.md` governs all wiki operations within that genome. This file governs boundaries only.
### NEVER
- Load multiple `wiki/index.md` files simultaneously for cross-genome comparison — use qmd.
- Run `git-crypt`, `bw`, or Vaultwarden commands — host responsibility.
- Modify files in more than one genome in the same operation.
- Modify `core-karpathy` in any way.
### ASK FIRST
- Any operation that touches two or more genomes.
- Updating submodule pointers in master.
- Any key rotation procedure.
@ -77,7 +82,8 @@ Genome-level operations are governed by the genome's `AGENTS.md`, not this file.
---
## Cross-Genome Lint
*Manual, monthly — requires operator initiation. Not automated.*
_Manual, monthly — requires operator initiation. Not automated._
1. Use `qmd search "<concept>"` to find pages covering the same concept across genomes.
2. Identify:

View file

@ -1,25 +1,31 @@
## Summary
<!-- One sentence: goal of this session and source processed. -->
## Pages Created
| Path | Type | Maturity |
|------|------|----------|
| `[[folder/slug]]` | entity / concept / source / query | draft |
| Path | Type | Maturity |
| ----------------- | --------------------------------- | -------- |
| `[[folder/slug]]` | entity / concept / source / query | draft |
## Pages Modified
| Path | Change |
|------|--------|
| Path | Change |
| ----------------- | ----------------------------------------- |
| `[[folder/slug]]` | Added cross-reference to `[[other/page]]` |
## Contradictions Found
- [ ] None
- [ ] `n` conflict file(s) created — listed below
## Private Data Accessed
- [ ] No — `PRIVATE_CONTEXT: disabled`
- [ ] Yes — `PRIVATE_CONTEXT: enabled` · outputs in `wiki/private/` only
## Scoped Lint (post-ingest)
- [ ] Frontmatter valid on all touched pages
- [ ] No broken wikilinks on touched pages
- [ ] No issues found

View file

@ -0,0 +1,45 @@
# {{MASTER_REPO}}
Master (umbrella) repository for the Knowledge Genome network.
| Field | Value |
| ---------- | -------------------------------------------------- |
| Owner | `{{FORGEJO_USER}}` |
| Remote | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}` |
| Scaffolded | `{{DATE}}` |
## What this repo is
This repository does **not** hold knowledge itself. It is the orchestrator: each genome
is a Git submodule, plus `core-karpathy` as a read-only reference pattern. Cross-genome
coordination rules live in `AGENTS.md`.
```text
{{MASTER_REPO}}/
├── core-karpathy/ ← reference pattern — read-only, never modify
├── genome-*/ ← one submodule per genome (own AGENTS.md, own git-crypt)
└── AGENTS.md ← cross-genome coordinator (boundaries only)
```
## Working with submodules
```bash
# Clone with all genomes
git clone --recurse-submodules {{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}.git
# Pull the latest pointers for every genome
git submodule update --remote --merge
# Operate inside a single genome (one genome at a time — see AGENTS.md)
cd genome-<name>
```
## Rules of the road
- Operate within **one genome at a time**; no commits spanning multiple genomes.
- `core-karpathy` is read-only.
- Never commit to `main` in a genome — PRs only, no self-merge.
- Private data (`**/private/**`) is git-crypt encrypted and never leaves the local network.
Genome-level operations are governed by each genome's own `AGENTS.md`. This README and the
master `AGENTS.md` govern boundaries only.

View file

@ -1,9 +1,9 @@
---
title: "Index — {{GENOME_NAME}}"
type: index
domain: {{GENOME_NAME}}
domain: { { GENOME_NAME } }
maturity: stable
last_updated: {{DATE}}
last_updated: { { DATE } }
private: false
---
@ -19,27 +19,28 @@ Entry format: `- [[folder/slug]] — One-line summary. \`maturity: <value>\``
---
## Sources (`wiki/sources/`)
*Ingested raw materials. One entry per processed source.*
_Ingested raw materials. One entry per processed source._
## Entities (`wiki/entities/`)
*People, organisations, tools, projects.*
_People, organisations, tools, projects._
## Concepts (`wiki/concepts/`)
*Theories, methodologies, patterns, architectural decisions.*
_Theories, methodologies, patterns, architectural decisions._
## Queries (`wiki/queries/`)
*Synthesised answers worth preserving. Archived explorations and analyses.*
_Synthesised answers worth preserving. Archived explorations and analyses._
## Conflicts Pending Review (`wiki/queries/conflict-*.md`)
*Created automatically when the agent detects contradictions between sources.*
*Do not summarise entries here — list slugs only to avoid surfacing unresolved claims.*
*Remove entry once the operator has resolved and closed the corresponding PR.*
_Created automatically when the agent detects contradictions between sources._
_Do not summarise entries here — list slugs only to avoid surfacing unresolved claims._
_Remove entry once the operator has resolved and closed the corresponding PR._
## Private Synthesis (`wiki/private/`)
*Restricted access. Requires `PRIVATE_CONTEXT: enabled` and unlocked repo.*
*List slug names ONLY. Do not append summaries — prevents metadata leakage.*
_Restricted access. Requires `PRIVATE_CONTEXT: enabled` and unlocked repo._
_List slug names ONLY. Do not append summaries — prevents metadata leakage._

View file

@ -1,9 +1,9 @@
---
title: "Operations Log — {{GENOME_NAME}}"
type: log
domain: {{GENOME_NAME}}
domain: { { GENOME_NAME } }
maturity: stable
last_updated: {{DATE}}
last_updated: { { DATE } }
private: false
---
@ -22,11 +22,13 @@ Append new entries at the bottom using the format defined below.
## Entry Format
### Required header (enables shell parsing):
```text
## [YYYY-MM-DD] TYPE | Subject or title
```
### Required metadata block for all agent-generated entries:
```markdown
- run_id: `<short-uuid or session-identifier>`
- model: `<model-name-and-version>`
@ -38,6 +40,7 @@ Append new entries at the bottom using the format defined below.
**Valid TYPEs:** `INGEST` | `LINT` | `QUERY` | `CONFLICT` | `CONFIG` | `SECURITY`
**Parse examples:**
```bash
# Last 5 entries
grep "^## \[" wiki/log.md | tail -5
@ -55,6 +58,6 @@ grep "^## \[2026-05" wiki/log.md
- run_id: `system-init`
- model: `setup-knowledge-genome.sh`
- context_read: *(none — initial scaffold)*
- context_read: _(none — initial scaffold)_
- output_written: `[[wiki/index.md]]`, `[[wiki/log.md]]`, `[[AGENTS.md]]`
- reasoning: Initial directory structure and encryption layer initialized by setup script.

56
tests/README.md Normal file
View file

@ -0,0 +1,56 @@
# Tests
Deterministic tests for the mechanical layer of the framework — **no LLM, no GPU, no
network**. They simulate pi's output with fixtures and exercise the scripts directly, so
they run anywhere (laptop, CI, a git hook). They do **not** belong on vm101 or in n8n.
## What's covered
| File | Covers |
|------|--------|
| `scripts.bats` | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) |
| `lint.bats` | `lib/lint.sh` validators + `scoped-lint.sh` reuse |
| `structure.bats` | `lib/structure.sh` report/sync |
| `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` |
`run-ingest.bats` auto-`skip`s if `jq` is missing; everything else needs only bash + git
(+ `python3` for the index tests).
## Install bats
```bash
# Debian/Ubuntu
sudo apt install bats
# or pinned, as a vendored submodule
git submodule add https://github.com/bats-core/bats-core.git test/bats
```
## Run
```bash
bats tests/ # whole suite
bats tests/lint.bats # one file
bats -f "sorted" tests/scripts.bats # filter by name
```
Each test builds its own throwaway genome under `BATS_TEST_TMPDIR` (auto-cleaned) with a
local bare git remote, so `open-pr.sh --DRY_RUN` can branch/commit/push without touching
Forgejo.
## Makefile targets
```make
test:
@bats tests/
verify-structure:
@bash scripts/verify-genomes.sh
sync-structure:
@bash scripts/verify-genomes.sh --sync
```
## Note on `helpers.bash`
`FIXTURE_DIRS` in `helpers.bash` must match `GENOME_DIRS` in `lib/structure.sh`. If you
change the canonical layout, update both (the structure tests assume a clean baseline).

98
tests/helpers.bash Normal file
View file

@ -0,0 +1,98 @@
#!/usr/bin/env bash
# tests/helpers.bash — shared helpers for the bats suite.
REPO_ROOT="$(cd "${BATS_TEST_DIRNAME}/.." && pwd)"
LIB_DIR="${REPO_ROOT}/lib"
SKILL_SCRIPTS="${REPO_ROOT}/skills/ingest/scripts"
# Canonical dirs a fresh genome must contain (kept in sync with lib/structure.sh).
FIXTURE_DIRS=(
raw/articles raw/transcripts raw/code-packs raw/assets raw/private
wiki/sources wiki/entities wiki/concepts wiki/queries wiki/private
)
# make_fixture_genome → echoes the path to a throwaway genome checkout with a
# local bare remote, the full canonical structure, and rendered index/log.
# Uses BATS_TEST_TMPDIR so bats cleans it up automatically.
make_fixture_genome() {
local base; base="$(mktemp -d "${BATS_TEST_TMPDIR:-/tmp}/genome.XXXXXX")"
git init --bare -q "${base}/origin.git"
local g="${base}/genome"
local d
for d in "${FIXTURE_DIRS[@]}"; do mkdir -p "${g}/${d}"; touch "${g}/${d}/.gitkeep"; done
cat > "${g}/wiki/index.md" <<'EOF'
---
title: "Index — genome-test"
type: index
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Master Index: genome-test
---
## Sources (`wiki/sources/`)
*Ingested raw materials.*
## Entities (`wiki/entities/`)
*People, tools.*
## Concepts (`wiki/concepts/`)
*Patterns.*
## Queries (`wiki/queries/`)
*Answers.*
## Conflicts Pending Review (`wiki/queries/conflict-*.md`)
*slugs only.*
EOF
cat > "${g}/wiki/log.md" <<'EOF'
---
title: "Operations Log — genome-test"
type: log
domain: genome-test
maturity: stable
last_updated: 2026-01-01
private: false
---
# Operations Log
---
## [2026-01-01] CONFIG | scaffolded
- run_id: `init`
EOF
echo "raw test" > "${g}/raw/articles/test.md"
mkdir -p "${base}/nohooks"
(
cd "${g}"
git init -q
# Hermetic: ignore the user's global git config (signing, global hooks);
# otherwise commit.gpgsign or a global core.hooksPath makes git commit fail here.
git config commit.gpgsign false
git config core.hooksPath "${base}/nohooks"
git config user.email t@t
git config user.name tester
git add .
git commit -qm init
git branch -M main
git remote add origin "${base}/origin.git"
git push -q -u origin main
) >/dev/null
echo "${g}"
}

71
tests/lint.bats Normal file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env bats
# tests/lint.bats — lib/lint.sh validators and the scoped-lint wrapper.
load helpers
setup() {
source "$LIB_DIR/output.sh"
source "$LIB_DIR/lint.sh"
}
write_page() { # write_page <path> <type> <domain>
cat > "$1" <<EOF
---
title: "T"
type: $2
domain: $3
tags: [x]
maturity: draft
last_updated: $(date +%F)
private: false
---
body
EOF
}
@test "lint_markdown_file: a clean page passes (0 errors)" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/good.md" source genome-test
run lint_markdown_file "$G/wiki/sources/good.md" genome-test
[ "$status" -eq 0 ]
}
@test "lint_markdown_file: invalid type + wrong domain are caught" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/bad.md" banana wrong-genome
run lint_markdown_file "$G/wiki/sources/bad.md" genome-test
[ "$status" -ne 0 ]
}
@test "check_privacy_consistency: a private/ file without 'private: true' fails" {
G="$(make_fixture_genome)"
# page sits in wiki/private/ but is flagged private: false → leak
write_page "$G/wiki/private/p.md" private genome-test
run check_privacy_consistency "$G/wiki/private/p.md"
[ "$status" -ne 0 ]
}
@test "check_page_size: a >800-line page errors" {
G="$(make_fixture_genome)"
{ write_page "$G/wiki/sources/big.md" source genome-test; yes "x" | head -n 850 >> "$G/wiki/sources/big.md"; }
run check_page_size "$G/wiki/sources/big.md"
[ "$status" -ne 0 ]
}
@test "scoped-lint: aggregates findings and exits non-zero on errors" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/bad.md" banana wrong-genome
cd "$G"
export KG_LIB_DIR="$LIB_DIR"
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/bad.md
[ "$status" -ne 0 ]
[[ "$output" == *"error(s)"* ]]
}
@test "scoped-lint: a clean page passes (exit 0)" {
G="$(make_fixture_genome)"
write_page "$G/wiki/sources/good.md" source genome-test
cd "$G"
export KG_LIB_DIR="$LIB_DIR"
run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/good.md
[ "$status" -eq 0 ]
}

173
tests/run-ingest.bats Normal file
View file

@ -0,0 +1,173 @@
#!/usr/bin/env bats
# tests/run-ingest.bats — end-to-end orchestrator test (no LLM, no network).
# Simulates pi's output (a source page + manifest) and runs the mechanical pass.
load helpers
@test "run-ingest: DRY_RUN end-to-end updates index + log and opens a dry PR" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
# --- simulate the semantic pass that pi would have done ---
cat > wiki/sources/test-source.md <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-03
private: false
---
body
EOF
cat > .ingest-manifest.json <<'EOF'
{
"raw_source": "raw/articles/test.md",
"model": "qwen3.5-9b",
"reasoning": "Ingested the test source.",
"pr_summary": "Ingest of test: 1 source page.",
"contradictions": "None",
"pages": [
{"path": "wiki/sources/test-source.md", "summary": "A smoke-test source.", "maturity": "draft", "status": "created"}
]
}
EOF
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t"
export DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *'"status":"ok"'* ]]
[[ "$output" == *'"lint_clean":true'* ]]
[[ "$output" == *'"conflict":false'* ]]
# side effects on the working tree
grep -q 'sources/test-source' wiki/index.md
grep -q 'INGEST | test' wiki/log.md
git rev-parse --verify feat/ai-ingest-test
}
@test "run-ingest: a conflict page is labelled and lands in the Conflicts section" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/queries/conflict-pricing-2026-06-03.md <<'EOF'
---
title: "Conflict: pricing"
type: conflict
domain: genome-test
maturity: draft
last_updated: 2026-06-03
private: false
---
conflict body
EOF
cat > .ingest-manifest.json <<'EOF'
{
"raw_source": "raw/articles/test.md",
"model": "m",
"reasoning": "Flagged a contradiction.",
"pr_summary": "Conflict on pricing.",
"contradictions": "1 conflict file created — pricing",
"pages": [
{"path": "wiki/queries/conflict-pricing-2026-06-03.md", "summary": "ignored", "maturity": "draft", "status": "created"}
]
}
EOF
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t"
export DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *'"conflict":true'* ]]
# listed by slug under the Conflicts section
grep -q 'queries/conflict-pricing-2026-06-03' wiki/index.md
}
@test "run-ingest: records INGEST_MODEL in the log (manifest carries no model field)" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/sources/test-source.md <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-04
private: false
---
body
EOF
# New contract: NO "model" field — the orchestrator supplies it via INGEST_MODEL.
cat > .ingest-manifest.json <<'EOF'
{
"raw_source": "raw/articles/test.md",
"reasoning": "Ingested the test source.",
"pr_summary": "Ingest of test: 1 source page.",
"contradictions": "None",
"pages": [
{"path": "wiki/sources/test-source.md", "summary": "A smoke-test source.", "maturity": "draft", "status": "created"}
]
}
EOF
export KG_LIB_DIR="$LIB_DIR"
export FORGEJO_URL="http://forgejo.local" FORGEJO_USER="u" FORGEJO_TOKEN="t" DRY_RUN=1
export INGEST_MODEL="qwen-test-tag"
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *'"status":"ok"'* ]]
grep -q 'qwen-test-tag' wiki/log.md
}
@test "run-ingest: rejects a manifest path that escapes wiki/ (traversal)" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > .ingest-manifest.json <<'EOF'
{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/../etc/passwd","summary":"x","maturity":"draft","status":"created"}] }
EOF
export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -ne 0 ]
[[ "$output" == *'"status":"error"'* ]]
}
@test "run-ingest: honours INGEST_BASE for the PR base" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/sources/test-source.md <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-04
private: false
---
body
EOF
cat > .ingest-manifest.json <<'EOF'
{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/sources/test-source.md","summary":"s","maturity":"draft","status":"created"}] }
EOF
export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
export INGEST_BASE="develop"
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *"develop"* ]]
}

88
tests/scripts.bats Normal file
View file

@ -0,0 +1,88 @@
#!/usr/bin/env bats
# tests/scripts.bats — unit tests for the deterministic skill scripts.
load helpers
@test "slug: path with extension and spaces" {
run bash "$SKILL_SCRIPTS/slug.sh" "raw/articles/My Test Source.md"
[ "$status" -eq 0 ]
[ "$output" = "my-test-source" ]
}
@test "slug: punctuation and repeats collapse to single hyphens" {
run bash "$SKILL_SCRIPTS/slug.sh" "Qualche Concetto!! Strano"
[ "$output" = "qualche-concetto-strano" ]
}
@test "log-append: appends a well-formed INGEST entry with a run_id" {
G="$(make_fixture_genome)"; cd "$G"
run bash "$SKILL_SCRIPTS/log-append.sh" --type INGEST --subject foo --model m \
--context "[[raw/x]]" --output "[[sources/foo]]" --reasoning "why"
[ "$status" -eq 0 ]
grep -q "INGEST | foo" wiki/log.md
grep -q '^- run_id: `' wiki/log.md
grep -q '^- model: `m`' wiki/log.md
}
@test "log-append: rejects an invalid TYPE" {
G="$(make_fixture_genome)"; cd "$G"
run bash "$SKILL_SCRIPTS/log-append.sh" --type BOGUS --subject foo
[ "$status" -ne 0 ]
}
@test "index-append: inserts under the right section and keeps it sorted" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/zzz]] — z. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/aaa]] — a. `maturity: draft`'
a=$(grep -n 'sources/aaa' wiki/index.md | cut -d: -f1)
z=$(grep -n 'sources/zzz' wiki/index.md | cut -d: -f1)
[ -n "$a" ] && [ -n "$z" ]
[ "$a" -lt "$z" ]
}
@test "index-append: bumps frontmatter last_updated to today" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Concepts --entry '- [[concepts/x]] — x. `maturity: draft`'
grep -q "^last_updated: $(date +%F)$" wiki/index.md
}
@test "index-append: is idempotent for the same entry" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/dup]] — d. `maturity: draft`'
[ "$(grep -c 'sources/dup' wiki/index.md)" -eq 1 ]
}
@test "index-append: updates an existing entry by wikilink path (no duplicate)" {
G="$(make_fixture_genome)"; cd "$G"
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — old summary. `maturity: draft`'
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — new summary. `maturity: stable`'
[ "$(grep -c 'sources/foo' wiki/index.md)" -eq 1 ]
grep -q 'new summary' wiki/index.md
! grep -q 'old summary' wiki/index.md
}
@test "slug: refuses an all-symbols input (no empty slug)" {
run bash "$SKILL_SCRIPTS/slug.sh" "!!!.md"
[ "$status" -ne 0 ]
[ -z "$output" ] || [[ "$output" != *"feat/ai-ingest-"* ]]
}
@test "index-append: self-heals a frontmatter missing last_updated" {
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/index.md <<'EOF'
---
title: "Index"
type: index
domain: genome-test
maturity: stable
private: false
---
# Index
## Sources (`wiki/sources/`)
*x*
EOF
python3 "$SKILL_SCRIPTS/index-append.py" --section Sources --entry '- [[sources/foo]] — s. `maturity: draft`'
grep -q "^last_updated: $(date +%F)$" wiki/index.md
}

40
tests/structure.bats Normal file
View file

@ -0,0 +1,40 @@
#!/usr/bin/env bats
# tests/structure.bats — canonical-structure verify/sync.
load helpers
setup() {
source "$LIB_DIR/output.sh"
source "$LIB_DIR/structure.sh"
}
@test "structure_report: a full fixture has no drift" {
G="$(make_fixture_genome)"
run structure_report "$G"
[ "$status" -eq 0 ]
}
@test "structure_report: flags a missing canonical dir" {
G="$(make_fixture_genome)"
rm -rf "$G/wiki/private"
run structure_report "$G"
[ "$status" -ne 0 ]
[[ "$output" == *"wiki/private"* ]]
}
@test "structure_report: notes an extra dir but does not fail on it" {
G="$(make_fixture_genome)"
mkdir -p "$G/wiki/experiments"
run structure_report "$G"
[ "$status" -eq 0 ]
[[ "$output" == *"experiments"* ]]
}
@test "structure_sync: creates missing dirs and is idempotent" {
G="$(make_fixture_genome)"
rm -rf "$G/wiki/private" "$G/raw/transcripts"
structure_sync "$G"
[ -d "$G/wiki/private" ] && [ -d "$G/raw/transcripts" ]
run structure_report "$G"
[ "$status" -eq 0 ]
structure_sync "$G" # second run: nothing to do
}