diff --git a/Makefile b/Makefile index 3366cb9..feae5b5 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ # ============================================================================= -# Knowledge Genome - Makefile v. 0.1.0 +# Knowledge Genome - Makefile v. 0.2.0 # Orchestrates the setup and management of the knowledge base. # ============================================================================= -include config.env -export $(shell sed 's/=.*//' config.env) +include globals.env +export $(shell grep -v '^[#[:space:]]' globals.env | sed 's/=.*//') -.PHONY: setup add-genome status lint clean help +.PHONY: setup add-genome status lint lock doctor sync help help: @echo "Available commands:" @@ -14,6 +14,9 @@ help: @echo " make add-genome - Register and scaffold a new genome" @echo " make status - Check submodule and encryption status" @echo " make lint - Verify schema, privacy flags, and metadata" + @echo " make lock - Lock all encrypted files across all genomes" + @echo " make doctor - Verify all required tools are installed" + @echo " make sync - Sync submodules and report unpushed commits" lint: @bash scripts/lint-genomes.sh @@ -34,3 +37,25 @@ status: @git submodule status @echo "--- Encryption Status (First 10 files) ---" @git-crypt status | head -n 10 + +doctor: + @echo "Checking required tools..." + @command -v git >/dev/null 2>&1 || { echo " MISSING: git"; exit 1; } + @command -v git-crypt >/dev/null 2>&1 || { echo " MISSING: git-crypt"; exit 1; } + @command -v curl >/dev/null 2>&1 || { echo " MISSING: curl"; exit 1; } + @command -v jq >/dev/null 2>&1 || { echo " MISSING: jq"; exit 1; } + @command -v bw >/dev/null 2>&1 || echo " OPTIONAL: bw (Bitwarden CLI) not found — key injection will be manual." + @echo "System ready." + +sync: + @echo "Syncing submodules..." + @git submodule update --init --recursive + @echo "--- Unpushed commits per genome ---" + @git submodule foreach 'git log --oneline @{u}.. 2>/dev/null | head -5 || true' + +lock: + @echo "Locking master repository..." + @git-crypt lock 2>/dev/null || true + @echo "Locking all submodules..." + @git submodule foreach 'git-crypt lock 2>/dev/null || true' + @echo "All genomes securely locked." diff --git a/README.md b/README.md index eedcf3f..c304199 100644 --- a/README.md +++ b/README.md @@ -1,201 +1,200 @@ # Knowledge Genome System -> A distributed, modular, and secure personal knowledge base architecture. +> A distributed, modular, and secure personal knowledge base — no vector database required. -The **Knowledge Genome System** is a framework designed to manage personal knowledge using a "Master-Genome" architecture. It follows the LLM-Wiki patterns (Karpathy-style) while adding a robust security layer for sensitive data and automated quality control. +The **Knowledge Genome System** implements the [LLM Wiki pattern](https://gist.github.com/karpathy/442a6bf555914893e9891c11519de94f) +by Andrej Karpathy, extended with a multi-domain submodule architecture, git-crypt +encryption for sensitive data, and a human-in-the-loop Git Flow for quality control. --- -# Architecture +## Core Philosophy -This project is structured as a **Master Orchestrator** that manages multiple independent **Genomes** via Git Submodules. +Most RAG systems make the LLM rediscover knowledge from scratch on every query. +This system is different: the LLM **incrementally builds and maintains a persistent wiki** +that sits between you and the raw sources. Knowledge is compiled once and kept current — +not re-derived on every question. -## Core Components +**This means: no vector database, no embedding pipeline, no external retrieval server.** +The `wiki/index.md` of each genome is the retrieval layer. At moderate scale +(~100 sources, hundreds of pages) this works better than RAG because cross-references, +contradictions, and syntheses are already resolved — the LLM doesn't have to piece +them together at query time. -### Master Repository - -Contains: - -* Orchestration scripts -* Global configuration (`config.env`) -* Security templates - -### Genomes - -Individual specialized repositories (e.g. `genome-dev`, `genome-finance`) that act as standalone units of knowledge. - -### Security Layers - -#### Physical Security - -`git-crypt` encrypts `private/` directories at rest. - -#### Logical Security - -YAML frontmatter (`private: true`) prevents AI agents from leaking sensitive data during public sessions. - -#### Validation Layer - -A custom linting engine ensures metadata consistency. +If the wiki grows beyond what the index can navigate efficiently, the only recommended +search extension is [`qmd`](https://github.com/tobi/qmd) — a local, on-device +BM25 + vector search engine for markdown files with an MCP server interface. +No external infrastructure required. --- -# Quick Start +## Architecture + +```text +master-knowledge-genome/ ← Root orchestrator +├── core-karpathy/ ← LLM Wiki reference pattern (read-only submodule) +├── genome-dev/ ← Submodule: web dev, Angular, TUI +├── genome-finance/ ← Submodule: personal finance +├── genome-homelab/ ← Submodule: Keru infrastructure +└── AGENTS.md ← Global coordination schema +``` + +Each genome is an independent repository with this structure: +```text +genome-{name}/ +├── raw/ +│ ├── articles/ transcripts/ code-packs/ assets/ ← Plaintext, open to collaborators +│ └── private/ ← AES-256-CTR encrypted (git-crypt) +├── wiki/ +│ ├── index.md log.md ← Navigation and audit trail +│ ├── sources/ entities/ concepts/ queries/ ← Agent-maintained knowledge +│ └── private/ ← AES-256-CTR encrypted (git-crypt) +└── AGENTS.md ← Per-genome agent contract +``` + +--- ## Prerequisites -Required dependencies: +**Required:** +- `git` +- `git-crypt` +- `curl` +- `jq` -* `git` -* `git-crypt` -* `curl` -* `jq` +**Optional:** +- `bw` (Bitwarden CLI) — for runtime key injection from Vaultwarden without writing keys to disk -Optional: - -* `bw` (Bitwarden CLI) — used for runtime key injection +Install on Ubuntu/Debian: +```bash +sudo apt update && sudo apt install -y git git-crypt curl jq +``` --- -## Initialization +## Quick Start ```bash -# 1. Clone the master repository -git clone && cd master-knowledge-genome +# 1. Clone this setup repository +git clone knowledge-genome-setup +cd knowledge-genome-setup -# 2. Run the full setup -# (checks dependencies, creates master scaffold, -# initializes genomes) +# 2. Export your Forgejo token +export FORGEJO_TOKEN="your_token_here" + +# 3. Run full setup make setup ``` -# Management Commands +`make setup` will: +- Check all dependencies +- Create the master and genome repositories on Forgejo +- Scaffold the local directory structure with git-crypt active on `private/` +- Install the pre-commit security hook in each genome +- Export the symmetric git-crypt keys to `keys/` -The system is controlled through a centralized Makefile. +--- -| Command | Description | -| ----------------- | -------------------------------------------------------------- | -| `make setup` | Full system initialization (Master + Registry Genomes). | -| `make add-genome` | Scaffolds and registers a new genome (requires NAME and DESC). | -| `make lint` | Runs the validation suite across all genomes. | -| `make status` | Checks Git status and encryption state for all submodules. | +## Management Commands -# Validation & Linting (`make lint`) +| Command | Description | +|---------|-------------| +| `make setup` | Full system initialisation (master + all genomes defined in `config.env`) | +| `make add-genome NAME=x DESC="y"` | Scaffold and register a new genome | +| `make lint` | Validate schema, privacy flags, and metadata across all genomes | +| `make status` | Show git submodule status and first 10 git-crypt encryption states | +| `make help` | Show all available targets | -The built-in linter ensures that the knowledge base remains machine-readable and secure. - -It automatically validates: - -## Frontmatter Integrity - -Every `.md` file must contain valid YAML headers. - -## Domain Consistency - -Ensures that a file's domain metadata matches its parent genome. - -## Privacy Leak Detection - -Critical validation step. - -Verifies that any file located in a `/private/` directory contains the flag: - -```yaml -private: true +**Adding a new genome example:** +```bash +make add-genome NAME=genome-research DESC="Academic papers, deep-dives, open research" ``` -This prevents accidental exposure during AI sessions. +--- -## Broken Wiki-Links +## Security Model -Detects dead `[[internal-links]]`. +### Hybrid Privacy Architecture -# Security Model +Each genome has two layers: -## Hybrid Privacy Architecture +| Layer | Directories | Access | +|-------|-------------|--------| +| Public | `raw/articles/`, `raw/transcripts/`, `wiki/sources/`, `wiki/concepts/` | Plaintext — safe for collaborators | +| Private | `raw/private/`, `wiki/private/` | AES-256-CTR via git-crypt — owner only | -Each genome is divided into two layers. +On the remote (Forgejo), private files are opaque binary blobs. +Collaborators without the key can contribute normally to public directories +— git handles the encrypted files transparently with no errors. -### Public Layer +### Runtime Key Injection -Directories: - -```text -raw/public/ -wiki/public/ -``` - -Characteristics: - -* Plaintext -* Shareable with collaborators - -### Private Layer - -Directories: - -```text -raw/private/ -wiki/private/ -``` - -Characteristics: - -* Encrypted using AES-256 via `git-crypt` - -## Runtime Key Injection - -To keep the AI environment secure, encryption keys are never stored on the VM disk. - -Instead, the system uses Bitwarden (`bw`) / Vaultwarden for runtime injection. - -### Example +Encryption keys are never stored as persistent files on the AI server. +They are injected at session start via the Bitwarden CLI (`bw`) against +your self-hosted Vaultwarden instance, using process substitution: ```bash -# Unlock a genome using a key stored in Vaultwarden +# Key lives only in a kernel file descriptor — never touches disk git-crypt unlock <( - bw get notes "genome-dev key" \ - --session "$BW_SESSION" | base64 -d + bw get notes "genome-dev key" --session "$BW_SESSION" | base64 -d ) ``` -# Genome Schema +**Use `bw` (standard Bitwarden CLI), not `bws`.** +`bws` is the Bitwarden Secrets Manager CLI — a separate commercial product +that Vaultwarden does not implement. -All wiki documents follow a strict schema to support AI ingestion. +### Pre-commit Hook -## YAML Frontmatter Schema +A security hook is installed in every genome's `.git/hooks/pre-commit`. +It inspects every staged file: if any file in `raw/private/` or `wiki/private/` +is not encrypted by git-crypt, the commit is blocked with a clear error message +explaining how to fix the issue. -```yaml ---- -title: "Document Title" -type: entity | concept | source | log -domain: genome-name -private: true/false -last_updated: YYYY-MM-DD ---- +### Key Rotation + +If a key is lost or compromised: +```bash +source lib/git-crypt.sh +cd ~/knowledge-genome-setup/genome-dev +gcrypt_rotate_key "genome-dev" ``` +The function decrypts all private files, generates a new key, re-encrypts, +and prints instructions for updating Vaultwarden. -# Agent Interaction +--- -When starting a session with an AI agent, always declare the privacy context. +## Agent Interaction -## Public Context +At the start of every AI session, declare the privacy context explicitly: ```text PRIVATE_CONTEXT: disabled ``` - -Behavior: - -* The agent ignores all private folders. - -## Private Context +The agent ignores all `private/` directories. Outputs are safe to share. ```text PRIVATE_CONTEXT: enabled ``` +The agent processes encrypted data. Requires the genome to be unlocked. +All outputs referencing private data are prefixed with `[PRIVATE DATA INCLUDED]`. -Behavior: +--- -* The agent processes encrypted data. -* Requires the repository to be unlocked. +## Knowledge Quality + +The system includes three quality mechanisms drawn directly from the LLM Wiki pattern: + +**Conflict Resolution** — when new evidence contradicts existing wiki content, +the agent creates a `wiki/queries/conflict-*.md` node instead of silently overwriting. +Human review required before merging. + +**Knowledge Decay** — pages with `maturity: stable` not updated in 6 months, +and `maturity: draft` pages not updated in 3 months, are flagged during lint passes +with a `⚠️ STALE` callout. The agent proposes re-validation but does not change +maturity without new source evidence. + +**Cross-Genome Lint** — once a month, a manual session passes the aggregated index +of all genomes to the agent to detect concept duplication and missing cross-references. +No automated LLM controller in CI/CD — the cost in tokens and complexity is not +justified at this scale. diff --git a/config.env b/config.env deleted file mode 100644 index 56e176e..0000000 --- a/config.env +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash -# ============================================================================= -# config.env -# Single Source of Truth for the Knowledge Genome Framework. -# ============================================================================= - -# --- PROVIDER SELECTION --- -PROVIDER="forgejo" # Options: "forgejo", "github" - -# --- FORGEJO CONFIGURATION --- -FORGEJO_URL="https://git.keruhomelab.com" -FORGEJO_USER="keru" -# Note: FORGEJO_TOKEN must be exported in your shell for security. - -# --- VAULTWARDEN CONFIGURATION --- -# Used for rendering template instructions -VAULTWARDEN_URL="https://vault.keruhomelab.com" - -# --- MASTER REPOSITORY --- -MASTER_REPO="master-knowledge-genome" -GIST_URL="https://gist.github.com/442a6bf555914893e9891c11519de94f.git" - -# --- GENOME REGISTRY --- -# Format: "name|description" -GENOMES=( - "genome-dev|Web development, TUI, Angular, software architecture" - "genome-finance|Personal finance, investments, market analysis" - "genome-homelab|Keru infrastructure, network configs, architecture logs" -) - -# --- SYSTEM PATHS --- -WORK_DIR="${HOME}/knowledge-genome-setup" -KEYS_DIR="${WORK_DIR}/keys" - -# Core directory resolution (DO NOT CHANGE) -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -TEMPLATES_DIR="${SCRIPT_DIR}/templates" -LIB_DIR="${SCRIPT_DIR}/lib" -PROVIDERS_DIR="${SCRIPT_DIR}/providers" diff --git a/globals.env b/globals.env new file mode 100644 index 0000000..d3a6996 --- /dev/null +++ b/globals.env @@ -0,0 +1,25 @@ +# ============================================================================= +# globals.env +# Static configuration — pure KEY=VALUE. +# Safe to: make include, docker-compose, env parsers, shell source. +# ============================================================================= + +# --- PROVIDER SELECTION --- +PROVIDER=forgejo + +# --- FORGEJO --- +FORGEJO_URL=https://git.keruhomelab.com +FORGEJO_USER=keru +FORGEJO_SSH_PORT=222 + +# --- GITHUB (used when PROVIDER=github) --- +# GITHUB_USER=your-username +# GITHUB_ORG=your-org # Optional: set only for org repos; overrides GITHUB_USER +# Note: GITHUB_TOKEN must be exported in your shell for security. + +# --- VAULTWARDEN --- +VAULTWARDEN_URL=https://vault.keruhomelab.com + +# --- MASTER REPOSITORY --- +MASTER_REPO=master-knowledge-genome +GIST_URL=https://gist.github.com/442a6bf555914893e9891c11519de94f.git diff --git a/lib/deps.sh b/lib/deps.sh index 078f1af..5e46a86 100644 --- a/lib/deps.sh +++ b/lib/deps.sh @@ -16,10 +16,10 @@ check_deps() { if [[ ${#missing[@]} -gt 0 ]]; then error "Missing required tools: ${missing[*]}" - echo -e "\nInstall them using your package manager:" - echo " Debian/Ubuntu: sudo apt install ${missing[*]}" - echo " MacOS: brew install ${missing[*]}" - exit 1 + printf "\nInstall them using your package manager:\n" + printf " Debian/Ubuntu: sudo apt install %s\n" "${missing[*]}" + printf " MacOS: brew install %s\n" "${missing[*]}" + return 1 fi success "Environment check passed: all required tools found." diff --git a/lib/git-crypt.sh b/lib/git-crypt.sh index 3877c0c..f35a342 100644 --- a/lib/git-crypt.sh +++ b/lib/git-crypt.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # ============================================================================= # lib/git-crypt.sh -# git-crypt lifecycle management (init, export, verify). +# git-crypt lifecycle management (init, export, verify, rotate). # ============================================================================= gcrypt_init() { @@ -16,7 +16,7 @@ gcrypt_export_key() { mkdir -p "${KEYS_DIR}" git-crypt export-key "$key_path" success "Symmetric key exported to: $key_path" - warn "Action required: Store this key in Vaultwarden and remove it from local disk." + warn "SECURITY ALERT: Move this key to Vaultwarden and delete it from disk immediately." } gcrypt_verify() { @@ -26,21 +26,135 @@ gcrypt_verify() { info "Verifying git-crypt status for ${genome_name}..." git-crypt lock - # Checking if the private marker is still encrypted (binary check) if file "raw/private/.gitkeep" 2>/dev/null | grep -q "data"; then success "Encryption verified: private/ directory is protected." else - warn "Encryption check inconclusive. Please run 'git-crypt status' manually." + warn "Encryption check inconclusive. Run 'git-crypt status' manually." fi [[ -f "$key_path" ]] && git-crypt unlock "$key_path" } +# --------------------------------------------------------------------------- +# gcrypt_rotate_key +# Rotates the git-crypt symmetric key for the current genome directory. +# +# WHAT THIS DOES: +# 1. Unlocks the repo with the existing key (working tree is decrypted). +# 2. Removes the old key material from .git/git-crypt/keys/. +# 3. Runs git-crypt init to generate a new symmetric key. +# 4. Stages and commits private files — they are re-encrypted with the new key. +# 5. Exports the new key to KEYS_DIR for Vaultwarden upload. +# +# WHAT THIS DOES NOT DO (limitation): +# Git history still contains blobs encrypted with the OLD key. Anyone who +# has the old key and access to the git history can still decrypt those blobs. +# To purge old encrypted blobs from history entirely, run git-filter-repo +# separately after this function completes (manual step — not automated here +# because it rewrites all commit hashes and requires force-pushing). +# +# USAGE: +# source lib/git-crypt.sh +# cd ~/knowledge-genome-setup/genome-dev +# gcrypt_rotate_key "genome-dev" +# +# REQUIRES: +# - The old key file at KEYS_DIR/.key OR the repo is already unlocked. +# - Clean working tree (no uncommitted changes outside private/). +# --------------------------------------------------------------------------- +gcrypt_rotate_key() { + local genome_name="$1" + local old_key_path="${KEYS_DIR}/${genome_name}.key" + local new_key_name="${genome_name}-rotated-$(date +%Y%m%d)" + + step "Key rotation: ${genome_name}" + + warn "SCOPE: this rotates the key for future commits only." + warn " Old git history retains blobs encrypted with the previous key." + warn " See function header in git-crypt.sh for full purge instructions." + echo "" + + # 1. Unlock with old key (if not already unlocked) + if git-crypt status 2>/dev/null | grep -q "encrypted"; then + info "Repository appears to be locked. Attempting unlock..." + if [[ -f "$old_key_path" ]]; then + git-crypt unlock "$old_key_path" + success "Unlocked with existing key." + else + error "Old key not found at: ${old_key_path}" + error "Unlock manually before rotating: git-crypt unlock /path/to/${genome_name}.key" + return 1 + fi + else + info "Repository is already unlocked — proceeding." + fi + + # 2. Ensure working tree is clean (private files excluded — they will be re-staged) + if ! git diff --quiet -- ':!raw/private' ':!wiki/private' 2>/dev/null; then + error "Working tree has uncommitted changes outside private/. Commit or stash them first." + return 1 + fi + + # 3. Remove old key material only (preserves .git/git-crypt/ structure) + info "Removing old key material..." + rm -rf .git/git-crypt/keys + success "Old key material removed." + + # 4. Re-initialize git-crypt (generates a new symmetric key) + info "Initializing new symmetric key..." + git-crypt init + success "New key generated." + + # 5. Re-stage private files so they are committed encrypted with the new key + local staged=0 + if compgen -G "raw/private/*" > /dev/null 2>&1; then + git add raw/private/ + staged=1 + fi + if compgen -G "wiki/private/*" > /dev/null 2>&1; then + git add wiki/private/ + staged=1 + fi + + if [[ $staged -eq 1 ]]; then + # Exclude .gitkeep-only commits — only commit if real content exists + if ! git diff --cached --quiet; then + git commit -m "security: rotate git-crypt key for ${genome_name}" + success "Private files re-committed with new key." + else + info "Only .gitkeep files in private/ — no content commit needed." + fi + else + info "No private files found to re-encrypt." + fi + + # 6. Export new key + gcrypt_export_key "$new_key_name" + + echo "" + success "Key rotation complete for: ${genome_name}" + echo "" + warn "NEXT STEPS:" + echo " 1. Push the new commit: git push origin main" + echo " 2. Upload the new key to Vaultwarden:" + echo " base64 < ${KEYS_DIR}/${new_key_name}.key" + echo " → Secure Note name: \"${genome_name} key\" (replace existing)" + echo " 3. Delete both key files from disk:" + echo " rm ${KEYS_DIR}/${genome_name}.key" + echo " rm ${KEYS_DIR}/${new_key_name}.key" + echo " 4. Revoke access from any previous key holders." + echo " 5. For full history purge (removes old encrypted blobs from git history):" + echo " git filter-repo --invert-paths --path raw/private --path wiki/private" + echo " git push --force origin main" + echo " (⚠ rewrites all commit hashes — coordinate with any collaborators)" + echo "" +} + gcrypt_print_key_instructions() { local genome_name="$1" local v_url="${VAULTWARDEN_URL:-https://your-vaultwarden.com}" - echo -e "\n ── ${BOLD}Key Management: ${genome_name}${NC} ──\n" + printf "\n ── %b ──\n\n" "${BOLD}Key Management: ${genome_name}${NC}" echo " 1. Encode the key to base64:" echo " base64 < ${KEYS_DIR}/${genome_name}.key" echo "" @@ -48,9 +162,16 @@ gcrypt_print_key_instructions() { echo " Name: \"${genome_name} key\"" echo " Note: " echo "" - echo " 3. For AI Server / Runtime Injection:" - echo " export BW_SESSION=\$(bw unlock --raw)" + echo " 3. Delete from disk:" + echo " rm ${KEYS_DIR}/${genome_name}.key" + echo "" + echo " 4. Runtime injection on AI server (no key on disk):" + echo " bw config server ${v_url}" + echo " export BW_SESSION=\$(bw unlock --passwordenv BW_MASTER_PASSWORD --raw)" echo " git-crypt unlock <(bw get notes \"${genome_name} key\" --session \"\$BW_SESSION\" | base64 -d)" + echo "" + echo " NOTE: use 'bw' (standard Bitwarden CLI), NOT 'bws'." + echo " 'bws' is the Secrets Manager CLI and does not work with Vaultwarden." } gcrypt_print_runtime_model() { @@ -67,13 +188,13 @@ gcrypt_print_runtime_model() { echo " smudge filter. Obsidian reads them as normal Markdown." echo "" echo " On the AI VM:" - echo " Same as laptop when unlocked. Use runtime injection (step 5" - echo " above) so the key is never written to disk." + echo " Same as laptop when unlocked. Use runtime injection so the" + echo " key is never written to disk." echo "" echo " Limitation:" echo " Encryption does NOT protect against a full server compromise" - echo " where an attacker has root access to a machine where the repo" - echo " is already unlocked. Runtime injection mitigates this." + echo " where an attacker has root access to an already-unlocked repo." + echo " Runtime injection mitigates this risk." echo " ─────────────────────────────────────────────────────────────" echo "" } diff --git a/lib/lint.sh b/lib/lint.sh index 367dda0..189105a 100644 --- a/lib/lint.sh +++ b/lib/lint.sh @@ -4,20 +4,31 @@ # Validation logic for Knowledge Genome files. # ============================================================================= -# Validates YAML frontmatter and mandatory fields +# Valid values for the 'type' frontmatter field. +# Must stay in sync with the type list in templates/agents-genome.md. +# Note: 'index' and 'log' are wiki-level singleton files (wiki/index.md, wiki/log.md). +# 'conflict' has no dedicated scaffold directory — it is a cross-cutting type +# that can live under any wiki/ subdirectory. +VALID_TYPES=("source" "entity" "concept" "query" "conflict" "private" "index" "log") + +# --------------------------------------------------------------------------- +# lint_markdown_file +# Validates YAML frontmatter: delimiters, mandatory fields, domain, type value. +# Returns the number of errors found. +# --------------------------------------------------------------------------- lint_markdown_file() { local file="$1" local genome_name="$2" local errors=0 - # 1. Check Frontmatter delimiters + # 1. Check frontmatter delimiters if [[ $(head -n 1 "$file") != "---" ]]; then warn "Missing frontmatter start (---) in: $file" errors=$((errors + 1)) fi # 2. Check mandatory fields - local mandatory_fields=("title:" "type:" "domain:") + local mandatory_fields=("title:" "type:" "domain:" "maturity:" "last_updated:") for field in "${mandatory_fields[@]}"; do if ! grep -q "^${field}" "$file"; then warn "Missing mandatory field '${field}' in: $file" @@ -25,52 +36,165 @@ lint_markdown_file() { fi done - # 3. Check if domain matches the genome name + # 3. Check domain matches genome name if grep -q "^domain:" "$file" && ! grep -q "^domain: ${genome_name}" "$file"; then - warn "Domain mismatch in $file (expected ${genome_name})" + warn "Domain mismatch in $file (expected '${genome_name}')" errors=$((errors + 1)) fi + # 4. Validate type value against allowed list + local type_errors + check_valid_type "$file" + type_errors=$? + errors=$((errors + type_errors)) + return $errors } -# Ensures files in private/ directories have the 'private: true' flag +# --------------------------------------------------------------------------- +# check_valid_type +# Verifies that the 'type' field contains a value from VALID_TYPES. +# Returns 1 if invalid, 0 if valid or field absent (absence caught by lint_markdown_file). +# --------------------------------------------------------------------------- +check_valid_type() { + local file="$1" + + local type_value + type_value=$(grep "^type:" "$file" | head -1 | sed 's/^type:[[:space:]]*//' | tr -d '"') + + [[ -z "$type_value" ]] && return 0 # absence is caught upstream + + local valid=0 + for t in "${VALID_TYPES[@]}"; do + [[ "$type_value" == "$t" ]] && valid=1 && break + done + + if [[ $valid -eq 0 ]]; then + warn "Invalid type value '${type_value}' in: $file" + warn " Valid types: ${VALID_TYPES[*]}" + return 1 + fi + + return 0 +} + +# --------------------------------------------------------------------------- +# check_privacy_consistency +# Ensures files in private/ directories carry 'private: true'. +# Warns if a public file is incorrectly marked private. +# --------------------------------------------------------------------------- check_privacy_consistency() { local file="$1" local errors=0 if [[ "$file" == *"/private/"* ]]; then if ! grep -q "^private: true" "$file"; then - error "Privacy Leak: $file is in a private folder but lacks 'private: true' metadata." + error "Privacy leak: $file is in a private/ directory but lacks 'private: true'." errors=$((errors + 1)) fi else if grep -q "^private: true" "$file"; then - warn "Metadata Mismatch: $file is marked private but located in a public directory." - # We count this as a warning unless you want to force strict isolation + warn "Metadata mismatch: $file is marked 'private: true' but is in a public directory." fi fi return $errors } -# Basic check for internal wiki-links [[target]] +# --------------------------------------------------------------------------- +# check_knowledge_decay +# Reads 'maturity' and 'last_updated' from frontmatter and compares against +# the staleness thresholds defined in agents-genome.md: +# maturity: stable → flag if last_updated > 180 days ago +# maturity: draft → flag if last_updated > 90 days ago +# +# Returns 1 if the file is stale, 0 otherwise. +# Silently skips files with missing or unparseable date fields. +# --------------------------------------------------------------------------- +check_knowledge_decay() { + local file="$1" + + local last_updated maturity + last_updated=$(grep "^last_updated:" "$file" | head -1 | sed 's/^last_updated:[[:space:]]*//' | tr -d '"') + maturity=$(grep "^maturity:" "$file" | head -1 | sed 's/^maturity:[[:space:]]*//' | tr -d '"') + + # Skip if either field is absent or maturity is not decay-trackable + [[ -z "$last_updated" || -z "$maturity" ]] && return 0 + [[ "$maturity" != "stable" && "$maturity" != "draft" ]] && return 0 + + # Parse date — handle both GNU date (Linux) and BSD date (macOS) + local updated_ts + if date --version >/dev/null 2>&1; then + # GNU date + updated_ts=$(date -d "$last_updated" +%s 2>/dev/null) + else + # BSD date (macOS) + updated_ts=$(date -j -f "%Y-%m-%d" "$last_updated" +%s 2>/dev/null) + fi + + [[ -z "$updated_ts" ]] && return 0 # unparseable date — skip silently + + local now days_old threshold + now=$(date +%s) + days_old=$(( (now - updated_ts) / 86400 )) + + case "$maturity" in + stable) threshold=180 ;; + draft) threshold=90 ;; + esac + + if [[ $days_old -gt $threshold ]]; then + warn "STALE: $file" + warn " maturity: ${maturity} | last_updated: ${last_updated} | ${days_old} days ago (threshold: ${threshold})" + return 1 + fi + + return 0 +} + +# --------------------------------------------------------------------------- +# check_page_size +# Enforces the page length limits defined in agents-genome.md: +# soft cap: 400 lines → warn +# hard cap: 800 lines → error +# These limits ensure pages fit within the LLM context window without +# attention degradation and keep the wiki atomically navigable. +# --------------------------------------------------------------------------- +check_page_size() { + local file="$1" + local lines + lines=$(wc -l < "$file") + + if [[ $lines -gt 800 ]]; then + error "Page too long (${lines} lines, hard cap 800): $file" + error " Split this page into focused sub-pages and link them." + return 1 + elif [[ $lines -gt 400 ]]; then + warn "Page approaching limit (${lines} lines, soft cap 400): $file" + fi + + return 0 +} + +# --------------------------------------------------------------------------- +# check_broken_links +# Basic check for internal [[wikilinks]] that cannot be resolved locally. +# Only emits warnings — cross-genome links may legitimately not resolve here. +# --------------------------------------------------------------------------- check_broken_links() { local file="$1" local base_dir base_dir=$(dirname "$file") - # Extract links, stripping aliases: [[Link|Alias]] -> Link + # Extract link targets, stripping aliases: [[Link|Alias]] -> Link local links - links=$(grep -oP '\[\[\K[^\]]+' "$file" | cut -d'|' -f1) + links=$(grep -oE '\[\[[^\]]+' "$file" 2>/dev/null | sed 's/^\[\[//' | cut -d'|' -f1) for link in $links; do local target="$link" [[ "$target" != *.md ]] && target="${target}.md" - # Simple relative check if [[ ! -f "${base_dir}/${target}" && ! -f "${base_dir}/../${target}" ]]; then - # Only a warning as links might point to other genomes or deep structures warn "Potential broken link: [[$link]] in $file" fi done diff --git a/lib/output.sh b/lib/output.sh index 1a65c90..0626fef 100644 --- a/lib/output.sh +++ b/lib/output.sh @@ -15,11 +15,12 @@ else GREEN='' YELLOW='' CYAN='' RED='' BOLD='' NC='' fi -info() { echo -e "${CYAN}[INFO]${NC} $*"; } -success() { echo -e "${GREEN}[OK]${NC} $*"; } -warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } -error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } -step() { echo -e "\n${BOLD}${YELLOW}━━━ $* ━━━${NC}"; } +info() { printf "%b\n" "${CYAN}[INFO]${NC} $*"; } +success() { printf "%b\n" "${GREEN}[OK]${NC} $*"; } +warn() { printf "%b\n" "${YELLOW}[WARN]${NC} $*"; } +error() { printf "%b\n" "${RED}[ERROR]${NC} $*" >&2; } +die() { error "$*"; exit 1; } +step() { printf "\n%b\n" "${BOLD}${YELLOW}━━━ $* ━━━${NC}"; } box() { local max_len=0 @@ -28,9 +29,9 @@ box() { done local border border=$(printf '─%.0s' $(seq 1 $((max_len + 2)))) - echo -e "${CYAN}┌${border}┐${NC}" + printf "%b\n" "${CYAN}┌${border}┐${NC}" for line in "$@"; do printf "${CYAN}│${NC} %-${max_len}s ${CYAN}│${NC}\n" "$line" done - echo -e "${CYAN}└${border}┘${NC}" + printf "%b\n" "${CYAN}└${border}┘${NC}" } diff --git a/lib/scaffold.sh b/lib/scaffold.sh index 0bea0bd..fbefb6c 100644 --- a/lib/scaffold.sh +++ b/lib/scaffold.sh @@ -8,14 +8,17 @@ render_template() { local template_file="$1" local output_file="$2" - [[ ! -f "$template_file" ]] && { error "Template not found: ${template_file}"; exit 1; } + [[ ! -f "$template_file" ]] && { error "Template not found: ${template_file}"; return 1; } local content - content=$(cat "$template_file") + content=$(<"$template_file") + + local genome_name_upper + genome_name_upper=$(tr '[:lower:]' '[:upper:]' <<< "${GENOME_NAME}") # Placeholder replacement content="${content//\{\{GENOME_NAME\}\}/${GENOME_NAME}}" - content="${content//\{\{GENOME_NAME_UPPER\}\}/${GENOME_NAME^^}}" + content="${content//\{\{GENOME_NAME_UPPER\}\}/${genome_name_upper}}" content="${content//\{\{GENOME_DESC\}\}/${GENOME_DESC}}" content="${content//\{\{FORGEJO_URL\}\}/${FORGEJO_URL}}" content="${content//\{\{FORGEJO_USER\}\}/${FORGEJO_USER}}" diff --git a/providers/forgejo.sh b/providers/forgejo.sh index 3ed45be..bc2bcd2 100644 --- a/providers/forgejo.sh +++ b/providers/forgejo.sh @@ -31,8 +31,8 @@ provider_create_repo() { case "$http_code" in 201) success "Repository '${name}' created successfully." ;; 409) info "Repository '${name}' already exists - skipping." ;; - 401) error "Unauthorized. Check your FORGEJO_TOKEN."; exit 1 ;; - *) error "Forgejo API returned HTTP ${http_code}. Check connectivity."; exit 1 ;; + 401) error "Unauthorized. Check your FORGEJO_TOKEN."; return 1 ;; + *) error "Forgejo API returned HTTP ${http_code}. Check connectivity."; return 1 ;; esac } @@ -44,8 +44,7 @@ provider_ssh_url() { local host # Extract hostname by removing protocol and trailing slashes host=$(echo "${FORGEJO_URL}" | sed -e 's|^[^/]*//||' -e 's|/*$||') - # Using port 222 as default for many homelab Forgejo/Gitea setups - echo "ssh://git@${host}:222/${FORGEJO_USER}/${1}.git" + echo "ssh://git@${host}:${FORGEJO_SSH_PORT:-222}/${FORGEJO_USER}/${1}.git" } provider_web_url() { diff --git a/providers/github.sh b/providers/github.sh index 14193ae..f7f1f45 100644 --- a/providers/github.sh +++ b/providers/github.sh @@ -37,7 +37,7 @@ provider_create_repo() { case "$http_code" in 201) success "Repository '${name}' created on GitHub." ;; 422) info "Repository '${name}' already exists - skipping." ;; - *) error "GitHub API returned HTTP ${http_code}. check token/permissions."; exit 1 ;; + *) error "GitHub API returned HTTP ${http_code}. Check token/permissions."; return 1 ;; esac } diff --git a/registry.sh b/registry.sh new file mode 100644 index 0000000..88513f5 --- /dev/null +++ b/registry.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# ============================================================================= +# registry.sh - Knowledge Genome Registry +# Dynamic paths and genome definitions. +# ============================================================================= + +# Guard against double sourcing +[[ -n "${_REGISTRY_LOADED:-}" ]] && return +_REGISTRY_LOADED=1 + +# Resolve project root relative to this file +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Dynamic paths +WORK_DIR="${HOME}/knowledge-genome-setup" +KEYS_DIR="${WORK_DIR}/keys" +TEMPLATES_DIR="${PROJECT_ROOT}/templates" +LIB_DIR="${PROJECT_ROOT}/lib" +PROVIDERS_DIR="${PROJECT_ROOT}/providers" + +# --- GENOME REGISTRY --- +# Format: "name|description" +GENOMES=( + "genome-dev|Web development, TUI, Angular, software architecture" + "genome-finance|Personal finance, investments, market analysis" + "genome-homelab|Keru infrastructure, network configs, architecture logs" +) diff --git a/scripts/add-genome.sh b/scripts/add-genome.sh index 09e0aa1..37dab87 100644 --- a/scripts/add-genome.sh +++ b/scripts/add-genome.sh @@ -2,12 +2,12 @@ # ============================================================================= # scripts/add-genome.sh # Helper to add a single new genome to the existing infrastructure. -# Usage: make add-genome NAME=my-new-genome DESC="Description here" # ============================================================================= set -euo pipefail source "lib/output.sh" -source "config.env" +source "globals.env" +source "registry.sh" GENOME_NAME="${1:-}" GENOME_DESC="${2:-}" @@ -20,10 +20,8 @@ fi step "Adding New Genome: ${GENOME_NAME}" -# Overwrite the GENOMES array for this session to process only the new one -export GENOMES=("${GENOME_NAME}|${GENOME_DESC}") +GENOMES=("${GENOME_NAME}|${GENOME_DESC}") -# Trigger the standard genome setup logic -bash "scripts/setup-genomes.sh" +source "scripts/setup-genomes.sh" success "Genome '${GENOME_NAME}' added and linked successfully!" diff --git a/scripts/lint-genomes.sh b/scripts/lint-genomes.sh index 1aad8f4..7ec6e23 100644 --- a/scripts/lint-genomes.sh +++ b/scripts/lint-genomes.sh @@ -2,37 +2,56 @@ # ============================================================================= # scripts/lint-genomes.sh # Executes quality control across all registered genomes. +# Iterates from the GENOMES registry in registry.sh — not from filesystem patterns — +# so all genomes are covered regardless of their naming convention. # ============================================================================= set -euo pipefail source "lib/output.sh" -source "config.env" +source "globals.env" +source "registry.sh" source "lib/lint.sh" step "Starting Knowledge Genome Linting" TOTAL_ERRORS=0 +TOTAL_STALE=0 -# Iterate through genome submodules inside the Master repo -for genome_dir in "${WORK_DIR}/${MASTER_REPO}"/genome-*/; do - [[ -d "$genome_dir" ]] || continue +for entry in "${GENOMES[@]}"; do + IFS='|' read -r GENOME_NAME _ <<< "$entry" + genome_dir="${WORK_DIR}/${MASTER_REPO}/${GENOME_NAME}" + + if [[ ! -d "$genome_dir" ]]; then + warn "Genome directory not found locally, skipping: ${genome_dir}" + continue + fi - GENOME_NAME=$(basename "$genome_dir") info "Auditing genome: ${GENOME_NAME}..." - # Find all .md files, excluding AGENTS.md and external references + # Lint all .md files except AGENTS.md and core-karpathy reference while IFS= read -r md_file; do - # Run validations - lint_markdown_file "$md_file" "$GENOME_NAME" || TOTAL_ERRORS=$((TOTAL_ERRORS + $?)) - check_privacy_consistency "$md_file" || TOTAL_ERRORS=$((TOTAL_ERRORS + $?)) - check_broken_links "$md_file" - done < <(find "$genome_dir" -name "*.md" ! -name "AGENTS.md" ! -path "*/core-karpathy/*") + lint_markdown_file "$md_file" "$GENOME_NAME" && fe=0 || fe=$? + check_privacy_consistency "$md_file" && pce=0 || pce=$? + check_page_size "$md_file" && pse=0 || pse=$? + TOTAL_ERRORS=$((TOTAL_ERRORS + fe + pce + pse)) + + check_knowledge_decay "$md_file" && stale=0 || stale=$? + TOTAL_STALE=$((TOTAL_STALE + stale)) + + check_broken_links "$md_file" || true # warnings only, never contributes to errors + + done < <(find "$genome_dir" -name "*.md" \ + ! -name "AGENTS.md" \ + ! -path "*/core-karpathy/*") done -if [[ $TOTAL_ERRORS -eq 0 ]]; then - success "Linting passed: All files are consistent and secure." +echo "" +if [[ $TOTAL_ERRORS -eq 0 && $TOTAL_STALE -eq 0 ]]; then + success "Linting passed: all files are consistent, secure, and current." +elif [[ $TOTAL_ERRORS -eq 0 && $TOTAL_STALE -gt 0 ]]; then + warn "Linting passed with ${TOTAL_STALE} stale file(s). Review and re-validate flagged pages." else - error "Linting failed: Found ${TOTAL_ERRORS} critical issues." + error "Linting failed: ${TOTAL_ERRORS} critical issue(s), ${TOTAL_STALE} stale file(s)." exit 1 fi diff --git a/scripts/setup-genomes.sh b/scripts/setup-genomes.sh index 4de8ed8..c5c3999 100644 --- a/scripts/setup-genomes.sh +++ b/scripts/setup-genomes.sh @@ -2,12 +2,16 @@ # ============================================================================= # scripts/setup-genomes.sh # Iterates through the GENOMES registry to provision remote and local repos. -# Handles git-crypt initialization and submodule linking. # ============================================================================= set -euo pipefail source "lib/output.sh" -source "config.env" +source "globals.env" + +if [[ -z "${WORK_DIR:-}" ]]; then + source "registry.sh" +fi + source "lib/scaffold.sh" source "lib/git-crypt.sh" source "providers/${PROVIDER}.sh" @@ -15,48 +19,45 @@ source "providers/${PROVIDER}.sh" step "Processing Genome Registry" for entry in "${GENOMES[@]}"; do - # Parse name and description from the array IFS='|' read -r GENOME_NAME GENOME_DESC <<< "$entry" export GENOME_NAME GENOME_DESC info "Processing: ${GENOME_NAME}..." - # 1. Remote Provisioning (Idempotent: skips if exists) + # 1. Remote Creation (Idempotent) provider_create_repo "${GENOME_NAME}" "${GENOME_DESC}" "true" + SSH_URL=$(provider_ssh_url "${GENOME_NAME}") GENOME_PATH="${WORK_DIR}/${MASTER_REPO}/${GENOME_NAME}" - if [ ! -d "${GENOME_PATH}" ]; then - info "Creating local directory and initializing scaffold..." - mkdir -p "${GENOME_PATH}" - cd "${GENOME_PATH}" - git init + if [[ ! -d "${GENOME_PATH}" ]]; then + cd "${WORK_DIR}/${MASTER_REPO}" + info "Linking ${GENOME_NAME} as a submodule..." - # IMPORTANT: Initialize git-crypt BEFORE creating sensitive files + git submodule add "${SSH_URL}" "${GENOME_NAME}" + + cd "${GENOME_NAME}" + + # IMPORTANT: Initialize git-crypt BEFORE creating any files gcrypt_init - # Create directory structure and apply templates scaffold_genome "." install_precommit_hook "." - # Initial commit and push to remote + # Initial genome push git add . - git commit -m "feat: initial scaffold for ${GENOME_NAME}" - - SSH_URL=$(provider_ssh_url "${GENOME_NAME}") - git remote add origin "${SSH_URL}" + git commit -m "feat: initial scaffold and git-crypt init for ${GENOME_NAME}" git push -u origin main - # Export the AES key for the user to back up + # Key export and instructions gcrypt_export_key "${GENOME_NAME}" + gcrypt_print_key_instructions "${GENOME_NAME}" - # 2. Link as submodule in the Master repository + # Commit the submodule reference in the master repo cd "${WORK_DIR}/${MASTER_REPO}" - info "Linking ${GENOME_NAME} as a submodule..." - git submodule add "${SSH_URL}" "${GENOME_NAME}" - git add .gitmodules "${GENOME_NAME}" - git commit -m "feat: link submodule ${GENOME_NAME}" - else - warn "Genome directory '${GENOME_NAME}' already exists. Skipping local setup." + git commit -m "feat: add ${GENOME_NAME} as submodule" + git push origin main fi done + +success "Genome provisioning completed." diff --git a/scripts/setup-master.sh b/scripts/setup-master.sh index 7d9bf6f..181c01e 100644 --- a/scripts/setup-master.sh +++ b/scripts/setup-master.sh @@ -6,29 +6,36 @@ set -euo pipefail source "lib/output.sh" -source "config.env" +source "globals.env" +source "registry.sh" source "lib/scaffold.sh" +source "providers/${PROVIDER}.sh" # Required for remote creation step "Configuring Master Repository: ${MASTER_REPO}" -# Ensure workspace exists +# 1. Remote Creation +provider_create_repo "${MASTER_REPO}" "Knowledge Genome Master Repository" "true" + mkdir -p "${WORK_DIR}/${MASTER_REPO}" cd "${WORK_DIR}/${MASTER_REPO}" -if [ ! -d ".git" ]; then +if [[ ! -d ".git" ]]; then info "Initializing Git in Master repository..." git init - # Optional: Add Karpathy's core reference as a read-only submodule - if [ -n "${GIST_URL:-}" ]; then + # 2. Origin Configuration + SSH_URL=$(provider_ssh_url "${MASTER_REPO}") + git remote add origin "${SSH_URL}" + + if [[ -n "${GIST_URL:-}" ]]; then info "Adding core-karpathy as an external reference..." git submodule add "${GIST_URL}" core-karpathy || warn "Could not add core-karpathy submodule." fi fi -# Apply master-level templates (README, AGENTS) scaffold_master "." - -# Initial commit for the master structure git add . git commit -m "chore: initialize master scaffold" || info "No changes to commit in master." + +# 3. Initial Push +git push -u origin main diff --git a/scripts/setup.sh b/scripts/setup.sh index eb2e5b7..86fd013 100644 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -9,8 +9,10 @@ set -euo pipefail # Resolve script directory and source core components SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "${SCRIPT_DIR}" # All child scripts use relative paths from project root source "${SCRIPT_DIR}/lib/output.sh" -source "${SCRIPT_DIR}/config.env" +source "${SCRIPT_DIR}/globals.env" +source "${SCRIPT_DIR}/registry.sh" source "${SCRIPT_DIR}/lib/deps.sh" step "Starting Knowledge Genome Setup" @@ -24,7 +26,7 @@ check_git_identity info "Initializing Master Repository..." bash "${SCRIPT_DIR}/scripts/setup-master.sh" -# 3. Genomes Provisioning (as defined in config.env) +# 3. Genomes Provisioning (as defined in registry.sh) info "Provisioning registered Genomes..." bash "${SCRIPT_DIR}/scripts/setup-genomes.sh" diff --git a/templates/agents-genome.md b/templates/agents-genome.md index 757dbd5..95df73b 100644 --- a/templates/agents-genome.md +++ b/templates/agents-genome.md @@ -2,57 +2,247 @@ **[ROLE]** -You are the specialized AI maintainer for the `{{GENOME_NAME}}` genome. Read this schema before executing any file operations. +You are the specialized AI maintainer for the `{{GENOME_NAME}}` genome. +Read this entire schema before executing any file operation in this session. + +--- ## 1. Genome Identity -- **Name:** `{{GENOME_NAME}}` -- **Domain Scope:** `{{GENOME_DESC}}` -- **Owner:** `{{FORGEJO_USER}}` +| Field | Value | +|--------------|-------| +| Name | `{{GENOME_NAME}}` | +| Domain Scope | `{{GENOME_DESC}}` | +| Owner | `{{FORGEJO_USER}}` | +| Repository | `{{FORGEJO_URL}}/{{FORGEJO_USER}}/{{GENOME_NAME}}` | + +--- ## 2. Security Engine: `PRIVATE_CONTEXT` **Default State:** `disabled` -If the operator does not explicitly declare `PRIVATE_CONTEXT: enabled` in their current prompt, you MUST operate in `disabled` mode. +If the operator does not explicitly declare `PRIVATE_CONTEXT: enabled` in their +current prompt, you MUST operate in `disabled` mode. Never infer or assume the value. ### Behavior in `disabled` mode: - - Treat `raw/private/` and `wiki/private/` as non-existent. -- Do not execute `cat`, `ls`, or `grep` on private paths. +- Do not execute `cat`, `ls`, `grep`, or any read operation on private paths. - Refuse operator requests to summarize personal data. +- All outputs are safe to share with collaborators. ### Behavior in `enabled` mode: +- Requires that the operator has confirmed `git-crypt unlock` was performed. +- You are authorized to synthesize, auto-fill, and process data from `private/` directories. +- Outputs derived from private data go exclusively to `wiki/private/`. +- **Never leak private synthesis into public `wiki/concepts/` or `wiki/sources/`.** +- Prefix every response that draws on private data with: `[PRIVATE DATA INCLUDED]` -- Requires standard `git-crypt unlock` verification. -- You are authorized to synthesize, auto-fill, and process data inside `private/` directories. -- Outputs must be confined to `wiki/private/`. DO NOT leak private synthesis into public `wiki/concepts/`. +### Pre-commit failures: +If a commit is rejected by the pre-commit hook with a **"PLAINTEXT LEAK DETECTED"** warning, **DO NOT** attempt to bypass it with `--no-verify`. Stop the session and ask the operator to verify the encryption state and `.gitattributes`. -## 3. Operations & Linting Protocol +### On the AI server — runtime key injection: +The git-crypt key must never be stored as a persistent file on the AI VM. +```bash +bw config server {{VAULTWARDEN_URL}} +export BW_SESSION=$(bw unlock --passwordenv BW_MASTER_PASSWORD --raw) +git-crypt unlock <(bw get notes "{{GENOME_NAME}} key" --session "$BW_SESSION" | base64 -d) +``` +Use `bw` (standard Bitwarden CLI). `bws` (Secrets Manager CLI) does NOT work with +self-hosted Vaultwarden. -Every document generation or modification MUST pass this internal linting checklist: +When the session ends or PRIVATE_CONTEXT returns to disabled: +```bash +git-crypt lock +``` -1. **Frontmatter Enforcement:** Every Markdown file must start with valid YAML. +--- - ```yaml - --- - title: "Strict String Title" - type: source | entity | concept | private - domain: {{GENOME_NAME}} - tags: [lowercase, hyphen-separated] - last_updated: YYYY-MM-DD - private: true | false - --- - ``` +## 3. Core Rules -2. **Atomic Linking:** If you create `wiki/concepts/new-idea.md`, you MUST instantly add: +1. **`raw/` is sacred and immutable.** Read from `raw/`; never create, modify, or delete files in it. +2. **`wiki/` is owned by the agent.** Create, update, cross-link, and maintain all pages in `wiki/`. +3. **Every operation must be logged** in `wiki/log.md` using the format defined in Section 6. +4. **`wiki/index.md` must be updated** immediately after any ingest or lint pass. +5. **No direct commits to `main`.** Always work on a feature branch and open a Pull Request. +6. **Contradict, don't overwrite.** See Section 5 — Conflict Resolution. +7. **Never commit unencrypted data** outside `raw/private/` or `wiki/private/`. - ```text - * [[concepts/new-idea]] - - ``` +--- - to `wiki/index.md` under the appropriate heading, sorted alphabetically. +## 4. Operations & Linting Protocol -3. **Bi-directional Integrity:** Use Obsidian-style links `[[folder/file]]`. Do not use standard Markdown links `[text](url)` for internal references. +Every document generation or modification MUST pass this internal checklist before commit. -4. **Log the Action:** Append exactly ONE line to `wiki/log.md` detailing the operation. +### 4.1 Frontmatter Enforcement + +Every Markdown file must start with valid YAML frontmatter: + +```yaml +--- +title: "Strict String Title" +type: source | entity | concept | query | conflict | private +domain: {{GENOME_NAME}} +tags: [lowercase, hyphen-separated] +maturity: draft | stable | deprecated +last_updated: YYYY-MM-DD +private: true | false +--- +``` + +**Field rules:** +- `maturity: draft` — newly created or based on a single source; not yet cross-validated. +- `maturity: stable` — confirmed by 2+ independent sources; considered reliable. +- `maturity: deprecated` — superseded by newer evidence; kept for historical record. + When marking a page deprecated, add a `> **DEPRECATED:** ` callout at the top. + +**Do not use semantic versioning (1.x.x) for content.** Git history tracks every change. +`maturity` captures the epistemic state; `last_updated` tracks recency. + +### 4.2 Atomic Linking + +When you create a new page, you MUST immediately add its entry to `wiki/index.md`: +```text +- [[folder/slug]] — Brief one-line summary. `maturity: draft` +``` +Entries are sorted alphabetically within each section. + +### 4.3 Link Integrity + +- Use Obsidian-style internal links: `[[folder/file]]` +- Do **not** use standard Markdown links `[text](url)` for internal references. +- Cross-genome links use relative paths: `[[../genome-target/wiki/folder/file]]` + +### 4.4 Lint Checks (Periodic) + +When running a lint pass: +1. Find orphan pages — wiki pages with no inbound `[[wikilink]]`. +2. Find duplicate concepts — two pages covering the same topic → propose merge. +3. Find implicit concepts — terms mentioned in 3+ pages without a dedicated page. +4. Check `maturity` consistency — pages with 2+ sources still marked `draft`. +5. Check broken internal links. +6. Apply Knowledge Decay check (see Section 7). +7. Report findings as a structured list. Do not auto-fix without operator approval. + +--- + +## 5. Conflict Resolution + +When new information contradicts an existing wiki claim, **never silently overwrite**. + +### Procedure: +1. Keep the existing page unchanged. +2. Create `wiki/queries/conflict--.md` with this structure: + +```yaml +--- +title: "Conflict: " +type: conflict +domain: {{GENOME_NAME}} +maturity: draft +last_updated: YYYY-MM-DD +private: false +--- +``` +```markdown +## Conflict: + +**Source A (existing claim):** [[path/to/existing-page]] +> Summary of the claim held by the current wiki. + +**Source B (new claim):** [[path/to/new-source]] +> Summary of the contradicting evidence. + +**Agent Assessment:** +- Confidence in A: high | medium | low — +- Confidence in B: high | medium | low — +- Recommended action: `accept_b` | `keep_a` | `requires_human_review` + +**Status:** ⏳ Awaiting human decision +``` + +3. Add `[[queries/conflict--]]` to `wiki/index.md` under a + `## Conflicts Pending Review` section (create it if absent). +4. Log the conflict in `wiki/log.md` with type `CONFLICT`. +5. Open a Pull Request titled `[CONFLICT] — human review required`. + +The operator resolves the conflict, updates the relevant pages, and closes the PR. + +--- + +## 6. Log Format + +Every operation must append exactly ONE entry to `wiki/log.md`. +The header line is required and must be grep-parseable. +The metadata block is required for all agent-generated entries. + +```markdown +## [YYYY-MM-DD] TYPE | Title or subject + +- run_id: `` +- model: `` +- context_read: `[[path/A]]`, `[[path/B]]` +- output_written: `[[path/C]]`, `[[path/D]]` +- reasoning: One sentence explaining what changed and why. +``` + +**Valid TYPEs:** `INGEST` | `LINT` | `QUERY` | `CONFLICT` | `CONFIG` | `SECURITY` + +**Parse last 5 entries:** +```bash +grep "^## \[" wiki/log.md | tail -5 +``` + +**Parse by type:** +```bash +grep "^## \[" wiki/log.md | grep "CONFLICT" +``` + +--- + +## 7. Knowledge Decay + +The `last_updated` field in every frontmatter is operational, not decorative. + +**Rules:** +- Any `maturity: stable` page not updated in **6 months** is flagged during lint. +- Any `maturity: draft` page not updated in **3 months** is flagged during lint. +- Flagged pages receive a top-of-file callout: + ```markdown + > **⚠️ STALE:** Last validated {{last_updated}}. Re-validation required. + ``` +- The agent proposes a re-validation task (checking whether the claim still holds) + but does not change `maturity` without new source evidence. + +--- + +## 8. Ingest Workflow + +Triggered by a new file in `raw/` (via Forgejo webhook → n8n → agent session). + +1. Read the source document fully. +2. Create `wiki/sources/.md` with summary and key points. +3. For each entity (person, tool, organisation): update or create `wiki/entities/.md`. +4. For each concept (pattern, theory, decision): update or create `wiki/concepts/.md`. +5. Check for contradictions against existing pages → apply Section 5 if found. +6. Update `wiki/index.md`. +7. Append a log entry (Section 6 format). +8. Commit on branch `feat/ai-ingest-`. +9. Open Pull Request on Forgejo — no merge without human approval. + +**For private sources** (`raw/private/`, requires `PRIVATE_CONTEXT: enabled`): +- Output goes exclusively to `wiki/private/.md`. +- PR title must start with `[PRIVATE]`. + +--- + +## 9. Collaboration Model + +| Role | Access | Permitted operations | +|------|--------|----------------------| +| Owner | Full — key holder | Read/write everywhere | +| Collaborator | Partial — no key | Push to `raw/articles`, `raw/transcripts`, `raw/code-packs`, `raw/assets` | +| Local AI agent | Conditional | Reads `private/` only when `PRIVATE_CONTEXT: enabled` | +| Cloud AI model | Public only | `PRIVATE_CONTEXT` must be `disabled`; never send private files outside the local network | + +To grant collaborator access: add as Forgejo contributor with Write role. Do not share the git-crypt key. diff --git a/templates/agents-master.md b/templates/agents-master.md index 7bf35f3..ce2ad11 100644 --- a/templates/agents-master.md +++ b/templates/agents-master.md @@ -1,40 +1,176 @@ # SYSTEM DIRECTIVE: Global Schema `{{MASTER_REPO}}` -**[ROLE]** You are the Orchestrator AI for the Knowledge Genome network. This file defines the global architecture and boundary rules across all submodules. +**[ROLE]** You are the Orchestrator AI for the Knowledge Genome network. +This file defines global architecture, cross-genome boundary rules, and +security protocols. Read it before any cross-genome session. + +--- ## 1. Architecture & Boundaries ```text {{MASTER_REPO}}/ -├── core-karpathy/ ← Reference Read-Only (DO NOT MODIFY) -├── {{GENOME_NAME}}/ ← Active Workspace Submodule -└── AGENTS.md ← This File +├── core-karpathy/ ← Reference pattern — read-only, never modify +├── genome-dev/ ← Submodule: web development, Angular, TUI +├── genome-finance/ ← Submodule: personal finance (git-crypt on private/) +├── genome-homelab/ ← Submodule: Keru infrastructure and network +└── AGENTS.md ← This file ``` -### CRITICAL RULES: +Each genome submodule has its own `AGENTS.md` with domain-specific rules. -- Single-Domain Focus: Operate within ONLY ONE genome submodule at a time. Do not attempt atomic commits across multiple genomes. +### Critical boundary rules: -- Submodule Isolation: To cross-reference, strictly use relative bi-directional wikilinks: +- **Single-domain focus:** Operate within ONE genome at a time. + Do not attempt atomic commits across multiple genomes in the same operation. +- **Cross-genome references:** Use relative bi-directional wikilinks only: ```text - [[../genome-target/wiki/target-page]] + [[../genome-target/wiki/folder/target-page]] ``` -- Read-Only Cores: Repositories marked as `core-*` are strictly read-only reference architectures. +- **Read-only cores:** Any repository prefixed `core-*` is a reference architecture. + Never commit to it. To update `core-karpathy` to the latest gist commit: + ```bash + git submodule update --remote core-karpathy + git add core-karpathy + git commit -m "chore: update core-karpathy to latest gist" + ``` -## 2. Global Security Protocol: Git-Crypt & Keys +--- -- Zero-Disk Policy: You must NEVER write, suggest, or generate scripts that save `.key` files to the disk. +## 2. Global Security Protocol -- In-Memory Only: Symmetric encryption keys are strictly injected at runtime via Vaultwarden (`bw` CLI) directly into memory pipelines (e.g., `<(bw get notes ...)`). +### Zero-Disk Key Policy +- Never write, suggest, or generate scripts that save `.key` files to disk. +- Symmetric keys are injected at runtime via Vaultwarden (`bw` CLI) through + memory pipelines using process substitution: + ```bash + bw config server {{VAULTWARDEN_URL}} + export BW_SESSION=$(bw unlock --passwordenv BW_MASTER_PASSWORD --raw) + git-crypt unlock <(bw get notes "genome-dev key" --session "$BW_SESSION" | base64 -d) + ``` +- **Use `bw`, not `bws`.** `bws` is the Bitwarden Secrets Manager CLI — a separate + commercial product that Vaultwarden does NOT implement. -- Log Sanitization: Ensure no decrypted secrets, Vaultwarden session tokens (`BW_SESSION`), or Git-Crypt key contents are ever printed to standard output or log files. +### Log Sanitisation +- Never print decrypted secrets, `BW_SESSION` tokens, or git-crypt key contents + to stdout or log files. +- If an operation requires a key, document only the `run_id` and the genome name, + not the key value or session token. -## 3. Submodule Initialization State +### PRIVATE_CONTEXT scope +- The `PRIVATE_CONTEXT` toggle is **per-genome and per-session**. + Enabling it for `genome-finance` does NOT enable it for `genome-dev`. +- Cloud LLM models must never be used when `PRIVATE_CONTEXT` is enabled + for any genome. Private data must not leave the local network. -To synchronize the workspace, the operational command is strictly: +--- + +## 3. Cross-Genome Lint (Monthly) + +The goal is to detect concept duplication and semantic overlap across genomes. +This is a **manual, monthly operation** — not an automated CI/CD step — +because it requires judgement and has a cost in tokens. + +**Procedure:** +1. Collect the `wiki/index.md` from every active genome. +2. Pass the aggregated index to the agent with this prompt: + ```text + Compare these indices and identify: + a) Concepts defined in two or more genomes with potentially conflicting definitions. + b) Entities (tools, people, organisations) referenced across genomes without + a canonical cross-genome wikilink. + c) Concepts in genome-X that should link to genome-Y but don't. + Report findings. Do not modify any files. + ``` +3. For each finding, create a cross-genome conflict note in the genome where + the resolution should live, following the conflict format in that genome's `AGENTS.md`. +4. Log the lint pass in the master `AGENTS.md` update history (below). + +--- + +## 4. Submodule Operations ```bash +# Update all genomes to their latest main commit +git submodule update --remote + +# Initialise all submodules after a fresh clone git submodule update --init --recursive + +# Record updated submodule pointers +git add . +git commit -m "chore: update submodule pointers" +git push ``` + +--- + +## 5. Adding a New Genome + +```bash +# 1. Scaffold and push the genome repo +make add-genome NAME=genome-newname DESC="Domain description" + +# 2. Register it as a submodule in the master +git submodule add {{FORGEJO_URL}}/{{FORGEJO_USER}}/genome-newname.git genome-newname +git add .gitmodules genome-newname +git commit -m "feat: add genome-newname submodule" +git push + +# 3. Update this file's architecture diagram in Section 1 +``` + +--- + +## 6. Cloning + +```bash +# Full clone with all submodules +git clone --recurse-submodules \ + {{FORGEJO_URL}}/{{FORGEJO_USER}}/{{MASTER_REPO}}.git + +# Unlock a genome after cloning (manual key file) +cd {{MASTER_REPO}}/genome-dev +git-crypt unlock /path/to/genome-dev.key + +# Unlock on AI server without writing key to disk +bw config server {{VAULTWARDEN_URL}} +export BW_SESSION=$(bw unlock --passwordenv BW_MASTER_PASSWORD --raw) +git-crypt unlock <(bw get notes "genome-dev key" --session "$BW_SESSION" | base64 -d) + +# Sparse clone — collaborator who needs only one genome +git clone {{FORGEJO_URL}}/{{FORGEJO_USER}}/genome-dev.git +``` + +--- + +## 7. Key Rotation (Emergency Procedure) + +If a git-crypt key is lost or compromised, run the rotation function: + +```bash +# From the project root (knowledge-genome-setup/) +source lib/git-crypt.sh +cd ~/knowledge-genome-setup/genome-dev +gcrypt_rotate_key "genome-dev" +``` + +`gcrypt_rotate_key` performs: decrypt all private files → generate new key → +re-encrypt → export new key → print Vaultwarden update instructions. + +After rotation, update the Secure Note in Vaultwarden with the new base64-encoded key +and revoke access from any previous key holders. + +--- + +## 8. Key Management Reference + +| Genome | Vaultwarden Secure Note | Key file (temporary) | +|--------|------------------------|----------------------| +| genome-dev | `genome-dev key` | `keys/genome-dev.key` | +| genome-finance | `genome-finance key` | `keys/genome-finance.key` | +| genome-homelab | `genome-homelab key` | `keys/genome-homelab.key` | + +Key files in `keys/` are temporary exports only. Delete them after uploading to Vaultwarden. diff --git a/templates/pre-commit.sh b/templates/pre-commit.sh index 1985fc6..649b850 100644 --- a/templates/pre-commit.sh +++ b/templates/pre-commit.sh @@ -10,6 +10,13 @@ set -euo pipefail PRIVATE_PATTERNS=("raw/private/" "wiki/private/") FAILED=0 +# Check on git-crypt +if [[ ! -d ".git-crypt" ]]; then + echo -e "\n\033[0;31m[CRITICAL] git-crypt is not initialized in this repository.\033[0m" + echo "Run 'git-crypt init' and 'make setup' before committing." + exit 1 +fi + # Get staged files (excluding deletions) STAGED_FILES=$(git diff --cached --name-only --diff-filter=ACM 2>/dev/null || true) diff --git a/templates/wiki-index.md b/templates/wiki-index.md index 39c43f6..9fe4719 100644 --- a/templates/wiki-index.md +++ b/templates/wiki-index.md @@ -2,6 +2,7 @@ title: "Index — {{GENOME_NAME}}" type: index domain: {{GENOME_NAME}} +maturity: stable last_updated: {{DATE}} private: false --- @@ -9,21 +10,35 @@ private: false # Master Index: {{GENOME_NAME}} **[AGENT INSTRUCTION]** -Maintain strict alphabetical sorting within each section. Ensure the `last_updated` YAML field is modified upon every edit. Use only `[[slug]] - Summary` format. +This is the primary navigation file. Read it first on every session before accessing individual pages. +Maintain strict alphabetical sorting within each section. +Update `last_updated` in the YAML frontmatter on every edit. +Entry format: `- [[folder/slug]] — One-line summary. \`maturity: \`` --- ## Sources (`wiki/sources/`) -*Ingested raw materials.* +*Ingested raw materials. One entry per processed source.* ## Entities (`wiki/entities/`) -*People, organizations, tools.* +*People, organisations, tools, projects.* ## Concepts (`wiki/concepts/`) -*Theories, methodologies, architecture.* +*Theories, methodologies, patterns, architectural decisions.* + + +## Queries (`wiki/queries/`) +*Synthesised answers worth preserving. Archived explorations and analyses.* + + +## Conflicts Pending Review (`wiki/queries/conflict-*.md`) +*Created automatically when the agent detects contradictions between sources.* +*Do not summarise entries here — list slugs only to avoid surfacing unresolved claims.* +*Remove entry once the operator has resolved and closed the corresponding PR.* ## Private Synthesis (`wiki/private/`) -*Restricted access. Use slug names ONLY. Do not append summaries to prevent metadata leakage.* +*Restricted access. Requires `PRIVATE_CONTEXT: enabled` and unlocked repo.* +*List slug names ONLY. Do not append summaries — prevents metadata leakage.* diff --git a/templates/wiki-log.md b/templates/wiki-log.md index 96e6008..3987fb1 100644 --- a/templates/wiki-log.md +++ b/templates/wiki-log.md @@ -2,23 +2,55 @@ title: "Operations Log — {{GENOME_NAME}}" type: log domain: {{GENOME_NAME}} +maturity: stable +last_updated: {{DATE}} private: false --- # Operations Log: {{GENOME_NAME}} **[AGENT INSTRUCTION]** - -This is an append-only system ledger. Do not edit previous lines. - -Append new entries strictly using the following format: - -```text -YYYY-MM-DD | [TYPE] | [AUTHOR] | Message -``` - -*(Valid TYPEs: INGEST, LINT, QUERY, CONFIG, SECURITY)* +This is an append-only system ledger. Never edit or delete previous entries. +Append new entries at the bottom using the format defined below. --- -2024-05-08 | [CONFIG] | SYSTEM | Genome Scaffolded. Directory structure and encryption layer initialized. +## Entry Format + +### Required header (enables shell parsing): +```text +## [YYYY-MM-DD] TYPE | Subject or title +``` + +### Required metadata block for all agent-generated entries: +```markdown +- run_id: `` +- model: `` +- context_read: `[[path/A]]`, `[[path/B]]` +- output_written: `[[path/C]]`, `[[path/D]]` +- reasoning: One sentence explaining what changed and why. +``` + +**Valid TYPEs:** `INGEST` | `LINT` | `QUERY` | `CONFLICT` | `CONFIG` | `SECURITY` + +**Parse examples:** +```bash +# Last 5 entries +grep "^## \[" wiki/log.md | tail -5 + +# All CONFLICT entries +grep "^## \[" wiki/log.md | grep "CONFLICT" + +# All entries from a specific date +grep "^## \[2026-05" wiki/log.md +``` + +--- + +## [{{DATE}}] CONFIG | Genome scaffolded + +- run_id: `system-init` +- model: `setup-knowledge-genome.sh` +- context_read: *(none — initial scaffold)* +- output_written: `[[wiki/index.md]]`, `[[wiki/log.md]]`, `[[AGENTS.md]]` +- reasoning: Initial directory structure and encryption layer initialized by setup script.