diff --git a/skills/ingest/SKILL.md b/skills/ingest/SKILL.md new file mode 100644 index 0000000..bd75214 --- /dev/null +++ b/skills/ingest/SKILL.md @@ -0,0 +1,83 @@ +--- +name: ingest +description: Semantic pass of a single raw source into the current genome's wiki — read the source, write sources/entities/concepts, handle contradictions, then emit a manifest and STOP. Use when a new file lands in raw/. Does NOT do git, log, index, lint, or PRs (a post-processor handles those), and does NOT handle private sources or project repos. +license: see repository +compatibility: Runs inside one genome checkout (cwd = genome root). Tools needed — read, edit only. NO bash, NO git. The deterministic steps (index, log, scoped lint, PR) run AFTER you exit, via run-ingest.sh. PRIVATE_CONTEXT must be disabled. +allowed-tools: read edit +metadata: + framework: knowledge-genome + phase: "1-ingest-semantic" +--- + +# Ingest — semantic pass + +You run inside ONE genome checkout. `AGENTS.md` (already in your context) is the +authoritative contract. Your job is the **semantic pass only**: read the source, write +the wiki pages, handle contradictions. You do **not** touch git, the log, the index, the +linter, or PRs — a post-processor (`run-ingest.sh`) does all of that _after you stop_, +from the manifest you leave behind. This keeps your context clean and your turns few, +which matters on a small local model. + +**Argument:** the relative path of the single raw source to ingest +(e.g. `raw/articles/foo.md`). Process only this one. + +## Pre-flight — stop the session if any check fails + +1. Refuse if the argument path is under any `private/` directory. +2. Refuse if `PRIVATE_CONTEXT` is not `disabled`. +3. Confirm the file exists under `raw/`. + +## Semantic work (your only job) + +1. Read the source once. +2. Write `wiki/sources/.md` — faithful summary + key points, with the required + frontmatter (`type: source`, `domain: `, `maturity: draft`, + `last_updated: `, `private: false`, sensible `tags`). +3. For each entity (person, tool, org) → create or update `wiki/entities/.md`. +4. For each concept (pattern, theory, decision) → create or update + `wiki/concepts/.md`. +5. On a real contradiction with an existing claim, follow `AGENTS.md` §Conflict: create + `wiki/queries/conflict--.md`. Never overwrite the existing page. + +Name files in kebab-case and pick stable names. Read `wiki/index.md` (and the specific +pages it points to) to decide create-vs-update and to spot contradictions. Do not scan +whole directories. + +## Finish: write the manifest, then STOP + +As your **final action**, write `.ingest-manifest.json` at the genome root +(NOT under `wiki/`) describing exactly what you did. Then stop — do not commit, lint, +append to the log/index, or open anything. + +```json +{ + "raw_source": "raw/articles/foo.md", + "model": "", + "reasoning": "One sentence for the log: what changed and why.", + "pr_summary": "One or two sentences describing this ingest for the PR.", + "contradictions": "None (or: 1 conflict file created — )", + "pages": [ + { + "path": "wiki/sources/foo.md", + "summary": "One-line index summary.", + "maturity": "draft", + "status": "created" + }, + { + "path": "wiki/entities/acme.md", + "summary": "Acme — vendor.", + "maturity": "draft", + "status": "modified" + } + ] +} +``` + +Manifest rules: + +- List every page you created or modified, with `status` `created` or `modified`. +- `summary` is the one-line index description (≈12 words max). For conflict pages the + summary is ignored — the index lists conflicts by slug only. +- Do not invent a `run_id`, branch, commit, or PR — those belong to the post-processor. + +One source per session. After writing the manifest, stop. diff --git a/skills/ingest/references/frontmatter.md b/skills/ingest/references/frontmatter.md new file mode 100644 index 0000000..e69de29 diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py new file mode 100644 index 0000000..e70009a --- /dev/null +++ b/skills/ingest/scripts/index-append.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# ============================================================================= +# skills/ingest/scripts/index-append.py +# Insert an entry line into the correct section of wiki/index.md and keep that +# section's entries alphabetically ordered. Bumps frontmatter last_updated. +# +# NOTE: agents-genome.md and wiki-index.md claim the pre-commit hook sorts the +# index. The actual pre-commit.sh only runs the plaintext-leak check — it does +# NOT sort. This script owns the ordering instead. (If you later move sorting +# into the hook, reduce this to a plain append.) +# +# index-append.py --section Sources \ +# --entry '- [[sources/foo]] — One-line summary. `maturity: draft`' +# ============================================================================= +import argparse +import datetime +import re +import sys + +ENTRY_RE = re.compile(r"^- \[\[") +HEADER_RE = re.compile(r"^## ") + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--section", required=True, + help="Section name, e.g. Sources / Entities / Concepts / Queries / Conflicts") + ap.add_argument("--entry", required=True, help="Full index line to insert") + ap.add_argument("--file", default="wiki/index.md") + args = ap.parse_args() + + try: + with open(args.file, encoding="utf-8") as fh: + lines = fh.read().splitlines() + except FileNotFoundError: + print(f"index-append: not found: {args.file}", file=sys.stderr) + return 1 + + today = datetime.date.today().isoformat() + + # 1. Bump last_updated inside the first frontmatter block + fm_open = False + for i, ln in enumerate(lines): + if ln.strip() == "---": + if not fm_open: + fm_open = True + continue + break # end of frontmatter + if fm_open and ln.startswith("last_updated:"): + lines[i] = f"last_updated: {today}" + + # 2. Locate the target section [start, end) + start = None + for i, ln in enumerate(lines): + if HEADER_RE.match(ln) and ln[3:].startswith(args.section): + start = i + break + if start is None: + print(f"index-append: section '{args.section}' not found in {args.file}", + file=sys.stderr) + return 1 + + end = len(lines) + for i in range(start + 1, len(lines)): + if HEADER_RE.match(lines[i]): + end = i + break + + # 3. Split the section body into intro (non-entry) and entries + body = lines[start + 1:end] + intro = [ln for ln in body if not ENTRY_RE.match(ln)] + entries = [ln for ln in body if ENTRY_RE.match(ln)] + + if args.entry in entries: + print(f"index-append: entry already present, skipping") + return 0 + + entries.append(args.entry) + entries.sort(key=str.casefold) + + # Normalise intro: drop trailing blanks, keep header + comment(s) + while intro and intro[-1].strip() == "": + intro.pop() + + new_section = intro + [""] + entries + [""] + lines = lines[:start + 1] + new_section + lines[end:] + + with open(args.file, "w", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + print(f"index-append: added to {args.section}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/ingest/scripts/log-append.sh b/skills/ingest/scripts/log-append.sh new file mode 100644 index 0000000..32e6ca0 --- /dev/null +++ b/skills/ingest/scripts/log-append.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/log-append.sh +# Append one entry to the append-only ledger wiki/log.md, in the exact format +# defined by AGENTS.md / wiki-log.md. Generates run_id. Never edits prior entries. +# +# log-append.sh --type INGEST --subject "" --model "" \ +# --context "[[raw/x]]" --output "[[sources/x]]" \ +# --reasoning "One sentence." +# ============================================================================= +set -euo pipefail + +LOG_FILE="${LOG_FILE:-wiki/log.md}" + +type="" subject="" model="" context="" output="" reasoning="" +while [[ $# -gt 0 ]]; do + case "$1" in + --type) type="$2"; shift 2 ;; + --subject) subject="$2"; shift 2 ;; + --model) model="$2"; shift 2 ;; + --context) context="$2"; shift 2 ;; + --output) output="$2"; shift 2 ;; + --reasoning) reasoning="$2"; shift 2 ;; + *) echo "log-append: unknown arg: $1" >&2; exit 1 ;; + esac +done + +: "${type:?--type required}" +: "${subject:?--subject required}" + +case "$type" in + INGEST|LINT|QUERY|CONFLICT|CONFIG|SECURITY) ;; + *) echo "log-append: invalid TYPE '${type}'" >&2; exit 1 ;; +esac + +[[ -f "$LOG_FILE" ]] || { echo "log-append: not found: $LOG_FILE" >&2; exit 1; } + +run_id="$(uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid)" +today="$(date +%Y-%m-%d)" + +{ + printf '\n## [%s] %s | %s\n\n' "$today" "$type" "$subject" + printf -- '- run_id: `%s`\n' "$run_id" + printf -- '- model: `%s`\n' "${model:-unknown}" + printf -- '- context_read: %s\n' "${context:-*(none)*}" + printf -- '- output_written: %s\n' "${output:-*(none)*}" + printf -- '- reasoning: %s\n' "${reasoning:-No reasoning provided.}" +} >> "$LOG_FILE" + +echo "run_id=${run_id}" diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh new file mode 100644 index 0000000..cac5fc0 --- /dev/null +++ b/skills/ingest/scripts/open-pr.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/open-pr.sh +# Branch, commit (conventional), push, and open a Forgejo PR for the wiki/ changes. +# Mirrors the API conventions of providers/forgejo.sh (token auth + http_code). +# Runs inside the genome checkout (cwd = genome root). Never touches main. +# +# open-pr.sh --slug --title "feat: ingest " --body-file \ +# [--base main] [--label CONFLICT] +# +# Requires env: FORGEJO_URL, FORGEJO_USER, FORGEJO_TOKEN. +# ============================================================================= +set -euo pipefail + +: "${FORGEJO_URL:?missing FORGEJO_URL}" +: "${FORGEJO_USER:?missing FORGEJO_USER}" +: "${FORGEJO_TOKEN:?missing FORGEJO_TOKEN}" + +slug="" title="" body_file="" base="main" label="" +while [[ $# -gt 0 ]]; do + case "$1" in + --slug) slug="$2"; shift 2 ;; + --title) title="$2"; shift 2 ;; + --body-file) body_file="$2"; shift 2 ;; + --base) base="$2"; shift 2 ;; + --label) label="$2"; shift 2 ;; + *) echo "open-pr: unknown arg: $1" >&2; exit 1 ;; + esac +done + +: "${slug:?--slug required}" +: "${title:?--title required}" +: "${body_file:?--body-file required}" +[[ -f "$body_file" ]] || { echo "open-pr: body file not found: $body_file" >&2; exit 1; } + +branch="feat/ai-ingest-${slug}" +repo="$(basename -s .git "$(git config --get remote.origin.url)")" + +# 1. Branch + commit + push (AGENTS.md rule 5: never commit to main) +git switch -c "$branch" 2>/dev/null || git switch "$branch" +git add wiki/ +if git diff --cached --quiet; then + echo "open-pr: nothing staged under wiki/ — aborting" >&2 + exit 1 +fi +git commit -m "$title" +git push -u origin "$branch" + +# 2. Open the PR via Forgejo API (jq builds the JSON safely) +body="$(cat "$body_file")" +payload="$(jq -n --arg head "$branch" --arg base "$base" \ + --arg title "$title" --arg body "$body" \ + '{head:$head, base:$base, title:$title, body:$body}')" + +resp="$(curl -s -w '\n%{http_code}' \ + -H "Authorization: token ${FORGEJO_TOKEN}" \ + -H "Content-Type: application/json" \ + -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls" \ + -d "$payload")" + +code="$(printf '%s' "$resp" | tail -n1)" +json="$(printf '%s' "$resp" | sed '$d')" + +case "$code" in + 201) + url="$(printf '%s' "$json" | jq -r '.html_url')" + number="$(printf '%s' "$json" | jq -r '.number')" + echo "PR opened: ${url}" + ;; + 409) + echo "open-pr: a PR for '${branch}' already exists — push updated the branch." >&2 + exit 0 + ;; + 401) + echo "open-pr: unauthorized — check FORGEJO_TOKEN (n8n-bot)." >&2 + exit 1 + ;; + *) + echo "open-pr: Forgejo API HTTP ${code}: ${json}" >&2 + exit 1 + ;; +esac + +# 3. Optional label (e.g. CONFLICT). Best-effort; non-fatal. +if [[ -n "$label" && -n "${number:-}" ]]; then + label_id="$(curl -s -H "Authorization: token ${FORGEJO_TOKEN}" \ + "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/labels" \ + | jq -r --arg n "$label" '.[] | select(.name==$n) | .id' | head -n1)" + if [[ -n "$label_id" && "$label_id" != "null" ]]; then + curl -s -o /dev/null \ + -H "Authorization: token ${FORGEJO_TOKEN}" -H "Content-Type: application/json" \ + -X POST "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/issues/${number}/labels" \ + -d "{\"labels\":[${label_id}]}" \ + && echo "label '${label}' applied" >&2 + else + echo "open-pr: label '${label}' not found in repo — skipped." >&2 + fi +fi diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh new file mode 100644 index 0000000..b9fab70 --- /dev/null +++ b/skills/ingest/scripts/run-ingest.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/run-ingest.sh +# Post-pi orchestrator. Runs OUTSIDE pi's loop, on vm101, in the genome checkout. +# Consumes .ingest-manifest.json (written by the ingest skill) and performs every +# deterministic step — index, log, scoped lint, PR — so pi's context stays clean. +# +# run-ingest.sh [manifest_path] +# +# Emits a single JSON result line on stdout for n8n to parse. +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: run-ingest.sh [manifest]}" +manifest="${2:-.ingest-manifest.json}" +SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +fail() { + jq -n --arg stage "$1" --arg reason "$2" \ + '{status:"error", stage:$stage, reason:$reason}' + exit 1 +} + +command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq missing"}'; exit 1; } +command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" +[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}" + +# --- read manifest scalars --- +raw_source="$(jq -r '.raw_source' "$manifest")" +model="$(jq -r '.model // "unknown"' "$manifest")" +reasoning="$(jq -r '.reasoning // "Ingest."' "$manifest")" +pr_summary="$(jq -r '.pr_summary // "Ingest."' "$manifest")" +contradictions="$(jq -r '.contradictions // "None"' "$manifest")" + +[[ -n "$raw_source" && "$raw_source" != "null" ]] || fail "manifest" "raw_source missing" + +slug="$(bash "${SCRIPTS}/slug.sh" "$raw_source")" + +# --- collect touched paths --- +mapfile -t created_paths < <(jq -r '.pages[] | select(.status=="created") | .path' "$manifest") +mapfile -t modified_paths < <(jq -r '.pages[] | select(.status=="modified") | .path' "$manifest") +all_paths=( "${created_paths[@]}" "${modified_paths[@]}" ) +[[ ${#all_paths[@]} -gt 0 ]] || fail "manifest" "no pages reported" + +conflict_label="" + +# --- 1. index entries (created pages only), inserted in order --- +while IFS=$'\t' read -r path summary maturity; do + [[ -z "$path" ]] && continue + link="${path#wiki/}"; link="${link%.md}" # e.g. sources/foo + folder="${link%%/*}" + case "$folder" in + sources) section="Sources" ;; + entities) section="Entities" ;; + concepts) section="Concepts" ;; + queries) + if [[ "$link" == queries/conflict-* ]]; then section="Conflicts"; conflict_label="CONFLICT" + else section="Queries"; fi ;; + *) section="Sources" ;; + esac + + if [[ "$section" == "Conflicts" ]]; then + entry="- [[${link}]]" # conflicts: slug only + else + entry="- [[${link}]] — ${summary} \`maturity: ${maturity}\`" + fi + + python3 "${SCRIPTS}/index-append.py" --section "$section" --entry "$entry" \ + || fail "index" "index-append failed for ${path}" +done < <(jq -r '.pages[] | select(.status=="created") + | [.path, (.summary // ""), (.maturity // "draft")] | @tsv' "$manifest") + +# --- 2. log entry --- +out="$(jq -r '[.pages[].path | "[[" + (sub("^wiki/";"") | sub("\\.md$";"")) + "]]"] | join(", ")' "$manifest")" +"${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model" \ + --context "[[${raw_source}]]" --output "${out:-*(none)*}" --reasoning "$reasoning" \ + || fail "log" "log-append failed" + +# --- 3. scoped lint (capture findings for the PR; never aborts the run) --- +lint_out="$( "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$? + +# --- 4. assemble the PR body (manifest tables + lint results) --- +body="$(mktemp)" +{ + echo "## Summary" + echo "$pr_summary" + echo "" + echo "## Pages" + echo "| Path | Status | Maturity |" + echo "|------|--------|----------|" + jq -r '.pages[] | "| `\(.path)` | \(.status) | \(.maturity // "draft") |"' "$manifest" + echo "" + echo "## Contradictions" + echo "$contradictions" + echo "" + echo "## Scoped Lint (post-ingest)" + echo '```' + echo "$lint_out" + echo '```' +} > "$body" + +# --- 5. open the PR --- +pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" ) +[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) +pr_out="$( "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? +pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" + +rm -f "$body" + +# --- final result line for n8n --- +jq -n \ + --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ + --arg slug "$slug" \ + --arg pr_url "$pr_url" \ + --argjson lint_clean "$([[ $lint_rc -eq 0 ]] && echo true || echo false)" \ + --argjson conflict "$([[ -n "$conflict_label" ]] && echo true || echo false)" \ + --arg detail "$pr_out" \ + '{status:$status, slug:$slug, pr_url:$pr_url, lint_clean:$lint_clean, conflict:$conflict, detail:$detail}' + +[[ $pr_rc -eq 0 ]] diff --git a/skills/ingest/scripts/scoped-lint.sh b/skills/ingest/scripts/scoped-lint.sh new file mode 100644 index 0000000..a064fd9 --- /dev/null +++ b/skills/ingest/scripts/scoped-lint.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/scoped-lint.sh +# Run the framework's validation on ONLY the files touched this session. +# Reuses lib/lint.sh + lib/output.sh — same checks as `make lint`, scoped. +# +# KG_LIB_DIR=/opt/knowledge-genome-setup/lib \ +# scoped-lint.sh wiki/sources/x.md wiki/entities/y.md +# +# Exits non-zero if any hard error is found, so the agent notices. +# Findings are printed (stderr from the lint functions + a summary on stdout). +# ============================================================================= +set -euo pipefail + +: "${KG_LIB_DIR:?set KG_LIB_DIR to the framework lib/ dir (e.g. /opt/knowledge-genome-setup/lib)}" + +# shellcheck source=/dev/null +source "${KG_LIB_DIR}/output.sh" +# shellcheck source=/dev/null +source "${KG_LIB_DIR}/lint.sh" + +genome="${1:?usage: scoped-lint.sh }" +shift +[[ $# -gt 0 ]] || { echo "scoped-lint: no files given" >&2; exit 1; } + +errors=0 +stale=0 +count=$# + +for f in "$@"; do + if [[ ! -f "$f" ]]; then + warn "scoped-lint: missing file (skipped): $f" + continue + fi + + lint_markdown_file "$f" "$genome" && fe=0 || fe=$? + check_privacy_consistency "$f" && pce=0 || pce=$? + check_page_size "$f" && pse=0 || pse=$? + errors=$(( errors + fe + pce + pse )) + + check_knowledge_decay "$f" && st=0 || st=$? + stale=$(( stale + st )) + + check_broken_links "$f" || true # warnings only +done + +echo "" +echo "scoped-lint: ${errors} error(s), ${stale} stale across ${count} file(s)" + +[[ $errors -eq 0 ]] diff --git a/skills/ingest/scripts/slug.sh b/skills/ingest/scripts/slug.sh new file mode 100644 index 0000000..a5711ac --- /dev/null +++ b/skills/ingest/scripts/slug.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +# ============================================================================= +# skills/ingest/scripts/slug.sh +# Derive a wiki slug from a path, filename, or title string. +# slug.sh "raw/articles/My Source.md" -> my-source +# slug.sh "Some Concept Name" -> some-concept-name +# ============================================================================= +set -euo pipefail + +input="${1:?usage: slug.sh }" + +# Strip directory and extension when given a path +base="${input##*/}" +base="${base%.*}" + +printf '%s\n' "$base" \ + | tr '[:upper:]' '[:lower:]' \ + | sed -E 's/[^a-z0-9]+/-/g; s/-{2,}/-/g; s/^-+//; s/-+$//'