Compare commits

...

14 commits

5 changed files with 178 additions and 27 deletions

View file

@ -1,5 +1,5 @@
# =============================================================================
# Knowledge Genome - Makefile v. 1.5.0
# Knowledge Genome - Makefile v. 1.6.0
# Orchestrates the setup and management of the knowledge base.
# =============================================================================

View file

@ -1,13 +1,11 @@
#!/bin/bash
# genome-raw-commit <genome>
#
# Commits raw files synchronized by Syncthing into the vault and pushes them to origin/<base>.
# - Committer = n8n-bot (robotic identity responsible for pushing)
# - Author = deduced from the Syncthing device ID (modifiedBy field), resolved via .authors.json.
# Falls back to default values if unknown.
# - One commit per author/device to ensure clear attribution.
# - No-op if no changes are present. Excludes infrastructure files and private folders.
# Commit the raw files that Syncthing has placed in the vault and push them to origin/<base>.
# - Committer = n8n-bot (sole pusher); Author = the person who wrote it (Syncthing modifiedBy -> .authors.json)
# - One commit per author (single-device => one commit). No-op if there is nothing.
# - JSON output built with jq (safe escaping), with a `files` array:
# for each raw -> file, author, local_path, local_url (file://), remote_url (Forgejo web).
set -euo pipefail
genome="${1:?usage: genome-raw-commit <genome>}"
@ -20,6 +18,7 @@ set -a; . "${HOME}/.config/knowledge-genome.env"; set +a
: "${FORGEJO_USER:=n8n-bot}"
: "${FORGEJO_HOST:=127.0.0.1:3001}"
: "${FORGEJO_OWNER:=Keru}"
: "${FORGEJO_WEB_BASE:=https://git.keruhomelab.com}" # human-facing URL for remote links (not the loopback)
: "${SYNCTHING_URL:=http://127.0.0.1:8384}"
: "${COMMITTER_NAME:=n8n-bot}"
: "${COMMITTER_EMAIL:=n8n-bot@homelab}"
@ -38,43 +37,42 @@ git config user.name "$COMMITTER_NAME"
git config user.email "$COMMITTER_EMAIL"
git config commit.gpgsign false
# Scope restricted to raw/ directory. raw/.stignore is omitted via .git/info/exclude
git add -A -- raw/
git reset -q -- raw/.stignore 2>/dev/null || true
grep -qxF 'raw/.stignore' "${vault}/.git/info/exclude" 2>/dev/null || echo 'raw/.stignore' >> "${vault}/.git/info/exclude"
grep -qxF 'raw/.stfolder' "${vault}/.git/info/exclude" 2>/dev/null || echo 'raw/.stfolder' >> "${vault}/.git/info/exclude"
git add -A -- raw/
git reset -q -- raw/.stignore raw/.stfolder 2>/dev/null || true
if git diff --cached --quiet; then
printf '{"status":"noop","genome":"%s"}\n' "$genome"
exit 0
fi
# Map Syncthing device ID to author information (name, email)
resolve_dev() {
# $1 = file path relative to the vault root (e.g., raw/file.txt)
resolve_dev() { # $1 = path relative to the vault (raw/...) -> prints the short device id, or empty
[[ -z "${SYNCTHING_API_KEY:-}" ]] && return 0
curl -fsS -H "X-API-Key: ${SYNCTHING_API_KEY}" --get "${SYNCTHING_URL}/rest/db/file" \
--data-urlencode "folder=${fid}" --data-urlencode "file=${1#raw/}" 2>/dev/null \
| jq -r '.local.modifiedBy // empty' 2>/dev/null || true
}
author_for_dev() {
# $1 = device ID
author_for_dev() { # $1 = device id -> prints "name\temail"
local dev="$1" name="$DEFAULT_AUTHOR_NAME" email="$DEFAULT_AUTHOR_EMAIL"
if [[ -n "$dev" && -f "$authors_map" ]] && jq -e --arg d "$dev" '.[$d]' "$authors_map" >/dev/null 2>&1; then
name="$(jq -r --arg d "$dev" '.[$d].name' "$authors_map")"
email="$(jq -r --arg d "$dev" '.[$d].email' "$authors_map")"
fi
printf '%s\t%s\t%s' "$name" "$email" "${dev:-unknown}"
printf '%s\t%s' "$name" "$email"
}
# Group staged files by author identity
declare -A G_FILES G_NAME G_EMAIL G_DEV
# Collect per-file (relpath, author) and group by author for committing
declare -A G_FILES G_NAME G_EMAIL
declare -a ROWS
while IFS= read -r f; do
[[ -z "$f" ]] && continue
dev="$(resolve_dev "$f")"
IFS=$'\t' read -r aname aemail adev <<< "$(author_for_dev "$dev")"
IFS=$'\t' read -r aname aemail <<< "$(author_for_dev "$dev")"
ROWS+=("${f}"$'\t'"${aname}")
key="${aname} <${aemail}>"
G_FILES["$key"]+="${f}"$'\n'
G_NAME["$key"]="$aname"; G_EMAIL["$key"]="$aemail"; G_DEV["$key"]="$adev"
G_NAME["$key"]="$aname"; G_EMAIL["$key"]="$aemail"
done < <(git diff --cached --name-only -- raw/)
ts="$(date +%Y-%m-%dT%H:%M:%S%z)"
@ -82,20 +80,34 @@ commits=0; summary=""
for key in "${!G_FILES[@]}"; do
mapfile -t files < <(printf '%s' "${G_FILES[$key]}")
short="$(printf '%s\n' "${files[@]}" | sed 's#^raw/##' | paste -sd, -)"
msg="$(printf 'raw(%s): sync %s\n\nAdded-by-device: %s\nSyncthing-device-id: %s\nSource: syncthing-autocommit\nSynced-at: %s\n' \
"$genome" "$short" "${G_DEV[$key]}" "${G_DEV[$key]}" "$ts")"
msg="$(printf 'raw(%s): sync %s\n\nAdded-by: %s\nSource: syncthing-autocommit\nSynced-at: %s\n' \
"$genome" "$short" "${G_NAME[$key]}" "$ts")"
git commit -q --author="$key" -m "$msg" -- "${files[@]}"
commits=$((commits+1))
summary="${summary}${summary:+; }${G_NAME[$key]}:${short}"
done
# Fetch updates from origin to merge upstream modifications before pushing
# Pull in any remote advances (e.g. a merged wiki PR), then push
git fetch -q origin
if git show-ref --verify --quiet "refs/remotes/origin/${GENOME_BASE}"; then
git rebase -q "origin/${GENOME_BASE}" \
|| { git rebase --abort 2>/dev/null || true; printf '{"status":"error","reason":"rebase-conflict","genome":"%s"}\n' "$genome"; exit 1; }
fi
git push -q "$clone_url" "HEAD:${GENOME_BASE}"
head="$(git rev-parse --short HEAD)"
printf '{"status":"ok","genome":"%s","base":"%s","commits":%d","head":"%s","summary":"%s"}\n' \
"$genome" "$GENOME_BASE" $commits "$(git rev-parse --short HEAD)" "$summary"
# `files` array: local (file://) and remote (Forgejo web) link for each committed raw
files_json="$(
for row in "${ROWS[@]}"; do
IFS=$'\t' read -r rel aname <<< "$row"
jq -n --arg file "$rel" --arg author "$aname" \
--arg lpath "${vault}/${rel}" \
--arg lurl "file://${vault}/${rel}" \
--arg rurl "${FORGEJO_WEB_BASE}/${FORGEJO_OWNER}/${genome}/src/branch/${GENOME_BASE}/${rel}" \
'{file:$file, author:$author, local_path:$lpath, local_url:$lurl, remote_url:$rurl}'
done | jq -s '.'
)"
jq -n --arg genome "$genome" --arg base "$GENOME_BASE" --argjson commits "$commits" \
--arg head "$head" --arg summary "$summary" --argjson files "$files_json" \
'{status:"ok", genome:$genome, base:$base, commits:$commits, head:$head, summary:$summary, files:$files}'

60
deploy/vm101/README.md Normal file
View file

@ -0,0 +1,60 @@
# deploy/vm101
System artifacts deployed to **vm101** (the GPU ingest node). The repo is the
source of truth; the live copies live in `/usr/local/bin/`. Edit here, then
`sudo ./install.sh` on vm101 to push changes.
## Contents
- `n8n-pi-wrap` — forced-command wrapper that fronts every n8n→vm101 SSH call.
- `install.sh` — installs the wrapper(s) into `/usr/local/bin` (idempotent).
## n8n-pi-wrap
The only entry point for the `n8n-runner` identity onto vm101. n8n never gets a
shell here: whatever it sends arrives as `SSH_ORIGINAL_COMMAND`, and a `case`
whitelist decides what runs. Anything outside the whitelist is denied and logged.
Allowed commands:
| Command | What it does |
|---|---|
| `pi run` | one-shot prompt via stdin (proof-of-life / health) |
| `pi ingest <genome> <raw_path>` | the real two-phase ingest (below) |
| `ollama list` / `ollama ps` | model introspection |
### The two-phase ingest
`pi ingest` runs the clean-start + two phases, then stops:
1. **Clean start**`git fetch && switch <INGEST_BASE> && reset --hard origin/<base>`.
Destroys only vm101's *scratch* checkout (never a shared branch, never a
force-push) — this determinism is by design.
2. **Semantic**`skills/ingest/scripts/ingest-semantic.py <genome> <raw_path>`
drives `pi` to WRITE `wiki/*` pages + `.ingest-manifest.json`.
NOTE: this is the script, NOT `pi -p "/skill:ingest ..."` (that form makes the
model reply in chat and write nothing — the classic "manifest not found" trap).
3. **Mechanical**`skills/ingest/scripts/run-ingest.sh <genome>` validates the
manifest, then index/log/scoped-lint/commit on `feat/ai-ingest-<slug>` and opens
a PR onto `<INGEST_BASE>`. Emits one JSON line `{status,slug,pr_url,...}`.
The PR then waits for the human gate. One raw per session, sequential.
### Input hardening
Both inputs come from `SSH_ORIGINAL_COMMAND`, so both are validated:
- `genome` — kebab lowercase `^[a-z0-9-]+$`.
- `raw_path` — must be under `raw/`, no `..` traversal, restricted charset
`[A-Za-z0-9._/-]`, and the file must exist. Rejected paths return a JSON error.
Config (`INGEST_BASE`, `GENOMES_ROOT`, `INGEST_MODEL`, Forgejo token) is sourced
from `~/.config/knowledge-genome.env` (0600, owner-only).
## Install / update
```bash
# on vm101
cd ~/knowledge-genome-orchestrator/deploy/vm101
sudo ./install.sh
```

8
deploy/vm101/install.sh Executable file
View file

@ -0,0 +1,8 @@
#!/bin/bash
# deploy/vm101/install.sh — install vm101 wrappers from repo -> /usr/local/bin (idempotent).
# Run ON vm101 with sudo: sudo ./install.sh
set -euo pipefail
here="$(cd "$(dirname "$0")" && pwd)"
install -m 0755 "${here}/n8n-pi-wrap" /usr/local/bin/n8n-pi-wrap
echo "installed: /usr/local/bin/n8n-pi-wrap"
bash -n /usr/local/bin/n8n-pi-wrap && echo "syntax: ok"

71
deploy/vm101/n8n-pi-wrap Executable file
View file

@ -0,0 +1,71 @@
#!/bin/bash
set -eu
cmd="${SSH_ORIGINAL_COMMAND:-}"
case "$cmd" in
"pi run")
logger -t n8n-pi-wrap "ok: pi run (prompt via stdin)"
prompt=$(cat)
exec /usr/local/bin/pi --no-tools --mode json -p "$prompt" </dev/null
;;
"pi ingest "*)
# Strict positional parse: EXACTLY `pi ingest <genome> <raw_path>` (two tokens).
rest="${cmd#pi ingest }"
genome="${rest%% *}"
raw_path="${rest#* }"
# reject: missing second token, or any extra token (a space left in raw_path)
if [ "$genome" = "$rest" ] || [ -z "$raw_path" ] || [ "$raw_path" != "${raw_path#* }" ]; then
echo '{"status":"error","reason":"usage: pi ingest <genome> <raw_path>"}'; exit 1
fi
# genome slug: kebab lowercase only
case "$genome" in ""|*[!a-z0-9-]*) echo '{"status":"error","reason":"invalid genome name"}'; exit 1;; esac
# raw_path whitelist: MUST live under raw/, no traversal, restricted charset.
# - must start with "raw/" - no ".." segment - no absolute path / leading slash
# - allowed chars: [A-Za-z0-9._/-] (kebab slugs + subdirs like raw/articles/foo.md)
case "$raw_path" in
raw/*) : ;;
*) echo '{"status":"error","reason":"raw_path must be under raw/"}'; exit 1;;
esac
case "$raw_path" in
*..*|*//*) echo '{"status":"error","reason":"raw_path traversal"}'; exit 1;;
esac
case "$raw_path" in
*[!A-Za-z0-9._/-]*) echo '{"status":"error","reason":"raw_path illegal chars"}'; exit 1;;
esac
logger -t n8n-pi-wrap "ok: pi ingest ${genome} ${raw_path}"
set -a; . "${HOME}/.config/knowledge-genome.env"; set +a
cd "${GENOMES_ROOT}/${genome}" || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
# The raw file must actually exist under the genome's raw/ dir.
[ -f "$raw_path" ] || { echo '{"status":"error","reason":"raw file not found"}'; exit 1; }
# Clean start on the configured base (develop), pinned to the remote. Destroys only
# vm101's scratch checkout (never a shared branch, never a force-push) — this is by design.
git fetch -q origin \
&& git switch -q "${INGEST_BASE:-main}" 2>/dev/null \
&& git reset -q --hard "origin/${INGEST_BASE:-main}"
# SEMANTIC step: dedicated script drives pi to WRITE wiki pages + manifest.
# (NOT `pi -p "/skill:ingest ..."`, which makes the model reply in chat and write nothing.)
log="$(mktemp -t pi-ingest.XXXXXX.log)"
"${HOME}/.pi/agent/skills/ingest/scripts/ingest-semantic.py" "${genome}" "${raw_path}" \
>"$log" 2>&1 \
|| { echo "{\"status\":\"error\",\"stage\":\"semantic\",\"reason\":\"ingest-semantic failed\",\"log\":\"${log}\"}"; exit 1; }
# MECHANICAL step: validate manifest -> index/log/scoped-lint/commit/PR -> 1 JSON line
exec "${HOME}/.pi/agent/skills/ingest/scripts/run-ingest.sh" "${genome}"
;;
"ollama list")
logger -t n8n-pi-wrap "ok: ollama list"
exec /usr/local/bin/ollama list
;;
"ollama ps")
logger -t n8n-pi-wrap "ok: ollama ps"
exec /usr/local/bin/ollama ps
;;
*)
logger -t n8n-pi-wrap "denied: ${cmd:-<empty>}"
echo "unauthorized command" >&2
exit 1
;;
esac