feat: enhance ingest workflow with validation, PR control, and robustness

This commit is contained in:
Matteo Cherubini 2026-06-05 10:37:09 +02:00
parent ddf34944e0
commit 93bc5bb007
3 changed files with 62 additions and 4 deletions

View file

@ -80,7 +80,15 @@ case "$code" in
echo "PR opened: ${url}"
;;
409)
echo "open-pr: a PR for '${branch}' already exists — push updated the branch." >&2
# PR already exists — fetch it so the orchestrator still gets the URL.
existing="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \
"${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls?state=open" \
| jq -r --arg b "$branch" '.[] | select(.head.ref==$b) | .html_url' | head -n1)"
if [[ -n "$existing" && "$existing" != "null" ]]; then
echo "PR opened: ${existing}"
else
echo "open-pr: a PR for '${branch}' already exists (push updated the branch)." >&2
fi
exit 0
;;
401)

View file

@ -25,6 +25,18 @@ command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq mis
command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)"
[[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}"
# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) ---
# 1) well-formed JSON object with a string raw_source and an array of pages
jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \
"$manifest" >/dev/null 2>&1 \
|| fail "manifest" "invalid manifest: need object with string raw_source and array pages"
# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal)
if jq -e '[.pages[].path
| select((type!="string") or (startswith("wiki/")|not) or test("\\.\\."))]
| length > 0' "$manifest" >/dev/null 2>&1; then
fail "manifest" "unsafe page path (must be a string under wiki/, no '..')"
fi
# --- read manifest scalars ---
raw_source="$(jq -r '.raw_source' "$manifest")"
# model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its
@ -88,6 +100,7 @@ lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )
# --- 4. assemble the PR body (manifest tables + lint results) ---
body="$(mktemp)"
trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash)
{
echo "## Summary"
echo "$pr_summary"
@ -107,13 +120,11 @@ body="$(mktemp)"
} > "$body"
# --- 5. open the PR ---
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" )
pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" )
[[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" )
pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$?
pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)"
rm -f "$body"
# --- final result line for n8n ---
jq -nc \
--arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \

View file

@ -132,3 +132,42 @@ EOF
[[ "$output" == *'"status":"ok"'* ]]
grep -q 'qwen-test-tag' wiki/log.md
}
@test "run-ingest: rejects a manifest path that escapes wiki/ (traversal)" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > .ingest-manifest.json <<'EOF'
{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/../etc/passwd","summary":"x","maturity":"draft","status":"created"}] }
EOF
export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -ne 0 ]
[[ "$output" == *'"status":"error"'* ]]
}
@test "run-ingest: honours INGEST_BASE for the PR base" {
command -v jq >/dev/null 2>&1 || skip "jq not installed"
G="$(make_fixture_genome)"; cd "$G"
cat > wiki/sources/test-source.md <<'EOF'
---
title: "Test Source"
type: source
domain: genome-test
tags: [t]
maturity: draft
last_updated: 2026-06-04
private: false
---
body
EOF
cat > .ingest-manifest.json <<'EOF'
{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None",
"pages":[{"path":"wiki/sources/test-source.md","summary":"s","maturity":"draft","status":"created"}] }
EOF
export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1
export INGEST_BASE="develop"
run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test
[ "$status" -eq 0 ]
[[ "$output" == *"develop"* ]]
}