From 93bc5bb0075f7f6121f9c796530fe72127bf1c76 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 10:37:09 +0200 Subject: [PATCH] feat: enhance ingest workflow with validation, PR control, and robustness --- skills/ingest/scripts/open-pr.sh | 10 +++++++- skills/ingest/scripts/run-ingest.sh | 17 ++++++++++--- tests/run-ingest.bats | 39 +++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 4 deletions(-) diff --git a/skills/ingest/scripts/open-pr.sh b/skills/ingest/scripts/open-pr.sh index 8f8f23b..3e979ff 100644 --- a/skills/ingest/scripts/open-pr.sh +++ b/skills/ingest/scripts/open-pr.sh @@ -80,7 +80,15 @@ case "$code" in echo "PR opened: ${url}" ;; 409) - echo "open-pr: a PR for '${branch}' already exists — push updated the branch." >&2 + # PR already exists — fetch it so the orchestrator still gets the URL. + existing="$(curl --max-time 15 -s -H "Authorization: token ${FORGEJO_TOKEN}" \ + "${FORGEJO_URL}/api/v1/repos/${FORGEJO_USER}/${repo}/pulls?state=open" \ + | jq -r --arg b "$branch" '.[] | select(.head.ref==$b) | .html_url' | head -n1)" + if [[ -n "$existing" && "$existing" != "null" ]]; then + echo "PR opened: ${existing}" + else + echo "open-pr: a PR for '${branch}' already exists (push updated the branch)." >&2 + fi exit 0 ;; 401) diff --git a/skills/ingest/scripts/run-ingest.sh b/skills/ingest/scripts/run-ingest.sh index 4ffbd1f..ae5b3af 100644 --- a/skills/ingest/scripts/run-ingest.sh +++ b/skills/ingest/scripts/run-ingest.sh @@ -25,6 +25,18 @@ command -v jq >/dev/null 2>&1 || { echo '{"status":"error","reason":"jq mis command -v python3 >/dev/null 2>&1 || fail "deps" "python3 missing (needed by index-append.py)" [[ -f "$manifest" ]] || fail "manifest" "manifest not found: ${manifest}" +# --- validate the manifest BEFORE trusting any field (LLM output is stochastic) --- +# 1) well-formed JSON object with a string raw_source and an array of pages +jq -e 'type=="object" and (.raw_source|type=="string") and (.pages|type=="array")' \ + "$manifest" >/dev/null 2>&1 \ + || fail "manifest" "invalid manifest: need object with string raw_source and array pages" +# 2) every page.path must be a string, live under wiki/, and contain no '..' (no traversal) +if jq -e '[.pages[].path + | select((type!="string") or (startswith("wiki/")|not) or test("\\.\\."))] + | length > 0' "$manifest" >/dev/null 2>&1; then + fail "manifest" "unsafe page path (must be a string under wiki/, no '..')" +fi + # --- read manifest scalars --- raw_source="$(jq -r '.raw_source' "$manifest")" # model name comes from the orchestrator/wrapper (INGEST_MODEL); the agent cannot know its @@ -88,6 +100,7 @@ lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 ) # --- 4. assemble the PR body (manifest tables + lint results) --- body="$(mktemp)" +trap 'rm -f "$body"' EXIT # auto-clean on any exit (success, fail(), or crash) { echo "## Summary" echo "$pr_summary" @@ -107,13 +120,11 @@ body="$(mktemp)" } > "$body" # --- 5. open the PR --- -pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" ) +pr_args=( --slug "$slug" --title "feat: ingest ${slug}" --body-file "$body" --base "${INGEST_BASE:-main}" ) [[ -n "$conflict_label" ]] && pr_args+=( --label "$conflict_label" ) pr_out="$( bash "${SCRIPTS}/open-pr.sh" "${pr_args[@]}" 2>&1 )" && pr_rc=0 || pr_rc=$? pr_url="$(printf '%s\n' "$pr_out" | sed -n 's/^PR opened: //p' | head -n1)" -rm -f "$body" - # --- final result line for n8n --- jq -nc \ --arg status "$([[ $pr_rc -eq 0 ]] && echo ok || echo pr_failed)" \ diff --git a/tests/run-ingest.bats b/tests/run-ingest.bats index 65fdb43..6b7c30b 100644 --- a/tests/run-ingest.bats +++ b/tests/run-ingest.bats @@ -132,3 +132,42 @@ EOF [[ "$output" == *'"status":"ok"'* ]] grep -q 'qwen-test-tag' wiki/log.md } + +@test "run-ingest: rejects a manifest path that escapes wiki/ (traversal)" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + G="$(make_fixture_genome)"; cd "$G" + cat > .ingest-manifest.json <<'EOF' +{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None", + "pages":[{"path":"wiki/../etc/passwd","summary":"x","maturity":"draft","status":"created"}] } +EOF + export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1 + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -ne 0 ] + [[ "$output" == *'"status":"error"'* ]] +} + +@test "run-ingest: honours INGEST_BASE for the PR base" { + command -v jq >/dev/null 2>&1 || skip "jq not installed" + G="$(make_fixture_genome)"; cd "$G" + cat > wiki/sources/test-source.md <<'EOF' +--- +title: "Test Source" +type: source +domain: genome-test +tags: [t] +maturity: draft +last_updated: 2026-06-04 +private: false +--- +body +EOF + cat > .ingest-manifest.json <<'EOF' +{ "raw_source":"raw/articles/test.md","reasoning":"r","pr_summary":"s","contradictions":"None", + "pages":[{"path":"wiki/sources/test-source.md","summary":"s","maturity":"draft","status":"created"}] } +EOF + export KG_LIB_DIR="$LIB_DIR" FORGEJO_URL=http://x FORGEJO_USER=u FORGEJO_TOKEN=t DRY_RUN=1 + export INGEST_BASE="develop" + run bash "$SKILL_SCRIPTS/run-ingest.sh" genome-test + [ "$status" -eq 0 ] + [[ "$output" == *"develop"* ]] +}