From 918d632b41512e91c371005cf60aa560aafae70f Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Sat, 27 Jun 2026 12:15:59 +0200 Subject: [PATCH] feat: Implement `pending-raw.sh` to identify changed sources --- skills/ingest/scripts/pending-raw.sh | 64 ++++++++++++++++++++ tests/pending-raw.bats | 90 ++++++++++++++++++++++++++++ tests/permissions.bats | 1 + 3 files changed, 155 insertions(+) create mode 100755 skills/ingest/scripts/pending-raw.sh create mode 100644 tests/pending-raw.bats diff --git a/skills/ingest/scripts/pending-raw.sh b/skills/ingest/scripts/pending-raw.sh new file mode 100755 index 0000000..0bd1a21 --- /dev/null +++ b/skills/ingest/scripts/pending-raw.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# ============================================================================= +# pending-raw.sh — deterministic "what needs ingesting" calculator. +# Reads the clean base checkout and classifies each raw/articles/*.md as: +# new -> no wiki/sources/.md +# modified -> page exists but its source_sha256 != current file hash +# Emits the same JSON envelope as changed-raw (drop-in), plus detail[] for ntfy. +# ============================================================================= +set -euo pipefail + +genome="${1:?usage: pending-raw.sh }" +base_dir="${GENOMES_ROOT:-${HOME}/genomes}" +cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; } + +# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh). +git fetch -q origin \ + && git switch -q "${INGEST_BASE:-main}" 2>/dev/null \ + && git reset -q --hard "origin/${INGEST_BASE:-main}" \ + && git clean -q -fd + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SLUG="${SCRIPT_DIR}/slug.sh" + +declare -a NEW=() +declare -a MOD=() +declare -A SEEN_SLUG=() + +if [[ -d raw/articles ]]; then + while IFS= read -r -d '' f; do + rel="${f#./}" + case "$rel" in + */.stfolder/*|*/.stignore|*/.gitkeep) continue ;; + esac + slug="$("$SLUG" --raw "$rel")" || continue + + # Residual collision (two distinct raws -> same slug): warn, do not silence. + if [[ -n "${SEEN_SLUG[$slug]:-}" && "${SEEN_SLUG[$slug]}" != "$rel" ]]; then + logger -t pending-raw "warn: slug collision ${slug}: ${SEEN_SLUG[$slug]} <-> ${rel}" + fi + SEEN_SLUG[$slug]="$rel" + + page="wiki/sources/${slug}.md" + if [[ ! -f "$page" ]]; then + NEW+=("$rel") + else + cur="$(sha256sum "$rel" | cut -d' ' -f1)" + rec="$(sed -n 's/^source_sha256:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)" + if [[ "$cur" != "$rec" ]]; then + MOD+=("$rel") + fi + fi + done < <(find raw/articles -type f -name '*.md' -print0 2>/dev/null) +fi + +if [[ ${#NEW[@]} -eq 0 && ${#MOD[@]} -eq 0 ]]; then + echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}' +else + { + for x in "${NEW[@]}"; do printf '%s\tnew\n' "$x"; done + for x in "${MOD[@]}"; do printf '%s\tmodified\n' "$x"; done + } | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \ + | jq -s --arg g "$genome" \ + '{status: "ok", genome: $g, count: length, files: [.[].path], detail: .}' +fi diff --git a/tests/pending-raw.bats b/tests/pending-raw.bats new file mode 100644 index 0000000..f43237f --- /dev/null +++ b/tests/pending-raw.bats @@ -0,0 +1,90 @@ +#!/usr/bin/env bats + +setup() { + load 'helpers' + export PENDING="${SKILL_SCRIPTS}/pending-raw.sh" + export GENOMES_ROOT="${BATS_TEST_TMPDIR}" + export INGEST_BASE="main" + + g_src="$(make_fixture_genome)" + export g_name="fixture-genome" + mv "$g_src" "${GENOMES_ROOT}/${g_name}" + export g="${GENOMES_ROOT}/${g_name}" + + # FIX: make_fixture_genome ships raw/articles/test.md with no source page, which would + # otherwise count as a permanent 'new' and break every count assertion. Clear it so each + # test controls exactly what is pending (verified: count base becomes 0). + ( cd "$g" && rm -f raw/articles/test.md && git add -A \ + && git commit -q -m "test: clear default raw" && git push -q ) +} + +@test "pending-raw: detects a brand new raw file" { + echo "new content" > "${g}/raw/articles/new-file.md" + ( cd "$g" && git add . && git commit -q -m "add raw" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.detail[0].path == "raw/articles/new-file.md"' + echo "$output" | jq -e '.detail[0].reason == "new"' +} + +@test "pending-raw: skips up-to-date files" { + echo "ok content" > "${g}/raw/articles/ok-file.md" + hash_ok="$(sha256sum "${g}/raw/articles/ok-file.md" | cut -d' ' -f1)" + cat > "${g}/wiki/sources/ok-file.md" < "${g}/raw/articles/mod-file.md" + hash_v1="$(sha256sum "${g}/raw/articles/mod-file.md" | cut -d' ' -f1)" + cat > "${g}/wiki/sources/mod-file.md" < "${g}/raw/articles/mod-file.md" + ( cd "$g" && git add . && git commit -q -m "v2" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.detail[0].reason == "modified"' +} + +@test "pending-raw: nested subdirectory yields prefixed slug" { + mkdir -p "${g}/raw/articles/sub-b" + echo "subdir content" > "${g}/raw/articles/sub-b/file.md" + ( cd "$g" && git add . && git commit -q -m "subdir" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 1' + echo "$output" | jq -e '.files[0] == "raw/articles/sub-b/file.md"' +} + +@test "pending-raw: excludes noise (.stfolder, .gitkeep)" { + touch "${g}/raw/articles/.gitkeep" + mkdir -p "${g}/raw/articles/.stfolder" + touch "${g}/raw/articles/.stfolder/sync.log" + ( cd "$g" && git add . && git commit -q -m "noise" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 0' +} + +@test "pending-raw: reports both files on a slug collision" { + mkdir -p "${g}/raw/articles/cibo" + echo "c1" > "${g}/raw/articles/cibo-pane.md" + echo "c2" > "${g}/raw/articles/cibo/pane.md" + ( cd "$g" && git add . && git commit -q -m "collision" && git push -q ) + run bash "$PENDING" "$g_name" + [ "$status" -eq 0 ] + echo "$output" | jq -e '.count == 2' +} diff --git a/tests/permissions.bats b/tests/permissions.bats index c71d32f..ebe9888 100644 --- a/tests/permissions.bats +++ b/tests/permissions.bats @@ -17,6 +17,7 @@ EXECUTABLES=( skills/ingest/scripts/open-pr.sh skills/ingest/scripts/log-append.sh skills/ingest/scripts/slug.sh + skills/ingest/scripts/pending-raw.sh skills/ingest/scripts/index-append.py scripts/add-genome.sh scripts/setup.sh