feat: Implement pending-raw.sh to identify changed sources
This commit is contained in:
parent
0ff98e1ebd
commit
918d632b41
3 changed files with 155 additions and 0 deletions
64
skills/ingest/scripts/pending-raw.sh
Executable file
64
skills/ingest/scripts/pending-raw.sh
Executable file
|
|
@ -0,0 +1,64 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# =============================================================================
|
||||||
|
# pending-raw.sh — deterministic "what needs ingesting" calculator.
|
||||||
|
# Reads the clean base checkout and classifies each raw/articles/*.md as:
|
||||||
|
# new -> no wiki/sources/<slug>.md
|
||||||
|
# modified -> page exists but its source_sha256 != current file hash
|
||||||
|
# Emits the same JSON envelope as changed-raw (drop-in), plus detail[] for ntfy.
|
||||||
|
# =============================================================================
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
genome="${1:?usage: pending-raw.sh <genome>}"
|
||||||
|
base_dir="${GENOMES_ROOT:-${HOME}/genomes}"
|
||||||
|
cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
|
||||||
|
|
||||||
|
# Clean start on the configured base (Step 3 will extract this to lib/clean-start.sh).
|
||||||
|
git fetch -q origin \
|
||||||
|
&& git switch -q "${INGEST_BASE:-main}" 2>/dev/null \
|
||||||
|
&& git reset -q --hard "origin/${INGEST_BASE:-main}" \
|
||||||
|
&& git clean -q -fd
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
SLUG="${SCRIPT_DIR}/slug.sh"
|
||||||
|
|
||||||
|
declare -a NEW=()
|
||||||
|
declare -a MOD=()
|
||||||
|
declare -A SEEN_SLUG=()
|
||||||
|
|
||||||
|
if [[ -d raw/articles ]]; then
|
||||||
|
while IFS= read -r -d '' f; do
|
||||||
|
rel="${f#./}"
|
||||||
|
case "$rel" in
|
||||||
|
*/.stfolder/*|*/.stignore|*/.gitkeep) continue ;;
|
||||||
|
esac
|
||||||
|
slug="$("$SLUG" --raw "$rel")" || continue
|
||||||
|
|
||||||
|
# Residual collision (two distinct raws -> same slug): warn, do not silence.
|
||||||
|
if [[ -n "${SEEN_SLUG[$slug]:-}" && "${SEEN_SLUG[$slug]}" != "$rel" ]]; then
|
||||||
|
logger -t pending-raw "warn: slug collision ${slug}: ${SEEN_SLUG[$slug]} <-> ${rel}"
|
||||||
|
fi
|
||||||
|
SEEN_SLUG[$slug]="$rel"
|
||||||
|
|
||||||
|
page="wiki/sources/${slug}.md"
|
||||||
|
if [[ ! -f "$page" ]]; then
|
||||||
|
NEW+=("$rel")
|
||||||
|
else
|
||||||
|
cur="$(sha256sum "$rel" | cut -d' ' -f1)"
|
||||||
|
rec="$(sed -n 's/^source_sha256:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)"
|
||||||
|
if [[ "$cur" != "$rec" ]]; then
|
||||||
|
MOD+=("$rel")
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
done < <(find raw/articles -type f -name '*.md' -print0 2>/dev/null)
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ${#NEW[@]} -eq 0 && ${#MOD[@]} -eq 0 ]]; then
|
||||||
|
echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}'
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for x in "${NEW[@]}"; do printf '%s\tnew\n' "$x"; done
|
||||||
|
for x in "${MOD[@]}"; do printf '%s\tmodified\n' "$x"; done
|
||||||
|
} | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \
|
||||||
|
| jq -s --arg g "$genome" \
|
||||||
|
'{status: "ok", genome: $g, count: length, files: [.[].path], detail: .}'
|
||||||
|
fi
|
||||||
90
tests/pending-raw.bats
Normal file
90
tests/pending-raw.bats
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
setup() {
|
||||||
|
load 'helpers'
|
||||||
|
export PENDING="${SKILL_SCRIPTS}/pending-raw.sh"
|
||||||
|
export GENOMES_ROOT="${BATS_TEST_TMPDIR}"
|
||||||
|
export INGEST_BASE="main"
|
||||||
|
|
||||||
|
g_src="$(make_fixture_genome)"
|
||||||
|
export g_name="fixture-genome"
|
||||||
|
mv "$g_src" "${GENOMES_ROOT}/${g_name}"
|
||||||
|
export g="${GENOMES_ROOT}/${g_name}"
|
||||||
|
|
||||||
|
# FIX: make_fixture_genome ships raw/articles/test.md with no source page, which would
|
||||||
|
# otherwise count as a permanent 'new' and break every count assertion. Clear it so each
|
||||||
|
# test controls exactly what is pending (verified: count base becomes 0).
|
||||||
|
( cd "$g" && rm -f raw/articles/test.md && git add -A \
|
||||||
|
&& git commit -q -m "test: clear default raw" && git push -q )
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: detects a brand new raw file" {
|
||||||
|
echo "new content" > "${g}/raw/articles/new-file.md"
|
||||||
|
( cd "$g" && git add . && git commit -q -m "add raw" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 1'
|
||||||
|
echo "$output" | jq -e '.detail[0].path == "raw/articles/new-file.md"'
|
||||||
|
echo "$output" | jq -e '.detail[0].reason == "new"'
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: skips up-to-date files" {
|
||||||
|
echo "ok content" > "${g}/raw/articles/ok-file.md"
|
||||||
|
hash_ok="$(sha256sum "${g}/raw/articles/ok-file.md" | cut -d' ' -f1)"
|
||||||
|
cat > "${g}/wiki/sources/ok-file.md" <<FM
|
||||||
|
---
|
||||||
|
source_sha256: $hash_ok
|
||||||
|
---
|
||||||
|
FM
|
||||||
|
( cd "$g" && git add . && git commit -q -m "add ok" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 0'
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: flags modified files" {
|
||||||
|
echo "content v1" > "${g}/raw/articles/mod-file.md"
|
||||||
|
hash_v1="$(sha256sum "${g}/raw/articles/mod-file.md" | cut -d' ' -f1)"
|
||||||
|
cat > "${g}/wiki/sources/mod-file.md" <<FM
|
||||||
|
---
|
||||||
|
source_sha256: $hash_v1
|
||||||
|
---
|
||||||
|
FM
|
||||||
|
( cd "$g" && git add . && git commit -q -m "v1" && git push -q )
|
||||||
|
echo "content v2" > "${g}/raw/articles/mod-file.md"
|
||||||
|
( cd "$g" && git add . && git commit -q -m "v2" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 1'
|
||||||
|
echo "$output" | jq -e '.detail[0].reason == "modified"'
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: nested subdirectory yields prefixed slug" {
|
||||||
|
mkdir -p "${g}/raw/articles/sub-b"
|
||||||
|
echo "subdir content" > "${g}/raw/articles/sub-b/file.md"
|
||||||
|
( cd "$g" && git add . && git commit -q -m "subdir" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 1'
|
||||||
|
echo "$output" | jq -e '.files[0] == "raw/articles/sub-b/file.md"'
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: excludes noise (.stfolder, .gitkeep)" {
|
||||||
|
touch "${g}/raw/articles/.gitkeep"
|
||||||
|
mkdir -p "${g}/raw/articles/.stfolder"
|
||||||
|
touch "${g}/raw/articles/.stfolder/sync.log"
|
||||||
|
( cd "$g" && git add . && git commit -q -m "noise" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 0'
|
||||||
|
}
|
||||||
|
|
||||||
|
@test "pending-raw: reports both files on a slug collision" {
|
||||||
|
mkdir -p "${g}/raw/articles/cibo"
|
||||||
|
echo "c1" > "${g}/raw/articles/cibo-pane.md"
|
||||||
|
echo "c2" > "${g}/raw/articles/cibo/pane.md"
|
||||||
|
( cd "$g" && git add . && git commit -q -m "collision" && git push -q )
|
||||||
|
run bash "$PENDING" "$g_name"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
echo "$output" | jq -e '.count == 2'
|
||||||
|
}
|
||||||
|
|
@ -17,6 +17,7 @@ EXECUTABLES=(
|
||||||
skills/ingest/scripts/open-pr.sh
|
skills/ingest/scripts/open-pr.sh
|
||||||
skills/ingest/scripts/log-append.sh
|
skills/ingest/scripts/log-append.sh
|
||||||
skills/ingest/scripts/slug.sh
|
skills/ingest/scripts/slug.sh
|
||||||
|
skills/ingest/scripts/pending-raw.sh
|
||||||
skills/ingest/scripts/index-append.py
|
skills/ingest/scripts/index-append.py
|
||||||
scripts/add-genome.sh
|
scripts/add-genome.sh
|
||||||
scripts/setup.sh
|
scripts/setup.sh
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue