knowledge-genome-orchestrator/skills/ingest/scripts/pending-raw.sh

64 lines
2.5 KiB
Bash
Executable file

#!/usr/bin/env bash
# =============================================================================
# pending-raw.sh — deterministic "what needs ingesting" calculator.
# Reads the clean base checkout and classifies each raw/articles/*.md as:
# new -> no wiki/sources/<slug>.md
# modified -> page exists but its source_sha256 != current file hash
# Emits the same JSON envelope as changed-raw (drop-in), plus detail[] for ntfy.
# =============================================================================
set -euo pipefail
genome="${1:?usage: pending-raw.sh <genome>}"
base_dir="${GENOMES_ROOT:-${HOME}/genomes}"
cd "${base_dir}/${genome}" 2>/dev/null || { echo '{"status":"error","reason":"unknown genome"}'; exit 1; }
# Clean start on the configured base (single source of truth in lib/clean-start.sh).
: "${KG_LIB_DIR:=${HOME}/knowledge-genome-orchestrator/lib}"
source "${KG_LIB_DIR}/clean-start.sh" 2>/dev/null \
|| { echo '{"status":"error","reason":"clean-start.sh not found"}'; exit 1; }
clean_start || { echo '{"status":"error","reason":"clean-start failed"}'; exit 1; }
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SLUG="${SCRIPT_DIR}/slug.sh"
declare -a NEW=()
declare -a MOD=()
declare -A SEEN_SLUG=()
if [[ -d raw/articles ]]; then
while IFS= read -r -d '' f; do
rel="${f#./}"
case "$rel" in
*/.stfolder/*|*/.stignore|*/.gitkeep) continue ;;
esac
slug="$("$SLUG" --raw "$rel")" || continue
# Residual collision (two distinct raws -> same slug): warn, do not silence.
if [[ -n "${SEEN_SLUG[$slug]:-}" && "${SEEN_SLUG[$slug]}" != "$rel" ]]; then
logger -t pending-raw "warn: slug collision ${slug}: ${SEEN_SLUG[$slug]} <-> ${rel}"
fi
SEEN_SLUG[$slug]="$rel"
page="wiki/sources/${slug}.md"
if [[ ! -f "$page" ]]; then
NEW+=("$rel")
else
cur="$(sha256sum "$rel" | cut -d' ' -f1)"
rec="$(sed -n 's/^source_sha256:[[:space:]]*//p' "$page" | tr -d '\r' | head -n1)"
if [[ "$cur" != "$rec" ]]; then
MOD+=("$rel")
fi
fi
done < <(find raw/articles -type f -name '*.md' -print0 2>/dev/null)
fi
if [[ ${#NEW[@]} -eq 0 && ${#MOD[@]} -eq 0 ]]; then
echo '{"status":"ok","genome":"'"$genome"'","count":0,"files":[],"detail":[]}'
else
{
for x in "${NEW[@]}"; do printf '%s\tnew\n' "$x"; done
for x in "${MOD[@]}"; do printf '%s\tmodified\n' "$x"; done
} | jq -R 'split("\t") | {path: .[0], reason: .[1]}' \
| jq -s --arg g "$genome" \
'{status: "ok", genome: $g, count: length, files: [.[].path], detail: .}'
fi