From f703498fd952b0a669b11b5a37ca84695eb2733c Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 19 Jun 2026 05:44:58 +0200 Subject: [PATCH] feat: Add Levenshtein distance, string similarity, and duplicate slug advisory --- lib/lint.sh | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/lib/lint.sh b/lib/lint.sh index 960eba0..13ea5d0 100644 --- a/lib/lint.sh +++ b/lib/lint.sh @@ -208,3 +208,111 @@ check_broken_links() { fi done <<< "$links" } + +# --------------------------------------------------------------------------- +# levenshtein +# Classic edit distance via a two-row rolling buffer, so every array subscript +# is a single integer. The previous implementation used comma subscripts +# (d[i,j]); in bash arithmetic the comma operator collapses to one dimension, +# so the table aliased onto itself and returned wrong distances — it could not +# even score two identical strings as 0. This form is portable to bash 3.2 +# (no associative arrays). Echoes the integer distance. +# --------------------------------------------------------------------------- +levenshtein() { + local s1="$1" s2="$2" + local len1=${#s1} len2=${#s2} + (( len1 == 0 )) && { echo "$len2"; return; } + (( len2 == 0 )) && { echo "$len1"; return; } + + local -a prev=() curr=() + local i j cost del ins sub min + for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done + + for (( i = 1; i <= len1; i++ )); do + curr[0]=$i + for (( j = 1; j <= len2; j++ )); do + cost=1 + [[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0 + del=$(( prev[j] + 1 )) + ins=$(( curr[j-1] + 1 )) + sub=$(( prev[j-1] + cost )) + min=$del + (( ins < min )) && min=$ins + (( sub < min )) && min=$sub + curr[j]=$min + done + prev=( "${curr[@]}" ) + done + + echo "${prev[len2]}" +} + +# --------------------------------------------------------------------------- +# similarity +# Percentage similarity from the edit distance: 100 = identical, 0 = entirely +# different. Two empty strings are treated as identical (100), so the divide +# is always guarded. +# --------------------------------------------------------------------------- +similarity() { + local s1="$1" s2="$2" + local maxlen=${#s1} + (( ${#s2} > maxlen )) && maxlen=${#s2} + (( maxlen == 0 )) && { echo "100"; return; } + local dist + dist=$(levenshtein "$s1" "$s2") + echo $(( 100 - (dist * 100 / maxlen) )) +} + +# --------------------------------------------------------------------------- +# check_duplicates +# Advisory only: warns when a page created this run has a slug suspiciously +# close to an entity/concept already listed in wiki/index.md, so a human can +# merge them in the PR rather than grow two near-identical pages. Never fails +# the lint (always returns 0), exactly like check_broken_links. +# +# The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches +# are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index +# BEFORE the lint runs, so without the skip every new slug would match itself at +# 100%. A page that genuinely collides with a pre-existing file is reported by +# the manifest as 'modified', not 'created', so skipping created==existing pairs +# can never mask a real collision. +# --------------------------------------------------------------------------- +check_duplicates() { + local manifest="$1" + [[ -f "$manifest" ]] || return 0 + command -v jq >/dev/null 2>&1 || return 0 + + # New leaf slugs from pages created this run. + local -a new_slugs=() + local slug + while IFS= read -r slug; do + [[ -n "$slug" ]] && new_slugs+=("$slug") + done < <(jq -r '.pages[]? | select(.status=="created") | .path + | split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null) + + # Existing entity/concept slugs already catalogued in the index. + local -a existing_slugs=() + if [[ -f "wiki/index.md" ]]; then + local line + while IFS= read -r line; do + if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then + existing_slugs+=("${BASH_REMATCH[2]}") + fi + done < "wiki/index.md" + fi + + (( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0 + + local threshold="${KG_DUP_THRESHOLD:-70}" + local new exist sim + for new in "${new_slugs[@]}"; do + for exist in "${existing_slugs[@]}"; do + [[ "$new" == "$exist" ]] && continue # skip exact self-match (see header) + sim=$(similarity "$new" "$exist") + if (( sim > threshold )); then + warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR" + fi + done + done + return 0 +}