feat: Add Levenshtein distance, string similarity, and duplicate slug advisory
This commit is contained in:
parent
b808f0fc8f
commit
f703498fd9
1 changed files with 108 additions and 0 deletions
108
lib/lint.sh
108
lib/lint.sh
|
|
@ -208,3 +208,111 @@ check_broken_links() {
|
||||||
fi
|
fi
|
||||||
done <<< "$links"
|
done <<< "$links"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# levenshtein <s1> <s2>
|
||||||
|
# Classic edit distance via a two-row rolling buffer, so every array subscript
|
||||||
|
# is a single integer. The previous implementation used comma subscripts
|
||||||
|
# (d[i,j]); in bash arithmetic the comma operator collapses to one dimension,
|
||||||
|
# so the table aliased onto itself and returned wrong distances — it could not
|
||||||
|
# even score two identical strings as 0. This form is portable to bash 3.2
|
||||||
|
# (no associative arrays). Echoes the integer distance.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
levenshtein() {
|
||||||
|
local s1="$1" s2="$2"
|
||||||
|
local len1=${#s1} len2=${#s2}
|
||||||
|
(( len1 == 0 )) && { echo "$len2"; return; }
|
||||||
|
(( len2 == 0 )) && { echo "$len1"; return; }
|
||||||
|
|
||||||
|
local -a prev=() curr=()
|
||||||
|
local i j cost del ins sub min
|
||||||
|
for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done
|
||||||
|
|
||||||
|
for (( i = 1; i <= len1; i++ )); do
|
||||||
|
curr[0]=$i
|
||||||
|
for (( j = 1; j <= len2; j++ )); do
|
||||||
|
cost=1
|
||||||
|
[[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0
|
||||||
|
del=$(( prev[j] + 1 ))
|
||||||
|
ins=$(( curr[j-1] + 1 ))
|
||||||
|
sub=$(( prev[j-1] + cost ))
|
||||||
|
min=$del
|
||||||
|
(( ins < min )) && min=$ins
|
||||||
|
(( sub < min )) && min=$sub
|
||||||
|
curr[j]=$min
|
||||||
|
done
|
||||||
|
prev=( "${curr[@]}" )
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "${prev[len2]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# similarity <s1> <s2>
|
||||||
|
# Percentage similarity from the edit distance: 100 = identical, 0 = entirely
|
||||||
|
# different. Two empty strings are treated as identical (100), so the divide
|
||||||
|
# is always guarded.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
similarity() {
|
||||||
|
local s1="$1" s2="$2"
|
||||||
|
local maxlen=${#s1}
|
||||||
|
(( ${#s2} > maxlen )) && maxlen=${#s2}
|
||||||
|
(( maxlen == 0 )) && { echo "100"; return; }
|
||||||
|
local dist
|
||||||
|
dist=$(levenshtein "$s1" "$s2")
|
||||||
|
echo $(( 100 - (dist * 100 / maxlen) ))
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# check_duplicates <manifest>
|
||||||
|
# Advisory only: warns when a page created this run has a slug suspiciously
|
||||||
|
# close to an entity/concept already listed in wiki/index.md, so a human can
|
||||||
|
# merge them in the PR rather than grow two near-identical pages. Never fails
|
||||||
|
# the lint (always returns 0), exactly like check_broken_links.
|
||||||
|
#
|
||||||
|
# The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches
|
||||||
|
# are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index
|
||||||
|
# BEFORE the lint runs, so without the skip every new slug would match itself at
|
||||||
|
# 100%. A page that genuinely collides with a pre-existing file is reported by
|
||||||
|
# the manifest as 'modified', not 'created', so skipping created==existing pairs
|
||||||
|
# can never mask a real collision.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
check_duplicates() {
|
||||||
|
local manifest="$1"
|
||||||
|
[[ -f "$manifest" ]] || return 0
|
||||||
|
command -v jq >/dev/null 2>&1 || return 0
|
||||||
|
|
||||||
|
# New leaf slugs from pages created this run.
|
||||||
|
local -a new_slugs=()
|
||||||
|
local slug
|
||||||
|
while IFS= read -r slug; do
|
||||||
|
[[ -n "$slug" ]] && new_slugs+=("$slug")
|
||||||
|
done < <(jq -r '.pages[]? | select(.status=="created") | .path
|
||||||
|
| split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null)
|
||||||
|
|
||||||
|
# Existing entity/concept slugs already catalogued in the index.
|
||||||
|
local -a existing_slugs=()
|
||||||
|
if [[ -f "wiki/index.md" ]]; then
|
||||||
|
local line
|
||||||
|
while IFS= read -r line; do
|
||||||
|
if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then
|
||||||
|
existing_slugs+=("${BASH_REMATCH[2]}")
|
||||||
|
fi
|
||||||
|
done < "wiki/index.md"
|
||||||
|
fi
|
||||||
|
|
||||||
|
(( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0
|
||||||
|
|
||||||
|
local threshold="${KG_DUP_THRESHOLD:-70}"
|
||||||
|
local new exist sim
|
||||||
|
for new in "${new_slugs[@]}"; do
|
||||||
|
for exist in "${existing_slugs[@]}"; do
|
||||||
|
[[ "$new" == "$exist" ]] && continue # skip exact self-match (see header)
|
||||||
|
sim=$(similarity "$new" "$exist")
|
||||||
|
if (( sim > threshold )); then
|
||||||
|
warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue