feat: Add Levenshtein distance, string similarity, and duplicate slug advisory

2026-06-19 05:44:58 +02:00 · 2026-06-19 05:44:58 +02:00 · f703498fd9
commit f703498fd9
parent b808f0fc8f
1 changed files with 108 additions and 0 deletions
--- a/lib/lint.sh
+++ b/lib/lint.sh
@ -208,3 +208,111 @@ check_broken_links() {
    fi
  done <<< "$links"
 }
 # ---------------------------------------------------------------------------
 # levenshtein <s1> <s2>
 # Classic edit distance via a two-row rolling buffer, so every array subscript
 # is a single integer. The previous implementation used comma subscripts
 # (d[i,j]); in bash arithmetic the comma operator collapses to one dimension,
 # so the table aliased onto itself and returned wrong distances — it could not
 # even score two identical strings as 0. This form is portable to bash 3.2
 # (no associative arrays). Echoes the integer distance.
 # ---------------------------------------------------------------------------
 levenshtein() {
  local s1="$1" s2="$2"
  local len1=${#s1} len2=${#s2}
  (( len1 == 0 )) && { echo "$len2"; return; }
  (( len2 == 0 )) && { echo "$len1"; return; }
  local -a prev=() curr=()
  local i j cost del ins sub min
  for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done
  for (( i = 1; i <= len1; i++ )); do
    curr[0]=$i
    for (( j = 1; j <= len2; j++ )); do
      cost=1
      [[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0
      del=$(( prev[j] + 1 ))
      ins=$(( curr[j-1] + 1 ))
      sub=$(( prev[j-1] + cost ))
      min=$del
      (( ins < min )) && min=$ins
      (( sub < min )) && min=$sub
      curr[j]=$min
    done
    prev=( "${curr[@]}" )
  done
  echo "${prev[len2]}"
 }
 # ---------------------------------------------------------------------------
 # similarity <s1> <s2>
 # Percentage similarity from the edit distance: 100 = identical, 0 = entirely
 # different. Two empty strings are treated as identical (100), so the divide
 # is always guarded.
 # ---------------------------------------------------------------------------
 similarity() {
  local s1="$1" s2="$2"
  local maxlen=${#s1}
  (( ${#s2} > maxlen )) && maxlen=${#s2}
  (( maxlen == 0 )) && { echo "100"; return; }
  local dist
  dist=$(levenshtein "$s1" "$s2")
  echo $(( 100 - (dist * 100 / maxlen) ))
 }
 # ---------------------------------------------------------------------------
 # check_duplicates <manifest>
 # Advisory only: warns when a page created this run has a slug suspiciously
 # close to an entity/concept already listed in wiki/index.md, so a human can
 # merge them in the PR rather than grow two near-identical pages. Never fails
 # the lint (always returns 0), exactly like check_broken_links.
 #
 # The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches
 # are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index
 # BEFORE the lint runs, so without the skip every new slug would match itself at
 # 100%. A page that genuinely collides with a pre-existing file is reported by
 # the manifest as 'modified', not 'created', so skipping created==existing pairs
 # can never mask a real collision.
 # ---------------------------------------------------------------------------
 check_duplicates() {
  local manifest="$1"
  [[ -f "$manifest" ]] || return 0
  command -v jq >/dev/null 2>&1 || return 0
  # New leaf slugs from pages created this run.
  local -a new_slugs=()
  local slug
  while IFS= read -r slug; do
    [[ -n "$slug" ]] && new_slugs+=("$slug")
  done < <(jq -r '.pages[]? | select(.status=="created") | .path
                  | split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null)
  # Existing entity/concept slugs already catalogued in the index.
  local -a existing_slugs=()
  if [[ -f "wiki/index.md" ]]; then
    local line
    while IFS= read -r line; do
      if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then
        existing_slugs+=("${BASH_REMATCH[2]}")
      fi
    done < "wiki/index.md"
  fi
  (( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0
  local threshold="${KG_DUP_THRESHOLD:-70}"
  local new exist sim
  for new in "${new_slugs[@]}"; do
    for exist in "${existing_slugs[@]}"; do
      [[ "$new" == "$exist" ]] && continue   # skip exact self-match (see header)
      sim=$(similarity "$new" "$exist")
      if (( sim > threshold )); then
        warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR"
      fi
    done
  done
  return 0
 }