318 lines
11 KiB
Bash
318 lines
11 KiB
Bash
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# lib/lint.sh
|
|
# Validation logic for Knowledge Genome files.
|
|
# =============================================================================
|
|
|
|
# Valid values for the 'type' frontmatter field.
|
|
# Must stay in sync with the type list in templates/agents-genome.md.
|
|
# Note: 'index' and 'log' are wiki-level singleton files (wiki/index.md, wiki/log.md).
|
|
# 'conflict' has no dedicated scaffold directory — it is a cross-cutting type
|
|
# that can live under any wiki/ subdirectory.
|
|
VALID_TYPES=("source" "entity" "concept" "query" "conflict" "private" "index" "log")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# lint_markdown_file <file> <genome_name>
|
|
# Validates YAML frontmatter: delimiters, mandatory fields, domain, type value.
|
|
# Returns the number of errors found.
|
|
# ---------------------------------------------------------------------------
|
|
lint_markdown_file() {
|
|
local file="$1"
|
|
local genome_name="$2"
|
|
local errors=0
|
|
|
|
# 1. Check frontmatter delimiters
|
|
if [[ $(head -n 1 "$file") != "---" ]]; then
|
|
error "Missing frontmatter start (---) in: $file"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
# 2. Check mandatory fields
|
|
local mandatory_fields=("title:" "type:" "domain:" "maturity:" "last_updated:")
|
|
for field in "${mandatory_fields[@]}"; do
|
|
if ! grep -q "^${field}" "$file"; then
|
|
error "Missing mandatory field '${field}' in: $file"
|
|
errors=$((errors + 1))
|
|
fi
|
|
done
|
|
|
|
# 3. Check domain matches genome name
|
|
if grep -q "^domain:" "$file" && ! grep -q "^domain: ${genome_name}" "$file"; then
|
|
error "Domain mismatch in $file (expected '${genome_name}')"
|
|
errors=$((errors + 1))
|
|
fi
|
|
|
|
# 4. Validate type value against allowed list
|
|
local type_errors
|
|
check_valid_type "$file"
|
|
type_errors=$?
|
|
errors=$((errors + type_errors))
|
|
|
|
return $errors
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_valid_type <file>
|
|
# Verifies that the 'type' field contains a value from VALID_TYPES.
|
|
# Returns 1 if invalid, 0 if valid or field absent (absence caught by lint_markdown_file).
|
|
# ---------------------------------------------------------------------------
|
|
check_valid_type() {
|
|
local file="$1"
|
|
|
|
local type_value
|
|
type_value=$(grep "^type:" "$file" | head -1 | sed 's/^type:[[:space:]]*//' | tr -d '"')
|
|
|
|
[[ -z "$type_value" ]] && return 0 # absence is caught upstream
|
|
|
|
local valid=0
|
|
for t in "${VALID_TYPES[@]}"; do
|
|
[[ "$type_value" == "$t" ]] && valid=1 && break
|
|
done
|
|
|
|
if [[ $valid -eq 0 ]]; then
|
|
error "Invalid type value '${type_value}' in: $file"
|
|
error " Valid types: ${VALID_TYPES[*]}"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_privacy_consistency <file>
|
|
# Ensures files in private/ directories carry 'private: true'.
|
|
# Warns if a public file is incorrectly marked private.
|
|
# ---------------------------------------------------------------------------
|
|
check_privacy_consistency() {
|
|
local file="$1"
|
|
local errors=0
|
|
|
|
if [[ "$file" == *"/private/"* ]]; then
|
|
if ! grep -q "^private: true" "$file"; then
|
|
error "Privacy leak: $file is in a private/ directory but lacks 'private: true'."
|
|
errors=$((errors + 1))
|
|
fi
|
|
else
|
|
if grep -q "^private: true" "$file"; then
|
|
warn "Metadata mismatch: $file is marked 'private: true' but is in a public directory."
|
|
fi
|
|
fi
|
|
|
|
return $errors
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_knowledge_decay <file>
|
|
# Reads 'maturity' and 'last_updated' from frontmatter and compares against
|
|
# the staleness thresholds defined in agents-genome.md:
|
|
# maturity: stable → flag if last_updated > 180 days ago
|
|
# maturity: draft → flag if last_updated > 90 days ago
|
|
#
|
|
# Returns 1 if the file is stale, 0 otherwise.
|
|
# Silently skips files with missing or unparseable date fields.
|
|
# ---------------------------------------------------------------------------
|
|
check_knowledge_decay() {
|
|
local file="$1"
|
|
|
|
local last_updated maturity
|
|
last_updated=$(grep "^last_updated:" "$file" | head -1 | sed 's/^last_updated:[[:space:]]*//' | tr -d '"')
|
|
maturity=$(grep "^maturity:" "$file" | head -1 | sed 's/^maturity:[[:space:]]*//' | tr -d '"')
|
|
|
|
# Skip if either field is absent or maturity is not decay-trackable
|
|
[[ -z "$last_updated" || -z "$maturity" ]] && return 0
|
|
[[ "$maturity" != "stable" && "$maturity" != "draft" ]] && return 0
|
|
|
|
# Parse date — handle both GNU date (Linux) and BSD date (macOS)
|
|
local updated_ts
|
|
if date --version >/dev/null 2>&1; then
|
|
# GNU date
|
|
updated_ts=$(date -d "$last_updated" +%s 2>/dev/null)
|
|
else
|
|
# BSD date (macOS)
|
|
updated_ts=$(date -j -f "%Y-%m-%d" "$last_updated" +%s 2>/dev/null)
|
|
fi
|
|
|
|
[[ -z "$updated_ts" ]] && return 0 # unparseable date — skip silently
|
|
|
|
local now days_old threshold
|
|
now=$(date +%s)
|
|
days_old=$(( (now - updated_ts) / 86400 ))
|
|
|
|
case "$maturity" in
|
|
stable) threshold=180 ;;
|
|
draft) threshold=90 ;;
|
|
esac
|
|
|
|
if [[ $days_old -gt $threshold ]]; then
|
|
error "STALE: $file"
|
|
error " maturity: ${maturity} | last_updated: ${last_updated} | ${days_old} days ago (threshold: ${threshold})"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_page_size <file>
|
|
# Enforces the page length limits defined in agents-genome.md:
|
|
# soft cap: 400 lines → warn
|
|
# hard cap: 800 lines → error
|
|
# These limits ensure pages fit within the LLM context window without
|
|
# attention degradation and keep the wiki atomically navigable.
|
|
# ---------------------------------------------------------------------------
|
|
check_page_size() {
|
|
local file="$1"
|
|
local lines
|
|
lines=$(wc -l < "$file")
|
|
|
|
if [[ $lines -gt 800 ]]; then
|
|
error "Page too long (${lines} lines, hard cap 800): $file"
|
|
error " Split this page into focused sub-pages and link them."
|
|
return 1
|
|
elif [[ $lines -gt 400 ]]; then
|
|
warn "Page approaching limit (${lines} lines, soft cap 400): $file"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_broken_links <file>
|
|
# Basic check for internal [[wikilinks]] that cannot be resolved locally.
|
|
# Only emits warnings — cross-genome links may legitimately not resolve here.
|
|
# ---------------------------------------------------------------------------
|
|
check_broken_links() {
|
|
local file="$1"
|
|
local base_dir
|
|
base_dir=$(dirname "$file")
|
|
|
|
# Extract link targets, stripping aliases: [[Link|Alias]] -> Link
|
|
local links
|
|
links=$(grep -oE '\[\[[^\]]+' "$file" 2>/dev/null | sed 's/^\[\[//' | cut -d'|' -f1)
|
|
|
|
# Cross-genome links (../other-genome/…) are not resolvable from a single
|
|
# genome checkout and are skipped — they would always fall
|
|
# through the two-level lookup and produce non-actionable warnings.
|
|
while IFS= read -r link; do
|
|
[[ -z "$link" ]] && continue
|
|
|
|
if [[ "$link" == ../* ]]; then
|
|
continue
|
|
fi
|
|
|
|
local target="$link"
|
|
[[ "$target" != *.md ]] && target="${target}.md"
|
|
|
|
if [[ ! -f "${base_dir}/${target}" && ! -f "${base_dir}/../${target}" ]]; then
|
|
warn "Potential broken link: [[$link]] in $file"
|
|
fi
|
|
done <<< "$links"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# levenshtein <s1> <s2>
|
|
# Classic edit distance via a two-row rolling buffer, so every array subscript
|
|
# is a single integer. The previous implementation used comma subscripts
|
|
# (d[i,j]); in bash arithmetic the comma operator collapses to one dimension,
|
|
# so the table aliased onto itself and returned wrong distances — it could not
|
|
# even score two identical strings as 0. This form is portable to bash 3.2
|
|
# (no associative arrays). Echoes the integer distance.
|
|
# ---------------------------------------------------------------------------
|
|
levenshtein() {
|
|
local s1="$1" s2="$2"
|
|
local len1=${#s1} len2=${#s2}
|
|
(( len1 == 0 )) && { echo "$len2"; return; }
|
|
(( len2 == 0 )) && { echo "$len1"; return; }
|
|
|
|
local -a prev=() curr=()
|
|
local i j cost del ins sub min
|
|
for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done
|
|
|
|
for (( i = 1; i <= len1; i++ )); do
|
|
curr[0]=$i
|
|
for (( j = 1; j <= len2; j++ )); do
|
|
cost=1
|
|
[[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0
|
|
del=$(( prev[j] + 1 ))
|
|
ins=$(( curr[j-1] + 1 ))
|
|
sub=$(( prev[j-1] + cost ))
|
|
min=$del
|
|
(( ins < min )) && min=$ins
|
|
(( sub < min )) && min=$sub
|
|
curr[j]=$min
|
|
done
|
|
prev=( "${curr[@]}" )
|
|
done
|
|
|
|
echo "${prev[len2]}"
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# similarity <s1> <s2>
|
|
# Percentage similarity from the edit distance: 100 = identical, 0 = entirely
|
|
# different. Two empty strings are treated as identical (100), so the divide
|
|
# is always guarded.
|
|
# ---------------------------------------------------------------------------
|
|
similarity() {
|
|
local s1="$1" s2="$2"
|
|
local maxlen=${#s1}
|
|
(( ${#s2} > maxlen )) && maxlen=${#s2}
|
|
(( maxlen == 0 )) && { echo "100"; return; }
|
|
local dist
|
|
dist=$(levenshtein "$s1" "$s2")
|
|
echo $(( 100 - (dist * 100 / maxlen) ))
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_duplicates <manifest>
|
|
# Advisory only: warns when a page created this run has a slug suspiciously
|
|
# close to an entity/concept already listed in wiki/index.md, so a human can
|
|
# merge them in the PR rather than grow two near-identical pages. Never fails
|
|
# the lint (always returns 0), exactly like check_broken_links.
|
|
#
|
|
# The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches
|
|
# are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index
|
|
# BEFORE the lint runs, so without the skip every new slug would match itself at
|
|
# 100%. A page that genuinely collides with a pre-existing file is reported by
|
|
# the manifest as 'modified', not 'created', so skipping created==existing pairs
|
|
# can never mask a real collision.
|
|
# ---------------------------------------------------------------------------
|
|
check_duplicates() {
|
|
local manifest="$1"
|
|
[[ -f "$manifest" ]] || return 0
|
|
command -v jq >/dev/null 2>&1 || return 0
|
|
|
|
# New leaf slugs from pages created this run.
|
|
local -a new_slugs=()
|
|
local slug
|
|
while IFS= read -r slug; do
|
|
[[ -n "$slug" ]] && new_slugs+=("$slug")
|
|
done < <(jq -r '.pages[]? | select(.status=="created") | .path
|
|
| split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null)
|
|
|
|
# Existing entity/concept slugs already catalogued in the index.
|
|
local -a existing_slugs=()
|
|
if [[ -f "wiki/index.md" ]]; then
|
|
local line
|
|
while IFS= read -r line; do
|
|
if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then
|
|
existing_slugs+=("${BASH_REMATCH[2]}")
|
|
fi
|
|
done < "wiki/index.md"
|
|
fi
|
|
|
|
(( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0
|
|
|
|
local threshold="${KG_DUP_THRESHOLD:-70}"
|
|
local new exist sim
|
|
for new in "${new_slugs[@]}"; do
|
|
for exist in "${existing_slugs[@]}"; do
|
|
[[ "$new" == "$exist" ]] && continue # skip exact self-match (see header)
|
|
sim=$(similarity "$new" "$exist")
|
|
if (( sim > threshold )); then
|
|
warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR"
|
|
fi
|
|
done
|
|
done
|
|
return 0
|
|
}
|