#!/usr/bin/env bash # ============================================================================= # lib/lint.sh # Validation logic for Knowledge Genome files. # ============================================================================= # Valid values for the 'type' frontmatter field. # Must stay in sync with the type list in templates/agents-genome.md. # Note: 'index' and 'log' are wiki-level singleton files (wiki/index.md, wiki/log.md). # 'conflict' has no dedicated scaffold directory — it is a cross-cutting type # that can live under any wiki/ subdirectory. VALID_TYPES=("source" "entity" "concept" "query" "conflict" "private" "index" "log") # --------------------------------------------------------------------------- # lint_markdown_file # Validates YAML frontmatter: delimiters, mandatory fields, domain, type value. # Returns the number of errors found. # --------------------------------------------------------------------------- lint_markdown_file() { local file="$1" local genome_name="$2" local errors=0 # 1. Check frontmatter delimiters if [[ $(head -n 1 "$file") != "---" ]]; then error "Missing frontmatter start (---) in: $file" errors=$((errors + 1)) fi # 2. Check mandatory fields local mandatory_fields=("title:" "type:" "domain:" "maturity:" "last_updated:") for field in "${mandatory_fields[@]}"; do if ! grep -q "^${field}" "$file"; then error "Missing mandatory field '${field}' in: $file" errors=$((errors + 1)) fi done # 3. Check domain matches genome name if grep -q "^domain:" "$file" && ! grep -q "^domain: ${genome_name}" "$file"; then error "Domain mismatch in $file (expected '${genome_name}')" errors=$((errors + 1)) fi # 4. Validate type value against allowed list local type_errors check_valid_type "$file" type_errors=$? errors=$((errors + type_errors)) return $errors } # --------------------------------------------------------------------------- # check_valid_type # Verifies that the 'type' field contains a value from VALID_TYPES. # Returns 1 if invalid, 0 if valid or field absent (absence caught by lint_markdown_file). # --------------------------------------------------------------------------- check_valid_type() { local file="$1" local type_value type_value=$(grep "^type:" "$file" | head -1 | sed 's/^type:[[:space:]]*//' | tr -d '"') [[ -z "$type_value" ]] && return 0 # absence is caught upstream local valid=0 for t in "${VALID_TYPES[@]}"; do [[ "$type_value" == "$t" ]] && valid=1 && break done if [[ $valid -eq 0 ]]; then error "Invalid type value '${type_value}' in: $file" error " Valid types: ${VALID_TYPES[*]}" return 1 fi return 0 } # --------------------------------------------------------------------------- # check_privacy_consistency # Ensures files in private/ directories carry 'private: true'. # Warns if a public file is incorrectly marked private. # --------------------------------------------------------------------------- check_privacy_consistency() { local file="$1" local errors=0 if [[ "$file" == *"/private/"* ]]; then if ! grep -q "^private: true" "$file"; then error "Privacy leak: $file is in a private/ directory but lacks 'private: true'." errors=$((errors + 1)) fi else if grep -q "^private: true" "$file"; then warn "Metadata mismatch: $file is marked 'private: true' but is in a public directory." fi fi return $errors } # --------------------------------------------------------------------------- # check_knowledge_decay # Reads 'maturity' and 'last_updated' from frontmatter and compares against # the staleness thresholds defined in agents-genome.md: # maturity: stable → flag if last_updated > 180 days ago # maturity: draft → flag if last_updated > 90 days ago # # Returns 1 if the file is stale, 0 otherwise. # Silently skips files with missing or unparseable date fields. # --------------------------------------------------------------------------- check_knowledge_decay() { local file="$1" local last_updated maturity last_updated=$(grep "^last_updated:" "$file" | head -1 | sed 's/^last_updated:[[:space:]]*//' | tr -d '"') maturity=$(grep "^maturity:" "$file" | head -1 | sed 's/^maturity:[[:space:]]*//' | tr -d '"') # Skip if either field is absent or maturity is not decay-trackable [[ -z "$last_updated" || -z "$maturity" ]] && return 0 [[ "$maturity" != "stable" && "$maturity" != "draft" ]] && return 0 # Parse date — handle both GNU date (Linux) and BSD date (macOS) local updated_ts if date --version >/dev/null 2>&1; then # GNU date updated_ts=$(date -d "$last_updated" +%s 2>/dev/null) else # BSD date (macOS) updated_ts=$(date -j -f "%Y-%m-%d" "$last_updated" +%s 2>/dev/null) fi [[ -z "$updated_ts" ]] && return 0 # unparseable date — skip silently local now days_old threshold now=$(date +%s) days_old=$(( (now - updated_ts) / 86400 )) case "$maturity" in stable) threshold=180 ;; draft) threshold=90 ;; esac if [[ $days_old -gt $threshold ]]; then error "STALE: $file" error " maturity: ${maturity} | last_updated: ${last_updated} | ${days_old} days ago (threshold: ${threshold})" return 1 fi return 0 } # --------------------------------------------------------------------------- # check_page_size # Enforces the page length limits defined in agents-genome.md: # soft cap: 400 lines → warn # hard cap: 800 lines → error # These limits ensure pages fit within the LLM context window without # attention degradation and keep the wiki atomically navigable. # --------------------------------------------------------------------------- check_page_size() { local file="$1" local lines lines=$(wc -l < "$file") if [[ $lines -gt 800 ]]; then error "Page too long (${lines} lines, hard cap 800): $file" error " Split this page into focused sub-pages and link them." return 1 elif [[ $lines -gt 400 ]]; then warn "Page approaching limit (${lines} lines, soft cap 400): $file" fi return 0 } # --------------------------------------------------------------------------- # check_broken_links # Basic check for internal [[wikilinks]] that cannot be resolved locally. # Only emits warnings — cross-genome links may legitimately not resolve here. # --------------------------------------------------------------------------- check_broken_links() { local file="$1" local base_dir base_dir=$(dirname "$file") # Extract link targets, stripping aliases: [[Link|Alias]] -> Link local links links=$(grep -oE '\[\[[^\]]+' "$file" 2>/dev/null | sed 's/^\[\[//' | cut -d'|' -f1) # Cross-genome links (../other-genome/…) are not resolvable from a single # genome checkout and are skipped — they would always fall # through the two-level lookup and produce non-actionable warnings. while IFS= read -r link; do [[ -z "$link" ]] && continue if [[ "$link" == ../* ]]; then continue fi local target="$link" [[ "$target" != *.md ]] && target="${target}.md" if [[ ! -f "${base_dir}/${target}" && ! -f "${base_dir}/../${target}" ]]; then warn "Potential broken link: [[$link]] in $file" fi done <<< "$links" } # --------------------------------------------------------------------------- # levenshtein # Classic edit distance via a two-row rolling buffer, so every array subscript # is a single integer. The previous implementation used comma subscripts # (d[i,j]); in bash arithmetic the comma operator collapses to one dimension, # so the table aliased onto itself and returned wrong distances — it could not # even score two identical strings as 0. This form is portable to bash 3.2 # (no associative arrays). Echoes the integer distance. # --------------------------------------------------------------------------- levenshtein() { local s1="$1" s2="$2" local len1=${#s1} len2=${#s2} (( len1 == 0 )) && { echo "$len2"; return; } (( len2 == 0 )) && { echo "$len1"; return; } local -a prev=() curr=() local i j cost del ins sub min for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done for (( i = 1; i <= len1; i++ )); do curr[0]=$i for (( j = 1; j <= len2; j++ )); do cost=1 [[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0 del=$(( prev[j] + 1 )) ins=$(( curr[j-1] + 1 )) sub=$(( prev[j-1] + cost )) min=$del (( ins < min )) && min=$ins (( sub < min )) && min=$sub curr[j]=$min done prev=( "${curr[@]}" ) done echo "${prev[len2]}" } # --------------------------------------------------------------------------- # similarity # Percentage similarity from the edit distance: 100 = identical, 0 = entirely # different. Two empty strings are treated as identical (100), so the divide # is always guarded. # --------------------------------------------------------------------------- similarity() { local s1="$1" s2="$2" local maxlen=${#s1} (( ${#s2} > maxlen )) && maxlen=${#s2} (( maxlen == 0 )) && { echo "100"; return; } local dist dist=$(levenshtein "$s1" "$s2") echo $(( 100 - (dist * 100 / maxlen) )) } # --------------------------------------------------------------------------- # check_duplicates # Advisory only: warns when a page created this run has a slug suspiciously # close to an entity/concept already listed in wiki/index.md, so a human can # merge them in the PR rather than grow two near-identical pages. Never fails # the lint (always returns 0), exactly like check_broken_links. # # The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches # are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index # BEFORE the lint runs, so without the skip every new slug would match itself at # 100%. A page that genuinely collides with a pre-existing file is reported by # the manifest as 'modified', not 'created', so skipping created==existing pairs # can never mask a real collision. # --------------------------------------------------------------------------- check_duplicates() { local manifest="$1" [[ -f "$manifest" ]] || return 0 command -v jq >/dev/null 2>&1 || return 0 # New leaf slugs from pages created this run. local -a new_slugs=() local slug while IFS= read -r slug; do [[ -n "$slug" ]] && new_slugs+=("$slug") done < <(jq -r '.pages[]? | select(.status=="created") | .path | split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null) # Existing entity/concept slugs already catalogued in the index. local -a existing_slugs=() if [[ -f "wiki/index.md" ]]; then local line while IFS= read -r line; do if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then existing_slugs+=("${BASH_REMATCH[2]}") fi done < "wiki/index.md" fi (( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0 local threshold="${KG_DUP_THRESHOLD:-70}" local new exist sim for new in "${new_slugs[@]}"; do for exist in "${existing_slugs[@]}"; do [[ "$new" == "$exist" ]] && continue # skip exact self-match (see header) sim=$(similarity "$new" "$exist") if (( sim > threshold )); then warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR" fi done done return 0 }