knowledge-genome-orchestrator/lib/lint.sh

201 lines
6.9 KiB
Bash

#!/usr/bin/env bash
# =============================================================================
# lib/lint.sh
# Validation logic for Knowledge Genome files.
# =============================================================================
# Valid values for the 'type' frontmatter field.
# Must stay in sync with the type list in templates/agents-genome.md.
# Note: 'index' and 'log' are wiki-level singleton files (wiki/index.md, wiki/log.md).
# 'conflict' has no dedicated scaffold directory — it is a cross-cutting type
# that can live under any wiki/ subdirectory.
VALID_TYPES=("source" "entity" "concept" "query" "conflict" "private" "index" "log")
# ---------------------------------------------------------------------------
# lint_markdown_file <file> <genome_name>
# Validates YAML frontmatter: delimiters, mandatory fields, domain, type value.
# Returns the number of errors found.
# ---------------------------------------------------------------------------
lint_markdown_file() {
local file="$1"
local genome_name="$2"
local errors=0
# 1. Check frontmatter delimiters
if [[ $(head -n 1 "$file") != "---" ]]; then
warn "Missing frontmatter start (---) in: $file"
errors=$((errors + 1))
fi
# 2. Check mandatory fields
local mandatory_fields=("title:" "type:" "domain:" "maturity:" "last_updated:")
for field in "${mandatory_fields[@]}"; do
if ! grep -q "^${field}" "$file"; then
warn "Missing mandatory field '${field}' in: $file"
errors=$((errors + 1))
fi
done
# 3. Check domain matches genome name
if grep -q "^domain:" "$file" && ! grep -q "^domain: ${genome_name}" "$file"; then
warn "Domain mismatch in $file (expected '${genome_name}')"
errors=$((errors + 1))
fi
# 4. Validate type value against allowed list
local type_errors
check_valid_type "$file"
type_errors=$?
errors=$((errors + type_errors))
return $errors
}
# ---------------------------------------------------------------------------
# check_valid_type <file>
# Verifies that the 'type' field contains a value from VALID_TYPES.
# Returns 1 if invalid, 0 if valid or field absent (absence caught by lint_markdown_file).
# ---------------------------------------------------------------------------
check_valid_type() {
local file="$1"
local type_value
type_value=$(grep "^type:" "$file" | head -1 | sed 's/^type:[[:space:]]*//' | tr -d '"')
[[ -z "$type_value" ]] && return 0 # absence is caught upstream
local valid=0
for t in "${VALID_TYPES[@]}"; do
[[ "$type_value" == "$t" ]] && valid=1 && break
done
if [[ $valid -eq 0 ]]; then
warn "Invalid type value '${type_value}' in: $file"
warn " Valid types: ${VALID_TYPES[*]}"
return 1
fi
return 0
}
# ---------------------------------------------------------------------------
# check_privacy_consistency <file>
# Ensures files in private/ directories carry 'private: true'.
# Warns if a public file is incorrectly marked private.
# ---------------------------------------------------------------------------
check_privacy_consistency() {
local file="$1"
local errors=0
if [[ "$file" == *"/private/"* ]]; then
if ! grep -q "^private: true" "$file"; then
error "Privacy leak: $file is in a private/ directory but lacks 'private: true'."
errors=$((errors + 1))
fi
else
if grep -q "^private: true" "$file"; then
warn "Metadata mismatch: $file is marked 'private: true' but is in a public directory."
fi
fi
return $errors
}
# ---------------------------------------------------------------------------
# check_knowledge_decay <file>
# Reads 'maturity' and 'last_updated' from frontmatter and compares against
# the staleness thresholds defined in agents-genome.md:
# maturity: stable → flag if last_updated > 180 days ago
# maturity: draft → flag if last_updated > 90 days ago
#
# Returns 1 if the file is stale, 0 otherwise.
# Silently skips files with missing or unparseable date fields.
# ---------------------------------------------------------------------------
check_knowledge_decay() {
local file="$1"
local last_updated maturity
last_updated=$(grep "^last_updated:" "$file" | head -1 | sed 's/^last_updated:[[:space:]]*//' | tr -d '"')
maturity=$(grep "^maturity:" "$file" | head -1 | sed 's/^maturity:[[:space:]]*//' | tr -d '"')
# Skip if either field is absent or maturity is not decay-trackable
[[ -z "$last_updated" || -z "$maturity" ]] && return 0
[[ "$maturity" != "stable" && "$maturity" != "draft" ]] && return 0
# Parse date — handle both GNU date (Linux) and BSD date (macOS)
local updated_ts
if date --version >/dev/null 2>&1; then
# GNU date
updated_ts=$(date -d "$last_updated" +%s 2>/dev/null)
else
# BSD date (macOS)
updated_ts=$(date -j -f "%Y-%m-%d" "$last_updated" +%s 2>/dev/null)
fi
[[ -z "$updated_ts" ]] && return 0 # unparseable date — skip silently
local now days_old threshold
now=$(date +%s)
days_old=$(( (now - updated_ts) / 86400 ))
case "$maturity" in
stable) threshold=180 ;;
draft) threshold=90 ;;
esac
if [[ $days_old -gt $threshold ]]; then
warn "STALE: $file"
warn " maturity: ${maturity} | last_updated: ${last_updated} | ${days_old} days ago (threshold: ${threshold})"
return 1
fi
return 0
}
# ---------------------------------------------------------------------------
# check_page_size <file>
# Enforces the page length limits defined in agents-genome.md:
# soft cap: 400 lines → warn
# hard cap: 800 lines → error
# These limits ensure pages fit within the LLM context window without
# attention degradation and keep the wiki atomically navigable.
# ---------------------------------------------------------------------------
check_page_size() {
local file="$1"
local lines
lines=$(wc -l < "$file")
if [[ $lines -gt 800 ]]; then
error "Page too long (${lines} lines, hard cap 800): $file"
error " Split this page into focused sub-pages and link them."
return 1
elif [[ $lines -gt 400 ]]; then
warn "Page approaching limit (${lines} lines, soft cap 400): $file"
fi
return 0
}
# ---------------------------------------------------------------------------
# check_broken_links <file>
# Basic check for internal [[wikilinks]] that cannot be resolved locally.
# Only emits warnings — cross-genome links may legitimately not resolve here.
# ---------------------------------------------------------------------------
check_broken_links() {
local file="$1"
local base_dir
base_dir=$(dirname "$file")
# Extract link targets, stripping aliases: [[Link|Alias]] -> Link
local links
links=$(grep -oE '\[\[[^\]]+' "$file" 2>/dev/null | sed 's/^\[\[//' | cut -d'|' -f1)
for link in $links; do
local target="$link"
[[ "$target" != *.md ]] && target="${target}.md"
if [[ ! -f "${base_dir}/${target}" && ! -f "${base_dir}/../${target}" ]]; then
warn "Potential broken link: [[$link]] in $file"
fi
done
}