Merge branch 'release/1.3.1' into main

2026-06-19 05:53:48 +02:00 · 2026-06-19 05:53:48 +02:00 · 4ae1a3de5f
commit 4ae1a3de5f
parent 4b99b0acd2 b0de520f9d
8 changed files with 342 additions and 71 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 # =============================================================================
-# Knowledge Genome - Makefile v. 1.3.0
+# Knowledge Genome - Makefile v. 1.3.1
 # Orchestrates the setup and management of the knowledge base.
 # =============================================================================

--- a/README.md
+++ b/README.md
@ -77,6 +77,11 @@ master-knowledge-genome/              ← Root orchestrator (submodule registry)
 └── AGENTS.md                         ← Global coordination schema (cross-genome rules)
 ```

+> The genome names above (`genome-dev`, `genome-finance`, `genome-homelab`) are
+> **illustrative** — they show the kind of multi-domain layout this orchestrator targets.
+> The shipped `registry.sh` defines a single disposable sandbox, **`genome-test`**; you
+> create real genomes yourself with `make add-genome` (see the registry examples below).
+
 Each genome is an independent git repository:

 ```text
@ -175,6 +180,11 @@ knowledge-genome-orchestrator/        ← This repository (setup tooling)
 > The `skills/ingest/` directory is version-controlled here but **deployed** to the AI
 > node (vm101) under `~/.pi/agent/skills/ingest`. The agent (`pi`) does only semantic work
 > and writes a manifest; `run-ingest.sh` does the mechanical steps. See [Workflows → Ingest](#ingest).
+>
+> ingest-semantic.py: one schema-constrained call to local model, returns JSON. run-ingest.sh: index/log/lint/PR.
+> Semantic JSON extraction → deterministic wiki conform + manifest.
+>
+> cp skills/ingest/\* ~/.pi/agent/skills/ingest/ after make setup. Updated via git pull on laptop, pushed to vm101 via SSH in n8n flow.

 ---

@ -807,7 +817,10 @@ model must not waste context on:
 8. Appends the `INGEST | <slug>` entry to `wiki/log.md` (the model name comes from the
   orchestrator via `INGEST_MODEL` — the agent cannot reliably know its own tag)
 9. Runs scoped lint on exactly the pages touched this run (`scoped-lint.sh`, reusing
-   `lib/lint.sh`)
+   `lib/lint.sh`), including a **duplicate-slug advisory**: a slug created this run that is
+   highly similar to an entity/concept already in `wiki/index.md` is flagged in the PR so a
+   human can merge them. It is advisory only — it never fails the lint or blocks the PR
+   (threshold tunable via `KG_DUP_THRESHOLD`, default 70)
 10. Commits **only `wiki/`** on `feat/ai-ingest-<slug>` and opens a PR against the integration
    base (`INGEST_BASE`, default `main`); the body matches the `templates/pr-description.md`
    structure (Summary / Pages / Contradictions / Scoped Lint)
@ -960,6 +973,10 @@ The agent proposes re-validation but does not change `maturity` without new sour

 ### Cross-genome references

+> **Status: planned.** The cross-genome collector and **navigation skill** described in this
+> section are specified but **not yet implemented** in this release — only the `ingest` skill
+> ships today. What follows documents the intended design and the boundary contract it will honour.
+
 Cross-domain knowledge moves by **pull, never push**: the genome you are working in draws
 material _in_; nothing is ever written into another genome. There are **no cross-genome
 wikilinks** — submodule pointers make relative paths brittle.
@ -1062,7 +1079,7 @@ grep "^## \[" wiki/log.md | grep "CONFLICT"  # All conflicts
 grep "^## \[2026-05" wiki/log.md             # Entries from a specific month
 ```

-The orchestrator always injects only `tail -n 20 wiki/log.md` into agent context.
+ingest-semantic.py receives source text + existing entity/concept names (from index) as prompt context.
 The LLM never loads the full log.

 ---
@ -1122,6 +1139,8 @@ Note: `.obsidian/` is in `.gitignore`. Workspace and plugin settings are local

 ### n8n automation

+n8n → SSH → ingest-semantic.py <genome> <raw> → run-ingest.sh <genome>.
+
 n8n (running on the storage node) can automate the ingest pipeline:

 1. Forgejo webhook fires on push to a genome's `raw/` directory
--- a/lib/lint.sh
+++ b/lib/lint.sh
@ -208,3 +208,111 @@ check_broken_links() {
    fi
  done <<< "$links"
 }
+
+# ---------------------------------------------------------------------------
+# levenshtein <s1> <s2>
+# Classic edit distance via a two-row rolling buffer, so every array subscript
+# is a single integer. The previous implementation used comma subscripts
+# (d[i,j]); in bash arithmetic the comma operator collapses to one dimension,
+# so the table aliased onto itself and returned wrong distances — it could not
+# even score two identical strings as 0. This form is portable to bash 3.2
+# (no associative arrays). Echoes the integer distance.
+# ---------------------------------------------------------------------------
+levenshtein() {
+  local s1="$1" s2="$2"
+  local len1=${#s1} len2=${#s2}
+  (( len1 == 0 )) && { echo "$len2"; return; }
+  (( len2 == 0 )) && { echo "$len1"; return; }
+
+  local -a prev=() curr=()
+  local i j cost del ins sub min
+  for (( j = 0; j <= len2; j++ )); do prev[j]=$j; done
+
+  for (( i = 1; i <= len1; i++ )); do
+    curr[0]=$i
+    for (( j = 1; j <= len2; j++ )); do
+      cost=1
+      [[ "${s1:i-1:1}" == "${s2:j-1:1}" ]] && cost=0
+      del=$(( prev[j] + 1 ))
+      ins=$(( curr[j-1] + 1 ))
+      sub=$(( prev[j-1] + cost ))
+      min=$del
+      (( ins < min )) && min=$ins
+      (( sub < min )) && min=$sub
+      curr[j]=$min
+    done
+    prev=( "${curr[@]}" )
+  done
+
+  echo "${prev[len2]}"
+}
+
+# ---------------------------------------------------------------------------
+# similarity <s1> <s2>
+# Percentage similarity from the edit distance: 100 = identical, 0 = entirely
+# different. Two empty strings are treated as identical (100), so the divide
+# is always guarded.
+# ---------------------------------------------------------------------------
+similarity() {
+  local s1="$1" s2="$2"
+  local maxlen=${#s1}
+  (( ${#s2} > maxlen )) && maxlen=${#s2}
+  (( maxlen == 0 )) && { echo "100"; return; }
+  local dist
+  dist=$(levenshtein "$s1" "$s2")
+  echo $(( 100 - (dist * 100 / maxlen) ))
+}
+
+# ---------------------------------------------------------------------------
+# check_duplicates <manifest>
+# Advisory only: warns when a page created this run has a slug suspiciously
+# close to an entity/concept already listed in wiki/index.md, so a human can
+# merge them in the PR rather than grow two near-identical pages. Never fails
+# the lint (always returns 0), exactly like check_broken_links.
+#
+# The threshold is tunable via KG_DUP_THRESHOLD (default 70). Exact self-matches
+# are skipped: step 1 of run-ingest.sh appends this run's new slugs to the index
+# BEFORE the lint runs, so without the skip every new slug would match itself at
+# 100%. A page that genuinely collides with a pre-existing file is reported by
+# the manifest as 'modified', not 'created', so skipping created==existing pairs
+# can never mask a real collision.
+# ---------------------------------------------------------------------------
+check_duplicates() {
+  local manifest="$1"
+  [[ -f "$manifest" ]] || return 0
+  command -v jq >/dev/null 2>&1 || return 0
+
+  # New leaf slugs from pages created this run.
+  local -a new_slugs=()
+  local slug
+  while IFS= read -r slug; do
+    [[ -n "$slug" ]] && new_slugs+=("$slug")
+  done < <(jq -r '.pages[]? | select(.status=="created") | .path
+                  | split("/")[-1] | sub("\\.md$";"")' "$manifest" 2>/dev/null)
+
+  # Existing entity/concept slugs already catalogued in the index.
+  local -a existing_slugs=()
+  if [[ -f "wiki/index.md" ]]; then
+    local line
+    while IFS= read -r line; do
+      if [[ $line =~ \[\[(entities|concepts)/([a-z0-9-]+)\]\] ]]; then
+        existing_slugs+=("${BASH_REMATCH[2]}")
+      fi
+    done < "wiki/index.md"
+  fi
+
+  (( ${#new_slugs[@]} && ${#existing_slugs[@]} )) || return 0
+
+  local threshold="${KG_DUP_THRESHOLD:-70}"
+  local new exist sim
+  for new in "${new_slugs[@]}"; do
+    for exist in "${existing_slugs[@]}"; do
+      [[ "$new" == "$exist" ]] && continue   # skip exact self-match (see header)
+      sim=$(similarity "$new" "$exist")
+      if (( sim > threshold )); then
+        warn "Possible duplicate: '${new}' ≈ '${exist}' (${sim}% similar) — review in PR"
+      fi
+    done
+  done
+  return 0
+}
--- a/skills/ingest/scripts/ingest-semantic.py
+++ b/skills/ingest/scripts/ingest-semantic.py
@ -1,24 +1,23 @@
 #!/usr/bin/env python3
 # =============================================================================
 # skills/ingest/scripts/ingest-semantic.py
-# Phase 1 (semantic) of the Knowledge Genome ingest — the LIGHT version.
+# Phase 1 (semantic) of the Knowledge Genome ingest — light agent + deterministic conform.
+#
+# - FIXED: Add 'title:' field to frontmatter (lint was complaining about missing title)
+# - NEW: Inject existing index (entity/concept names) into prompt to prevent duplicates
+# - NEW: Richer prompt asking for 2-4 sentences per description (not 1-2), with concrete details
+# - Enhanced schema to handle longer descriptions naturally
 #
 # The model does ONLY semantic extraction and returns ONE schema-constrained JSON
 # object (no tools, no file writing, no git, no frontmatter, no slugs). This script
 # then CONFORMS that output deterministically into wiki pages with enforced
 # frontmatter + kebab-case paths, and writes a .ingest-manifest.json in EXACTLY the
-# schema run-ingest.sh expects. run-ingest.sh (phase 2) then does index / log /
-# scoped-lint / PR, unchanged.
+# schema run-ingest.sh expects.
 #
 #   cd <genome checkout>
 #   ingest-semantic.py <genome> raw/articles/<file>.md      # phase 1 (this)
 #   run-ingest.sh      <genome>                             # phase 2 (deterministic)
 #
-# Why this shape: local tool-calling via pi/ollama proved fragile, and a small
-# model does not reliably honour folders / naming / frontmatter / manifest schema
-# when it writes files itself. Here the model cannot break the contract because it
-# never touches the filesystem — the script owns all structure. Stdlib only.
-#
 # Emits a single JSON status line on stdout (for n8n / logs).
 # =============================================================================
 import json, os, re, sys, datetime, urllib.request, urllib.error
@ -57,7 +56,84 @@ if not source_text.strip():
    die("preflight", "source is empty: " + raw_rel)


-# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
+# --- read existing index to avoid duplicate slugs ---
+existing_entities = set()
+existing_concepts = set()
+if os.path.isfile("wiki/index.md"):
+    try:
+        with open("wiki/index.md", "r", encoding="utf-8") as f:
+            idx_text = f.read()
+        # extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
+        for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
+            existing_entities.add(m.group(1))
+        for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
+            existing_concepts.add(m.group(1))
+    except Exception:
+        pass  # index not readable or not found; that's OK
+
+
+def slugify(s):
+    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
+    return re.sub(r"-+", "-", s).strip("-") or "untitled"
+
+
+def twords(s, n=20):
+    """Truncate at n words; used for index entry summaries."""
+    s = " ".join((s or "").split())
+    w = s.split(" ")
+    return s if len(w) <= n else " ".join(w[:n]) + "…"
+
+
+def yaml_dq(s):
+    """Render a value as a YAML double-quoted scalar.
+
+    Titles can contain characters that break a bare scalar — most commonly a
+    colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
+    '-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
+    title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
+    collapsed to spaces so the scalar stays on one line.
+    """
+    s = " ".join((s or "").split())
+    s = s.replace("\\", "\\\\").replace('"', '\\"')
+    return f'"{s}"'
+
+
+def frontmatter(ptype, title, tags):
+    """Return YAML frontmatter with title field."""
+    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
+    return ("---\n"
+            f"title: {yaml_dq(title)}\n"
+            f"type: {ptype}\n"
+            f"domain: {genome}\n"
+            "maturity: draft\n"
+            f"last_updated: {TODAY}\n"
+            "private: false\n"
+            f"tags: {taglist}\n"
+            "---\n")
+
+
+def write_new(path, ptype, title, body, tags):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(frontmatter(ptype, title, tags))
+        f.write(f"\n# {title}\n\n{body}\n")
+
+
+def append_section(path, source_slug, body):
+    # never overwrite an existing page: accumulate, attributed to the new source
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
+    try:  # best-effort bump of last_updated in the existing frontmatter
+        with open(path, "r", encoding="utf-8") as f:
+            txt = f.read()
+        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(txt)
+    except Exception:
+        pass
+
+
+# --- the semantic contract ---
 SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
 Read the source and return ONLY structured data describing what it contains.
 You do not write files, you do not produce frontmatter, and you do not invent
@ -65,17 +141,29 @@ paths, slugs, branches, commits or PRs — a deterministic script does all of th

 Rules:
 - source_summary: a faithful, self-contained summary of the source, in the
-  source's own language. Plain prose, no markdown headings.
- key_points: the handful of concrete facts/claims worth indexing.
+  source's own language. Plain prose, NO markdown headings. 2-4 sentences,
+  with concrete details. Preserve the essence and nuance of the source.
+- key_points: 3-5 concrete facts or claims worth indexing; no padding.
 - entities: every person, tool, organisation or product the source names.
-  kind is one of person|tool|org|product. description is one or two factual
-  sentences. No markdown headings inside the description.
+  kind is one of person|tool|org|product. description is 2-3 factual sentences
+  with specifics. No markdown headings inside the description.
 - concepts: every pattern, theory, decision or named idea the source explains.
-  description is one or two factual sentences.
+  description is 2-3 factual sentences with concrete examples or context.
 - contradictions: ONLY when the source makes a claim that directly contradicts a
  widely-known fact or contradicts itself. Otherwise return an empty list.
 - Names must be the natural name of the thing; the script will normalise them.
-Do not pad. Be faithful to the source."""
+
+If the source references an entity or concept already in the wiki (see the list below),
+use the EXACT name already present; do not invent a variant. This prevents duplicates.
+
+Existing entities in this genome:
+{existing_entities}
+
+Existing concepts in this genome:
+{existing_concepts}
+
+Be faithful to the source. Be specific. Do not pad or improvise."""
+

 # --- JSON schema -> constrained decoding (Ollama structured outputs) ---
 SCHEMA = {
@ -118,10 +206,19 @@ SCHEMA = {


 def call_model():
+    # format existing names as a human-readable list
+    existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)"
+    existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)"
+
+    prompt = SYSTEM_PROMPT.format(
+        existing_entities=existing_ents,
+        existing_concepts=existing_conc,
+    )
+
    payload = {
        "model": MODEL,
        "messages": [
-            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "system", "content": prompt},
            {"role": "user", "content":
                "Source path: " + raw_rel + "\n\n--- SOURCE START ---\n"
                + source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},
@ -152,51 +249,6 @@ def call_model():
        die("model", "model did not return valid JSON: " + str(e))


-# --- conform helpers (the script OWNS all structure) ---
-def slugify(s):
-    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
-    return re.sub(r"-+", "-", s).strip("-") or "untitled"
-
-
-def twords(s, n=12):
-    s = " ".join((s or "").split())
-    w = s.split(" ")
-    return s if len(w) <= n else " ".join(w[:n]) + "…"
-
-
-def frontmatter(ptype, tags):
-    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
-    return ("---\n"
-            f"type: {ptype}\n"
-            f"domain: {genome}\n"
-            "maturity: draft\n"
-            f"last_updated: {TODAY}\n"
-            "private: false\n"
-            f"tags: {taglist}\n"
-            "---\n")
-
-
-def write_new(path, ptype, title, body, tags):
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(frontmatter(ptype, tags))
-        f.write(f"\n# {title}\n\n{body}\n")
-
-
-def append_section(path, source_slug, body):
-    # never overwrite an existing page: accumulate, attributed to the new source
-    with open(path, "a", encoding="utf-8") as f:
-        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
-    try:  # best-effort bump of last_updated in the existing frontmatter
-        with open(path, "r", encoding="utf-8") as f:
-            txt = f.read()
-        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
-        with open(path, "w", encoding="utf-8") as f:
-            f.write(txt)
-    except Exception:
-        pass
-
-
 # --- run the semantic pass ---
 sem = call_model()
 source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +262,15 @@ src_body   = (sem.get("source_summary") or "").strip()
 if kp_lines:
    src_body += "\n\n## Key points\n\n" + kp_lines
 src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
-src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
-            + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
+src_title  = sem.get('source_title') or source_slug
+src_tags   = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+              + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
 os.makedirs("wiki/sources", exist_ok=True)
 with open(src_path, "w", encoding="utf-8") as f:
-    f.write(frontmatter("source", src_tags))
-    f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
+    f.write(frontmatter("source", src_title, src_tags))
+    f.write(f"\n# {src_title}\n\n{src_body}\n")
 pages.append({"path": src_path,
-              "summary": twords(sem.get("source_title") or source_slug),
+              "summary": twords(src_title),
              "maturity": "draft", "status": src_status})


--- a/skills/ingest/scripts/run-ingest.sh
+++ b/skills/ingest/scripts/run-ingest.sh
@ -113,7 +113,14 @@ bash "${SCRIPTS}/log-append.sh" --type INGEST --subject "$slug" --model "$model"
  || fail "log" "log-append failed"

 # --- 3. scoped linter (capture findings for the PR; never aborts the run) ---
-lint_out="$( bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1 )" && lint_rc=0 || lint_rc=$?
+# Point scoped-lint at the same manifest we were handed so its duplicate
+# advisory reads the right file even when a non-default path arrives as $2.
+# (The dedup check lives inside lib/lint.sh and is invoked by scoped-lint —
+# there is no separate check-duplicates.sh script.)
+export INGEST_MANIFEST="$manifest"
+lint_out="$(
+  bash "${SCRIPTS}/scoped-lint.sh" "$genome" "${all_paths[@]}" 2>&1
+)" && lint_rc=0 || lint_rc=$?

 # --- 4. assemble the PR body (manifest tables + lint results) ---
 body="$(mktemp)"
--- a/skills/ingest/scripts/scoped-lint.sh
+++ b/skills/ingest/scripts/scoped-lint.sh
@ -49,6 +49,13 @@ for f in "$@"; do
  check_broken_links        "$f" || true   # warnings only
 done

+# Cross-page duplicate advisory: runs ONCE over the whole manifest (not per
+# file) — it compares this run's created slugs against the index, so repeating
+# it for every file would only print the same warnings N times. Warn-only;
+# never affects the exit status. INGEST_MANIFEST lets run-ingest.sh point us at
+# a non-default manifest path; falls back to the conventional name otherwise.
+check_duplicates "${INGEST_MANIFEST:-.ingest-manifest.json}"
+
 echo ""
 echo "scoped-lint: ${errors} error(s), ${stale} stale across ${count} file(s)"

--- a/tests/README.md
+++ b/tests/README.md
@ -9,7 +9,7 @@ they run anywhere (laptop, CI, a git hook). They do **not** belong on vm101 or i
 | File | Covers |
 |------|--------|
 | `scripts.bats`    | `slug.sh`, `log-append.sh`, `index-append.py` (insert, sort, bump, idempotent) |
-| `lint.bats`       | `lib/lint.sh` validators + `scoped-lint.sh` reuse |
+| `lint.bats`       | `lib/lint.sh` validators + `scoped-lint.sh` reuse + duplicate-slug advisory (edit-distance math, self-match skip, once-per-run) |
 | `structure.bats`  | `lib/structure.sh` report/sync |
 | `run-ingest.bats` | `run-ingest.sh` end-to-end (DRY_RUN, local bare remote) — needs `jq` |

--- a/tests/lint.bats
+++ b/tests/lint.bats
@ -69,3 +69,80 @@ EOF
  run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test wiki/sources/good.md
  [ "$status" -eq 0 ]
 }
+
+# --- duplicate-slug advisory (check_duplicates + its distance helpers) --------
+# These guard the dedup feature: correct edit-distance math, the warn-only
+# contract, the exact-self-match skip (run-ingest appends new slugs to the
+# index before lint runs), and that the advisory fires once per run, not once
+# per file.
+
+@test "levenshtein: identical strings have distance 0" {
+  run levenshtein cat cat
+  [ "$status" -eq 0 ]
+  [ "$output" -eq 0 ]
+}
+
+@test "levenshtein: kitten→sitting is 3 (textbook case)" {
+  run levenshtein kitten sitting
+  [ "$output" -eq 3 ]
+}
+
+@test "similarity: identical strings score 100" {
+  run similarity gpu-pricing gpu-pricing
+  [ "$output" -eq 100 ]
+}
+
+@test "check_duplicates: warns on a near-duplicate of an indexed concept" {
+  G="$(make_fixture_genome)"; cd "$G"
+  printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
+  cat > .ingest-manifest.json <<'JSON'
+{"raw_source":"src","pages":[{"path":"wiki/concepts/llm-routings.md","status":"created"}]}
+JSON
+  run check_duplicates .ingest-manifest.json
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"≈"* ]]
+  [[ "$output" == *"llm-routings"* ]]
+}
+
+@test "check_duplicates: silent when the new slug is unlike anything indexed" {
+  G="$(make_fixture_genome)"; cd "$G"
+  printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
+  cat > .ingest-manifest.json <<'JSON'
+{"raw_source":"src","pages":[{"path":"wiki/concepts/budget-hardware.md","status":"created"}]}
+JSON
+  run check_duplicates .ingest-manifest.json
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"≈"* ]]
+}
+
+@test "check_duplicates: an exact self-match is not flagged (index already has the slug)" {
+  G="$(make_fixture_genome)"; cd "$G"
+  # run-ingest step 1 inserts this run's slug into the index BEFORE lint runs;
+  # the slug must not be reported as a duplicate of itself.
+  printf -- '- [[concepts/llm-routing]] — x\n' >> wiki/index.md
+  cat > .ingest-manifest.json <<'JSON'
+{"raw_source":"src","pages":[{"path":"wiki/concepts/llm-routing.md","status":"created"}]}
+JSON
+  run check_duplicates .ingest-manifest.json
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"≈"* ]]
+}
+
+@test "scoped-lint: duplicate advisory fires once across multiple files, not per file" {
+  G="$(make_fixture_genome)"
+  write_page "$G/wiki/concepts/data-pipelines.md" concept genome-test
+  write_page "$G/wiki/concepts/other-topic.md"    concept genome-test
+  printf -- '- [[concepts/data-pipeline]] — x\n' >> "$G/wiki/index.md"
+  cat > "$G/.ingest-manifest.json" <<'JSON'
+{"raw_source":"src","pages":[
+  {"path":"wiki/concepts/data-pipelines.md","status":"created"},
+  {"path":"wiki/concepts/other-topic.md","status":"created"}
+]}
+JSON
+  cd "$G"
+  export KG_LIB_DIR="$LIB_DIR"
+  run bash "$SKILL_SCRIPTS/scoped-lint.sh" genome-test \
+      wiki/concepts/data-pipelines.md wiki/concepts/other-topic.md
+  [ "$status" -eq 0 ]
+  [ "$(grep -c "≈" <<< "$output")" -eq 1 ]
+}