From e8980b55260c90b9d5d0f35208094f2b892d1e93 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Fri, 5 Jun 2026 09:59:18 +0200 Subject: [PATCH] feat(ingest): Implement wikilink-based deduplication for index entries --- skills/ingest/scripts/index-append.py | 29 +++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/skills/ingest/scripts/index-append.py b/skills/ingest/scripts/index-append.py index e70009a..a4fc718 100644 --- a/skills/ingest/scripts/index-append.py +++ b/skills/ingest/scripts/index-append.py @@ -18,6 +18,7 @@ import re import sys ENTRY_RE = re.compile(r"^- \[\[") +LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]") HEADER_RE = re.compile(r"^## ") @@ -71,11 +72,31 @@ def main() -> int: intro = [ln for ln in body if not ENTRY_RE.match(ln)] entries = [ln for ln in body if ENTRY_RE.match(ln)] - if args.entry in entries: - print(f"index-append: entry already present, skipping") - return 0 + # Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed + # summary/maturity should UPDATE the existing entry, not add a duplicate line. + new_m = LINK_RE.match(args.entry) + new_link = new_m.group(1) if new_m else None + + if new_link is not None: + replaced = False + for idx, ln in enumerate(entries): + m = LINK_RE.match(ln) + if m and m.group(1) == new_link: + if ln == args.entry: + print("index-append: entry already present, skipping") + return 0 + entries[idx] = args.entry # same page, refreshed text + replaced = True + break + if not replaced: + entries.append(args.entry) + else: + # No parseable wikilink — fall back to exact-line dedup. + if args.entry in entries: + print("index-append: entry already present, skipping") + return 0 + entries.append(args.entry) - entries.append(args.entry) entries.sort(key=str.casefold) # Normalise intro: drop trailing blanks, keep header + comment(s)