feat(ingest): Implement wikilink-based deduplication for index entries

This commit is contained in:
Matteo Cherubini 2026-06-05 09:59:18 +02:00
parent 203fbadd63
commit e8980b5526

View file

@ -18,6 +18,7 @@ import re
import sys
ENTRY_RE = re.compile(r"^- \[\[")
LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]")
HEADER_RE = re.compile(r"^## ")
@ -71,11 +72,31 @@ def main() -> int:
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
entries = [ln for ln in body if ENTRY_RE.match(ln)]
if args.entry in entries:
print(f"index-append: entry already present, skipping")
return 0
# Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed
# summary/maturity should UPDATE the existing entry, not add a duplicate line.
new_m = LINK_RE.match(args.entry)
new_link = new_m.group(1) if new_m else None
if new_link is not None:
replaced = False
for idx, ln in enumerate(entries):
m = LINK_RE.match(ln)
if m and m.group(1) == new_link:
if ln == args.entry:
print("index-append: entry already present, skipping")
return 0
entries[idx] = args.entry # same page, refreshed text
replaced = True
break
if not replaced:
entries.append(args.entry)
else:
# No parseable wikilink — fall back to exact-line dedup.
if args.entry in entries:
print("index-append: entry already present, skipping")
return 0
entries.append(args.entry)
entries.append(args.entry)
entries.sort(key=str.casefold)
# Normalise intro: drop trailing blanks, keep header + comment(s)