feat(ingest): Implement wikilink-based deduplication for index entries
This commit is contained in:
parent
203fbadd63
commit
e8980b5526
1 changed files with 25 additions and 4 deletions
|
|
@ -18,6 +18,7 @@ import re
|
|||
import sys
|
||||
|
||||
ENTRY_RE = re.compile(r"^- \[\[")
|
||||
LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]")
|
||||
HEADER_RE = re.compile(r"^## ")
|
||||
|
||||
|
||||
|
|
@ -71,11 +72,31 @@ def main() -> int:
|
|||
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
|
||||
entries = [ln for ln in body if ENTRY_RE.match(ln)]
|
||||
|
||||
if args.entry in entries:
|
||||
print(f"index-append: entry already present, skipping")
|
||||
return 0
|
||||
# Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed
|
||||
# summary/maturity should UPDATE the existing entry, not add a duplicate line.
|
||||
new_m = LINK_RE.match(args.entry)
|
||||
new_link = new_m.group(1) if new_m else None
|
||||
|
||||
if new_link is not None:
|
||||
replaced = False
|
||||
for idx, ln in enumerate(entries):
|
||||
m = LINK_RE.match(ln)
|
||||
if m and m.group(1) == new_link:
|
||||
if ln == args.entry:
|
||||
print("index-append: entry already present, skipping")
|
||||
return 0
|
||||
entries[idx] = args.entry # same page, refreshed text
|
||||
replaced = True
|
||||
break
|
||||
if not replaced:
|
||||
entries.append(args.entry)
|
||||
else:
|
||||
# No parseable wikilink — fall back to exact-line dedup.
|
||||
if args.entry in entries:
|
||||
print("index-append: entry already present, skipping")
|
||||
return 0
|
||||
entries.append(args.entry)
|
||||
|
||||
entries.append(args.entry)
|
||||
entries.sort(key=str.casefold)
|
||||
|
||||
# Normalise intro: drop trailing blanks, keep header + comment(s)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue