feat(ingest): Implement wikilink-based deduplication for index entries
This commit is contained in:
parent
203fbadd63
commit
e8980b5526
1 changed files with 25 additions and 4 deletions
|
|
@ -18,6 +18,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
ENTRY_RE = re.compile(r"^- \[\[")
|
ENTRY_RE = re.compile(r"^- \[\[")
|
||||||
|
LINK_RE = re.compile(r"^- \[\[([^\]]+)\]\]")
|
||||||
HEADER_RE = re.compile(r"^## ")
|
HEADER_RE = re.compile(r"^## ")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -71,11 +72,31 @@ def main() -> int:
|
||||||
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
|
intro = [ln for ln in body if not ENTRY_RE.match(ln)]
|
||||||
entries = [ln for ln in body if ENTRY_RE.match(ln)]
|
entries = [ln for ln in body if ENTRY_RE.match(ln)]
|
||||||
|
|
||||||
if args.entry in entries:
|
# Deduplicate by wikilink PATH, not by exact line: a re-ingest with a changed
|
||||||
print(f"index-append: entry already present, skipping")
|
# summary/maturity should UPDATE the existing entry, not add a duplicate line.
|
||||||
return 0
|
new_m = LINK_RE.match(args.entry)
|
||||||
|
new_link = new_m.group(1) if new_m else None
|
||||||
|
|
||||||
|
if new_link is not None:
|
||||||
|
replaced = False
|
||||||
|
for idx, ln in enumerate(entries):
|
||||||
|
m = LINK_RE.match(ln)
|
||||||
|
if m and m.group(1) == new_link:
|
||||||
|
if ln == args.entry:
|
||||||
|
print("index-append: entry already present, skipping")
|
||||||
|
return 0
|
||||||
|
entries[idx] = args.entry # same page, refreshed text
|
||||||
|
replaced = True
|
||||||
|
break
|
||||||
|
if not replaced:
|
||||||
entries.append(args.entry)
|
entries.append(args.entry)
|
||||||
|
else:
|
||||||
|
# No parseable wikilink — fall back to exact-line dedup.
|
||||||
|
if args.entry in entries:
|
||||||
|
print("index-append: entry already present, skipping")
|
||||||
|
return 0
|
||||||
|
entries.append(args.entry)
|
||||||
|
|
||||||
entries.sort(key=str.casefold)
|
entries.sort(key=str.casefold)
|
||||||
|
|
||||||
# Normalise intro: drop trailing blanks, keep header + comment(s)
|
# Normalise intro: drop trailing blanks, keep header + comment(s)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue