diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index 337d9f2..dd974cd 100644 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -57,7 +57,84 @@ if not source_text.strip(): die("preflight", "source is empty: " + raw_rel) -# --- the semantic contract (authoritative copy; SKILL.md documents it) --- +# --- read existing index to avoid duplicate slugs --- +existing_entities = set() +existing_concepts = set() +if os.path.isfile("wiki/index.md"): + try: + with open("wiki/index.md", "r", encoding="utf-8") as f: + idx_text = f.read() + # extract slugs from [[entities/slug]] and [[concepts/slug]] patterns + for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text): + existing_entities.add(m.group(1)) + for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text): + existing_concepts.add(m.group(1)) + except Exception: + pass # index not readable or not found; that's OK + + +def slugify(s): + s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower()) + return re.sub(r"-+", "-", s).strip("-") or "untitled" + + +def twords(s, n=20): + """Truncate at n words; used for index entry summaries.""" + s = " ".join((s or "").split()) + w = s.split(" ") + return s if len(w) <= n else " ".join(w[:n]) + "…" + + +def yaml_dq(s): + """Render a value as a YAML double-quoted scalar. + + Titles can contain characters that break a bare scalar — most commonly a + colon-space ('Conflict: X' would parse as a mapping), but also '#', leading + '-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any + title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are + collapsed to spaces so the scalar stays on one line. + """ + s = " ".join((s or "").split()) + s = s.replace("\\", "\\\\").replace('"', '\\"') + return f'"{s}"' + + +def frontmatter(ptype, title, tags): + """Return YAML frontmatter with title field.""" + taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]" + return ("---\n" + f"title: {yaml_dq(title)}\n" + f"type: {ptype}\n" + f"domain: {genome}\n" + "maturity: draft\n" + f"last_updated: {TODAY}\n" + "private: false\n" + f"tags: {taglist}\n" + "---\n") + + +def write_new(path, ptype, title, body, tags): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.write(frontmatter(ptype, title, tags)) + f.write(f"\n# {title}\n\n{body}\n") + + +def append_section(path, source_slug, body): + # never overwrite an existing page: accumulate, attributed to the new source + with open(path, "a", encoding="utf-8") as f: + f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n") + try: # best-effort bump of last_updated in the existing frontmatter + with open(path, "r", encoding="utf-8") as f: + txt = f.read() + txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1) + with open(path, "w", encoding="utf-8") as f: + f.write(txt) + except Exception: + pass + + +# --- the semantic contract --- SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki. Read the source and return ONLY structured data describing what it contains. You do not write files, you do not produce frontmatter, and you do not invent @@ -152,51 +229,6 @@ def call_model(): die("model", "model did not return valid JSON: " + str(e)) -# --- conform helpers (the script OWNS all structure) --- -def slugify(s): - s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower()) - return re.sub(r"-+", "-", s).strip("-") or "untitled" - - -def twords(s, n=12): - s = " ".join((s or "").split()) - w = s.split(" ") - return s if len(w) <= n else " ".join(w[:n]) + "…" - - -def frontmatter(ptype, tags): - taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]" - return ("---\n" - f"type: {ptype}\n" - f"domain: {genome}\n" - "maturity: draft\n" - f"last_updated: {TODAY}\n" - "private: false\n" - f"tags: {taglist}\n" - "---\n") - - -def write_new(path, ptype, title, body, tags): - os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, "w", encoding="utf-8") as f: - f.write(frontmatter(ptype, tags)) - f.write(f"\n# {title}\n\n{body}\n") - - -def append_section(path, source_slug, body): - # never overwrite an existing page: accumulate, attributed to the new source - with open(path, "a", encoding="utf-8") as f: - f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n") - try: # best-effort bump of last_updated in the existing frontmatter - with open(path, "r", encoding="utf-8") as f: - txt = f.read() - txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1) - with open(path, "w", encoding="utf-8") as f: - f.write(txt) - except Exception: - pass - - # --- run the semantic pass --- sem = call_model() source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0]) @@ -210,14 +242,15 @@ src_body = (sem.get("source_summary") or "").strip() if kp_lines: src_body += "\n\n## Key points\n\n" + kp_lines src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n" -src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])] - + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8] +src_title = sem.get('source_title') or source_slug +src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])] + + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8] os.makedirs("wiki/sources", exist_ok=True) with open(src_path, "w", encoding="utf-8") as f: - f.write(frontmatter("source", src_tags)) - f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n") + f.write(frontmatter("source", src_title, src_tags)) + f.write(f"\n# {src_title}\n\n{src_body}\n") pages.append({"path": src_path, - "summary": twords(sem.get("source_title") or source_slug), + "summary": twords(src_title), "maturity": "draft", "status": src_status})