feat: Implement shared page generation helpers and frontmatter title support

2026-06-19 05:41:41 +02:00 · 2026-06-19 05:41:41 +02:00 · cdab1e089e
commit cdab1e089e
parent ed63895fea
1 changed files with 84 additions and 51 deletions
--- a/skills/ingest/scripts/ingest-semantic.py
+++ b/skills/ingest/scripts/ingest-semantic.py
@ -57,7 +57,84 @@ if not source_text.strip():
    die("preflight", "source is empty: " + raw_rel)


-# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
+# --- read existing index to avoid duplicate slugs ---
+existing_entities = set()
+existing_concepts = set()
+if os.path.isfile("wiki/index.md"):
+    try:
+        with open("wiki/index.md", "r", encoding="utf-8") as f:
+            idx_text = f.read()
+        # extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
+        for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
+            existing_entities.add(m.group(1))
+        for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
+            existing_concepts.add(m.group(1))
+    except Exception:
+        pass  # index not readable or not found; that's OK
+
+
+def slugify(s):
+    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
+    return re.sub(r"-+", "-", s).strip("-") or "untitled"
+
+
+def twords(s, n=20):
+    """Truncate at n words; used for index entry summaries."""
+    s = " ".join((s or "").split())
+    w = s.split(" ")
+    return s if len(w) <= n else " ".join(w[:n]) + "…"
+
+
+def yaml_dq(s):
+    """Render a value as a YAML double-quoted scalar.
+
+    Titles can contain characters that break a bare scalar — most commonly a
+    colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
+    '-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
+    title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
+    collapsed to spaces so the scalar stays on one line.
+    """
+    s = " ".join((s or "").split())
+    s = s.replace("\\", "\\\\").replace('"', '\\"')
+    return f'"{s}"'
+
+
+def frontmatter(ptype, title, tags):
+    """Return YAML frontmatter with title field."""
+    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
+    return ("---\n"
+            f"title: {yaml_dq(title)}\n"
+            f"type: {ptype}\n"
+            f"domain: {genome}\n"
+            "maturity: draft\n"
+            f"last_updated: {TODAY}\n"
+            "private: false\n"
+            f"tags: {taglist}\n"
+            "---\n")
+
+
+def write_new(path, ptype, title, body, tags):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(frontmatter(ptype, title, tags))
+        f.write(f"\n# {title}\n\n{body}\n")
+
+
+def append_section(path, source_slug, body):
+    # never overwrite an existing page: accumulate, attributed to the new source
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
+    try:  # best-effort bump of last_updated in the existing frontmatter
+        with open(path, "r", encoding="utf-8") as f:
+            txt = f.read()
+        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(txt)
+    except Exception:
+        pass
+
+
+# --- the semantic contract ---
 SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
 Read the source and return ONLY structured data describing what it contains.
 You do not write files, you do not produce frontmatter, and you do not invent
@ -152,51 +229,6 @@ def call_model():
        die("model", "model did not return valid JSON: " + str(e))


-# --- conform helpers (the script OWNS all structure) ---
-def slugify(s):
-    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
-    return re.sub(r"-+", "-", s).strip("-") or "untitled"
-
-
-def twords(s, n=12):
-    s = " ".join((s or "").split())
-    w = s.split(" ")
-    return s if len(w) <= n else " ".join(w[:n]) + "…"
-
-
-def frontmatter(ptype, tags):
-    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
-    return ("---\n"
-            f"type: {ptype}\n"
-            f"domain: {genome}\n"
-            "maturity: draft\n"
-            f"last_updated: {TODAY}\n"
-            "private: false\n"
-            f"tags: {taglist}\n"
-            "---\n")
-
-
-def write_new(path, ptype, title, body, tags):
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(frontmatter(ptype, tags))
-        f.write(f"\n# {title}\n\n{body}\n")
-
-
-def append_section(path, source_slug, body):
-    # never overwrite an existing page: accumulate, attributed to the new source
-    with open(path, "a", encoding="utf-8") as f:
-        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
-    try:  # best-effort bump of last_updated in the existing frontmatter
-        with open(path, "r", encoding="utf-8") as f:
-            txt = f.read()
-        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
-        with open(path, "w", encoding="utf-8") as f:
-            f.write(txt)
-    except Exception:
-        pass
-
-
 # --- run the semantic pass ---
 sem = call_model()
 source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +242,15 @@ src_body   = (sem.get("source_summary") or "").strip()
 if kp_lines:
    src_body += "\n\n## Key points\n\n" + kp_lines
 src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
-src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
-            + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
+src_title  = sem.get('source_title') or source_slug
+src_tags   = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+              + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
 os.makedirs("wiki/sources", exist_ok=True)
 with open(src_path, "w", encoding="utf-8") as f:
-    f.write(frontmatter("source", src_tags))
-    f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
+    f.write(frontmatter("source", src_title, src_tags))
+    f.write(f"\n# {src_title}\n\n{src_body}\n")
 pages.append({"path": src_path,
-              "summary": twords(sem.get("source_title") or source_slug),
+              "summary": twords(src_title),
              "maturity": "draft", "status": src_status})