feat: Implement shared page generation helpers and frontmatter title support

2026-06-19 05:41:41 +02:00 · 2026-06-19 05:41:41 +02:00 · cdab1e089e
commit cdab1e089e
parent ed63895fea
1 changed files with 84 additions and 51 deletions
--- a/skills/ingest/scripts/ingest-semantic.py
+++ b/skills/ingest/scripts/ingest-semantic.py
@ -57,7 +57,84 @@ if not source_text.strip():
    die("preflight", "source is empty: " + raw_rel)
-# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
+# --- read existing index to avoid duplicate slugs ---
 existing_entities = set()
 existing_concepts = set()
 if os.path.isfile("wiki/index.md"):
    try:
        with open("wiki/index.md", "r", encoding="utf-8") as f:
            idx_text = f.read()
        # extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
        for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
            existing_entities.add(m.group(1))
        for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
            existing_concepts.add(m.group(1))
    except Exception:
        pass  # index not readable or not found; that's OK
 def slugify(s):
    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
    return re.sub(r"-+", "-", s).strip("-") or "untitled"
 def twords(s, n=20):
    """Truncate at n words; used for index entry summaries."""
    s = " ".join((s or "").split())
    w = s.split(" ")
    return s if len(w) <= n else " ".join(w[:n]) + "…"
 def yaml_dq(s):
    """Render a value as a YAML double-quoted scalar.
    Titles can contain characters that break a bare scalar — most commonly a
    colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
    '-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
    title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
    collapsed to spaces so the scalar stays on one line.
    """
    s = " ".join((s or "").split())
    s = s.replace("\\", "\\\\").replace('"', '\\"')
    return f'"{s}"'
 def frontmatter(ptype, title, tags):
    """Return YAML frontmatter with title field."""
    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
    return ("---\n"
            f"title: {yaml_dq(title)}\n"
            f"type: {ptype}\n"
            f"domain: {genome}\n"
            "maturity: draft\n"
            f"last_updated: {TODAY}\n"
            "private: false\n"
            f"tags: {taglist}\n"
            "---\n")
 def write_new(path, ptype, title, body, tags):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(frontmatter(ptype, title, tags))
        f.write(f"\n# {title}\n\n{body}\n")
 def append_section(path, source_slug, body):
    # never overwrite an existing page: accumulate, attributed to the new source
    with open(path, "a", encoding="utf-8") as f:
        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
    try:  # best-effort bump of last_updated in the existing frontmatter
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read()
        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
        with open(path, "w", encoding="utf-8") as f:
            f.write(txt)
    except Exception:
        pass
 # --- the semantic contract ---
 SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
 Read the source and return ONLY structured data describing what it contains.
 You do not write files, you do not produce frontmatter, and you do not invent
@ -152,51 +229,6 @@ def call_model():
        die("model", "model did not return valid JSON: " + str(e))
 # --- conform helpers (the script OWNS all structure) ---
 def slugify(s):
    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
    return re.sub(r"-+", "-", s).strip("-") or "untitled"
 def twords(s, n=12):
    s = " ".join((s or "").split())
    w = s.split(" ")
    return s if len(w) <= n else " ".join(w[:n]) + "…"
 def frontmatter(ptype, tags):
    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
    return ("---\n"
            f"type: {ptype}\n"
            f"domain: {genome}\n"
            "maturity: draft\n"
            f"last_updated: {TODAY}\n"
            "private: false\n"
            f"tags: {taglist}\n"
            "---\n")
 def write_new(path, ptype, title, body, tags):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(frontmatter(ptype, tags))
        f.write(f"\n# {title}\n\n{body}\n")
 def append_section(path, source_slug, body):
    # never overwrite an existing page: accumulate, attributed to the new source
    with open(path, "a", encoding="utf-8") as f:
        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
    try:  # best-effort bump of last_updated in the existing frontmatter
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read()
        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
        with open(path, "w", encoding="utf-8") as f:
            f.write(txt)
    except Exception:
        pass
 # --- run the semantic pass ---
 sem = call_model()
 source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +242,15 @@ src_body   = (sem.get("source_summary") or "").strip()
 if kp_lines:
    src_body += "\n\n## Key points\n\n" + kp_lines
 src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
-src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+src_title  = sem.get('source_title') or source_slug
-            + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
+src_tags   = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
              + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
 os.makedirs("wiki/sources", exist_ok=True)
 with open(src_path, "w", encoding="utf-8") as f:
-    f.write(frontmatter("source", src_tags))
+    f.write(frontmatter("source", src_title, src_tags))
-    f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
+    f.write(f"\n# {src_title}\n\n{src_body}\n")
 pages.append({"path": src_path,
-              "summary": twords(sem.get("source_title") or source_slug),
+              "summary": twords(src_title),
              "maturity": "draft", "status": src_status})