feat: Implement shared page generation helpers and frontmatter title support

This commit is contained in:
Matteo Cherubini 2026-06-19 05:41:41 +02:00
parent ed63895fea
commit cdab1e089e

View file

@ -57,7 +57,84 @@ if not source_text.strip():
die("preflight", "source is empty: " + raw_rel) die("preflight", "source is empty: " + raw_rel)
# --- the semantic contract (authoritative copy; SKILL.md documents it) --- # --- read existing index to avoid duplicate slugs ---
existing_entities = set()
existing_concepts = set()
if os.path.isfile("wiki/index.md"):
try:
with open("wiki/index.md", "r", encoding="utf-8") as f:
idx_text = f.read()
# extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
existing_entities.add(m.group(1))
for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
existing_concepts.add(m.group(1))
except Exception:
pass # index not readable or not found; that's OK
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=20):
"""Truncate at n words; used for index entry summaries."""
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def yaml_dq(s):
"""Render a value as a YAML double-quoted scalar.
Titles can contain characters that break a bare scalar most commonly a
colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
'-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
collapsed to spaces so the scalar stays on one line.
"""
s = " ".join((s or "").split())
s = s.replace("\\", "\\\\").replace('"', '\\"')
return f'"{s}"'
def frontmatter(ptype, title, tags):
"""Return YAML frontmatter with title field."""
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"title: {yaml_dq(title)}\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, title, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- the semantic contract ---
SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki. SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
Read the source and return ONLY structured data describing what it contains. Read the source and return ONLY structured data describing what it contains.
You do not write files, you do not produce frontmatter, and you do not invent You do not write files, you do not produce frontmatter, and you do not invent
@ -152,51 +229,6 @@ def call_model():
die("model", "model did not return valid JSON: " + str(e)) die("model", "model did not return valid JSON: " + str(e))
# --- conform helpers (the script OWNS all structure) ---
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=12):
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def frontmatter(ptype, tags):
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- run the semantic pass --- # --- run the semantic pass ---
sem = call_model() sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0]) source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +242,15 @@ src_body = (sem.get("source_summary") or "").strip()
if kp_lines: if kp_lines:
src_body += "\n\n## Key points\n\n" + kp_lines src_body += "\n\n## Key points\n\n" + kp_lines
src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n" src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])] src_title = sem.get('source_title') or source_slug
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8] src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True) os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f: with open(src_path, "w", encoding="utf-8") as f:
f.write(frontmatter("source", src_tags)) f.write(frontmatter("source", src_title, src_tags))
f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n") f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path, pages.append({"path": src_path,
"summary": twords(sem.get("source_title") or source_slug), "summary": twords(src_title),
"maturity": "draft", "status": src_status}) "maturity": "draft", "status": src_status})