feat: Implement shared page generation helpers and frontmatter title support

This commit is contained in:
Matteo Cherubini 2026-06-19 05:41:41 +02:00
parent ed63895fea
commit cdab1e089e

View file

@ -57,7 +57,84 @@ if not source_text.strip():
die("preflight", "source is empty: " + raw_rel)
# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
# --- read existing index to avoid duplicate slugs ---
existing_entities = set()
existing_concepts = set()
if os.path.isfile("wiki/index.md"):
try:
with open("wiki/index.md", "r", encoding="utf-8") as f:
idx_text = f.read()
# extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
existing_entities.add(m.group(1))
for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
existing_concepts.add(m.group(1))
except Exception:
pass # index not readable or not found; that's OK
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=20):
"""Truncate at n words; used for index entry summaries."""
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def yaml_dq(s):
"""Render a value as a YAML double-quoted scalar.
Titles can contain characters that break a bare scalar most commonly a
colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
'-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
collapsed to spaces so the scalar stays on one line.
"""
s = " ".join((s or "").split())
s = s.replace("\\", "\\\\").replace('"', '\\"')
return f'"{s}"'
def frontmatter(ptype, title, tags):
"""Return YAML frontmatter with title field."""
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"title: {yaml_dq(title)}\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, title, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- the semantic contract ---
SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
Read the source and return ONLY structured data describing what it contains.
You do not write files, you do not produce frontmatter, and you do not invent
@ -152,51 +229,6 @@ def call_model():
die("model", "model did not return valid JSON: " + str(e))
# --- conform helpers (the script OWNS all structure) ---
def slugify(s):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
return re.sub(r"-+", "-", s).strip("-") or "untitled"
def twords(s, n=12):
s = " ".join((s or "").split())
w = s.split(" ")
return s if len(w) <= n else " ".join(w[:n]) + ""
def frontmatter(ptype, tags):
taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
return ("---\n"
f"type: {ptype}\n"
f"domain: {genome}\n"
"maturity: draft\n"
f"last_updated: {TODAY}\n"
"private: false\n"
f"tags: {taglist}\n"
"---\n")
def write_new(path, ptype, title, body, tags):
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
f.write(frontmatter(ptype, tags))
f.write(f"\n# {title}\n\n{body}\n")
def append_section(path, source_slug, body):
# never overwrite an existing page: accumulate, attributed to the new source
with open(path, "a", encoding="utf-8") as f:
f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
try: # best-effort bump of last_updated in the existing frontmatter
with open(path, "r", encoding="utf-8") as f:
txt = f.read()
txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
with open(path, "w", encoding="utf-8") as f:
f.write(txt)
except Exception:
pass
# --- run the semantic pass ---
sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
@ -210,14 +242,15 @@ src_body = (sem.get("source_summary") or "").strip()
if kp_lines:
src_body += "\n\n## Key points\n\n" + kp_lines
src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
src_title = sem.get('source_title') or source_slug
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
+ [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f:
f.write(frontmatter("source", src_tags))
f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
f.write(frontmatter("source", src_title, src_tags))
f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path,
"summary": twords(sem.get("source_title") or source_slug),
"summary": twords(src_title),
"maturity": "draft", "status": src_status})