knowledge-genome-orchestrator/skills/ingest/scripts/ingest-semantic.py

#!/usr/bin/env python3
# =============================================================================
# skills/ingest/scripts/ingest-semantic.py
# Phase 1 (semantic) of the Knowledge Genome ingest — light agent + deterministic conform.
#
# - FIXED: Add 'title:' field to frontmatter (lint was complaining about missing title)
# - NEW: Inject existing index (entity/concept names) into prompt to prevent duplicates
# - NEW: Richer prompt asking for 2-4 sentences per description (not 1-2), with concrete details
# - Enhanced schema to handle longer descriptions naturally
#
# The model does ONLY semantic extraction and returns ONE schema-constrained JSON
# object (no tools, no file writing, no git, no frontmatter, no slugs). This script
# then CONFORMS that output deterministically into wiki pages with enforced
# frontmatter + kebab-case paths, and writes a .ingest-manifest.json in EXACTLY the
# schema run-ingest.sh expects.
#
#   cd <genome checkout>
#   ingest-semantic.py <genome> raw/articles/<file>.md      # phase 1 (this)
#   run-ingest.sh      <genome>                             # phase 2 (deterministic)
#
# Emits a single JSON status line on stdout (for n8n / logs).
# =============================================================================
import json, os, re, sys, datetime, urllib.request, urllib.error

# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
MODEL      = os.environ.get("INGEST_MODEL", "qwen2.5:14b")
NUM_CTX    = int(os.environ.get("INGEST_NUM_CTX", "16384"))
TIMEOUT    = int(os.environ.get("INGEST_TIMEOUT", "600"))
# INGEST_THINK: "false" disables a reasoning model's thinking trace, so models like
# gemma / qwq / qwen3 emit only the structured JSON (no truncation from long thinking).
# Unset = omit the flag entirely (correct for plain instruct models such as qwen2.5).
THINK      = os.environ.get("INGEST_THINK")
TODAY      = datetime.date.today().isoformat()


def die(stage, reason):
    print(json.dumps({"status": "error", "stage": stage, "reason": reason}))
    sys.exit(1)


# --- args + pre-flight (mirror the old skill's guards, enforced in code) ---
if len(sys.argv) < 3:
    die("args", "usage: ingest-semantic.py <genome> <raw/rel/path.md>")
genome  = sys.argv[1]
raw_rel = sys.argv[2].lstrip("./")

if "private/" in raw_rel or raw_rel.startswith("private"):
    die("preflight", "refusing private source: " + raw_rel)
if os.environ.get("PRIVATE_CONTEXT", "disabled") != "disabled":
    die("preflight", "PRIVATE_CONTEXT must be disabled")
if not raw_rel.startswith("raw/"):
    die("preflight", "source must live under raw/: " + raw_rel)
if not os.path.isfile(raw_rel):
    die("preflight", "source not found in cwd: " + raw_rel)

with open(raw_rel, "r", encoding="utf-8") as fh:
    source_text = fh.read()
if not source_text.strip():
    die("preflight", "source is empty: " + raw_rel)


# --- read existing index to avoid duplicate slugs ---
existing_entities = set()
existing_concepts = set()
if os.path.isfile("wiki/index.md"):
    try:
        with open("wiki/index.md", "r", encoding="utf-8") as f:
            idx_text = f.read()
        # extract slugs from [[entities/slug]] and [[concepts/slug]] patterns
        for m in re.finditer(r"\[\[entities/([a-z0-9\-]+)\]\]", idx_text):
            existing_entities.add(m.group(1))
        for m in re.finditer(r"\[\[concepts/([a-z0-9\-]+)\]\]", idx_text):
            existing_concepts.add(m.group(1))
    except Exception:
        pass  # index not readable or not found; that's OK


def slugify(s):
    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
    return re.sub(r"-+", "-", s).strip("-") or "untitled"


def twords(s, n=20):
    """Truncate at n words; used for index entry summaries."""
    s = " ".join((s or "").split())
    w = s.split(" ")
    return s if len(w) <= n else " ".join(w[:n]) + "…"


def yaml_dq(s):
    """Render a value as a YAML double-quoted scalar.

    Titles can contain characters that break a bare scalar — most commonly a
    colon-space ('Conflict: X' would parse as a mapping), but also '#', leading
    '-'/'?', quotes, etc. Double-quoting and escaping '\\' and '"' makes any
    title valid YAML (and keeps Obsidian/Dataview/qmd happy). Newlines are
    collapsed to spaces so the scalar stays on one line.
    """
    s = " ".join((s or "").split())
    s = s.replace("\\", "\\\\").replace('"', '\\"')
    return f'"{s}"'


def frontmatter(ptype, title, tags):
    """Return YAML frontmatter with title field."""
    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
    return ("---\n"
            f"title: {yaml_dq(title)}\n"
            f"type: {ptype}\n"
            f"domain: {genome}\n"
            "maturity: draft\n"
            f"last_updated: {TODAY}\n"
            "private: false\n"
            f"tags: {taglist}\n"
            "---\n")


def write_new(path, ptype, title, body, tags):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(frontmatter(ptype, title, tags))
        f.write(f"\n# {title}\n\n{body}\n")


def append_section(path, source_slug, body):
    # never overwrite an existing page: accumulate, attributed to the new source
    with open(path, "a", encoding="utf-8") as f:
        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
    try:  # best-effort bump of last_updated in the existing frontmatter
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read()
        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
        with open(path, "w", encoding="utf-8") as f:
            f.write(txt)
    except Exception:
        pass


# --- the semantic contract ---
SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
Read the source and return ONLY structured data describing what it contains.
You do not write files, you do not produce frontmatter, and you do not invent
paths, slugs, branches, commits or PRs — a deterministic script does all of that.

Rules:
- source_summary: a faithful, self-contained summary of the source, in the
  source's own language. Plain prose, NO markdown headings. 2-4 sentences,
  with concrete details. Preserve the essence and nuance of the source.
- key_points: 3-5 concrete facts or claims worth indexing; no padding.
- entities: every person, tool, organisation or product the source names.
  kind is one of person|tool|org|product. description is 2-3 factual sentences
  with specifics. No markdown headings inside the description.
- concepts: every pattern, theory, decision or named idea the source explains.
  description is 2-3 factual sentences with concrete examples or context.
- contradictions: ONLY when the source makes a claim that directly contradicts a
  widely-known fact or contradicts itself. Otherwise return an empty list.
- Names must be the natural name of the thing; the script will normalise them.

If the source references an entity or concept already in the wiki (see the list below),
use the EXACT name already present; do not invent a variant. This prevents duplicates.

Existing entities in this genome:
{existing_entities}

Existing concepts in this genome:
{existing_concepts}

Be faithful to the source. Be specific. Do not pad or improvise."""


# --- JSON schema -> constrained decoding (Ollama structured outputs) ---
SCHEMA = {
    "type": "object",
    "properties": {
        "source_title":   {"type": "string"},
        "source_summary": {"type": "string"},
        "key_points":     {"type": "array", "items": {"type": "string"}},
        "entities": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "name":        {"type": "string"},
                "kind":        {"type": "string",
                                "enum": ["person", "tool", "org", "product"]},
                "description": {"type": "string"},
            },
            "required": ["name", "description"],
        }},
        "concepts": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "name":        {"type": "string"},
                "description": {"type": "string"},
            },
            "required": ["name", "description"],
        }},
        "contradictions": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "concept":     {"type": "string"},
                "description": {"type": "string"},
            },
            "required": ["concept", "description"],
        }},
        "reasoning":  {"type": "string"},
        "pr_summary": {"type": "string"},
    },
    "required": ["source_title", "source_summary", "entities", "concepts"],
}


def call_model():
    # format existing names as a human-readable list
    existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)"
    existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)"

    prompt = SYSTEM_PROMPT.format(
        existing_entities=existing_ents,
        existing_concepts=existing_conc,
    )

    payload = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": prompt},
            {"role": "user", "content":
                "Source path: " + raw_rel + "\n\n--- SOURCE START ---\n"
                + source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},
        ],
        "format": SCHEMA,          # schema-constrained generation
        "stream": False,
        # deterministic extraction; repetition penalties OFF for structured output
        "options": {"temperature": 0.2, "repeat_penalty": 1.0, "num_ctx": NUM_CTX},
    }
    if THINK is not None:
        payload["think"] = THINK.strip().lower() in ("1", "true", "yes", "on")
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(
        OLLAMA_URL, data=data, headers={"Content-Type": "application/json"})
    try:
        with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
            resp = json.loads(r.read().decode("utf-8"))
    except urllib.error.URLError as e:
        die("model", "ollama request failed: " + str(e))
    content = ((resp.get("message") or {}).get("content") or "").strip()
    # schema-constrained, but stay defensive if a model wraps it in a fence
    if content.startswith("```"):
        content = content.strip("`")
        brace = content.find("{")
        if brace >= 0:
            content = content[brace:]
    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        die("model", "model did not return valid JSON: " + str(e))


# --- run the semantic pass ---
sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
pages = []

# 1. source page — canonical summary of THIS source (re)written
src_path   = f"wiki/sources/{source_slug}.md"
src_status = "modified" if os.path.exists(src_path) else "created"
kp_lines   = "\n".join("- " + p for p in (sem.get("key_points") or []) if p.strip())
src_body   = (sem.get("source_summary") or "").strip()
if kp_lines:
    src_body += "\n\n## Key points\n\n" + kp_lines
src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
src_title  = sem.get('source_title') or source_slug
src_tags   = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
              + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f:
    f.write(frontmatter("source", src_title, src_tags))
    f.write(f"\n# {src_title}\n\n{src_body}\n")
pages.append({"path": src_path,
              "summary": twords(src_title),
              "maturity": "draft", "status": src_status})


def handle(kind_dir, ptype, items):
    for it in items or []:
        name = (it.get("name") or "").strip()
        if not name:
            continue
        slug = slugify(name)
        path = f"wiki/{kind_dir}/{slug}.md"
        desc = (it.get("description") or "").strip()
        if os.path.exists(path):
            append_section(path, source_slug, desc)
            pages.append({"path": path, "summary": twords(desc), "status": "modified"})
        else:
            body = desc + f"\n\n## Sources\n\n- [[sources/{source_slug}]]\n"
            write_new(path, ptype, name, body, [genome, ptype])
            pages.append({"path": path, "summary": twords(desc),
                          "maturity": "draft", "status": "created"})


# 2. entities, 3. concepts
handle("entities", "entity", sem.get("entities", []))
handle("concepts", "concept", sem.get("concepts", []))

# 4. contradictions -> conflict pages (run-ingest routes wiki/queries/conflict-*)
conflicts = sem.get("contradictions") or []
conf_slugs = []
for c in conflicts:
    cslug = slugify(c.get("concept", "unknown"))
    conf_slugs.append(cslug)
    path = f"wiki/queries/conflict-{cslug}-{TODAY}.md"
    write_new(path, "query", f"Conflict: {c.get('concept', '')}",
              (c.get("description") or "").strip()
              + f"\n\n## Source\n\n- [[sources/{source_slug}]]\n",
              [genome, "conflict"])
    pages.append({"path": path, "summary": "", "maturity": "draft",
                  "status": "created"})

contradictions_str = ("None" if not conflicts
                      else f"{len(conflicts)} conflict file(s) created — "
                           + ", ".join(conf_slugs))

# --- write the manifest in EXACTLY run-ingest.sh's schema ---
manifest = {
    "raw_source":     raw_rel,
    "reasoning":      sem.get("reasoning") or ("Ingest of " + raw_rel),
    "pr_summary":     sem.get("pr_summary") or ("Semantic ingest of " + raw_rel),
    "contradictions": contradictions_str,
    "pages":          pages,
}
with open(".ingest-manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print(json.dumps({"status": "ok", "stage": "semantic",
                  "pages": len(pages), "model": MODEL,
                  "manifest": ".ingest-manifest.json"}))