knowledge-genome-orchestrator/skills/ingest/scripts/ingest-semantic.py

#!/usr/bin/env python3
# =============================================================================
# skills/ingest/scripts/ingest-semantic.py
# Phase 1 (semantic) of the Knowledge Genome ingest — the LIGHT version.
#
# The model does ONLY semantic extraction and returns ONE schema-constrained JSON
# object (no tools, no file writing, no git, no frontmatter, no slugs). This script
# then CONFORMS that output deterministically into wiki pages with enforced
# frontmatter + kebab-case paths, and writes a .ingest-manifest.json in EXACTLY the
# schema run-ingest.sh expects. run-ingest.sh (phase 2) then does index / log /
# scoped-lint / PR, unchanged.
#
#   cd <genome checkout>
#   ingest-semantic.py <genome> raw/articles/<file>.md      # phase 1 (this)
#   run-ingest.sh      <genome>                             # phase 2 (deterministic)
#
# Why this shape: local tool-calling via pi/ollama proved fragile, and a small
# model does not reliably honour folders / naming / frontmatter / manifest schema
# when it writes files itself. Here the model cannot break the contract because it
# never touches the filesystem — the script owns all structure. Stdlib only.
#
# Emits a single JSON status line on stdout (for n8n / logs).
# =============================================================================
import json, os, re, sys, datetime, urllib.request, urllib.error

# --- config (override via env; these live in ~/.config/knowledge-genome.env) ---
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/chat")
MODEL      = os.environ.get("INGEST_MODEL", "qwen2.5:14b")
NUM_CTX    = int(os.environ.get("INGEST_NUM_CTX", "16384"))
TIMEOUT    = int(os.environ.get("INGEST_TIMEOUT", "600"))
TODAY      = datetime.date.today().isoformat()


def die(stage, reason):
    print(json.dumps({"status": "error", "stage": stage, "reason": reason}))
    sys.exit(1)


# --- args + pre-flight (mirror the old skill's guards, enforced in code) ---
if len(sys.argv) < 3:
    die("args", "usage: ingest-semantic.py <genome> <raw/rel/path.md>")
genome  = sys.argv[1]
raw_rel = sys.argv[2].lstrip("./")

if "private/" in raw_rel or raw_rel.startswith("private"):
    die("preflight", "refusing private source: " + raw_rel)
if os.environ.get("PRIVATE_CONTEXT", "disabled") != "disabled":
    die("preflight", "PRIVATE_CONTEXT must be disabled")
if not raw_rel.startswith("raw/"):
    die("preflight", "source must live under raw/: " + raw_rel)
if not os.path.isfile(raw_rel):
    die("preflight", "source not found in cwd: " + raw_rel)

with open(raw_rel, "r", encoding="utf-8") as fh:
    source_text = fh.read()
if not source_text.strip():
    die("preflight", "source is empty: " + raw_rel)


# --- the semantic contract (authoritative copy; SKILL.md documents it) ---
SYSTEM_PROMPT = """You perform the SEMANTIC PASS of a single source into a knowledge wiki.
Read the source and return ONLY structured data describing what it contains.
You do not write files, you do not produce frontmatter, and you do not invent
paths, slugs, branches, commits or PRs — a deterministic script does all of that.

Rules:
- source_summary: a faithful, self-contained summary of the source, in the
  source's own language. Plain prose, no markdown headings.
- key_points: the handful of concrete facts/claims worth indexing.
- entities: every person, tool, organisation or product the source names.
  kind is one of person|tool|org|product. description is one or two factual
  sentences. No markdown headings inside the description.
- concepts: every pattern, theory, decision or named idea the source explains.
  description is one or two factual sentences.
- contradictions: ONLY when the source makes a claim that directly contradicts a
  widely-known fact or contradicts itself. Otherwise return an empty list.
- Names must be the natural name of the thing; the script will normalise them.
Do not pad. Be faithful to the source."""

# --- JSON schema -> constrained decoding (Ollama structured outputs) ---
SCHEMA = {
    "type": "object",
    "properties": {
        "source_title":   {"type": "string"},
        "source_summary": {"type": "string"},
        "key_points":     {"type": "array", "items": {"type": "string"}},
        "entities": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "name":        {"type": "string"},
                "kind":        {"type": "string",
                                "enum": ["person", "tool", "org", "product"]},
                "description": {"type": "string"},
            },
            "required": ["name", "description"],
        }},
        "concepts": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "name":        {"type": "string"},
                "description": {"type": "string"},
            },
            "required": ["name", "description"],
        }},
        "contradictions": {"type": "array", "items": {
            "type": "object",
            "properties": {
                "concept":     {"type": "string"},
                "description": {"type": "string"},
            },
            "required": ["concept", "description"],
        }},
        "reasoning":  {"type": "string"},
        "pr_summary": {"type": "string"},
    },
    "required": ["source_title", "source_summary", "entities", "concepts"],
}


def call_model():
    payload = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content":
                "Source path: " + raw_rel + "\n\n--- SOURCE START ---\n"
                + source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},
        ],
        "format": SCHEMA,          # schema-constrained generation
        "stream": False,
        # deterministic extraction; repetition penalties OFF for structured output
        "options": {"temperature": 0.2, "repeat_penalty": 1.0, "num_ctx": NUM_CTX},
    }
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(
        OLLAMA_URL, data=data, headers={"Content-Type": "application/json"})
    try:
        with urllib.request.urlopen(req, timeout=TIMEOUT) as r:
            resp = json.loads(r.read().decode("utf-8"))
    except urllib.error.URLError as e:
        die("model", "ollama request failed: " + str(e))
    content = ((resp.get("message") or {}).get("content") or "").strip()
    # schema-constrained, but stay defensive if a model wraps it in a fence
    if content.startswith("```"):
        content = content.strip("`")
        brace = content.find("{")
        if brace >= 0:
            content = content[brace:]
    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        die("model", "model did not return valid JSON: " + str(e))


# --- conform helpers (the script OWNS all structure) ---
def slugify(s):
    s = re.sub(r"[^a-z0-9]+", "-", (s or "").strip().lower())
    return re.sub(r"-+", "-", s).strip("-") or "untitled"


def twords(s, n=12):
    s = " ".join((s or "").split())
    w = s.split(" ")
    return s if len(w) <= n else " ".join(w[:n]) + "…"


def frontmatter(ptype, tags):
    taglist = "[" + ", ".join(sorted(set(t for t in tags if t))) + "]"
    return ("---\n"
            f"type: {ptype}\n"
            f"domain: {genome}\n"
            "maturity: draft\n"
            f"last_updated: {TODAY}\n"
            "private: false\n"
            f"tags: {taglist}\n"
            "---\n")


def write_new(path, ptype, title, body, tags):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(frontmatter(ptype, tags))
        f.write(f"\n# {title}\n\n{body}\n")


def append_section(path, source_slug, body):
    # never overwrite an existing page: accumulate, attributed to the new source
    with open(path, "a", encoding="utf-8") as f:
        f.write(f"\n\n## From [[sources/{source_slug}]]\n\n{body}\n")
    try:  # best-effort bump of last_updated in the existing frontmatter
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read()
        txt = re.sub(r"(?m)^last_updated:.*$", "last_updated: " + TODAY, txt, count=1)
        with open(path, "w", encoding="utf-8") as f:
            f.write(txt)
    except Exception:
        pass


# --- run the semantic pass ---
sem = call_model()
source_slug = slugify(os.path.splitext(os.path.basename(raw_rel))[0])
pages = []

# 1. source page — canonical summary of THIS source (re)written
src_path   = f"wiki/sources/{source_slug}.md"
src_status = "modified" if os.path.exists(src_path) else "created"
kp_lines   = "\n".join("- " + p for p in (sem.get("key_points") or []) if p.strip())
src_body   = (sem.get("source_summary") or "").strip()
if kp_lines:
    src_body += "\n\n## Key points\n\n" + kp_lines
src_body += f"\n\n## Source\n\n- [[{raw_rel}]]\n"
src_tags = ([slugify(e.get("name", "")) for e in sem.get("entities", [])]
            + [slugify(c.get("name", "")) for c in sem.get("concepts", [])])[:8]
os.makedirs("wiki/sources", exist_ok=True)
with open(src_path, "w", encoding="utf-8") as f:
    f.write(frontmatter("source", src_tags))
    f.write(f"\n# {sem.get('source_title') or source_slug}\n\n{src_body}\n")
pages.append({"path": src_path,
              "summary": twords(sem.get("source_title") or source_slug),
              "maturity": "draft", "status": src_status})


def handle(kind_dir, ptype, items):
    for it in items or []:
        name = (it.get("name") or "").strip()
        if not name:
            continue
        slug = slugify(name)
        path = f"wiki/{kind_dir}/{slug}.md"
        desc = (it.get("description") or "").strip()
        if os.path.exists(path):
            append_section(path, source_slug, desc)
            pages.append({"path": path, "summary": twords(desc), "status": "modified"})
        else:
            body = desc + f"\n\n## Sources\n\n- [[sources/{source_slug}]]\n"
            write_new(path, ptype, name, body, [genome, ptype])
            pages.append({"path": path, "summary": twords(desc),
                          "maturity": "draft", "status": "created"})


# 2. entities, 3. concepts
handle("entities", "entity", sem.get("entities", []))
handle("concepts", "concept", sem.get("concepts", []))

# 4. contradictions -> conflict pages (run-ingest routes wiki/queries/conflict-*)
conflicts = sem.get("contradictions") or []
conf_slugs = []
for c in conflicts:
    cslug = slugify(c.get("concept", "unknown"))
    conf_slugs.append(cslug)
    path = f"wiki/queries/conflict-{cslug}-{TODAY}.md"
    write_new(path, "query", f"Conflict: {c.get('concept', '')}",
              (c.get("description") or "").strip()
              + f"\n\n## Source\n\n- [[sources/{source_slug}]]\n",
              [genome, "conflict"])
    pages.append({"path": path, "summary": "", "maturity": "draft",
                  "status": "created"})

contradictions_str = ("None" if not conflicts
                      else f"{len(conflicts)} conflict file(s) created — "
                           + ", ".join(conf_slugs))

# --- write the manifest in EXACTLY run-ingest.sh's schema ---
manifest = {
    "raw_source":     raw_rel,
    "reasoning":      sem.get("reasoning") or ("Ingest of " + raw_rel),
    "pr_summary":     sem.get("pr_summary") or ("Semantic ingest of " + raw_rel),
    "contradictions": contradictions_str,
    "pages":          pages,
}
with open(".ingest-manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2, ensure_ascii=False)

print(json.dumps({"status": "ok", "stage": "semantic",
                  "pages": len(pages), "model": MODEL,
                  "manifest": ".ingest-manifest.json"}))