diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index fbd6f18..5fe9593 100644 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -1,24 +1,23 @@ #!/usr/bin/env python3 # ============================================================================= # skills/ingest/scripts/ingest-semantic.py -# Phase 1 (semantic) of the Knowledge Genome ingest — the LIGHT version. +# Phase 1 (semantic) of the Knowledge Genome ingest — light agent + deterministic conform. +# +# - FIXED: Add 'title:' field to frontmatter (lint was complaining about missing title) +# - NEW: Inject existing index (entity/concept names) into prompt to prevent duplicates +# - NEW: Richer prompt asking for 2-4 sentences per description (not 1-2), with concrete details +# - Enhanced schema to handle longer descriptions naturally # # The model does ONLY semantic extraction and returns ONE schema-constrained JSON # object (no tools, no file writing, no git, no frontmatter, no slugs). This script # then CONFORMS that output deterministically into wiki pages with enforced # frontmatter + kebab-case paths, and writes a .ingest-manifest.json in EXACTLY the -# schema run-ingest.sh expects. run-ingest.sh (phase 2) then does index / log / -# scoped-lint / PR, unchanged. +# schema run-ingest.sh expects. # # cd # ingest-semantic.py raw/articles/.md # phase 1 (this) # run-ingest.sh # phase 2 (deterministic) # -# Why this shape: local tool-calling via pi/ollama proved fragile, and a small -# model does not reliably honour folders / naming / frontmatter / manifest schema -# when it writes files itself. Here the model cannot break the contract because it -# never touches the filesystem — the script owns all structure. Stdlib only. -# # Emits a single JSON status line on stdout (for n8n / logs). # ============================================================================= import json, os, re, sys, datetime, urllib.request, urllib.error