diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index dd974cd..fbd6f18 100644 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -142,17 +142,29 @@ paths, slugs, branches, commits or PRs — a deterministic script does all of th Rules: - source_summary: a faithful, self-contained summary of the source, in the - source's own language. Plain prose, no markdown headings. -- key_points: the handful of concrete facts/claims worth indexing. + source's own language. Plain prose, NO markdown headings. 2-4 sentences, + with concrete details. Preserve the essence and nuance of the source. +- key_points: 3-5 concrete facts or claims worth indexing; no padding. - entities: every person, tool, organisation or product the source names. - kind is one of person|tool|org|product. description is one or two factual - sentences. No markdown headings inside the description. + kind is one of person|tool|org|product. description is 2-3 factual sentences + with specifics. No markdown headings inside the description. - concepts: every pattern, theory, decision or named idea the source explains. - description is one or two factual sentences. + description is 2-3 factual sentences with concrete examples or context. - contradictions: ONLY when the source makes a claim that directly contradicts a widely-known fact or contradicts itself. Otherwise return an empty list. - Names must be the natural name of the thing; the script will normalise them. -Do not pad. Be faithful to the source.""" + +If the source references an entity or concept already in the wiki (see the list below), +use the EXACT name already present; do not invent a variant. This prevents duplicates. + +Existing entities in this genome: +{existing_entities} + +Existing concepts in this genome: +{existing_concepts} + +Be faithful to the source. Be specific. Do not pad or improvise.""" + # --- JSON schema -> constrained decoding (Ollama structured outputs) --- SCHEMA = { @@ -195,10 +207,19 @@ SCHEMA = { def call_model(): + # format existing names as a human-readable list + existing_ents = ", ".join(sorted(existing_entities)) or "(none yet)" + existing_conc = ", ".join(sorted(existing_concepts)) or "(none yet)" + + prompt = SYSTEM_PROMPT.format( + existing_entities=existing_ents, + existing_concepts=existing_conc, + ) + payload = { "model": MODEL, "messages": [ - {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "system", "content": prompt}, {"role": "user", "content": "Source path: " + raw_rel + "\n\n--- SOURCE START ---\n" + source_text + "\n--- SOURCE END ---\n\nReturn the JSON now."},