From 95b38665498f34acebf0771626dda54226e40606 Mon Sep 17 00:00:00 2001 From: Matteo Cherubini Date: Wed, 1 Jul 2026 18:24:19 +0200 Subject: [PATCH] feat(ingest-semantic.py): Add pre-flight context window check --- skills/ingest/scripts/ingest-semantic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index a7b527a..fd4888d 100755 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -60,6 +60,15 @@ with open(raw_rel, "r", encoding="utf-8") as fh: if not source_text.strip(): die("preflight", "source is empty: " + raw_rel) +# --- pre-flight check: if the prompt exceeds context window, exit cleanly with stage:input --- +# Conservative estimate: ~4 chars/token for mixed IT/EN text +SAFETY_MARGIN = 4096 # room for system prompt + JSON response +MAX_SOURCE_TOKENS = NUM_CTX - SAFETY_MARGIN +MAX_SOURCE_CHARS = MAX_SOURCE_TOKENS * 4 + +if len(source_text) > MAX_SOURCE_CHARS: + die("input", f"source too large ({len(source_text)} chars, limit ~{MAX_SOURCE_CHARS}). " + f"Use the SPLIT directive or divide the document.") # --- read existing index to avoid duplicate slugs --- existing_entities = set()