diff --git a/skills/ingest/scripts/ingest-semantic.py b/skills/ingest/scripts/ingest-semantic.py index a7b527a..fd4888d 100755 --- a/skills/ingest/scripts/ingest-semantic.py +++ b/skills/ingest/scripts/ingest-semantic.py @@ -60,6 +60,15 @@ with open(raw_rel, "r", encoding="utf-8") as fh: if not source_text.strip(): die("preflight", "source is empty: " + raw_rel) +# --- pre-flight check: if the prompt exceeds context window, exit cleanly with stage:input --- +# Conservative estimate: ~4 chars/token for mixed IT/EN text +SAFETY_MARGIN = 4096 # room for system prompt + JSON response +MAX_SOURCE_TOKENS = NUM_CTX - SAFETY_MARGIN +MAX_SOURCE_CHARS = MAX_SOURCE_TOKENS * 4 + +if len(source_text) > MAX_SOURCE_CHARS: + die("input", f"source too large ({len(source_text)} chars, limit ~{MAX_SOURCE_CHARS}). " + f"Use the SPLIT directive or divide the document.") # --- read existing index to avoid duplicate slugs --- existing_entities = set()