feat(ingest-semantic.py): Add pre-flight context window check

This commit is contained in:
Matteo Cherubini 2026-07-01 18:24:19 +02:00
parent 111ffd266a
commit 95b3866549

View file

@ -60,6 +60,15 @@ with open(raw_rel, "r", encoding="utf-8") as fh:
if not source_text.strip():
die("preflight", "source is empty: " + raw_rel)
# --- pre-flight check: if the prompt exceeds context window, exit cleanly with stage:input ---
# Conservative estimate: ~4 chars/token for mixed IT/EN text
SAFETY_MARGIN = 4096 # room for system prompt + JSON response
MAX_SOURCE_TOKENS = NUM_CTX - SAFETY_MARGIN
MAX_SOURCE_CHARS = MAX_SOURCE_TOKENS * 4
if len(source_text) > MAX_SOURCE_CHARS:
die("input", f"source too large ({len(source_text)} chars, limit ~{MAX_SOURCE_CHARS}). "
f"Use the SPLIT directive or divide the document.")
# --- read existing index to avoid duplicate slugs ---
existing_entities = set()