feat(ingest-semantic.py): Add pre-flight context window check
This commit is contained in:
parent
111ffd266a
commit
95b3866549
1 changed files with 9 additions and 0 deletions
|
|
@ -60,6 +60,15 @@ with open(raw_rel, "r", encoding="utf-8") as fh:
|
|||
if not source_text.strip():
|
||||
die("preflight", "source is empty: " + raw_rel)
|
||||
|
||||
# --- pre-flight check: if the prompt exceeds context window, exit cleanly with stage:input ---
|
||||
# Conservative estimate: ~4 chars/token for mixed IT/EN text
|
||||
SAFETY_MARGIN = 4096 # room for system prompt + JSON response
|
||||
MAX_SOURCE_TOKENS = NUM_CTX - SAFETY_MARGIN
|
||||
MAX_SOURCE_CHARS = MAX_SOURCE_TOKENS * 4
|
||||
|
||||
if len(source_text) > MAX_SOURCE_CHARS:
|
||||
die("input", f"source too large ({len(source_text)} chars, limit ~{MAX_SOURCE_CHARS}). "
|
||||
f"Use the SPLIT directive or divide the document.")
|
||||
|
||||
# --- read existing index to avoid duplicate slugs ---
|
||||
existing_entities = set()
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue