feat(ingest-semantic.py): Add pre-flight context window check
This commit is contained in:
parent
111ffd266a
commit
95b3866549
1 changed files with 9 additions and 0 deletions
|
|
@ -60,6 +60,15 @@ with open(raw_rel, "r", encoding="utf-8") as fh:
|
||||||
if not source_text.strip():
|
if not source_text.strip():
|
||||||
die("preflight", "source is empty: " + raw_rel)
|
die("preflight", "source is empty: " + raw_rel)
|
||||||
|
|
||||||
|
# --- pre-flight check: if the prompt exceeds context window, exit cleanly with stage:input ---
|
||||||
|
# Conservative estimate: ~4 chars/token for mixed IT/EN text
|
||||||
|
SAFETY_MARGIN = 4096 # room for system prompt + JSON response
|
||||||
|
MAX_SOURCE_TOKENS = NUM_CTX - SAFETY_MARGIN
|
||||||
|
MAX_SOURCE_CHARS = MAX_SOURCE_TOKENS * 4
|
||||||
|
|
||||||
|
if len(source_text) > MAX_SOURCE_CHARS:
|
||||||
|
die("input", f"source too large ({len(source_text)} chars, limit ~{MAX_SOURCE_CHARS}). "
|
||||||
|
f"Use the SPLIT directive or divide the document.")
|
||||||
|
|
||||||
# --- read existing index to avoid duplicate slugs ---
|
# --- read existing index to avoid duplicate slugs ---
|
||||||
existing_entities = set()
|
existing_entities = set()
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue