server:
port: "8080"
read_timeout: 35m # Long for reasoning models (o3, GPT-5.2)
write_timeout: 35m
idle_timeout: 120s
coalescence_window: 50ms # Batch streaming tokens (0 = disabled)
redis:
address: "localhost:6379"
username: ""
password: ""
database: 0
pool_size: 10
max_retries: 3
dial_timeout: 5s
database:
path: "./data/ensemble.db"
max_open_conns: 25
max_idle_conns: 5
cache:
enable_session_affinity: true
max_cache_entries: 100000
crc_algorithm: crc32
cache_wait_threshold: 0.25 # >$0.25 estimated value: strong affinity
load_balance_threshold: 0.05 # <$0.05: prefer least-utilized endpoint
ttls:
anthropic: 8m
openai: 24h
gemini: 6m
rate_limit:
window_size: 1m
ttl_seconds: 65
sync_interval: 1s # Background Redis sync interval
default_rpm: 1000
default_tpm: 1000000
redis_eval_timeout: 50ms
redis_rollback_timeout: 25ms
# Provider configs are in separate files under config/providers/
# anthropic.yaml, openai.yaml, gemini.yaml, xai.yaml, openrouter.yaml,
# bedrock.yaml, vertex.yaml, fireworks.yaml, self-hosted-*.yaml
# Per-model streaming timeouts (stall vs overall)
streaming_timeouts:
"o1":
stall_timeout: 20m
overall_timeout: 25m
"o3":
stall_timeout: 20m
overall_timeout: 30m
"gpt-5":
stall_timeout: 20m
overall_timeout: 30m
# Provider HTTP client timeouts
provider_timeouts:
default: 60s
bedrock: 90s
api_call_default: 15m
api_call_extended: 50m # GPT-5.2 Pro reasoning
# YAML-driven parameter validation per model pattern
parameter_validation:
enable: true
model_drop_rules:
"gpt-5": ["temperature"]
"o1": ["temperature", "top_p"]
conditional_rules:
"claude*opus*":
- if_parameter: "temperature"
drop_parameters: ["top_p"]