ollama_smart_proxy/.env.example at master · lasdem/ollama_smart_proxy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# =============================================================================
# Ollama upstream
# =============================================================================
# Base URL for Ollama’s HTTP API; the proxy forwards traffic here.
# Alternative env name: OLLAMA_HOST (OLLAMA_API_BASE wins if both are set).
# Typical: http://localhost:11434 (default Ollama). Remote: http://<host>:11434 or HTTPS if terminated in front.
OLLAMA_API_BASE=http://localhost:11434

# Maximum concurrent requests the proxy will run against Ollama at the same time.
# Raise for more throughput on strong GPUs; lower if Ollama OOMs or thrashes. Sensible range: 1–16 (hardware-dependent).
OLLAMA_MAX_PARALLEL=3

# =============================================================================
# Proxy HTTP server
# =============================================================================
# Address to bind the FastAPI/uvicorn server. 0.0.0.0 = all interfaces; 127.0.0.1 = localhost only (safer on shared hosts).
PROXY_HOST=0.0.0.0

# TCP port for the proxy (dashboard and /proxy/* live here). Avoid colliding with Ollama (11434).
PROXY_PORT=8003

# Seconds to wait for a complete upstream (Ollama) response before failing the request.
# Long generations may need 300–600+; interactive chat often 120–300.
REQUEST_TIMEOUT=300

# =============================================================================
# VRAM / scheduling hints
# =============================================================================
# Total GPU VRAM in megabytes, used by the scheduler to estimate parallel load and swaps.
# Set to your GPU’s capacity (e.g. 8192, 12288, 24576, 49152, 80000 for an 80 GB card).
TOTAL_VRAM_MB=80000

# Seconds between VRAM usage polls. Lower = more reactive, slightly more overhead; typical 2–10.
VRAM_POLL_INTERVAL=2

# =============================================================================
# Queue priority (lower score = served sooner). Tunable bases and modifiers.
# If unset, code defaults are 100 / 200 / 400 / 800 for the four BASE_* vars (this file shows a tighter example).
# =============================================================================
# Base cost when the requested model is already loaded (prefer keeping hot models).
PRIORITY_BASE_LOADED=0

# Base cost when another request can run in parallel without unloading models.
PRIORITY_BASE_PARALLEL=150

# Base cost for a “small” model swap (below PRIORITY_BASE_LARGE_SWAP_THRESHOLD_GB in smart_proxy, default 40 GB).
PRIORITY_BASE_SMALL_SWAP=300

# Base cost for a “large” model swap.
PRIORITY_BASE_LARGE_SWAP=500

# Per second a request has waited: negative values increase priority over time (default -1 = −1 score per second waited).
PRIORITY_WAIT_TIME_MULTIPLIER=-1

# Penalty multiplier for clients that are rate-limited (queued + recent traffic in RATE_LIMIT_WINDOW); raises effective cost.
PRIORITY_RATE_LIMIT_MULTIPLIER=5

# Seconds over which rate-limit / “recent activity” is evaluated (default in code: 600 = 10 minutes).
RATE_LIMIT_WINDOW=600

# =============================================================================
# Logging
# =============================================================================
# Application log format: json (structured, good for prod) or human (readable in a terminal).
LOG_FORMAT=json

# Root log level: DEBUG, INFO, WARNING, ERROR, CRITICAL.
LOG_LEVEL=INFO

# Uvicorn/access-style noise: WARNING hides routine health/queue lines; INFO for full access logs when debugging.
ACCESS_LOG_LEVEL=WARNING

# =============================================================================
# Database
# =============================================================================
# sqlite (file, default) or postgres (set DB_HOST, DB_PORT, DB_NAME, DB_USER, DB_PASSWORD).
DB_TYPE=sqlite

# Path to the SQLite database file (ignored when DB_TYPE=postgres).
SQLITE_DB_PATH=./db/smart_proxy.db

# SQLAlchemy pool for PostgreSQL: connections to keep open and extra burst connections.
DB_POOL_SIZE=10
DB_MAX_OVERFLOW=20

# Days to retain raw request log rows before cleanup (0 = keep forever, which is the code default if unset).
LOG_RETENTION_DAYS=7

# How long to keep precomputed hourly rollup rows (analytics). Independent of LOG_RETENTION_DAYS.
ANALYTICS_HOURLY_RETENTION_DAYS=8

# How long to keep precomputed daily rollup rows.
ANALYTICS_DAILY_RETENTION_DAYS=91

# For GET /proxy/analytics, use rollup tables when the requested window is at least this many hours wide (faster on large ranges).
ANALYTICS_FROM_ROLLUPS_HOURS_THRESHOLD=72

# Seconds to cache identical /proxy/analytics responses in memory.
ANALYTICS_CACHE_TTL=60

# Parallel fan-out for analytics queries: true/1/yes or false/0/no. If omitted: false for sqlite, true for postgres.
# ANALYTICS_PARALLEL=true

# =============================================================================
# Security (admin API, dashboard auth, Ollama admin routes)
# =============================================================================
# Shared secret for /proxy/admin/* and related auth. If unset, admin endpoints are unprotected (see logs at startup).
# Use a long random string in production; never commit real secrets—copy this file to .env and replace.
PROXY_ADMIN_KEY=changeme