import os import shlex import subprocess import sys from pathlib import Path from dotenv import load_dotenv # Default env file. Override with ENV_FILE if needed. DEFAULT_ENV_FILE = ".env.gemma4_26B" def as_bool(value: str) -> bool: return str(value).strip().lower() in {"1", "true", "yes", "on"} def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path: raw = os.getenv(env_name, "").strip() if not raw: return (base_dir / default_relative).resolve() path = Path(raw).expanduser() if path.is_absolute(): return path.resolve() return (base_dir / path).resolve() def resolve_optional_path(raw_path: str, base_dir: Path) -> Path: path = Path(raw_path).expanduser() if path.is_absolute(): return path.resolve() return (base_dir / path).resolve() def main() -> None: script_dir = Path(__file__).resolve().parent env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve() if not env_path.exists(): raise FileNotFoundError(f"Environment file does not exist: {env_path}") load_dotenv(env_path) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip() model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir) host = os.getenv("HOST", "0.0.0.0") port = os.getenv("PORT", "9527") tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1") max_model_len = os.getenv("MAX_MODEL_LEN", "32768") gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.80") trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true")) enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true")) tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip() reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip() enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip() if enable_log_requests_raw: enable_log_requests = as_bool(enable_log_requests_raw) else: enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false")) vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip() default_chat_template_kwargs = os.getenv( "DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}' ).strip() chat_template = os.getenv("CHAT_TEMPLATE", "").strip() api_key = os.getenv("API_KEY", "your-secret-api-key").strip() log_dir = resolve_path("LOG_DIR", "logs", script_dir) max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip() max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip() if not model_dir.exists(): raise FileNotFoundError( f"Model directory does not exist: {model_dir}\n" "Run `python download_model.py` first." ) log_dir.mkdir(parents=True, exist_ok=True) if cuda_visible_devices: os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices if vllm_logging_level: os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level # Avoid passing non-vLLM env keys through subprocess environment. # These custom keys trigger "Unknown vLLM environment variable" warnings. cmd = [ sys.executable, "-m", "vllm.entrypoints.openai.api_server", "--model", str(model_dir), "--served-model-name", os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"), "--host", host, "--port", port, "--tensor-parallel-size", tensor_parallel_size, "--max-model-len", max_model_len, "--gpu-memory-utilization", gpu_memory_utilization, ] if trust_remote_code: cmd.append("--trust-remote-code") if enable_log_requests: cmd.append("--enable-log-requests") if enable_auto_tool_choice: cmd.append("--enable-auto-tool-choice") if tool_call_parser: cmd.extend(["--tool-call-parser", tool_call_parser]) if reasoning_parser: cmd.extend(["--reasoning-parser", reasoning_parser]) if default_chat_template_kwargs: cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs]) resolved_chat_template: Path | None = None if chat_template: resolved_chat_template = resolve_optional_path(chat_template, script_dir) if not resolved_chat_template.exists(): raise FileNotFoundError( f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n" "Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template." ) if resolved_chat_template is not None: cmd.extend(["--chat-template", str(resolved_chat_template)]) if api_key: cmd.extend(["--api-key", api_key]) if max_num_seqs: cmd.extend(["--max-num-seqs", max_num_seqs]) if max_num_batched_tokens: cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens]) # Force prefill tuning flags directly in script (do not rely on env parsing). cmd.extend( [ "--enable-chunked-prefill", "--max-num-partial-prefills=1", ] ) print("[INFO] starting vLLM server with command:") print(" ".join(shlex.quote(item) for item in cmd)) if enable_auto_tool_choice: print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}") print(f"[INFO] enable_log_requests={enable_log_requests}") if vllm_logging_level: print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}") if reasoning_parser: print(f"[INFO] reasoning_parser={reasoning_parser}") if resolved_chat_template is not None: print(f"[INFO] chat_template={resolved_chat_template}") else: print("[INFO] chat_template=(model default)") if cuda_visible_devices: print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}") print(f"[INFO] resolved model_dir={model_dir}") print(f"[INFO] resolved log_dir={log_dir}") print(f"[INFO] env_file={env_path}") subprocess.run(cmd, check=True) if __name__ == "__main__": main()