From 18fe5908a1389aa96a2e32f418161f22a41fb70d Mon Sep 17 00:00:00 2001 From: Bifang <915779419@qq.com> Date: Fri, 5 Jun 2026 15:53:47 +0800 Subject: [PATCH] first commit --- .dockerignore | 10 +++ .env | 71 ++++++++++++++++ Dockerfile | 10 +++ compose.yaml | 29 +++++++ model_download.py | 113 +++++++++++++++++++++++++ serve.py | 208 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 441 insertions(+) create mode 100644 .dockerignore create mode 100644 .env create mode 100644 Dockerfile create mode 100644 compose.yaml create mode 100644 model_download.py create mode 100644 serve.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..97f3586 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +__pycache__/ +*.pyc +*.pyo +*.pyd +.git/ +.agents/ +.codex/ +models/ +modelscope_cache/ +logs/ diff --git a/.env b/.env new file mode 100644 index 0000000..5a94032 --- /dev/null +++ b/.env @@ -0,0 +1,71 @@ +MODELSCOPE_CACHE=./modelscope_cache + +# 模型名称(可自定义) +MODEL_ID=Qwen3-9B + +# 模型文件路径 +MODEL_DIR=./models/Qwen3.5-9B + +HOST=0.0.0.0 +PORT=9527 + +# 指定加载到哪些显卡0,1,2,3,4 +CUDA_VISIBLE_DEVICES=0 +# 张量并行卡数 +TENSOR_PARALLEL_SIZE=1 + +# 上下文长度 +MAX_MODEL_LEN=32768 + +# 显存占用比例。默认参数0.9,多余显存分配个KV Cache以支持高并发 +GPU_MEMORY_UTILIZATION=0.4 + +# 计算精度 +# DTYPE=bfloat16 + +# KV Cache 精度(auto/fp8) +# KV_CACHE_DTYPE=auto + +# 最大并发序列数 +MAX_NUM_SEQS=32 + +# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配 +MAX_NUM_BATCHED_TOKENSMAX=16384 + + +# 其他运行开关 +DISABLE_LOG_REQUESTS=False +ENABLE_LOG_REQUESTS=true + +# VLLM运行模式DEBUG\INFO +VLLM_LOGGING_LEVEL=INFO + +# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败 +ENABLE_AUTO_TOOL_CHOICE=true +TOOL_CALL_PARSER=qwen3_xml +REASONING_PARSER=qwen3 + +# 思考标记开关,QWEN3.5-9B不匹配 +# DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}' + +# 留空时使用模型自带 chat_template;如需官方工具模板可填绝对路径 +# CHAT_TEMPLATE= + +# enable-chunked-prefill分块预填空,避免瞬时大量占用内存 +# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。 + +TRUST_REMOTE_CODE=true +API_KEY=unis123 + +# 采样参数 +# TEMPERATURE=1.0 +# TOP_P=0.95 +# TOP_K=64 + +LOG_DIR=./logs + +# Auto download model when MODEL_DIR is missing on container start. +AUTO_DOWNLOAD_MODEL=true +MODEL_SOURCE=Qwen/Qwen3.5-9B +DOWNLOAD_CACHE_DIR=./modelscope_cache +SKIP_MODEL_DOWNLOAD_IF_EXISTS=true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..792f80b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM registry.iluvatar.com.cn:10443/customer/sz/vllm0.17.0-4.4.0-x86:v5 + +WORKDIR /workspace/vllm + +COPY . /workspace/vllm + +ENV ENV_FILE=.env +ENV PYTHONUNBUFFERED=1 + +CMD ["python", "serve.py"] diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..a7def9f --- /dev/null +++ b/compose.yaml @@ -0,0 +1,29 @@ +services: + vllm: + build: + context: . + dockerfile: Dockerfile + image: local/vllm-qwen3-9b:latest + container_name: vllm-qwen3-9b + working_dir: /workspace/vllm + env_file: + - .env + environment: + ENV_FILE: .env + PYTHONUNBUFFERED: "1" + network_mode: host + ipc: host + pid: host + privileged: true + cap_add: + - ALL + restart: unless-stopped + volumes: + - ./models:/workspace/vllm/models + - ./modelscope_cache:/workspace/vllm/modelscope_cache + - ./logs:/workspace/vllm/logs + - /usr/src:/usr/src + - /lib/modules:/lib/modules + - /dev:/dev + - /home:/home + - /data:/data diff --git a/model_download.py b/model_download.py new file mode 100644 index 0000000..a3cdaec --- /dev/null +++ b/model_download.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +""" +Standalone model download script. + +Usage: + python model_download.py +""" + +import os +from pathlib import Path + +from dotenv import load_dotenv + +DEFAULT_ENV_FILE = ".env" +DEFAULT_MODEL_ID = "Qwen/Qwen3.5-9B" +DEFAULT_MODEL_DIR = "./models/Qwen3.5-9B" +DEFAULT_CACHE_DIR = "./modelscope_cache" + + +def resolve_path(raw: str, base_dir: Path) -> Path: + path = Path(raw).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def load_runtime_env(script_dir: Path) -> Path: + env_name = (os.getenv("ENV_FILE", DEFAULT_ENV_FILE) or DEFAULT_ENV_FILE).strip() + env_path = (script_dir / env_name).resolve() + if env_path.exists(): + load_dotenv(env_path) + return env_path + + +def env_flag(name: str, default: bool = False) -> bool: + raw = os.getenv(name, "") + if not raw: + return default + return raw.strip().lower() in {"1", "true", "yes", "on"} + + +def download_model( + model_id: str, + model_dir: Path, + cache_dir: Path, + revision: str = "", + skip_if_exists: bool = False, +) -> Path: + try: + from modelscope.hub.snapshot_download import snapshot_download + except Exception as exc: + raise RuntimeError( + "Missing dependencies. Please install first:\n" + " pip install -r requirements.txt" + ) from exc + + model_dir.parent.mkdir(parents=True, exist_ok=True) + cache_dir.mkdir(parents=True, exist_ok=True) + + if skip_if_exists and model_dir.exists() and any(model_dir.iterdir()): + print(f"[INFO] model already exists, skip download: {model_dir}") + return model_dir.resolve() + + print(f"[INFO] model_id={model_id}") + print(f"[INFO] model_dir={model_dir}") + print(f"[INFO] cache_dir={cache_dir}") + if revision: + print(f"[INFO] revision={revision}") + + kwargs = { + "model_id": model_id, + "local_dir": str(model_dir), + "cache_dir": str(cache_dir), + } + if revision: + kwargs["revision"] = revision + + downloaded_path = snapshot_download(**kwargs) + print(f"[OK] download complete: {downloaded_path}") + return Path(downloaded_path).resolve() + + +def main() -> None: + script_dir = Path(__file__).resolve().parent + env_path = load_runtime_env(script_dir) + model_id = os.getenv("DOWNLOAD_MODEL_ID", os.getenv("MODEL_SOURCE", DEFAULT_MODEL_ID)).strip() + model_dir_raw = os.getenv("DOWNLOAD_SAVE_DIR", os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)).strip() + cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", DEFAULT_CACHE_DIR)).strip() + revision = os.getenv("DOWNLOAD_REVISION", "").strip() + skip_if_exists = env_flag("SKIP_MODEL_DOWNLOAD_IF_EXISTS", True) + + if not model_id: + raise ValueError("DOWNLOAD_MODEL_ID/MODEL_SOURCE is empty.") + if not model_dir_raw: + raise ValueError("DOWNLOAD_SAVE_DIR/MODEL_DIR is empty.") + if not cache_dir_raw: + raise ValueError("DOWNLOAD_CACHE_DIR/MODELSCOPE_CACHE is empty.") + + model_dir = resolve_path(model_dir_raw, script_dir) + cache_dir = resolve_path(cache_dir_raw, script_dir) + print(f"[INFO] env_file={env_path}") + download_model( + model_id=model_id, + model_dir=model_dir, + cache_dir=cache_dir, + revision=revision, + skip_if_exists=skip_if_exists, + ) + + +if __name__ == "__main__": + main() diff --git a/serve.py b/serve.py new file mode 100644 index 0000000..3fe4756 --- /dev/null +++ b/serve.py @@ -0,0 +1,208 @@ +import os +import shlex +import subprocess +import sys +from pathlib import Path + +from dotenv import load_dotenv +from model_download import download_model + +# Default env file. Override with ENV_FILE if needed. +DEFAULT_ENV_FILE = ".env" + + +def as_bool(value: str) -> bool: + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path: + raw = os.getenv(env_name, "").strip() + if not raw: + return (base_dir / default_relative).resolve() + + path = Path(raw).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def resolve_optional_path(raw_path: str, base_dir: Path) -> Path: + path = Path(raw_path).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def ensure_model_ready(script_dir: Path, model_dir: Path) -> Path: + auto_download = as_bool(os.getenv("AUTO_DOWNLOAD_MODEL", "false")) + model_source = os.getenv("MODEL_SOURCE", "").strip() + cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", "./modelscope_cache")).strip() + revision = os.getenv("DOWNLOAD_REVISION", "").strip() + + if model_dir.exists() and any(model_dir.iterdir()): + return model_dir + + if not auto_download: + raise FileNotFoundError( + f"Model directory does not exist: {model_dir}\n" + "Run `python model_download.py` first, or set AUTO_DOWNLOAD_MODEL=true." + ) + + if not model_source: + raise ValueError( + "AUTO_DOWNLOAD_MODEL=true but MODEL_SOURCE is empty.\n" + "Example: MODEL_SOURCE=Qwen/Qwen3.5-9B" + ) + + cache_dir = resolve_optional_path(cache_dir_raw, script_dir) + print("[INFO] model directory missing, start auto download") + download_model( + model_id=model_source, + model_dir=model_dir, + cache_dir=cache_dir, + revision=revision, + skip_if_exists=True, + ) + return model_dir + + +def main() -> None: + script_dir = Path(__file__).resolve().parent + env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve() + if not env_path.exists(): + raise FileNotFoundError(f"Environment file does not exist: {env_path}") + load_dotenv(env_path) + + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip() + model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir) + host = os.getenv("HOST", "0.0.0.0") + port = os.getenv("PORT", "9527") + tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1") + max_model_len = os.getenv("MAX_MODEL_LEN", "32768") + gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.90") + trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true")) + enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true")) + tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip() + reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip() + + enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip() + if enable_log_requests_raw: + enable_log_requests = as_bool(enable_log_requests_raw) + else: + enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false")) + + vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip() + + default_chat_template_kwargs = os.getenv( + "DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}' + ).strip() + + chat_template = os.getenv("CHAT_TEMPLATE", "").strip() + + api_key = os.getenv("API_KEY", "your-secret-api-key").strip() + log_dir = resolve_path("LOG_DIR", "logs", script_dir) + max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip() + max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip() + + model_dir = ensure_model_ready(script_dir, model_dir) + + log_dir.mkdir(parents=True, exist_ok=True) + + if cuda_visible_devices: + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + if vllm_logging_level: + os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level + # Avoid passing non-vLLM env keys through subprocess environment. + # These custom keys trigger "Unknown vLLM environment variable" warnings. + + cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + str(model_dir), + "--served-model-name", + os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"), + "--host", + host, + "--port", + port, + "--tensor-parallel-size", + tensor_parallel_size, + "--max-model-len", + max_model_len, + "--gpu-memory-utilization", + gpu_memory_utilization, + ] + + if trust_remote_code: + cmd.append("--trust-remote-code") + + if enable_log_requests: + cmd.append("--enable-log-requests") + + if enable_auto_tool_choice: + cmd.append("--enable-auto-tool-choice") + if tool_call_parser: + cmd.extend(["--tool-call-parser", tool_call_parser]) + + if reasoning_parser: + cmd.extend(["--reasoning-parser", reasoning_parser]) + + # if default_chat_template_kwargs: + # cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs]) + + resolved_chat_template: Path | None = None + if chat_template: + resolved_chat_template = resolve_optional_path(chat_template, script_dir) + if not resolved_chat_template.exists(): + raise FileNotFoundError( + f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n" + "Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template." + ) + + if resolved_chat_template is not None: + cmd.extend(["--chat-template", str(resolved_chat_template)]) + + if api_key: + cmd.extend(["--api-key", api_key]) + + if max_num_seqs: + cmd.extend(["--max-num-seqs", max_num_seqs]) + + if max_num_batched_tokens: + cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens]) + + + # Force prefill tuning flags directly in script (do not rely on env parsing). + cmd.extend( + [ + "--enable-chunked-prefill", + "--max-num-partial-prefills=1", + ] + ) + + print("[INFO] starting vLLM server with command:") + print(" ".join(shlex.quote(item) for item in cmd)) + if enable_auto_tool_choice: + print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}") + print(f"[INFO] enable_log_requests={enable_log_requests}") + if vllm_logging_level: + print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}") + if reasoning_parser: + print(f"[INFO] reasoning_parser={reasoning_parser}") + if resolved_chat_template is not None: + print(f"[INFO] chat_template={resolved_chat_template}") + else: + print("[INFO] chat_template=(model default)") + if cuda_visible_devices: + print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}") + print(f"[INFO] resolved model_dir={model_dir}") + print(f"[INFO] resolved log_dir={log_dir}") + print(f"[INFO] env_file={env_path}") + + subprocess.run(cmd, check=True) + + +if __name__ == "__main__": + main()