first commit
parent
96ab29eb40
commit
18fe5908a1
|
|
@ -0,0 +1,10 @@
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
.git/
|
||||||
|
.agents/
|
||||||
|
.codex/
|
||||||
|
models/
|
||||||
|
modelscope_cache/
|
||||||
|
logs/
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
MODELSCOPE_CACHE=./modelscope_cache
|
||||||
|
|
||||||
|
# 模型名称(可自定义)
|
||||||
|
MODEL_ID=Qwen3-9B
|
||||||
|
|
||||||
|
# 模型文件路径
|
||||||
|
MODEL_DIR=./models/Qwen3.5-9B
|
||||||
|
|
||||||
|
HOST=0.0.0.0
|
||||||
|
PORT=9527
|
||||||
|
|
||||||
|
# 指定加载到哪些显卡0,1,2,3,4
|
||||||
|
CUDA_VISIBLE_DEVICES=0
|
||||||
|
# 张量并行卡数
|
||||||
|
TENSOR_PARALLEL_SIZE=1
|
||||||
|
|
||||||
|
# 上下文长度
|
||||||
|
MAX_MODEL_LEN=32768
|
||||||
|
|
||||||
|
# 显存占用比例。默认参数0.9,多余显存分配个KV Cache以支持高并发
|
||||||
|
GPU_MEMORY_UTILIZATION=0.4
|
||||||
|
|
||||||
|
# 计算精度
|
||||||
|
# DTYPE=bfloat16
|
||||||
|
|
||||||
|
# KV Cache 精度(auto/fp8)
|
||||||
|
# KV_CACHE_DTYPE=auto
|
||||||
|
|
||||||
|
# 最大并发序列数
|
||||||
|
MAX_NUM_SEQS=32
|
||||||
|
|
||||||
|
# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配
|
||||||
|
MAX_NUM_BATCHED_TOKENSMAX=16384
|
||||||
|
|
||||||
|
|
||||||
|
# 其他运行开关
|
||||||
|
DISABLE_LOG_REQUESTS=False
|
||||||
|
ENABLE_LOG_REQUESTS=true
|
||||||
|
|
||||||
|
# VLLM运行模式DEBUG\INFO
|
||||||
|
VLLM_LOGGING_LEVEL=INFO
|
||||||
|
|
||||||
|
# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败
|
||||||
|
ENABLE_AUTO_TOOL_CHOICE=true
|
||||||
|
TOOL_CALL_PARSER=qwen3_xml
|
||||||
|
REASONING_PARSER=qwen3
|
||||||
|
|
||||||
|
# 思考标记开关,QWEN3.5-9B不匹配
|
||||||
|
# DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
|
||||||
|
|
||||||
|
# 留空时使用模型自带 chat_template;如需官方工具模板可填绝对路径
|
||||||
|
# CHAT_TEMPLATE=
|
||||||
|
|
||||||
|
# enable-chunked-prefill分块预填空,避免瞬时大量占用内存
|
||||||
|
# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。
|
||||||
|
|
||||||
|
TRUST_REMOTE_CODE=true
|
||||||
|
API_KEY=unis123
|
||||||
|
|
||||||
|
# 采样参数
|
||||||
|
# TEMPERATURE=1.0
|
||||||
|
# TOP_P=0.95
|
||||||
|
# TOP_K=64
|
||||||
|
|
||||||
|
LOG_DIR=./logs
|
||||||
|
|
||||||
|
# Auto download model when MODEL_DIR is missing on container start.
|
||||||
|
AUTO_DOWNLOAD_MODEL=true
|
||||||
|
MODEL_SOURCE=Qwen/Qwen3.5-9B
|
||||||
|
DOWNLOAD_CACHE_DIR=./modelscope_cache
|
||||||
|
SKIP_MODEL_DOWNLOAD_IF_EXISTS=true
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
FROM registry.iluvatar.com.cn:10443/customer/sz/vllm0.17.0-4.4.0-x86:v5
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
COPY . /workspace/vllm
|
||||||
|
|
||||||
|
ENV ENV_FILE=.env
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
CMD ["python", "serve.py"]
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
services:
|
||||||
|
vllm:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: local/vllm-qwen3-9b:latest
|
||||||
|
container_name: vllm-qwen3-9b
|
||||||
|
working_dir: /workspace/vllm
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
environment:
|
||||||
|
ENV_FILE: .env
|
||||||
|
PYTHONUNBUFFERED: "1"
|
||||||
|
network_mode: host
|
||||||
|
ipc: host
|
||||||
|
pid: host
|
||||||
|
privileged: true
|
||||||
|
cap_add:
|
||||||
|
- ALL
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./models:/workspace/vllm/models
|
||||||
|
- ./modelscope_cache:/workspace/vllm/modelscope_cache
|
||||||
|
- ./logs:/workspace/vllm/logs
|
||||||
|
- /usr/src:/usr/src
|
||||||
|
- /lib/modules:/lib/modules
|
||||||
|
- /dev:/dev
|
||||||
|
- /home:/home
|
||||||
|
- /data:/data
|
||||||
|
|
@ -0,0 +1,113 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
"""
|
||||||
|
Standalone model download script.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python model_download.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
DEFAULT_ENV_FILE = ".env"
|
||||||
|
DEFAULT_MODEL_ID = "Qwen/Qwen3.5-9B"
|
||||||
|
DEFAULT_MODEL_DIR = "./models/Qwen3.5-9B"
|
||||||
|
DEFAULT_CACHE_DIR = "./modelscope_cache"
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_path(raw: str, base_dir: Path) -> Path:
|
||||||
|
path = Path(raw).expanduser()
|
||||||
|
if path.is_absolute():
|
||||||
|
return path.resolve()
|
||||||
|
return (base_dir / path).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def load_runtime_env(script_dir: Path) -> Path:
|
||||||
|
env_name = (os.getenv("ENV_FILE", DEFAULT_ENV_FILE) or DEFAULT_ENV_FILE).strip()
|
||||||
|
env_path = (script_dir / env_name).resolve()
|
||||||
|
if env_path.exists():
|
||||||
|
load_dotenv(env_path)
|
||||||
|
return env_path
|
||||||
|
|
||||||
|
|
||||||
|
def env_flag(name: str, default: bool = False) -> bool:
|
||||||
|
raw = os.getenv(name, "")
|
||||||
|
if not raw:
|
||||||
|
return default
|
||||||
|
return raw.strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def download_model(
|
||||||
|
model_id: str,
|
||||||
|
model_dir: Path,
|
||||||
|
cache_dir: Path,
|
||||||
|
revision: str = "",
|
||||||
|
skip_if_exists: bool = False,
|
||||||
|
) -> Path:
|
||||||
|
try:
|
||||||
|
from modelscope.hub.snapshot_download import snapshot_download
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Missing dependencies. Please install first:\n"
|
||||||
|
" pip install -r requirements.txt"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
model_dir.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if skip_if_exists and model_dir.exists() and any(model_dir.iterdir()):
|
||||||
|
print(f"[INFO] model already exists, skip download: {model_dir}")
|
||||||
|
return model_dir.resolve()
|
||||||
|
|
||||||
|
print(f"[INFO] model_id={model_id}")
|
||||||
|
print(f"[INFO] model_dir={model_dir}")
|
||||||
|
print(f"[INFO] cache_dir={cache_dir}")
|
||||||
|
if revision:
|
||||||
|
print(f"[INFO] revision={revision}")
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"model_id": model_id,
|
||||||
|
"local_dir": str(model_dir),
|
||||||
|
"cache_dir": str(cache_dir),
|
||||||
|
}
|
||||||
|
if revision:
|
||||||
|
kwargs["revision"] = revision
|
||||||
|
|
||||||
|
downloaded_path = snapshot_download(**kwargs)
|
||||||
|
print(f"[OK] download complete: {downloaded_path}")
|
||||||
|
return Path(downloaded_path).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
env_path = load_runtime_env(script_dir)
|
||||||
|
model_id = os.getenv("DOWNLOAD_MODEL_ID", os.getenv("MODEL_SOURCE", DEFAULT_MODEL_ID)).strip()
|
||||||
|
model_dir_raw = os.getenv("DOWNLOAD_SAVE_DIR", os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)).strip()
|
||||||
|
cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", DEFAULT_CACHE_DIR)).strip()
|
||||||
|
revision = os.getenv("DOWNLOAD_REVISION", "").strip()
|
||||||
|
skip_if_exists = env_flag("SKIP_MODEL_DOWNLOAD_IF_EXISTS", True)
|
||||||
|
|
||||||
|
if not model_id:
|
||||||
|
raise ValueError("DOWNLOAD_MODEL_ID/MODEL_SOURCE is empty.")
|
||||||
|
if not model_dir_raw:
|
||||||
|
raise ValueError("DOWNLOAD_SAVE_DIR/MODEL_DIR is empty.")
|
||||||
|
if not cache_dir_raw:
|
||||||
|
raise ValueError("DOWNLOAD_CACHE_DIR/MODELSCOPE_CACHE is empty.")
|
||||||
|
|
||||||
|
model_dir = resolve_path(model_dir_raw, script_dir)
|
||||||
|
cache_dir = resolve_path(cache_dir_raw, script_dir)
|
||||||
|
print(f"[INFO] env_file={env_path}")
|
||||||
|
download_model(
|
||||||
|
model_id=model_id,
|
||||||
|
model_dir=model_dir,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
revision=revision,
|
||||||
|
skip_if_exists=skip_if_exists,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -0,0 +1,208 @@
|
||||||
|
import os
|
||||||
|
import shlex
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from model_download import download_model
|
||||||
|
|
||||||
|
# Default env file. Override with ENV_FILE if needed.
|
||||||
|
DEFAULT_ENV_FILE = ".env"
|
||||||
|
|
||||||
|
|
||||||
|
def as_bool(value: str) -> bool:
|
||||||
|
return str(value).strip().lower() in {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path:
|
||||||
|
raw = os.getenv(env_name, "").strip()
|
||||||
|
if not raw:
|
||||||
|
return (base_dir / default_relative).resolve()
|
||||||
|
|
||||||
|
path = Path(raw).expanduser()
|
||||||
|
if path.is_absolute():
|
||||||
|
return path.resolve()
|
||||||
|
return (base_dir / path).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_optional_path(raw_path: str, base_dir: Path) -> Path:
|
||||||
|
path = Path(raw_path).expanduser()
|
||||||
|
if path.is_absolute():
|
||||||
|
return path.resolve()
|
||||||
|
return (base_dir / path).resolve()
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_model_ready(script_dir: Path, model_dir: Path) -> Path:
|
||||||
|
auto_download = as_bool(os.getenv("AUTO_DOWNLOAD_MODEL", "false"))
|
||||||
|
model_source = os.getenv("MODEL_SOURCE", "").strip()
|
||||||
|
cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", "./modelscope_cache")).strip()
|
||||||
|
revision = os.getenv("DOWNLOAD_REVISION", "").strip()
|
||||||
|
|
||||||
|
if model_dir.exists() and any(model_dir.iterdir()):
|
||||||
|
return model_dir
|
||||||
|
|
||||||
|
if not auto_download:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Model directory does not exist: {model_dir}\n"
|
||||||
|
"Run `python model_download.py` first, or set AUTO_DOWNLOAD_MODEL=true."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not model_source:
|
||||||
|
raise ValueError(
|
||||||
|
"AUTO_DOWNLOAD_MODEL=true but MODEL_SOURCE is empty.\n"
|
||||||
|
"Example: MODEL_SOURCE=Qwen/Qwen3.5-9B"
|
||||||
|
)
|
||||||
|
|
||||||
|
cache_dir = resolve_optional_path(cache_dir_raw, script_dir)
|
||||||
|
print("[INFO] model directory missing, start auto download")
|
||||||
|
download_model(
|
||||||
|
model_id=model_source,
|
||||||
|
model_dir=model_dir,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
revision=revision,
|
||||||
|
skip_if_exists=True,
|
||||||
|
)
|
||||||
|
return model_dir
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve()
|
||||||
|
if not env_path.exists():
|
||||||
|
raise FileNotFoundError(f"Environment file does not exist: {env_path}")
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
|
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip()
|
||||||
|
model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir)
|
||||||
|
host = os.getenv("HOST", "0.0.0.0")
|
||||||
|
port = os.getenv("PORT", "9527")
|
||||||
|
tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1")
|
||||||
|
max_model_len = os.getenv("MAX_MODEL_LEN", "32768")
|
||||||
|
gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.90")
|
||||||
|
trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true"))
|
||||||
|
enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true"))
|
||||||
|
tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip()
|
||||||
|
reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip()
|
||||||
|
|
||||||
|
enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip()
|
||||||
|
if enable_log_requests_raw:
|
||||||
|
enable_log_requests = as_bool(enable_log_requests_raw)
|
||||||
|
else:
|
||||||
|
enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false"))
|
||||||
|
|
||||||
|
vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip()
|
||||||
|
|
||||||
|
default_chat_template_kwargs = os.getenv(
|
||||||
|
"DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}'
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
chat_template = os.getenv("CHAT_TEMPLATE", "").strip()
|
||||||
|
|
||||||
|
api_key = os.getenv("API_KEY", "your-secret-api-key").strip()
|
||||||
|
log_dir = resolve_path("LOG_DIR", "logs", script_dir)
|
||||||
|
max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip()
|
||||||
|
max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip()
|
||||||
|
|
||||||
|
model_dir = ensure_model_ready(script_dir, model_dir)
|
||||||
|
|
||||||
|
log_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
if cuda_visible_devices:
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
|
||||||
|
if vllm_logging_level:
|
||||||
|
os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level
|
||||||
|
# Avoid passing non-vLLM env keys through subprocess environment.
|
||||||
|
# These custom keys trigger "Unknown vLLM environment variable" warnings.
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"vllm.entrypoints.openai.api_server",
|
||||||
|
"--model",
|
||||||
|
str(model_dir),
|
||||||
|
"--served-model-name",
|
||||||
|
os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"),
|
||||||
|
"--host",
|
||||||
|
host,
|
||||||
|
"--port",
|
||||||
|
port,
|
||||||
|
"--tensor-parallel-size",
|
||||||
|
tensor_parallel_size,
|
||||||
|
"--max-model-len",
|
||||||
|
max_model_len,
|
||||||
|
"--gpu-memory-utilization",
|
||||||
|
gpu_memory_utilization,
|
||||||
|
]
|
||||||
|
|
||||||
|
if trust_remote_code:
|
||||||
|
cmd.append("--trust-remote-code")
|
||||||
|
|
||||||
|
if enable_log_requests:
|
||||||
|
cmd.append("--enable-log-requests")
|
||||||
|
|
||||||
|
if enable_auto_tool_choice:
|
||||||
|
cmd.append("--enable-auto-tool-choice")
|
||||||
|
if tool_call_parser:
|
||||||
|
cmd.extend(["--tool-call-parser", tool_call_parser])
|
||||||
|
|
||||||
|
if reasoning_parser:
|
||||||
|
cmd.extend(["--reasoning-parser", reasoning_parser])
|
||||||
|
|
||||||
|
# if default_chat_template_kwargs:
|
||||||
|
# cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs])
|
||||||
|
|
||||||
|
resolved_chat_template: Path | None = None
|
||||||
|
if chat_template:
|
||||||
|
resolved_chat_template = resolve_optional_path(chat_template, script_dir)
|
||||||
|
if not resolved_chat_template.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n"
|
||||||
|
"Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template."
|
||||||
|
)
|
||||||
|
|
||||||
|
if resolved_chat_template is not None:
|
||||||
|
cmd.extend(["--chat-template", str(resolved_chat_template)])
|
||||||
|
|
||||||
|
if api_key:
|
||||||
|
cmd.extend(["--api-key", api_key])
|
||||||
|
|
||||||
|
if max_num_seqs:
|
||||||
|
cmd.extend(["--max-num-seqs", max_num_seqs])
|
||||||
|
|
||||||
|
if max_num_batched_tokens:
|
||||||
|
cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens])
|
||||||
|
|
||||||
|
|
||||||
|
# Force prefill tuning flags directly in script (do not rely on env parsing).
|
||||||
|
cmd.extend(
|
||||||
|
[
|
||||||
|
"--enable-chunked-prefill",
|
||||||
|
"--max-num-partial-prefills=1",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print("[INFO] starting vLLM server with command:")
|
||||||
|
print(" ".join(shlex.quote(item) for item in cmd))
|
||||||
|
if enable_auto_tool_choice:
|
||||||
|
print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}")
|
||||||
|
print(f"[INFO] enable_log_requests={enable_log_requests}")
|
||||||
|
if vllm_logging_level:
|
||||||
|
print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}")
|
||||||
|
if reasoning_parser:
|
||||||
|
print(f"[INFO] reasoning_parser={reasoning_parser}")
|
||||||
|
if resolved_chat_template is not None:
|
||||||
|
print(f"[INFO] chat_template={resolved_chat_template}")
|
||||||
|
else:
|
||||||
|
print("[INFO] chat_template=(model default)")
|
||||||
|
if cuda_visible_devices:
|
||||||
|
print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}")
|
||||||
|
print(f"[INFO] resolved model_dir={model_dir}")
|
||||||
|
print(f"[INFO] resolved log_dir={log_dir}")
|
||||||
|
print(f"[INFO] env_file={env_path}")
|
||||||
|
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue