first commit

2026-06-05 15:53:47 +08:00 · 2026-06-05 15:53:47 +08:00 · 18fe5908a1
parent 96ab29eb40
commit 18fe5908a1
6 changed files with 441 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,10 @@
 __pycache__/
 *.pyc
 *.pyo
 *.pyd
 .git/
 .agents/
 .codex/
 models/
 modelscope_cache/
 logs/
--- a/.env
+++ b/.env
@ -0,0 +1,71 @@
 MODELSCOPE_CACHE=./modelscope_cache
 # 模型名称（可自定义）
 MODEL_ID=Qwen3-9B
 # 模型文件路径
 MODEL_DIR=./models/Qwen3.5-9B
 HOST=0.0.0.0
 PORT=9527
 # 指定加载到哪些显卡0,1,2,3,4
 CUDA_VISIBLE_DEVICES=0
 # 张量并行卡数
 TENSOR_PARALLEL_SIZE=1
 # 上下文长度
 MAX_MODEL_LEN=32768
 # 显存占用比例。默认参数0.9，多余显存分配个KV Cache以支持高并发
 GPU_MEMORY_UTILIZATION=0.4
 # 计算精度
 # DTYPE=bfloat16
 # KV Cache 精度（auto/fp8）
 # KV_CACHE_DTYPE=auto
 # 最大并发序列数
 MAX_NUM_SEQS=32
 # 单批最大 token 数，根据并发和实际上下文需求配置，默认自动分配
 MAX_NUM_BATCHED_TOKENSMAX=16384
 # 其他运行开关
 DISABLE_LOG_REQUESTS=False
 ENABLE_LOG_REQUESTS=true
 # VLLM运行模式DEBUG\INFO
 VLLM_LOGGING_LEVEL=INFO
 # Tool calling 配置，需要和模型配套，否则可能出现工具调用失败
 ENABLE_AUTO_TOOL_CHOICE=true
 TOOL_CALL_PARSER=qwen3_xml
 REASONING_PARSER=qwen3
 # 思考标记开关，QWEN3.5-9B不匹配
 # DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
 # 留空时使用模型自带 chat_template；如需官方工具模板可填绝对路径
 # CHAT_TEMPLATE=
 # enable-chunked-prefill分块预填空，避免瞬时大量占用内存
 # max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求，提高稳定性。
 TRUST_REMOTE_CODE=true
 API_KEY=unis123
 # 采样参数
 # TEMPERATURE=1.0
 # TOP_P=0.95
 # TOP_K=64
 LOG_DIR=./logs
 # Auto download model when MODEL_DIR is missing on container start.
 AUTO_DOWNLOAD_MODEL=true
 MODEL_SOURCE=Qwen/Qwen3.5-9B
 DOWNLOAD_CACHE_DIR=./modelscope_cache
 SKIP_MODEL_DOWNLOAD_IF_EXISTS=true
--- a/10
+++ b/10
@ -0,0 +1,10 @@
 FROM registry.iluvatar.com.cn:10443/customer/sz/vllm0.17.0-4.4.0-x86:v5
 WORKDIR /workspace/vllm
 COPY . /workspace/vllm
 ENV ENV_FILE=.env
 ENV PYTHONUNBUFFERED=1
 CMD ["python", "serve.py"]
--- a/compose.yaml
+++ b/compose.yaml
@ -0,0 +1,29 @@
 services:
  vllm:
    build:
      context: .
      dockerfile: Dockerfile
    image: local/vllm-qwen3-9b:latest
    container_name: vllm-qwen3-9b
    working_dir: /workspace/vllm
    env_file:
      - .env
    environment:
      ENV_FILE: .env
      PYTHONUNBUFFERED: "1"
    network_mode: host
    ipc: host
    pid: host
    privileged: true
    cap_add:
      - ALL
    restart: unless-stopped
    volumes:
      - ./models:/workspace/vllm/models
      - ./modelscope_cache:/workspace/vllm/modelscope_cache
      - ./logs:/workspace/vllm/logs
      - /usr/src:/usr/src
      - /lib/modules:/lib/modules
      - /dev:/dev
      - /home:/home
      - /data:/data
--- a/model_download.py
+++ b/model_download.py
@ -0,0 +1,113 @@
 from __future__ import annotations
 """
 Standalone model download script.
 Usage:
  python model_download.py
 """
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 DEFAULT_ENV_FILE = ".env"
 DEFAULT_MODEL_ID = "Qwen/Qwen3.5-9B"
 DEFAULT_MODEL_DIR = "./models/Qwen3.5-9B"
 DEFAULT_CACHE_DIR = "./modelscope_cache"
 def resolve_path(raw: str, base_dir: Path) -> Path:
    path = Path(raw).expanduser()
    if path.is_absolute():
        return path.resolve()
    return (base_dir / path).resolve()
 def load_runtime_env(script_dir: Path) -> Path:
    env_name = (os.getenv("ENV_FILE", DEFAULT_ENV_FILE) or DEFAULT_ENV_FILE).strip()
    env_path = (script_dir / env_name).resolve()
    if env_path.exists():
        load_dotenv(env_path)
    return env_path
 def env_flag(name: str, default: bool = False) -> bool:
    raw = os.getenv(name, "")
    if not raw:
        return default
    return raw.strip().lower() in {"1", "true", "yes", "on"}
 def download_model(
    model_id: str,
    model_dir: Path,
    cache_dir: Path,
    revision: str = "",
    skip_if_exists: bool = False,
 ) -> Path:
    try:
        from modelscope.hub.snapshot_download import snapshot_download
    except Exception as exc:
        raise RuntimeError(
            "Missing dependencies. Please install first:\n"
            "  pip install -r requirements.txt"
        ) from exc
    model_dir.parent.mkdir(parents=True, exist_ok=True)
    cache_dir.mkdir(parents=True, exist_ok=True)
    if skip_if_exists and model_dir.exists() and any(model_dir.iterdir()):
        print(f"[INFO] model already exists, skip download: {model_dir}")
        return model_dir.resolve()
    print(f"[INFO] model_id={model_id}")
    print(f"[INFO] model_dir={model_dir}")
    print(f"[INFO] cache_dir={cache_dir}")
    if revision:
        print(f"[INFO] revision={revision}")
    kwargs = {
        "model_id": model_id,
        "local_dir": str(model_dir),
        "cache_dir": str(cache_dir),
    }
    if revision:
        kwargs["revision"] = revision
    downloaded_path = snapshot_download(**kwargs)
    print(f"[OK] download complete: {downloaded_path}")
    return Path(downloaded_path).resolve()
 def main() -> None:
    script_dir = Path(__file__).resolve().parent
    env_path = load_runtime_env(script_dir)
    model_id = os.getenv("DOWNLOAD_MODEL_ID", os.getenv("MODEL_SOURCE", DEFAULT_MODEL_ID)).strip()
    model_dir_raw = os.getenv("DOWNLOAD_SAVE_DIR", os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)).strip()
    cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", DEFAULT_CACHE_DIR)).strip()
    revision = os.getenv("DOWNLOAD_REVISION", "").strip()
    skip_if_exists = env_flag("SKIP_MODEL_DOWNLOAD_IF_EXISTS", True)
    if not model_id:
        raise ValueError("DOWNLOAD_MODEL_ID/MODEL_SOURCE is empty.")
    if not model_dir_raw:
        raise ValueError("DOWNLOAD_SAVE_DIR/MODEL_DIR is empty.")
    if not cache_dir_raw:
        raise ValueError("DOWNLOAD_CACHE_DIR/MODELSCOPE_CACHE is empty.")
    model_dir = resolve_path(model_dir_raw, script_dir)
    cache_dir = resolve_path(cache_dir_raw, script_dir)
    print(f"[INFO] env_file={env_path}")
    download_model(
        model_id=model_id,
        model_dir=model_dir,
        cache_dir=cache_dir,
        revision=revision,
        skip_if_exists=skip_if_exists,
    )
 if __name__ == "__main__":
    main()
--- a/serve.py
+++ b/serve.py
@ -0,0 +1,208 @@
 import os
 import shlex
 import subprocess
 import sys
 from pathlib import Path
 from dotenv import load_dotenv
 from model_download import download_model
 # Default env file. Override with ENV_FILE if needed.
 DEFAULT_ENV_FILE = ".env"
 def as_bool(value: str) -> bool:
    return str(value).strip().lower() in {"1", "true", "yes", "on"}
 def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path:
    raw = os.getenv(env_name, "").strip()
    if not raw:
        return (base_dir / default_relative).resolve()
    path = Path(raw).expanduser()
    if path.is_absolute():
        return path.resolve()
    return (base_dir / path).resolve()
 def resolve_optional_path(raw_path: str, base_dir: Path) -> Path:
    path = Path(raw_path).expanduser()
    if path.is_absolute():
        return path.resolve()
    return (base_dir / path).resolve()
 def ensure_model_ready(script_dir: Path, model_dir: Path) -> Path:
    auto_download = as_bool(os.getenv("AUTO_DOWNLOAD_MODEL", "false"))
    model_source = os.getenv("MODEL_SOURCE", "").strip()
    cache_dir_raw = os.getenv("DOWNLOAD_CACHE_DIR", os.getenv("MODELSCOPE_CACHE", "./modelscope_cache")).strip()
    revision = os.getenv("DOWNLOAD_REVISION", "").strip()
    if model_dir.exists() and any(model_dir.iterdir()):
        return model_dir
    if not auto_download:
        raise FileNotFoundError(
            f"Model directory does not exist: {model_dir}\n"
            "Run `python model_download.py` first, or set AUTO_DOWNLOAD_MODEL=true."
        )
    if not model_source:
        raise ValueError(
            "AUTO_DOWNLOAD_MODEL=true but MODEL_SOURCE is empty.\n"
            "Example: MODEL_SOURCE=Qwen/Qwen3.5-9B"
        )
    cache_dir = resolve_optional_path(cache_dir_raw, script_dir)
    print("[INFO] model directory missing, start auto download")
    download_model(
        model_id=model_source,
        model_dir=model_dir,
        cache_dir=cache_dir,
        revision=revision,
        skip_if_exists=True,
    )
    return model_dir
 def main() -> None:
    script_dir = Path(__file__).resolve().parent
    env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve()
    if not env_path.exists():
        raise FileNotFoundError(f"Environment file does not exist: {env_path}")
    load_dotenv(env_path)
    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip()
    model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir)
    host = os.getenv("HOST", "0.0.0.0")
    port = os.getenv("PORT", "9527")
    tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1")
    max_model_len = os.getenv("MAX_MODEL_LEN", "32768")
    gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.90")
    trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true"))
    enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true"))
    tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip()
    reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip()
    enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip()
    if enable_log_requests_raw:
        enable_log_requests = as_bool(enable_log_requests_raw)
    else:
        enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false"))
    vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip()
    default_chat_template_kwargs = os.getenv(
        "DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}'
    ).strip()
    chat_template = os.getenv("CHAT_TEMPLATE", "").strip()
    api_key = os.getenv("API_KEY", "your-secret-api-key").strip()
    log_dir = resolve_path("LOG_DIR", "logs", script_dir)
    max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip()
    max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip()
    model_dir = ensure_model_ready(script_dir, model_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    if cuda_visible_devices:
        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
    if vllm_logging_level:
        os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level
    # Avoid passing non-vLLM env keys through subprocess environment.
    # These custom keys trigger "Unknown vLLM environment variable" warnings.
    cmd = [
        sys.executable,
        "-m",
        "vllm.entrypoints.openai.api_server",
        "--model",
        str(model_dir),
        "--served-model-name",
        os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"),
        "--host",
        host,
        "--port",
        port,
        "--tensor-parallel-size",
        tensor_parallel_size,
        "--max-model-len",
        max_model_len,
        "--gpu-memory-utilization",
        gpu_memory_utilization,
    ]
    if trust_remote_code:
        cmd.append("--trust-remote-code")
    if enable_log_requests:
        cmd.append("--enable-log-requests")
    if enable_auto_tool_choice:
        cmd.append("--enable-auto-tool-choice")
        if tool_call_parser:
            cmd.extend(["--tool-call-parser", tool_call_parser])
    if reasoning_parser:
        cmd.extend(["--reasoning-parser", reasoning_parser])
    # if default_chat_template_kwargs:
    #     cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs])
    resolved_chat_template: Path | None = None
    if chat_template:
        resolved_chat_template = resolve_optional_path(chat_template, script_dir)
        if not resolved_chat_template.exists():
            raise FileNotFoundError(
                f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n"
                "Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template."
            )
    if resolved_chat_template is not None:
        cmd.extend(["--chat-template", str(resolved_chat_template)])
    if api_key:
        cmd.extend(["--api-key", api_key])
    if max_num_seqs:
        cmd.extend(["--max-num-seqs", max_num_seqs])
    if max_num_batched_tokens:
        cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens])
    # Force prefill tuning flags directly in script (do not rely on env parsing).
    cmd.extend(
        [
            "--enable-chunked-prefill",
            "--max-num-partial-prefills=1",
        ]
    )
    print("[INFO] starting vLLM server with command:")
    print(" ".join(shlex.quote(item) for item in cmd))
    if enable_auto_tool_choice:
        print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}")
    print(f"[INFO] enable_log_requests={enable_log_requests}")
    if vllm_logging_level:
        print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}")
    if reasoning_parser:
        print(f"[INFO] reasoning_parser={reasoning_parser}")
    if resolved_chat_template is not None:
        print(f"[INFO] chat_template={resolved_chat_template}")
    else:
        print("[INFO] chat_template=(model default)")
    if cuda_visible_devices:
        print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}")
    print(f"[INFO] resolved model_dir={model_dir}")
    print(f"[INFO] resolved log_dir={log_dir}")
    print(f"[INFO] env_file={env_path}")
    subprocess.run(cmd, check=True)
 if __name__ == "__main__":
    main()