first commit

2026-06-10 09:40:21 +08:00 · 2026-06-10 09:40:21 +08:00 · 2f630641af
commit 2f630641af
5 changed files with 400 additions and 0 deletions
--- a/.env.gemma4_26B
+++ b/.env.gemma4_26B
@ -0,0 +1,60 @@
+MODELSCOPE_CACHE=./modelscope_cache
+
+MODEL_ID=gemma-4-26B
+MODEL_DIR=./models/gemma-4-26B-A4B-it-FP8
+PORT=9527
+
+# 指定加载到哪些显卡0,1,2,3,4
+CUDA_VISIBLE_DEVICES=0
+# 张量并行卡数
+TENSOR_PARALLEL_SIZE=1
+
+# 上下文长度
+MAX_MODEL_LEN=81920
+
+# 显存占用比例。默认参数0.9，多余显存分配个KV Cache以支持高并发
+GPU_MEMORY_UTILIZATION=0.30
+
+# 计算精度
+# DTYPE=bfloat16
+
+# KV Cache 精度（auto/fp8）
+# KV_CACHE_DTYPE=auto
+
+# 最大并发序列数
+MAX_NUM_SEQS=64
+
+# 单批最大 token 数，根据并发和实际上下文需求配置，默认自动分配
+MAX_NUM_BATCHED_TOKENSMAX=8192
+
+
+# 其他运行开关
+DISABLE_LOG_REQUESTS=False
+ENABLE_LOG_REQUESTS=true
+
+# VLLM运行模式DEBUG\INFO
+VLLM_LOGGING_LEVEL=INFO
+
+# Tool calling 配置，需要和模型配套，否则可能出现工具调用失败
+ENABLE_AUTO_TOOL_CHOICE=true
+TOOL_CALL_PARSER=gemma4
+REASONING_PARSER=gemma4
+
+# Gemma专用推理链额外标记开关
+DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
+
+# 留空时使用模型自带 chat_template；如需官方工具模板可填绝对路径
+# CHAT_TEMPLATE=
+
+# enable-chunked-prefill分块预填空，避免瞬时大量占用内存
+# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求，提高稳定性。
+
+TRUST_REMOTE_CODE=true
+API_KEY=unis123
+
+# 采样参数
+# TEMPERATURE=1.0
+# TOP_P=0.95
+# TOP_K=64
+
+LOG_DIR=./logs
--- a/download_model.py
+++ b/download_model.py
@ -0,0 +1,75 @@
+from __future__ import annotations
+
+"""
+Standalone model download script.
+
+Usage:
+  python download_model_new.py
+"""
+
+from pathlib import Path
+
+# =========================
+# User Config
+# Modify these variables directly, then run:
+#   python download_model_new.py
+# =========================
+DOWNLOAD_MODEL_ID = "kuohao/gemma-4-26B-A4B-it-FP8"
+DOWNLOAD_SAVE_DIR = "./models/gemma-4-26B-A4B-it-FP8"
+DOWNLOAD_CACHE_DIR = "./modelscope_cache"
+DOWNLOAD_REVISION = ""
+
+def resolve_path(raw: str, base_dir: Path) -> Path:
+    path = Path(raw).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (base_dir / path).resolve()
+
+
+def main() -> None:
+    try:
+        from modelscope.hub.snapshot_download import snapshot_download
+    except Exception as exc:
+        raise RuntimeError(
+            "Missing dependencies. Please install first:\n"
+            "  pip install -r requirements.txt"
+        ) from exc
+
+    script_dir = Path(__file__).resolve().parent
+    model_id = DOWNLOAD_MODEL_ID.strip()
+    model_dir_raw = DOWNLOAD_SAVE_DIR.strip()
+    cache_dir_raw = DOWNLOAD_CACHE_DIR.strip()
+    revision = DOWNLOAD_REVISION.strip()
+
+    if not model_id:
+        raise ValueError("DOWNLOAD_MODEL_ID is empty.")
+    if not model_dir_raw:
+        raise ValueError("DOWNLOAD_SAVE_DIR is empty.")
+    if not cache_dir_raw:
+        raise ValueError("DOWNLOAD_CACHE_DIR is empty.")
+
+    model_dir = resolve_path(model_dir_raw, script_dir)
+    cache_dir = resolve_path(cache_dir_raw, script_dir)
+    model_dir.parent.mkdir(parents=True, exist_ok=True)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"[INFO] model_id={model_id}")
+    print(f"[INFO] model_dir={model_dir}")
+    print(f"[INFO] cache_dir={cache_dir}")
+    if revision:
+        print(f"[INFO] revision={revision}")
+
+    kwargs = {
+        "model_id": model_id,
+        "local_dir": str(model_dir),
+        "cache_dir": str(cache_dir),
+    }
+    if revision:
+        kwargs["revision"] = revision
+
+    downloaded_path = snapshot_download(**kwargs)
+    print(f"[OK] download complete: {downloaded_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/evirement.md
+++ b/evirement.md
@ -0,0 +1,71 @@
+# cu128 Manual Backup Plan
+
+This backup plan is for Linux x86_64 machines with NVIDIA GPU.
+Preferred CUDA target: 12.8.
+
+## 1. Recommended requirements source
+
+The project requirements are pinned to CUDA 12.8:
+
+- PyTorch index: `https://download.pytorch.org/whl/cu128`
+- vLLM index: `https://wheels.vllm.ai/nightly/cu128`
+
+Install with:
+
+```bash
+pip install -r requirements.txt
+```
+
+## 2. Manual install plan
+
+If `pip install -r requirements.txt` is slow or fails, install in this order.
+
+### Step 1: install PyTorch trio for cu128
+
+```bash
+pip install \
+  --index-url https://pypi.org/simple \
+  --extra-index-url https://download.pytorch.org/whl/cu128 \
+  torch==2.11.0 \
+  torchvision==0.26.0 \
+  torchaudio==2.11.0
+```
+
+### Step 2: install vLLM for cu128
+
+Note:
+- `vllm 0.19.0` for `cu128 x86_64` was not found as a GitHub release wheel.
+- Use the official vLLM `cu128` nightly wheel index as the fallback source.
+
+```bash
+pip install \
+  --index-url https://pypi.org/simple \
+  --extra-index-url https://download.pytorch.org/whl/cu128 \
+  --extra-index-url https://wheels.vllm.ai/nightly/cu128 \
+  vllm==0.19.0
+```
+
+### Step 3: install project runtime helpers
+
+```bash
+pip install python-dotenv modelscope
+```
+
+## 3. Quick verification
+
+```bash
+python -c "import torch, vllm; print(torch.__version__); print(torch.version.cuda); print(vllm.__version__)"
+```
+
+Expected:
+- `torch.version.cuda` should be `12.8`
+- `vllm.__version__` should start with `0.19.0`
+
+## 4. If install still fails
+
+Check these items first:
+
+- `nvidia-smi` is available
+- driver supports CUDA 12.8 runtime
+- machine is `Linux x86_64`, not native Windows
+- Python version is compatible with the downloaded wheels
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,16 @@
+--index-url https://pypi.org/simple
+--extra-index-url https://download.pytorch.org/whl/cu128
+--extra-index-url https://wheels.vllm.ai/nightly/cu128
+
+# x86_64 Linux + NVIDIA CUDA 12.8
+# PyTorch trio is pinned to the official cu128 build.
+torch==2.11.0
+torchvision==0.26.0
+torchaudio==2.11.0
+
+# vLLM 0.19.0 does not provide a cu128 x86_64 release wheel on GitHub releases,
+# so install it from the official cu128 nightly wheel index.
+vllm==0.19.0
+
+python-dotenv
+modelscope
--- a/serve.py
+++ b/serve.py
@ -0,0 +1,178 @@
+import os
+import shlex
+import subprocess
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+# Default env file. Override with ENV_FILE if needed.
+DEFAULT_ENV_FILE = ".env.gemma4_26B"
+
+
+def as_bool(value: str) -> bool:
+    return str(value).strip().lower() in {"1", "true", "yes", "on"}
+
+
+def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path:
+    raw = os.getenv(env_name, "").strip()
+    if not raw:
+        return (base_dir / default_relative).resolve()
+
+    path = Path(raw).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (base_dir / path).resolve()
+
+
+def resolve_optional_path(raw_path: str, base_dir: Path) -> Path:
+    path = Path(raw_path).expanduser()
+    if path.is_absolute():
+        return path.resolve()
+    return (base_dir / path).resolve()
+
+
+def main() -> None:
+    script_dir = Path(__file__).resolve().parent
+    env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve()
+    if not env_path.exists():
+        raise FileNotFoundError(f"Environment file does not exist: {env_path}")
+    load_dotenv(env_path)
+
+    cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip()
+    model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir)
+    host = os.getenv("HOST", "0.0.0.0")
+    port = os.getenv("PORT", "9527")
+    tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1")
+    max_model_len = os.getenv("MAX_MODEL_LEN", "32768")
+    gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.80")
+    trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true"))
+    enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true"))
+    tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip()
+    reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip()
+
+    enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip()
+    if enable_log_requests_raw:
+        enable_log_requests = as_bool(enable_log_requests_raw)
+    else:
+        enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false"))
+
+    vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip()
+
+    default_chat_template_kwargs = os.getenv(
+        "DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}'
+    ).strip()
+
+    chat_template = os.getenv("CHAT_TEMPLATE", "").strip()
+
+    api_key = os.getenv("API_KEY", "your-secret-api-key").strip()
+    log_dir = resolve_path("LOG_DIR", "logs", script_dir)
+    max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip()
+    max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip()
+
+    if not model_dir.exists():
+        raise FileNotFoundError(
+            f"Model directory does not exist: {model_dir}\n"
+            "Run `python download_model.py` first."
+        )
+
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    if cuda_visible_devices:
+        os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
+    if vllm_logging_level:
+        os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level
+    # Avoid passing non-vLLM env keys through subprocess environment.
+    # These custom keys trigger "Unknown vLLM environment variable" warnings.
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "--model",
+        str(model_dir),
+        "--served-model-name",
+        os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"),
+        "--host",
+        host,
+        "--port",
+        port,
+        "--tensor-parallel-size",
+        tensor_parallel_size,
+        "--max-model-len",
+        max_model_len,
+        "--gpu-memory-utilization",
+        gpu_memory_utilization,
+    ]
+
+    if trust_remote_code:
+        cmd.append("--trust-remote-code")
+
+    if enable_log_requests:
+        cmd.append("--enable-log-requests")
+
+    if enable_auto_tool_choice:
+        cmd.append("--enable-auto-tool-choice")
+        if tool_call_parser:
+            cmd.extend(["--tool-call-parser", tool_call_parser])
+
+    if reasoning_parser:
+        cmd.extend(["--reasoning-parser", reasoning_parser])
+
+    if default_chat_template_kwargs:
+        cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs])
+
+    resolved_chat_template: Path | None = None
+    if chat_template:
+        resolved_chat_template = resolve_optional_path(chat_template, script_dir)
+        if not resolved_chat_template.exists():
+            raise FileNotFoundError(
+                f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n"
+                "Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template."
+            )
+
+    if resolved_chat_template is not None:
+        cmd.extend(["--chat-template", str(resolved_chat_template)])
+
+    if api_key:
+        cmd.extend(["--api-key", api_key])
+
+    if max_num_seqs:
+        cmd.extend(["--max-num-seqs", max_num_seqs])
+
+    if max_num_batched_tokens:
+        cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens])
+
+
+    # Force prefill tuning flags directly in script (do not rely on env parsing).
+    cmd.extend(
+        [
+            "--enable-chunked-prefill",
+            "--max-num-partial-prefills=1",
+        ]
+    )
+
+    print("[INFO] starting vLLM server with command:")
+    print(" ".join(shlex.quote(item) for item in cmd))
+    if enable_auto_tool_choice:
+        print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}")
+    print(f"[INFO] enable_log_requests={enable_log_requests}")
+    if vllm_logging_level:
+        print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}")
+    if reasoning_parser:
+        print(f"[INFO] reasoning_parser={reasoning_parser}")
+    if resolved_chat_template is not None:
+        print(f"[INFO] chat_template={resolved_chat_template}")
+    else:
+        print("[INFO] chat_template=(model default)")
+    if cuda_visible_devices:
+        print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}")
+    print(f"[INFO] resolved model_dir={model_dir}")
+    print(f"[INFO] resolved log_dir={log_dir}")
+    print(f"[INFO] env_file={env_path}")
+
+    subprocess.run(cmd, check=True)
+
+
+if __name__ == "__main__":
+    main()