first commit

main
Bifang 2026-06-10 09:40:21 +08:00
commit 2f630641af
5 changed files with 400 additions and 0 deletions

60
.env.gemma4_26B 100644
View File

@ -0,0 +1,60 @@
MODELSCOPE_CACHE=./modelscope_cache
MODEL_ID=gemma-4-26B
MODEL_DIR=./models/gemma-4-26B-A4B-it-FP8
PORT=9527
# 指定加载到哪些显卡0,1,2,3,4
CUDA_VISIBLE_DEVICES=0
# 张量并行卡数
TENSOR_PARALLEL_SIZE=1
# 上下文长度
MAX_MODEL_LEN=81920
# 显存占用比例。默认参数0.9多余显存分配个KV Cache以支持高并发
GPU_MEMORY_UTILIZATION=0.30
# 计算精度
# DTYPE=bfloat16
# KV Cache 精度auto/fp8
# KV_CACHE_DTYPE=auto
# 最大并发序列数
MAX_NUM_SEQS=64
# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配
MAX_NUM_BATCHED_TOKENSMAX=8192
# 其他运行开关
DISABLE_LOG_REQUESTS=False
ENABLE_LOG_REQUESTS=true
# VLLM运行模式DEBUG\INFO
VLLM_LOGGING_LEVEL=INFO
# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败
ENABLE_AUTO_TOOL_CHOICE=true
TOOL_CALL_PARSER=gemma4
REASONING_PARSER=gemma4
# Gemma专用推理链额外标记开关
DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
# 留空时使用模型自带 chat_template如需官方工具模板可填绝对路径
# CHAT_TEMPLATE=
# enable-chunked-prefill分块预填空避免瞬时大量占用内存
# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。
TRUST_REMOTE_CODE=true
API_KEY=unis123
# 采样参数
# TEMPERATURE=1.0
# TOP_P=0.95
# TOP_K=64
LOG_DIR=./logs

75
download_model.py 100644
View File

@ -0,0 +1,75 @@
from __future__ import annotations
"""
Standalone model download script.
Usage:
python download_model_new.py
"""
from pathlib import Path
# =========================
# User Config
# Modify these variables directly, then run:
# python download_model_new.py
# =========================
DOWNLOAD_MODEL_ID = "kuohao/gemma-4-26B-A4B-it-FP8"
DOWNLOAD_SAVE_DIR = "./models/gemma-4-26B-A4B-it-FP8"
DOWNLOAD_CACHE_DIR = "./modelscope_cache"
DOWNLOAD_REVISION = ""
def resolve_path(raw: str, base_dir: Path) -> Path:
path = Path(raw).expanduser()
if path.is_absolute():
return path.resolve()
return (base_dir / path).resolve()
def main() -> None:
try:
from modelscope.hub.snapshot_download import snapshot_download
except Exception as exc:
raise RuntimeError(
"Missing dependencies. Please install first:\n"
" pip install -r requirements.txt"
) from exc
script_dir = Path(__file__).resolve().parent
model_id = DOWNLOAD_MODEL_ID.strip()
model_dir_raw = DOWNLOAD_SAVE_DIR.strip()
cache_dir_raw = DOWNLOAD_CACHE_DIR.strip()
revision = DOWNLOAD_REVISION.strip()
if not model_id:
raise ValueError("DOWNLOAD_MODEL_ID is empty.")
if not model_dir_raw:
raise ValueError("DOWNLOAD_SAVE_DIR is empty.")
if not cache_dir_raw:
raise ValueError("DOWNLOAD_CACHE_DIR is empty.")
model_dir = resolve_path(model_dir_raw, script_dir)
cache_dir = resolve_path(cache_dir_raw, script_dir)
model_dir.parent.mkdir(parents=True, exist_ok=True)
cache_dir.mkdir(parents=True, exist_ok=True)
print(f"[INFO] model_id={model_id}")
print(f"[INFO] model_dir={model_dir}")
print(f"[INFO] cache_dir={cache_dir}")
if revision:
print(f"[INFO] revision={revision}")
kwargs = {
"model_id": model_id,
"local_dir": str(model_dir),
"cache_dir": str(cache_dir),
}
if revision:
kwargs["revision"] = revision
downloaded_path = snapshot_download(**kwargs)
print(f"[OK] download complete: {downloaded_path}")
if __name__ == "__main__":
main()

71
evirement.md 100644
View File

@ -0,0 +1,71 @@
# cu128 Manual Backup Plan
This backup plan is for Linux x86_64 machines with NVIDIA GPU.
Preferred CUDA target: 12.8.
## 1. Recommended requirements source
The project requirements are pinned to CUDA 12.8:
- PyTorch index: `https://download.pytorch.org/whl/cu128`
- vLLM index: `https://wheels.vllm.ai/nightly/cu128`
Install with:
```bash
pip install -r requirements.txt
```
## 2. Manual install plan
If `pip install -r requirements.txt` is slow or fails, install in this order.
### Step 1: install PyTorch trio for cu128
```bash
pip install \
--index-url https://pypi.org/simple \
--extra-index-url https://download.pytorch.org/whl/cu128 \
torch==2.11.0 \
torchvision==0.26.0 \
torchaudio==2.11.0
```
### Step 2: install vLLM for cu128
Note:
- `vllm 0.19.0` for `cu128 x86_64` was not found as a GitHub release wheel.
- Use the official vLLM `cu128` nightly wheel index as the fallback source.
```bash
pip install \
--index-url https://pypi.org/simple \
--extra-index-url https://download.pytorch.org/whl/cu128 \
--extra-index-url https://wheels.vllm.ai/nightly/cu128 \
vllm==0.19.0
```
### Step 3: install project runtime helpers
```bash
pip install python-dotenv modelscope
```
## 3. Quick verification
```bash
python -c "import torch, vllm; print(torch.__version__); print(torch.version.cuda); print(vllm.__version__)"
```
Expected:
- `torch.version.cuda` should be `12.8`
- `vllm.__version__` should start with `0.19.0`
## 4. If install still fails
Check these items first:
- `nvidia-smi` is available
- driver supports CUDA 12.8 runtime
- machine is `Linux x86_64`, not native Windows
- Python version is compatible with the downloaded wheels

16
requirements.txt 100644
View File

@ -0,0 +1,16 @@
--index-url https://pypi.org/simple
--extra-index-url https://download.pytorch.org/whl/cu128
--extra-index-url https://wheels.vllm.ai/nightly/cu128
# x86_64 Linux + NVIDIA CUDA 12.8
# PyTorch trio is pinned to the official cu128 build.
torch==2.11.0
torchvision==0.26.0
torchaudio==2.11.0
# vLLM 0.19.0 does not provide a cu128 x86_64 release wheel on GitHub releases,
# so install it from the official cu128 nightly wheel index.
vllm==0.19.0
python-dotenv
modelscope

178
serve.py 100644
View File

@ -0,0 +1,178 @@
import os
import shlex
import subprocess
import sys
from pathlib import Path
from dotenv import load_dotenv
# Default env file. Override with ENV_FILE if needed.
DEFAULT_ENV_FILE = ".env.gemma4_26B"
def as_bool(value: str) -> bool:
return str(value).strip().lower() in {"1", "true", "yes", "on"}
def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path:
raw = os.getenv(env_name, "").strip()
if not raw:
return (base_dir / default_relative).resolve()
path = Path(raw).expanduser()
if path.is_absolute():
return path.resolve()
return (base_dir / path).resolve()
def resolve_optional_path(raw_path: str, base_dir: Path) -> Path:
path = Path(raw_path).expanduser()
if path.is_absolute():
return path.resolve()
return (base_dir / path).resolve()
def main() -> None:
script_dir = Path(__file__).resolve().parent
env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve()
if not env_path.exists():
raise FileNotFoundError(f"Environment file does not exist: {env_path}")
load_dotenv(env_path)
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip()
model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir)
host = os.getenv("HOST", "0.0.0.0")
port = os.getenv("PORT", "9527")
tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1")
max_model_len = os.getenv("MAX_MODEL_LEN", "32768")
gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.80")
trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true"))
enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true"))
tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip()
reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip()
enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip()
if enable_log_requests_raw:
enable_log_requests = as_bool(enable_log_requests_raw)
else:
enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false"))
vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip()
default_chat_template_kwargs = os.getenv(
"DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}'
).strip()
chat_template = os.getenv("CHAT_TEMPLATE", "").strip()
api_key = os.getenv("API_KEY", "your-secret-api-key").strip()
log_dir = resolve_path("LOG_DIR", "logs", script_dir)
max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip()
max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip()
if not model_dir.exists():
raise FileNotFoundError(
f"Model directory does not exist: {model_dir}\n"
"Run `python download_model.py` first."
)
log_dir.mkdir(parents=True, exist_ok=True)
if cuda_visible_devices:
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
if vllm_logging_level:
os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level
# Avoid passing non-vLLM env keys through subprocess environment.
# These custom keys trigger "Unknown vLLM environment variable" warnings.
cmd = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
str(model_dir),
"--served-model-name",
os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"),
"--host",
host,
"--port",
port,
"--tensor-parallel-size",
tensor_parallel_size,
"--max-model-len",
max_model_len,
"--gpu-memory-utilization",
gpu_memory_utilization,
]
if trust_remote_code:
cmd.append("--trust-remote-code")
if enable_log_requests:
cmd.append("--enable-log-requests")
if enable_auto_tool_choice:
cmd.append("--enable-auto-tool-choice")
if tool_call_parser:
cmd.extend(["--tool-call-parser", tool_call_parser])
if reasoning_parser:
cmd.extend(["--reasoning-parser", reasoning_parser])
if default_chat_template_kwargs:
cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs])
resolved_chat_template: Path | None = None
if chat_template:
resolved_chat_template = resolve_optional_path(chat_template, script_dir)
if not resolved_chat_template.exists():
raise FileNotFoundError(
f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n"
"Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template."
)
if resolved_chat_template is not None:
cmd.extend(["--chat-template", str(resolved_chat_template)])
if api_key:
cmd.extend(["--api-key", api_key])
if max_num_seqs:
cmd.extend(["--max-num-seqs", max_num_seqs])
if max_num_batched_tokens:
cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens])
# Force prefill tuning flags directly in script (do not rely on env parsing).
cmd.extend(
[
"--enable-chunked-prefill",
"--max-num-partial-prefills=1",
]
)
print("[INFO] starting vLLM server with command:")
print(" ".join(shlex.quote(item) for item in cmd))
if enable_auto_tool_choice:
print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}")
print(f"[INFO] enable_log_requests={enable_log_requests}")
if vllm_logging_level:
print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}")
if reasoning_parser:
print(f"[INFO] reasoning_parser={reasoning_parser}")
if resolved_chat_template is not None:
print(f"[INFO] chat_template={resolved_chat_template}")
else:
print("[INFO] chat_template=(model default)")
if cuda_visible_devices:
print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}")
print(f"[INFO] resolved model_dir={model_dir}")
print(f"[INFO] resolved log_dir={log_dir}")
print(f"[INFO] env_file={env_path}")
subprocess.run(cmd, check=True)
if __name__ == "__main__":
main()