commit 2f630641af82cffd2a378c207974f55fa4ad9a34 Author: Bifang <915779419@qq.com> Date: Wed Jun 10 09:40:21 2026 +0800 first commit diff --git a/.env.gemma4_26B b/.env.gemma4_26B new file mode 100644 index 0000000..1cfc75f --- /dev/null +++ b/.env.gemma4_26B @@ -0,0 +1,60 @@ +MODELSCOPE_CACHE=./modelscope_cache + +MODEL_ID=gemma-4-26B +MODEL_DIR=./models/gemma-4-26B-A4B-it-FP8 +PORT=9527 + +# 指定加载到哪些显卡0,1,2,3,4 +CUDA_VISIBLE_DEVICES=0 +# 张量并行卡数 +TENSOR_PARALLEL_SIZE=1 + +# 上下文长度 +MAX_MODEL_LEN=81920 + +# 显存占用比例。默认参数0.9,多余显存分配个KV Cache以支持高并发 +GPU_MEMORY_UTILIZATION=0.30 + +# 计算精度 +# DTYPE=bfloat16 + +# KV Cache 精度(auto/fp8) +# KV_CACHE_DTYPE=auto + +# 最大并发序列数 +MAX_NUM_SEQS=64 + +# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配 +MAX_NUM_BATCHED_TOKENSMAX=8192 + + +# 其他运行开关 +DISABLE_LOG_REQUESTS=False +ENABLE_LOG_REQUESTS=true + +# VLLM运行模式DEBUG\INFO +VLLM_LOGGING_LEVEL=INFO + +# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败 +ENABLE_AUTO_TOOL_CHOICE=true +TOOL_CALL_PARSER=gemma4 +REASONING_PARSER=gemma4 + +# Gemma专用推理链额外标记开关 +DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}' + +# 留空时使用模型自带 chat_template;如需官方工具模板可填绝对路径 +# CHAT_TEMPLATE= + +# enable-chunked-prefill分块预填空,避免瞬时大量占用内存 +# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。 + +TRUST_REMOTE_CODE=true +API_KEY=unis123 + +# 采样参数 +# TEMPERATURE=1.0 +# TOP_P=0.95 +# TOP_K=64 + +LOG_DIR=./logs diff --git a/download_model.py b/download_model.py new file mode 100644 index 0000000..a2bc338 --- /dev/null +++ b/download_model.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +""" +Standalone model download script. + +Usage: + python download_model_new.py +""" + +from pathlib import Path + +# ========================= +# User Config +# Modify these variables directly, then run: +# python download_model_new.py +# ========================= +DOWNLOAD_MODEL_ID = "kuohao/gemma-4-26B-A4B-it-FP8" +DOWNLOAD_SAVE_DIR = "./models/gemma-4-26B-A4B-it-FP8" +DOWNLOAD_CACHE_DIR = "./modelscope_cache" +DOWNLOAD_REVISION = "" + +def resolve_path(raw: str, base_dir: Path) -> Path: + path = Path(raw).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def main() -> None: + try: + from modelscope.hub.snapshot_download import snapshot_download + except Exception as exc: + raise RuntimeError( + "Missing dependencies. Please install first:\n" + " pip install -r requirements.txt" + ) from exc + + script_dir = Path(__file__).resolve().parent + model_id = DOWNLOAD_MODEL_ID.strip() + model_dir_raw = DOWNLOAD_SAVE_DIR.strip() + cache_dir_raw = DOWNLOAD_CACHE_DIR.strip() + revision = DOWNLOAD_REVISION.strip() + + if not model_id: + raise ValueError("DOWNLOAD_MODEL_ID is empty.") + if not model_dir_raw: + raise ValueError("DOWNLOAD_SAVE_DIR is empty.") + if not cache_dir_raw: + raise ValueError("DOWNLOAD_CACHE_DIR is empty.") + + model_dir = resolve_path(model_dir_raw, script_dir) + cache_dir = resolve_path(cache_dir_raw, script_dir) + model_dir.parent.mkdir(parents=True, exist_ok=True) + cache_dir.mkdir(parents=True, exist_ok=True) + + print(f"[INFO] model_id={model_id}") + print(f"[INFO] model_dir={model_dir}") + print(f"[INFO] cache_dir={cache_dir}") + if revision: + print(f"[INFO] revision={revision}") + + kwargs = { + "model_id": model_id, + "local_dir": str(model_dir), + "cache_dir": str(cache_dir), + } + if revision: + kwargs["revision"] = revision + + downloaded_path = snapshot_download(**kwargs) + print(f"[OK] download complete: {downloaded_path}") + + +if __name__ == "__main__": + main() diff --git a/evirement.md b/evirement.md new file mode 100644 index 0000000..ba62e6d --- /dev/null +++ b/evirement.md @@ -0,0 +1,71 @@ +# cu128 Manual Backup Plan + +This backup plan is for Linux x86_64 machines with NVIDIA GPU. +Preferred CUDA target: 12.8. + +## 1. Recommended requirements source + +The project requirements are pinned to CUDA 12.8: + +- PyTorch index: `https://download.pytorch.org/whl/cu128` +- vLLM index: `https://wheels.vllm.ai/nightly/cu128` + +Install with: + +```bash +pip install -r requirements.txt +``` + +## 2. Manual install plan + +If `pip install -r requirements.txt` is slow or fails, install in this order. + +### Step 1: install PyTorch trio for cu128 + +```bash +pip install \ + --index-url https://pypi.org/simple \ + --extra-index-url https://download.pytorch.org/whl/cu128 \ + torch==2.11.0 \ + torchvision==0.26.0 \ + torchaudio==2.11.0 +``` + +### Step 2: install vLLM for cu128 + +Note: +- `vllm 0.19.0` for `cu128 x86_64` was not found as a GitHub release wheel. +- Use the official vLLM `cu128` nightly wheel index as the fallback source. + +```bash +pip install \ + --index-url https://pypi.org/simple \ + --extra-index-url https://download.pytorch.org/whl/cu128 \ + --extra-index-url https://wheels.vllm.ai/nightly/cu128 \ + vllm==0.19.0 +``` + +### Step 3: install project runtime helpers + +```bash +pip install python-dotenv modelscope +``` + +## 3. Quick verification + +```bash +python -c "import torch, vllm; print(torch.__version__); print(torch.version.cuda); print(vllm.__version__)" +``` + +Expected: +- `torch.version.cuda` should be `12.8` +- `vllm.__version__` should start with `0.19.0` + +## 4. If install still fails + +Check these items first: + +- `nvidia-smi` is available +- driver supports CUDA 12.8 runtime +- machine is `Linux x86_64`, not native Windows +- Python version is compatible with the downloaded wheels diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3c3d754 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +--index-url https://pypi.org/simple +--extra-index-url https://download.pytorch.org/whl/cu128 +--extra-index-url https://wheels.vllm.ai/nightly/cu128 + +# x86_64 Linux + NVIDIA CUDA 12.8 +# PyTorch trio is pinned to the official cu128 build. +torch==2.11.0 +torchvision==0.26.0 +torchaudio==2.11.0 + +# vLLM 0.19.0 does not provide a cu128 x86_64 release wheel on GitHub releases, +# so install it from the official cu128 nightly wheel index. +vllm==0.19.0 + +python-dotenv +modelscope diff --git a/serve.py b/serve.py new file mode 100644 index 0000000..3eb4bb6 --- /dev/null +++ b/serve.py @@ -0,0 +1,178 @@ +import os +import shlex +import subprocess +import sys +from pathlib import Path + +from dotenv import load_dotenv + +# Default env file. Override with ENV_FILE if needed. +DEFAULT_ENV_FILE = ".env.gemma4_26B" + + +def as_bool(value: str) -> bool: + return str(value).strip().lower() in {"1", "true", "yes", "on"} + + +def resolve_path(env_name: str, default_relative: str, base_dir: Path) -> Path: + raw = os.getenv(env_name, "").strip() + if not raw: + return (base_dir / default_relative).resolve() + + path = Path(raw).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def resolve_optional_path(raw_path: str, base_dir: Path) -> Path: + path = Path(raw_path).expanduser() + if path.is_absolute(): + return path.resolve() + return (base_dir / path).resolve() + + +def main() -> None: + script_dir = Path(__file__).resolve().parent + env_path = (script_dir / (os.getenv("ENV_FILE", DEFAULT_ENV_FILE).strip() or DEFAULT_ENV_FILE)).resolve() + if not env_path.exists(): + raise FileNotFoundError(f"Environment file does not exist: {env_path}") + load_dotenv(env_path) + + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "0").strip() + model_dir = resolve_path("MODEL_DIR", "models/google_gemma-4-E4B-it", script_dir) + host = os.getenv("HOST", "0.0.0.0") + port = os.getenv("PORT", "9527") + tensor_parallel_size = os.getenv("TENSOR_PARALLEL_SIZE", "1") + max_model_len = os.getenv("MAX_MODEL_LEN", "32768") + gpu_memory_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.80") + trust_remote_code = as_bool(os.getenv("TRUST_REMOTE_CODE", "true")) + enable_auto_tool_choice = as_bool(os.getenv("ENABLE_AUTO_TOOL_CHOICE", "true")) + tool_call_parser = os.getenv("TOOL_CALL_PARSER", "auto").strip() + reasoning_parser = os.getenv("REASONING_PARSER", "auto").strip() + + enable_log_requests_raw = os.getenv("ENABLE_LOG_REQUESTS", "").strip() + if enable_log_requests_raw: + enable_log_requests = as_bool(enable_log_requests_raw) + else: + enable_log_requests = not as_bool(os.getenv("DISABLE_LOG_REQUESTS", "false")) + + vllm_logging_level = os.getenv("VLLM_LOGGING_LEVEL", "INFO").strip() + + default_chat_template_kwargs = os.getenv( + "DEFAULT_CHAT_TEMPLATE_KWARGS", '{"enable_thinking": true}' + ).strip() + + chat_template = os.getenv("CHAT_TEMPLATE", "").strip() + + api_key = os.getenv("API_KEY", "your-secret-api-key").strip() + log_dir = resolve_path("LOG_DIR", "logs", script_dir) + max_num_seqs = os.getenv("MAX_NUM_SEQS", "64").strip() + max_num_batched_tokens = os.getenv("MAX_NUM_BATCHED_TOKENSMAX", "4096").strip() + + if not model_dir.exists(): + raise FileNotFoundError( + f"Model directory does not exist: {model_dir}\n" + "Run `python download_model.py` first." + ) + + log_dir.mkdir(parents=True, exist_ok=True) + + if cuda_visible_devices: + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + if vllm_logging_level: + os.environ["VLLM_LOGGING_LEVEL"] = vllm_logging_level + # Avoid passing non-vLLM env keys through subprocess environment. + # These custom keys trigger "Unknown vLLM environment variable" warnings. + + cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + str(model_dir), + "--served-model-name", + os.getenv("MODEL_ID", "google/google_gemma-4-E4B-it"), + "--host", + host, + "--port", + port, + "--tensor-parallel-size", + tensor_parallel_size, + "--max-model-len", + max_model_len, + "--gpu-memory-utilization", + gpu_memory_utilization, + ] + + if trust_remote_code: + cmd.append("--trust-remote-code") + + if enable_log_requests: + cmd.append("--enable-log-requests") + + if enable_auto_tool_choice: + cmd.append("--enable-auto-tool-choice") + if tool_call_parser: + cmd.extend(["--tool-call-parser", tool_call_parser]) + + if reasoning_parser: + cmd.extend(["--reasoning-parser", reasoning_parser]) + + if default_chat_template_kwargs: + cmd.extend(["--default-chat-template-kwargs", default_chat_template_kwargs]) + + resolved_chat_template: Path | None = None + if chat_template: + resolved_chat_template = resolve_optional_path(chat_template, script_dir) + if not resolved_chat_template.exists(): + raise FileNotFoundError( + f"CHAT_TEMPLATE does not exist: {resolved_chat_template}\n" + "Use an absolute path, or remove CHAT_TEMPLATE to let vLLM use model default template." + ) + + if resolved_chat_template is not None: + cmd.extend(["--chat-template", str(resolved_chat_template)]) + + if api_key: + cmd.extend(["--api-key", api_key]) + + if max_num_seqs: + cmd.extend(["--max-num-seqs", max_num_seqs]) + + if max_num_batched_tokens: + cmd.extend(["--max-num-batched-tokens", max_num_batched_tokens]) + + + # Force prefill tuning flags directly in script (do not rely on env parsing). + cmd.extend( + [ + "--enable-chunked-prefill", + "--max-num-partial-prefills=1", + ] + ) + + print("[INFO] starting vLLM server with command:") + print(" ".join(shlex.quote(item) for item in cmd)) + if enable_auto_tool_choice: + print(f"[INFO] tool_call_parser={tool_call_parser or '(empty)'}") + print(f"[INFO] enable_log_requests={enable_log_requests}") + if vllm_logging_level: + print(f"[INFO] VLLM_LOGGING_LEVEL={vllm_logging_level}") + if reasoning_parser: + print(f"[INFO] reasoning_parser={reasoning_parser}") + if resolved_chat_template is not None: + print(f"[INFO] chat_template={resolved_chat_template}") + else: + print("[INFO] chat_template=(model default)") + if cuda_visible_devices: + print(f"[INFO] CUDA_VISIBLE_DEVICES={cuda_visible_devices}") + print(f"[INFO] resolved model_dir={model_dir}") + print(f"[INFO] resolved log_dir={log_dir}") + print(f"[INFO] env_file={env_path}") + + subprocess.run(cmd, check=True) + + +if __name__ == "__main__": + main()