llm/.env.gemma4_26B

61 lines
1.4 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

MODELSCOPE_CACHE=./modelscope_cache
MODEL_ID=gemma-4-26B
MODEL_DIR=./models/gemma-4-26B-A4B-it-FP8
PORT=9527
# 指定加载到哪些显卡0,1,2,3,4
CUDA_VISIBLE_DEVICES=0
# 张量并行卡数
TENSOR_PARALLEL_SIZE=1
# 上下文长度
MAX_MODEL_LEN=81920
# 显存占用比例。默认参数0.9多余显存分配个KV Cache以支持高并发
GPU_MEMORY_UTILIZATION=0.30
# 计算精度
# DTYPE=bfloat16
# KV Cache 精度auto/fp8
# KV_CACHE_DTYPE=auto
# 最大并发序列数
MAX_NUM_SEQS=64
# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配
MAX_NUM_BATCHED_TOKENSMAX=8192
# 其他运行开关
DISABLE_LOG_REQUESTS=False
ENABLE_LOG_REQUESTS=true
# VLLM运行模式DEBUG\INFO
VLLM_LOGGING_LEVEL=INFO
# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败
ENABLE_AUTO_TOOL_CHOICE=true
TOOL_CALL_PARSER=gemma4
REASONING_PARSER=gemma4
# Gemma专用推理链额外标记开关
DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
# 留空时使用模型自带 chat_template如需官方工具模板可填绝对路径
# CHAT_TEMPLATE=
# enable-chunked-prefill分块预填空避免瞬时大量占用内存
# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。
TRUST_REMOTE_CODE=true
API_KEY=unis123
# 采样参数
# TEMPERATURE=1.0
# TOP_P=0.95
# TOP_K=64
LOG_DIR=./logs