tianshu_vllm/.env

72 lines
1.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

MODELSCOPE_CACHE=./modelscope_cache
# 模型名称(可自定义)
MODEL_ID=Qwen3-9B
# 模型文件路径
MODEL_DIR=./models/Qwen3.5-9B
HOST=0.0.0.0
PORT=9527
# 指定加载到哪些显卡0,1,2,3,4
CUDA_VISIBLE_DEVICES=0
# 张量并行卡数
TENSOR_PARALLEL_SIZE=1
# 上下文长度
MAX_MODEL_LEN=32768
# 显存占用比例。默认参数0.9多余显存分配个KV Cache以支持高并发
GPU_MEMORY_UTILIZATION=0.4
# 计算精度
# DTYPE=bfloat16
# KV Cache 精度auto/fp8
# KV_CACHE_DTYPE=auto
# 最大并发序列数
MAX_NUM_SEQS=32
# 单批最大 token 数,根据并发和实际上下文需求配置,默认自动分配
MAX_NUM_BATCHED_TOKENSMAX=16384
# 其他运行开关
DISABLE_LOG_REQUESTS=False
ENABLE_LOG_REQUESTS=true
# VLLM运行模式DEBUG\INFO
VLLM_LOGGING_LEVEL=INFO
# Tool calling 配置,需要和模型配套,否则可能出现工具调用失败
ENABLE_AUTO_TOOL_CHOICE=true
TOOL_CALL_PARSER=qwen3_xml
REASONING_PARSER=qwen3
# 思考标记开关QWEN3.5-9B不匹配
# DEFAULT_CHAT_TEMPLATE_KWARGS='{"enable_thinking": true}'
# 留空时使用模型自带 chat_template如需官方工具模板可填绝对路径
# CHAT_TEMPLATE=
# enable-chunked-prefill分块预填空避免瞬时大量占用内存
# max-num-partial-prefills 1限制同一时刻最多只有 1 个 处于“分块预填充中”的请求,提高稳定性。
TRUST_REMOTE_CODE=true
API_KEY=unis123
# 采样参数
# TEMPERATURE=1.0
# TOP_P=0.95
# TOP_K=64
LOG_DIR=./logs
# Auto download model when MODEL_DIR is missing on container start.
AUTO_DOWNLOAD_MODEL=true
MODEL_SOURCE=Qwen/Qwen3.5-9B
DOWNLOAD_CACHE_DIR=./modelscope_cache
SKIP_MODEL_DOWNLOAD_IF_EXISTS=true