Rico commited on
Commit
aacd42c
·
1 Parent(s): 86778cb

[UPDATE] update deploy_guidance

Browse files
Files changed (1) hide show
  1. docs/deploy_guidance.md +25 -9
docs/deploy_guidance.md CHANGED
@@ -18,7 +18,7 @@ The smallest deployment unit for this version is 16xH20 with either Tensor Paral
18
 
19
  ### vLLM Deployment
20
 
21
- Please make sure to use nightly version of vllm. For details, please refer to [vllm nightly installation doc](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#pre-built-wheels).
22
  ```bash
23
  uv pip install -U vllm \
24
  --torch-backend=auto \
@@ -42,6 +42,7 @@ vllm serve /path/to/step3 \
42
  --enable-auto-tool-choice \
43
  --tool-call-parser step3 \
44
  --trust-remote-code \
 
45
  --port $PORT_SERVING
46
  ```
47
 
@@ -58,6 +59,7 @@ vllm serve /path/to/step3 \
58
  --reasoning-parser step3 \
59
  --enable-auto-tool-choice \
60
  --tool-call-parser step3 \
 
61
  --trust-remote-code \
62
  ```
63
 
@@ -71,6 +73,7 @@ vllm serve /path/to/step3-fp8 \
71
  --enable-auto-tool-choice \
72
  --tool-call-parser step3 \
73
  --gpu-memory-utilization 0.85 \
 
74
  --trust-remote-code \
75
  ```
76
 
@@ -83,6 +86,7 @@ vllm serve /path/to/step3-fp8 \
83
  --reasoning-parser step3 \
84
  --enable-auto-tool-choice \
85
  --tool-call-parser step3 \
 
86
  --trust-remote-code \
87
  ```
88
 
@@ -104,15 +108,27 @@ pip3 install "sglang[all]>=0.4.10"
104
  ##### Tensor Parallelism(Serving on 16xH20):
105
 
106
  ```bash
107
- # start ray on node 0 and node 1
108
-
109
- # node 0:
110
  python -m sglang.launch_server \
111
- --model-path /path/to/step3 \
112
- --trust-remote-code \
113
- --tool-call-parser step3 \
114
- --reasoning-parser step3 \
115
- --tp 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  ```
117
 
118
  #### FP8 Model
 
18
 
19
  ### vLLM Deployment
20
 
21
+ Please make sure to use nightly version of vllm after this [PR](https://github.com/vllm-project/vllm/pull/21998) is merged. For details, please refer to [vllm nightly installation doc](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#pre-built-wheels).
22
  ```bash
23
  uv pip install -U vllm \
24
  --torch-backend=auto \
 
42
  --enable-auto-tool-choice \
43
  --tool-call-parser step3 \
44
  --trust-remote-code \
45
+ --max-num-batched-tokens 4096 \
46
  --port $PORT_SERVING
47
  ```
48
 
 
59
  --reasoning-parser step3 \
60
  --enable-auto-tool-choice \
61
  --tool-call-parser step3 \
62
+ --max-num-batched-tokens 4096 \
63
  --trust-remote-code \
64
  ```
65
 
 
73
  --enable-auto-tool-choice \
74
  --tool-call-parser step3 \
75
  --gpu-memory-utilization 0.85 \
76
+ --max-num-batched-tokens 4096 \
77
  --trust-remote-code \
78
  ```
79
 
 
86
  --reasoning-parser step3 \
87
  --enable-auto-tool-choice \
88
  --tool-call-parser step3 \
89
+ --max-num-batched-tokens 4096 \
90
  --trust-remote-code \
91
  ```
92
 
 
108
  ##### Tensor Parallelism(Serving on 16xH20):
109
 
110
  ```bash
111
+ # node 1
 
 
112
  python -m sglang.launch_server \
113
+ --model-path stepfun-ai/step3 \
114
+ --dist-init-addr master_ip:5000 \
115
+ --trust-remote-code \
116
+ --tool-call-parser step3 \
117
+ --reasoning-parser step3 \
118
+ --tp 16 \
119
+ --nnodes 2 \
120
+ --node-rank 0
121
+
122
+ # node 2
123
+ python -m sglang.launch_server \
124
+ --model-path stepfun-ai/step3 \
125
+ --dist-init-addr master_ip:5000 \
126
+ --trust-remote-code \
127
+ --tool-call-parser step3 \
128
+ --reasoning-parser step3 \
129
+ --tp 16 \
130
+ --nnodes 2 \
131
+ --node-rank 1
132
  ```
133
 
134
  #### FP8 Model