yusufs commited on
Commit
13a5c22
·
1 Parent(s): 493a5f1

feat(reduce-max-num-batched-tokens): Reducing max-num-batched-tokens even the error state it want to reduce max_model_len

Browse files
Files changed (2) hide show
  1. download_model.py +4 -0
  2. run.sh +7 -0
download_model.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  from huggingface_hub import snapshot_download
 
3
 
4
  hf_token: str = os.getenv("HF_TOKEN")
5
  if hf_token is None:
@@ -14,3 +15,6 @@ snapshot_download(
14
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
15
  token=hf_token,
16
  )
 
 
 
 
1
  import os
2
  from huggingface_hub import snapshot_download
3
+ from transformers.utils.hub import move_cache
4
 
5
  hf_token: str = os.getenv("HF_TOKEN")
6
  if hf_token is None:
 
15
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
16
  token=hf_token,
17
  )
18
+
19
+ # https://github.com/huggingface/transformers/issues/20428
20
+ move_cache()
run.sh CHANGED
@@ -15,11 +15,18 @@ printf "Running vLLM OpenAI compatible API Server at port %s\n" "7860"
15
  # --gpu-memory-utilization 0.85
16
 
17
 
 
 
 
 
 
 
18
  python -u /app/openai_compatible_api_server.py \
19
  --model sail/Sailor-4B-Chat \
20
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
21
  --host 0.0.0.0 \
22
  --port 7860 \
 
23
  --dtype half \
24
  --enforce-eager \
25
  --gpu-memory-utilization 0.85
 
15
  # --gpu-memory-utilization 0.85
16
 
17
 
18
+ # Reducing max-num-batched-tokens to 7536 because got this error:
19
+ # INFO 11-27 15:32:01 model_runner.py:1077] Loading model weights took 7.4150 GB
20
+ # INFO 11-27 15:32:09 worker.py:232] Memory profiling results: total_gpu_memory=14.58GiB initial_memory_usage=7.61GiB peak_torch_memory=9.31GiB memory_usage_post_profile=7.62GiB non_torch_memory=0.20GiB kv_cache_size=2.88GiB gpu_memory_utilization=0.85
21
+ # INFO 11-27 15:32:10 gpu_executor.py:113] # GPU blocks: 471, # CPU blocks: 655
22
+ # INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
23
+ # ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
24
  python -u /app/openai_compatible_api_server.py \
25
  --model sail/Sailor-4B-Chat \
26
  --revision 89a866a7041e6ec023dd462adeca8e28dd53c83e \
27
  --host 0.0.0.0 \
28
  --port 7860 \
29
+ --max-num-batched-tokens 7536 \
30
  --dtype half \
31
  --enforce-eager \
32
  --gpu-memory-utilization 0.85