yusufs commited on
Commit
c41cdb4
·
1 Parent(s): d51e450

feat(max_model_len): reducing max_model_len for T4 support

Browse files
Files changed (1) hide show
  1. main.py +15 -8
main.py CHANGED
@@ -40,17 +40,22 @@ engine_llama_3_2: LLM = LLM(
40
  model='meta-llama/Llama-3.2-3B-Instruct',
41
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
42
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
43
- max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
  tensor_parallel_size=1,
 
47
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
48
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
49
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
50
  # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
51
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
52
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
53
- # max_model_len=32768,
 
 
 
 
54
  enforce_eager=True, # Disable CUDA graph
55
 
56
  # File "/home/user/.local/lib/python3.12/site-packages/vllm/worker/worker.py",
@@ -59,6 +64,7 @@ engine_llama_3_2: LLM = LLM(
59
  # Your Tesla T4 GPU has compute capability 7.5.
60
  # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
61
  dtype='half', # Use 'half' for T4
 
62
  )
63
 
64
  # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
@@ -67,13 +73,14 @@ engine_llama_3_2: LLM = LLM(
67
  engine_sailor_chat: LLM = LLM(
68
  model='sail/Sailor-4B-Chat',
69
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
70
- max_num_batched_tokens=512, # Reduced for T4
71
- max_num_seqs=16, # Reduced for T4
72
- gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
73
  tensor_parallel_size=1,
74
- # max_model_len=32768,
75
- enforce_eager=True, # Disable CUDA graph
76
- dtype='half', # Use 'half' for T4
 
77
  )
78
 
79
 
 
40
  model='meta-llama/Llama-3.2-3B-Instruct',
41
  revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
42
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
43
+ max_num_batched_tokens=32768, # Reduced for T4, must equal with max_model_len
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
  tensor_parallel_size=1,
47
+
48
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
49
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
50
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
51
  # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
52
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
53
  # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
54
+ # [rank0]: raise ValueError(
55
+ # [rank0]: ValueError: The model's max seq len (131072)
56
+ # is larger than the maximum number of tokens that can be stored in KV cache (57056).
57
+ # Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
58
+ max_model_len=32768, # Reduced for T4
59
  enforce_eager=True, # Disable CUDA graph
60
 
61
  # File "/home/user/.local/lib/python3.12/site-packages/vllm/worker/worker.py",
 
64
  # Your Tesla T4 GPU has compute capability 7.5.
65
  # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
66
  dtype='half', # Use 'half' for T4
67
+ use_cached_outputs=True, # Enable caching
68
  )
69
 
70
  # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
 
73
  engine_sailor_chat: LLM = LLM(
74
  model='sail/Sailor-4B-Chat',
75
  revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
76
+ max_num_batched_tokens=32768, # Reduced for T4
77
+ max_num_seqs=16, # Reduced for T4
78
+ gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
79
  tensor_parallel_size=1,
80
+ max_model_len=32768,
81
+ enforce_eager=True, # Disable CUDA graph
82
+ dtype='half', # Use 'half' for T4
83
+ use_cached_outputs=True, # Enable caching
84
  )
85
 
86