Spaces:

yusufs
/

vllm-inference

Paused

App Files Files

yusufs commited on Nov 27, 2024

Commit

c41cdb4

1 Parent(s): d51e450

feat(max_model_len): reducing max_model_len for T4 support

Browse files

Files changed (1) hide show

main.py +15 -8

main.py CHANGED Viewed

@@ -40,17 +40,22 @@ engine_llama_3_2: LLM = LLM(
     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
-    max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     tensor_parallel_size=1,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
-    # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     # File "/home/user/.local/lib/python3.12/site-packages/vllm/worker/worker.py",
@@ -59,6 +64,7 @@ engine_llama_3_2: LLM = LLM(
     # Your Tesla T4 GPU has compute capability 7.5.
     # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
     dtype='half',                  # Use 'half' for T4
 )
 # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
@@ -67,13 +73,14 @@ engine_llama_3_2: LLM = LLM(
 engine_sailor_chat: LLM = LLM(
     model='sail/Sailor-4B-Chat',
     revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
-    max_num_batched_tokens=512,    # Reduced for T4
-    max_num_seqs=16,               # Reduced for T4
-    gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     tensor_parallel_size=1,
-    # max_model_len=32768,
-    enforce_eager=True,            # Disable CUDA graph
-    dtype='half',                  # Use 'half' for T4
 )

     model='meta-llama/Llama-3.2-3B-Instruct',
     revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L1062-L1065
+    max_num_batched_tokens=32768,  # Reduced for T4, must equal with max_model_len
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
     tensor_parallel_size=1,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     # Because when we use maximum token length, it will be slower and the memory is not enough for T4.
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
     # https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
+    # [rank0]:     raise ValueError(
+    # [rank0]: ValueError: The model's max seq len (131072)
+    #   is larger than the maximum number of tokens that can be stored in KV cache (57056).
+    #   Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
+    max_model_len=32768,           # Reduced for T4
     enforce_eager=True,            # Disable CUDA graph
     # File "/home/user/.local/lib/python3.12/site-packages/vllm/worker/worker.py",
     # Your Tesla T4 GPU has compute capability 7.5.
     # You can use float16 instead by explicitly setting the`dtype` flag in CLI, for example: --dtype=half.
     dtype='half',                  # Use 'half' for T4
+    use_cached_outputs=True,      # Enable caching
 )
 # ValueError: max_num_batched_tokens (512) is smaller than max_model_len (32768).
 engine_sailor_chat: LLM = LLM(
     model='sail/Sailor-4B-Chat',
     revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
+    max_num_batched_tokens=32768,    # Reduced for T4
+    max_num_seqs=16,                 # Reduced for T4
+    gpu_memory_utilization=0.85,     # Slightly increased, adjust if needed
     tensor_parallel_size=1,
+    max_model_len=32768,
+    enforce_eager=True,              # Disable CUDA graph
+    dtype='half',                    # Use 'half' for T4
+    use_cached_outputs=True,         # Enable caching
 )