Spaces:
Paused
Paused
feat(quantization): T4 not support bfloat16
Browse files- run-llama.sh +1 -2
- run-sailor.sh +1 -2
run-llama.sh
CHANGED
@@ -25,7 +25,6 @@ python -u /app/openai_compatible_api_server.py \
|
|
25 |
--port 7860 \
|
26 |
--max-num-batched-tokens 32768 \
|
27 |
--max-model-len 32768 \
|
28 |
-
--dtype
|
29 |
-
--kv-cache-dtype fp8 \
|
30 |
--enforce-eager \
|
31 |
--gpu-memory-utilization 0.85
|
|
|
25 |
--port 7860 \
|
26 |
--max-num-batched-tokens 32768 \
|
27 |
--max-model-len 32768 \
|
28 |
+
--dtype float16 \
|
|
|
29 |
--enforce-eager \
|
30 |
--gpu-memory-utilization 0.85
|
run-sailor.sh
CHANGED
@@ -25,7 +25,6 @@ python -u /app/openai_compatible_api_server.py \
|
|
25 |
--port 7860 \
|
26 |
--max-num-batched-tokens 32768 \
|
27 |
--max-model-len 32768 \
|
28 |
-
--dtype
|
29 |
-
--kv-cache-dtype fp8 \
|
30 |
--enforce-eager \
|
31 |
--gpu-memory-utilization 0.9
|
|
|
25 |
--port 7860 \
|
26 |
--max-num-batched-tokens 32768 \
|
27 |
--max-model-len 32768 \
|
28 |
+
--dtype float16 \
|
|
|
29 |
--enforce-eager \
|
30 |
--gpu-memory-utilization 0.9
|