yusufs commited on
Commit
0345d26
·
1 Parent(s): 38d356a

feat(quantization): T4 not support bfloat16

Browse files
Files changed (2) hide show
  1. run-llama.sh +1 -2
  2. run-sailor.sh +1 -2
run-llama.sh CHANGED
@@ -25,7 +25,6 @@ python -u /app/openai_compatible_api_server.py \
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
- --dtype bfloat16 \
29
- --kv-cache-dtype fp8 \
30
  --enforce-eager \
31
  --gpu-memory-utilization 0.85
 
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
+ --dtype float16 \
 
29
  --enforce-eager \
30
  --gpu-memory-utilization 0.85
run-sailor.sh CHANGED
@@ -25,7 +25,6 @@ python -u /app/openai_compatible_api_server.py \
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
- --dtype bfloat16 \
29
- --kv-cache-dtype fp8 \
30
  --enforce-eager \
31
  --gpu-memory-utilization 0.9
 
25
  --port 7860 \
26
  --max-num-batched-tokens 32768 \
27
  --max-model-len 32768 \
28
+ --dtype float16 \
 
29
  --enforce-eager \
30
  --gpu-memory-utilization 0.9