Spaces:
Paused
Paused
feat(add-model): always download model during build, it will be cached in the consecutive builds
Browse files- Dockerfile +5 -3
- README.md +3 -4
- download_model.py +10 -0
- run-llama.sh +31 -0
- run.sh → run-sailor.sh +6 -16
Dockerfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM python:3.12
|
2 |
|
3 |
RUN useradd -m -u 1000 user
|
4 |
USER user
|
@@ -27,5 +27,7 @@ EXPOSE 7860
|
|
27 |
|
28 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
29 |
|
30 |
-
RUN chmod +x /app/run.sh
|
31 |
-
|
|
|
|
|
|
1 |
+
FROM python:3.12.7-slim-bookworm
|
2 |
|
3 |
RUN useradd -m -u 1000 user
|
4 |
USER user
|
|
|
27 |
|
28 |
#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
29 |
|
30 |
+
RUN chmod +x /app/run-llama.sh
|
31 |
+
RUN chmod +x /app/run-sailor.sh
|
32 |
+
|
33 |
+
CMD ["/app/run-sailor.sh"]
|
README.md
CHANGED
@@ -21,14 +21,13 @@ poetry export -f requirements.txt --output requirements.txt --without-hashes
|
|
21 |
|
22 |
> References: https://huggingface.co/spaces/sofianhw/ai/tree/c6527a750644a849b6705bb6fe2fcea4e54a8196
|
23 |
|
24 |
-
|
|
|
|
|
25 |
|
26 |
* [x] change everything route in api_server.py that start (“/v1/xxx”) to (“/api/v1/xxx”).
|
27 |
and just run the python api_server.py with arguments. https://discuss.huggingface.co/t/run-vllm-docker-on-space/70228/5?u=yusufs
|
28 |
|
29 |
-
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
30 |
-
|
31 |
-
|
32 |
|
33 |
## Documentation about config
|
34 |
|
|
|
21 |
|
22 |
> References: https://huggingface.co/spaces/sofianhw/ai/tree/c6527a750644a849b6705bb6fe2fcea4e54a8196
|
23 |
|
24 |
+
This `api_server.py` file is exact copy version from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/entrypoints/openai/api_server.py
|
25 |
+
|
26 |
+
Changes (use diff tool to see the exact changes of the file):
|
27 |
|
28 |
* [x] change everything route in api_server.py that start (“/v1/xxx”) to (“/api/v1/xxx”).
|
29 |
and just run the python api_server.py with arguments. https://discuss.huggingface.co/t/run-vllm-docker-on-space/70228/5?u=yusufs
|
30 |
|
|
|
|
|
|
|
31 |
|
32 |
## Documentation about config
|
33 |
|
download_model.py
CHANGED
@@ -10,6 +10,16 @@ hf_token = hf_token.strip()
|
|
10 |
if hf_token == "":
|
11 |
raise ValueError("HF_TOKEN is empty")
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
snapshot_download(
|
14 |
repo_id="sail/Sailor-4B-Chat",
|
15 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
|
|
10 |
if hf_token == "":
|
11 |
raise ValueError("HF_TOKEN is empty")
|
12 |
|
13 |
+
|
14 |
+
# This is about 2.47 GB
|
15 |
+
snapshot_download(
|
16 |
+
repo_id="meta-llama/Llama-3.2-1B-Instruct",
|
17 |
+
revision="9213176726f574b556790deb65791e0c5aa438b6",
|
18 |
+
token=hf_token,
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
# This is about 3.67 GB
|
23 |
snapshot_download(
|
24 |
repo_id="sail/Sailor-4B-Chat",
|
25 |
revision="89a866a7041e6ec023dd462adeca8e28dd53c83e",
|
run-llama.sh
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/sh
|
2 |
+
|
3 |
+
|
4 |
+
printf "Running meta-llama/Llama-3.2-1B-Instruct using vLLM OpenAI compatible API Server at port %s\n" "7860"
|
5 |
+
|
6 |
+
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
7 |
+
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
8 |
+
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
9 |
+
# Because when we use maximum token length, it will be slower and the memory is not enough for T4.
|
10 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L85-L86
|
11 |
+
# https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/config.py#L98-L102
|
12 |
+
# [rank0]: raise ValueError(
|
13 |
+
# [rank0]: ValueError: The model's max seq len (131072)
|
14 |
+
# is larger than the maximum number of tokens that can be stored in KV cache (57056).
|
15 |
+
# Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
|
16 |
+
#
|
17 |
+
# Actually, the meta-llama/Llama-3.2-3B-Instruct rev 0cb88a4f764b7a12671c53f0838cd831a0843b95
|
18 |
+
# is enough with T4 16GB, but for the sake of the performance and comparing with the same
|
19 |
+
# params with the sail/Sailor-1.8B-Chat, I use the
|
20 |
+
# meta-llama/Llama-3.2-1B-Instruct rev 9213176726f574b556790deb65791e0c5aa438b6
|
21 |
+
python -u /app/openai_compatible_api_server.py \
|
22 |
+
--model meta-llama/Llama-3.2-1B-Instruct \
|
23 |
+
--revision 9213176726f574b556790deb65791e0c5aa438b6 \
|
24 |
+
--host 0.0.0.0 \
|
25 |
+
--port 7860 \
|
26 |
+
--max-num-batched-tokens 32768 \
|
27 |
+
--max-model-len 32768 \
|
28 |
+
--dtype half \
|
29 |
+
--enforce-eager \
|
30 |
+
--gpu-memory-utilization 0.85
|
31 |
+
|
run.sh → run-sailor.sh
RENAMED
@@ -1,20 +1,9 @@
|
|
1 |
#!/bin/sh
|
2 |
|
3 |
|
4 |
-
printf "Running vLLM OpenAI compatible API Server at port %s\n" "7860"
|
5 |
-
|
6 |
-
#python -u /app/openai_compatible_api_server.py \
|
7 |
-
# --model meta-llama/Llama-3.2-3B-Instruct \
|
8 |
-
# --revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 \
|
9 |
-
# --host 0.0.0.0 \
|
10 |
-
# --port 7860 \
|
11 |
-
# --max-num-batched-tokens 32768 \
|
12 |
-
# --max-model-len 32768 \
|
13 |
-
# --dtype half \
|
14 |
-
# --enforce-eager \
|
15 |
-
# --gpu-memory-utilization 0.85
|
16 |
-
|
17 |
|
|
|
18 |
# Reducing max-num-batched-tokens to 7536 because got this error:
|
19 |
# INFO 11-27 15:32:01 model_runner.py:1077] Loading model weights took 7.4150 GB
|
20 |
# INFO 11-27 15:32:09 worker.py:232] Memory profiling results: total_gpu_memory=14.58GiB initial_memory_usage=7.61GiB peak_torch_memory=9.31GiB memory_usage_post_profile=7.62GiB non_torch_memory=0.20GiB kv_cache_size=2.88GiB gpu_memory_utilization=0.85
|
@@ -22,11 +11,12 @@ printf "Running vLLM OpenAI compatible API Server at port %s\n" "7860"
|
|
22 |
# INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
|
23 |
# ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
|
24 |
python -u /app/openai_compatible_api_server.py \
|
25 |
-
--model sail/Sailor-
|
26 |
-
--revision
|
27 |
--host 0.0.0.0 \
|
28 |
--port 7860 \
|
29 |
-
--max-num-batched-tokens
|
|
|
30 |
--dtype half \
|
31 |
--enforce-eager \
|
32 |
--gpu-memory-utilization 0.85
|
|
|
1 |
#!/bin/sh
|
2 |
|
3 |
|
4 |
+
printf "Running sail/Sailor-1.8B-Chat using vLLM OpenAI compatible API Server at port %s\n" "7860"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
# Using the 1.8B version because the 4B version cannot be loaded due to memory constraints.
|
7 |
# Reducing max-num-batched-tokens to 7536 because got this error:
|
8 |
# INFO 11-27 15:32:01 model_runner.py:1077] Loading model weights took 7.4150 GB
|
9 |
# INFO 11-27 15:32:09 worker.py:232] Memory profiling results: total_gpu_memory=14.58GiB initial_memory_usage=7.61GiB peak_torch_memory=9.31GiB memory_usage_post_profile=7.62GiB non_torch_memory=0.20GiB kv_cache_size=2.88GiB gpu_memory_utilization=0.85
|
|
|
11 |
# INFO 11-27 15:32:10 gpu_executor.py:117] Maximum concurrency for 32768 tokens per request: 0.23x
|
12 |
# ERROR 11-27 15:32:10 engine.py:366] The model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (7536). Try increasing `gpu_memory_utilization` or decreasing `max_model_len` when initializing the engine.
|
13 |
python -u /app/openai_compatible_api_server.py \
|
14 |
+
--model sail/Sailor-1.8B-Chat \
|
15 |
+
--revision 04b86803d4011d4bfd80f3cd3841b005eb899987 \
|
16 |
--host 0.0.0.0 \
|
17 |
--port 7860 \
|
18 |
+
--max-num-batched-tokens 32768 \
|
19 |
+
--max-model-len 32768 \
|
20 |
--dtype half \
|
21 |
--enforce-eager \
|
22 |
--gpu-memory-utilization 0.85
|