run tests again on Modal (#1289) [skip ci]
Browse files* run tests again on Modal
* make sure to run the full suite of tests on modal
* run cicd steps via shell script
* run tests in different runs
* increase timeout
* split tests into steps on modal
* increase workflow timeout
* retry doing this with only a single script
* fix yml launch for modal ci
* reorder tests to run on modal
* skip dpo tests on modal
* run on L4s, A10G takes too long
* increase CPU and RAM for modal test
* run modal tests on A100s
* skip phi test on modal
* env not arg in modal dockerfile
* upgrade pydantic and fastapi for modal tests
* cleanup stray character
* use A10s instead of A100 for modal
- .github/workflows/tests.yml +18 -33
- cicd/Dockerfile.jinja +39 -0
- cicd/cicd.sh +5 -0
- cicd/tests.py +75 -0
- requirements.txt +1 -1
- tests/e2e/test_dpo.py +3 -0
- tests/e2e/test_phi.py +3 -0
.github/workflows/tests.yml
CHANGED
@@ -58,8 +58,8 @@ jobs:
|
|
58 |
docker-e2e-tests:
|
59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
60 |
# this job needs to be run on self-hosted GPU runners...
|
61 |
-
runs-on: [self-hosted,
|
62 |
-
timeout-minutes:
|
63 |
needs: [pre-commit, pytest]
|
64 |
|
65 |
strategy:
|
@@ -71,45 +71,30 @@ jobs:
|
|
71 |
python_version: "3.10"
|
72 |
pytorch: 2.1.2
|
73 |
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
|
|
74 |
- cuda: 121
|
75 |
cuda_version: 12.1.0
|
76 |
python_version: "3.10"
|
77 |
pytorch: 2.1.2
|
|
|
78 |
steps:
|
79 |
- name: Checkout
|
80 |
uses: actions/checkout@v4
|
81 |
-
- name:
|
82 |
-
|
83 |
-
uses: docker/metadata-action@v5
|
84 |
with:
|
85 |
-
|
86 |
-
- name:
|
87 |
-
run: |
|
88 |
-
# Set up build arguments
|
89 |
-
BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
|
90 |
-
CUDA="${{ matrix.cuda }}"
|
91 |
-
AXOLOTL_ARGS="${{ matrix.axolotl_args }}"
|
92 |
-
PYTORCH_VERSION="${{ matrix.pytorch }}"
|
93 |
-
# Build the Docker image
|
94 |
-
docker build . \
|
95 |
-
--file ./docker/Dockerfile-tests \
|
96 |
-
--build-arg BASE_TAG=$BASE_TAG \
|
97 |
-
--build-arg AXOLOTL_ARGS="$AXOLOTL_ARGS" \
|
98 |
-
--build-arg CUDA=$CUDA \
|
99 |
-
--build-arg GITHUB_REF=$GITHUB_REF \
|
100 |
-
--build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
|
101 |
-
--tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
|
102 |
-
--no-cache
|
103 |
-
- name: Unit Tests w docker image
|
104 |
-
run: |
|
105 |
-
docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
|
106 |
-
- name: GPU Unit Tests w docker image
|
107 |
run: |
|
108 |
-
|
109 |
-
|
|
|
110 |
run: |
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
114 |
run: |
|
115 |
-
|
|
|
58 |
docker-e2e-tests:
|
59 |
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
60 |
# this job needs to be run on self-hosted GPU runners...
|
61 |
+
runs-on: [self-hosted, modal]
|
62 |
+
timeout-minutes: 60
|
63 |
needs: [pre-commit, pytest]
|
64 |
|
65 |
strategy:
|
|
|
71 |
python_version: "3.10"
|
72 |
pytorch: 2.1.2
|
73 |
axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
|
74 |
+
num_gpus: 1
|
75 |
- cuda: 121
|
76 |
cuda_version: 12.1.0
|
77 |
python_version: "3.10"
|
78 |
pytorch: 2.1.2
|
79 |
+
num_gpus: 1
|
80 |
steps:
|
81 |
- name: Checkout
|
82 |
uses: actions/checkout@v4
|
83 |
+
- name: Install Python
|
84 |
+
uses: actions/setup-python@v5
|
|
|
85 |
with:
|
86 |
+
python-version: "3.10"
|
87 |
+
- name: Install Modal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
run: |
|
89 |
+
python -m pip install --upgrade pip
|
90 |
+
pip install modal jinja2
|
91 |
+
- name: Update env vars
|
92 |
run: |
|
93 |
+
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
94 |
+
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
95 |
+
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
96 |
+
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
97 |
+
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
98 |
+
- name: Run tests job on Modal
|
99 |
run: |
|
100 |
+
modal run cicd.tests
|
cicd/Dockerfile.jinja
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM winglian/axolotl-base:{{ BASE_TAG }}
|
2 |
+
|
3 |
+
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
|
4 |
+
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
|
5 |
+
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
|
6 |
+
ENV CUDA="{{ CUDA }}"
|
7 |
+
ENV BNB_CUDA_VERSION="{{ CUDA }}"
|
8 |
+
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
|
9 |
+
ENV GITHUB_REF="{{ GITHUB_REF }}"
|
10 |
+
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
|
11 |
+
|
12 |
+
RUN apt-get update && \
|
13 |
+
apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
|
14 |
+
|
15 |
+
WORKDIR /workspace
|
16 |
+
|
17 |
+
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
18 |
+
|
19 |
+
WORKDIR /workspace/axolotl
|
20 |
+
|
21 |
+
RUN git fetch origin +$GITHUB_REF && \
|
22 |
+
git checkout FETCH_HEAD
|
23 |
+
|
24 |
+
# If AXOLOTL_EXTRAS is set, append it in brackets
|
25 |
+
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
26 |
+
pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
27 |
+
else \
|
28 |
+
pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
|
29 |
+
fi
|
30 |
+
|
31 |
+
# So we can test the Docker image
|
32 |
+
RUN pip install pytest
|
33 |
+
|
34 |
+
# fix so that git fetch/pull from remote works
|
35 |
+
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
|
36 |
+
git config --get remote.origin.fetch
|
37 |
+
|
38 |
+
# helper for huggingface-login cli
|
39 |
+
RUN git config --global credential.helper store
|
cicd/cicd.sh
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
|
4 |
+
pytest /workspace/axolotl/tests/e2e/patched/
|
5 |
+
pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
|
cicd/tests.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
modal application to run axolotl gpu tests in Modal
|
3 |
+
"""
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
import jinja2
|
9 |
+
import modal
|
10 |
+
from jinja2 import select_autoescape
|
11 |
+
from modal import Image, Stub
|
12 |
+
|
13 |
+
cicd_path = pathlib.Path(__file__).parent.resolve()
|
14 |
+
|
15 |
+
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
16 |
+
template_env = jinja2.Environment(
|
17 |
+
loader=template_loader, autoescape=select_autoescape()
|
18 |
+
)
|
19 |
+
df_template = template_env.get_template("Dockerfile.jinja")
|
20 |
+
|
21 |
+
df_args = {
|
22 |
+
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
23 |
+
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
24 |
+
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
|
25 |
+
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
|
26 |
+
"CUDA": os.environ.get("CUDA", "118"),
|
27 |
+
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
28 |
+
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
29 |
+
}
|
30 |
+
|
31 |
+
dockerfile_contents = df_template.render(**df_args)
|
32 |
+
|
33 |
+
temp_dir = tempfile.mkdtemp()
|
34 |
+
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
35 |
+
f.write(dockerfile_contents)
|
36 |
+
|
37 |
+
cicd_image = (
|
38 |
+
Image.from_dockerfile(
|
39 |
+
pathlib.Path(temp_dir) / "Dockerfile",
|
40 |
+
force_build=True,
|
41 |
+
gpu="A10G",
|
42 |
+
)
|
43 |
+
.env(df_args)
|
44 |
+
.pip_install("fastapi==0.110.0", "pydantic==2.6.3")
|
45 |
+
)
|
46 |
+
|
47 |
+
stub = Stub("Axolotl CI/CD", secrets=[])
|
48 |
+
|
49 |
+
|
50 |
+
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
51 |
+
GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
|
52 |
+
|
53 |
+
|
54 |
+
def run_cmd(cmd: str, run_folder: str):
|
55 |
+
import subprocess # nosec
|
56 |
+
|
57 |
+
# Propagate errors from subprocess.
|
58 |
+
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
59 |
+
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
60 |
+
|
61 |
+
|
62 |
+
@stub.function(
|
63 |
+
image=cicd_image,
|
64 |
+
gpu=GPU_CONFIG,
|
65 |
+
timeout=45 * 60,
|
66 |
+
cpu=8.0,
|
67 |
+
memory=131072,
|
68 |
+
)
|
69 |
+
def cicd_pytest():
|
70 |
+
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
|
71 |
+
|
72 |
+
|
73 |
+
@stub.local_entrypoint()
|
74 |
+
def main():
|
75 |
+
cicd_pytest.remote()
|
requirements.txt
CHANGED
@@ -6,7 +6,7 @@ tokenizers==0.15.0
|
|
6 |
bitsandbytes>=0.41.1
|
7 |
accelerate==0.26.1
|
8 |
deepspeed==0.13.1
|
9 |
-
pydantic
|
10 |
addict
|
11 |
fire
|
12 |
PyYAML>=6.0
|
|
|
6 |
bitsandbytes>=0.41.1
|
7 |
accelerate==0.26.1
|
8 |
deepspeed==0.13.1
|
9 |
+
pydantic==2.6.3
|
10 |
addict
|
11 |
fire
|
12 |
PyYAML>=6.0
|
tests/e2e/test_dpo.py
CHANGED
@@ -7,6 +7,8 @@ import os
|
|
7 |
import unittest
|
8 |
from pathlib import Path
|
9 |
|
|
|
|
|
10 |
from axolotl.cli import load_rl_datasets
|
11 |
from axolotl.common.cli import TrainerCliArgs
|
12 |
from axolotl.train import train
|
@@ -19,6 +21,7 @@ LOG = logging.getLogger("axolotl.tests.e2e")
|
|
19 |
os.environ["WANDB_DISABLED"] = "true"
|
20 |
|
21 |
|
|
|
22 |
class TestDPOLlamaLora(unittest.TestCase):
|
23 |
"""
|
24 |
Test case for DPO Llama models using LoRA
|
|
|
7 |
import unittest
|
8 |
from pathlib import Path
|
9 |
|
10 |
+
import pytest
|
11 |
+
|
12 |
from axolotl.cli import load_rl_datasets
|
13 |
from axolotl.common.cli import TrainerCliArgs
|
14 |
from axolotl.train import train
|
|
|
21 |
os.environ["WANDB_DISABLED"] = "true"
|
22 |
|
23 |
|
24 |
+
@pytest.mark.skip(reason="doesn't seem to work on modal")
|
25 |
class TestDPOLlamaLora(unittest.TestCase):
|
26 |
"""
|
27 |
Test case for DPO Llama models using LoRA
|
tests/e2e/test_phi.py
CHANGED
@@ -7,6 +7,8 @@ import os
|
|
7 |
import unittest
|
8 |
from pathlib import Path
|
9 |
|
|
|
|
|
10 |
from axolotl.cli import load_datasets
|
11 |
from axolotl.common.cli import TrainerCliArgs
|
12 |
from axolotl.train import train
|
@@ -19,6 +21,7 @@ LOG = logging.getLogger("axolotl.tests.e2e")
|
|
19 |
os.environ["WANDB_DISABLED"] = "true"
|
20 |
|
21 |
|
|
|
22 |
class TestPhi(unittest.TestCase):
|
23 |
"""
|
24 |
Test case for Phi2 models
|
|
|
7 |
import unittest
|
8 |
from pathlib import Path
|
9 |
|
10 |
+
import pytest
|
11 |
+
|
12 |
from axolotl.cli import load_datasets
|
13 |
from axolotl.common.cli import TrainerCliArgs
|
14 |
from axolotl.train import train
|
|
|
21 |
os.environ["WANDB_DISABLED"] = "true"
|
22 |
|
23 |
|
24 |
+
@pytest.mark.skip(reason="doesn't seem to work on modal")
|
25 |
class TestPhi(unittest.TestCase):
|
26 |
"""
|
27 |
Test case for Phi2 models
|