winglian commited on
Commit
36d053f
·
unverified ·
1 Parent(s): af29d81

run PR e2e docker CI tests in Modal (#1217) [skip ci]

Browse files

* wip modal for ci

* handle falcon layernorms better

* update

* rebuild the template each time with the pseudo-ARGS

* fix ref

* update tests to use modal

* cleanup ci script

* make sure to install jinja2 also

* kickoff the gh action on gh hosted runners and specify num gpus

.github/workflows/tests.yml CHANGED
@@ -58,10 +58,15 @@ jobs:
58
  docker-e2e-tests:
59
  if: github.repository_owner == 'OpenAccess-AI-Collective'
60
  # this job needs to be run on self-hosted GPU runners...
61
- runs-on: [self-hosted, gpu, docker]
62
  timeout-minutes: 30
63
  needs: [pre-commit, pytest]
64
 
 
 
 
 
 
65
  strategy:
66
  fail-fast: false
67
  matrix:
@@ -70,43 +75,29 @@ jobs:
70
  cuda_version: 11.8.0
71
  python_version: "3.10"
72
  pytorch: 2.0.1
 
73
  - cuda: 121
74
  cuda_version: 12.1.0
75
  python_version: "3.10"
76
  pytorch: 2.1.2
 
77
  steps:
78
  - name: Checkout
79
  uses: actions/checkout@v4
80
- - name: Docker metadata
81
- id: metadata
82
- uses: docker/metadata-action@v5
83
  with:
84
- images: winglian/axolotl-tests
85
- - name: Build Docker image
86
- run: |
87
- # Set up build arguments
88
- BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
89
- CUDA="${{ matrix.cuda }}"
90
- PYTORCH_VERSION="${{ matrix.pytorch }}"
91
- # Build the Docker image
92
- docker build . \
93
- --file ./docker/Dockerfile-tests \
94
- --build-arg BASE_TAG=$BASE_TAG \
95
- --build-arg CUDA=$CUDA \
96
- --build-arg GITHUB_REF=$GITHUB_REF \
97
- --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
98
- --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
99
- --no-cache
100
- - name: Unit Tests w docker image
101
- run: |
102
- docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
103
- - name: GPU Unit Tests w docker image
104
  run: |
105
- docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
106
- - name: GPU Unit Tests monkeypatched w docker image
 
107
  run: |
108
- docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
109
- - name: Prune image from docker
110
- if: github.ref != 'refs/heads/main'
 
 
111
  run: |
112
- docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
 
58
  docker-e2e-tests:
59
  if: github.repository_owner == 'OpenAccess-AI-Collective'
60
  # this job needs to be run on self-hosted GPU runners...
61
+ runs-on: ubuntu-latest
62
  timeout-minutes: 30
63
  needs: [pre-commit, pytest]
64
 
65
+ env:
66
+ MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
67
+ MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
68
+ MODAL_ENVIRONMENT: axolotl-ci-cd
69
+
70
  strategy:
71
  fail-fast: false
72
  matrix:
 
75
  cuda_version: 11.8.0
76
  python_version: "3.10"
77
  pytorch: 2.0.1
78
+ num_gpus: 1
79
  - cuda: 121
80
  cuda_version: 12.1.0
81
  python_version: "3.10"
82
  pytorch: 2.1.2
83
+ num_gpus: 1
84
  steps:
85
  - name: Checkout
86
  uses: actions/checkout@v4
87
+ - name: Install Python
88
+ uses: actions/setup-python@v5
 
89
  with:
90
+ python-version: "3.10"
91
+ - name: Install Modal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  run: |
93
+ python -m pip install --upgrade pip
94
+ pip install modal jinja2
95
+ - name: Update env vars
96
  run: |
97
+ echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
98
+ echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
99
+ echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
100
+ echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
101
+ - name: Run training job on Modal
102
  run: |
103
+ modal run cicd.tests
cicd/Dockerfile.jinja ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM winglian/axolotl-base:{{ BASE_TAG }}
2
+
3
+ ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
4
+ ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
5
+ ENV CUDA="{{ CUDA }}"
6
+ ENV BNB_CUDA_VERSION="{{ CUDA }}"
7
+ ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
8
+ ENV GITHUB_REF="{{ GITHUB_REF }}"
9
+ ENV GITHUB_SHA="{{ GITHUB_SHA }}"
10
+
11
+ RUN apt-get update && \
12
+ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
13
+
14
+ WORKDIR /workspace
15
+
16
+ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
17
+
18
+ WORKDIR /workspace/axolotl
19
+
20
+ RUN git fetch origin +$GITHUB_REF && \
21
+ git checkout FETCH_HEAD
22
+
23
+ # If AXOLOTL_EXTRAS is set, append it in brackets
24
+ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
25
+ pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
26
+ else \
27
+ pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
28
+ fi
29
+
30
+ # So we can test the Docker image
31
+ RUN pip install pytest
32
+
33
+ # fix so that git fetch/pull from remote works
34
+ RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
35
+ git config --get remote.origin.fetch
36
+
37
+ # helper for huggingface-login cli
38
+ RUN git config --global credential.helper store
cicd/tests.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ modal application to run axolotl gpu tests in Modal
3
+ """
4
+ import os
5
+ import pathlib
6
+ import tempfile
7
+
8
+ import jinja2
9
+ import modal
10
+ from jinja2 import select_autoescape
11
+ from modal import Image, Stub
12
+
13
+ cicd_path = pathlib.Path(__file__).parent.resolve()
14
+
15
+ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
16
+ template_env = jinja2.Environment(
17
+ loader=template_loader, autoescape=select_autoescape()
18
+ )
19
+ df_template = template_env.get_template("Dockerfile.jinja")
20
+
21
+ df_args = {
22
+ "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
23
+ "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
24
+ "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
25
+ "CUDA": os.environ.get("CUDA", "118"),
26
+ "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
27
+ "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
28
+ }
29
+
30
+ dockerfile_contents = df_template.render(**df_args)
31
+
32
+ temp_dir = tempfile.mkdtemp()
33
+ with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
34
+ f.write(dockerfile_contents)
35
+
36
+ cicd_image = Image.from_dockerfile(
37
+ pathlib.Path(temp_dir) / "Dockerfile",
38
+ force_build=True,
39
+ gpu="A10G",
40
+ ).env(df_args)
41
+
42
+ stub = Stub("Axolotl CI/CD", secrets=[])
43
+
44
+
45
+ N_GPUS = int(os.environ.get("N_GPUS", 1))
46
+ GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
47
+
48
+
49
+ def run_cmd(cmd: str, run_folder: str):
50
+ import subprocess # nosec
51
+
52
+ # Propagate errors from subprocess.
53
+ if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
54
+ exit(exit_code) # pylint: disable=consider-using-sys-exit
55
+
56
+
57
+ @stub.function(
58
+ image=cicd_image,
59
+ gpu=GPU_CONFIG,
60
+ timeout=60 * 30,
61
+ )
62
+ def cicd_pytest():
63
+ cmd = "pytest /workspace/axolotl/tests/e2e/patched/"
64
+ run_cmd(cmd, "/workspace/axolotl")
65
+
66
+
67
+ @stub.local_entrypoint()
68
+ def main():
69
+ cicd_pytest.remote()
docker/{Dockerfile-tests → Dockerfile-modal} RENAMED
@@ -1,14 +1,11 @@
1
- ARG BASE_TAG=main-base
2
- FROM winglian/axolotl-base:$BASE_TAG
3
-
4
- ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
5
- ARG AXOLOTL_EXTRAS=""
6
- ARG CUDA="118"
7
- ENV BNB_CUDA_VERSION=$CUDA
8
- ARG PYTORCH_VERSION="2.0.1"
9
- ARG GITHUB_REF="main"
10
-
11
- ENV PYTORCH_VERSION=$PYTORCH_VERSION
12
 
13
  RUN apt-get update && \
14
  apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
 
1
+ FROM winglian/axolotl-base:main-base
2
+
3
+ ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
4
+ ENV AXOLOTL_EXTRAS=""
5
+ ENV CUDA="118"
6
+ ENV BNB_CUDA_VERSION="118"
7
+ ENV PYTORCH_VERSION="2.0.1"
8
+ ENV GITHUB_REF="main"
 
 
 
9
 
10
  RUN apt-get update && \
11
  apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 
2
  packaging==23.2
3
  peft==0.7.1
4
  transformers==4.37.0
 
1
  --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
2
+ jinja2
3
  packaging==23.2
4
  peft==0.7.1
5
  transformers==4.37.0
src/axolotl/utils/models.py CHANGED
@@ -645,7 +645,10 @@ def load_model(
645
  if not cfg.fsdp:
646
  # FSDP doesn't like mixed Float and BFloat16
647
  for name, module in model.named_modules():
648
- if any(m in name for m in ["norm", "gate"]):
 
 
 
649
  module.to(torch.float32)
650
  if model_config.model_type == "btlm":
651
  # don't upcast lm_head for btlm
@@ -684,7 +687,7 @@ def load_model(
684
  if needs_fa2_dtype or cfg.flash_attention:
685
  LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
686
  for name, module in model.named_modules():
687
- if "norm" in name:
688
  module.to(cfg.torch_dtype)
689
  if any(m in name for m in embedding_modules):
690
  if hasattr(module, "weight"):
 
645
  if not cfg.fsdp:
646
  # FSDP doesn't like mixed Float and BFloat16
647
  for name, module in model.named_modules():
648
+ if (
649
+ any(m in name for m in ["norm", "gate"])
650
+ or "LayerNorm" in module.__class__.__name__
651
+ ):
652
  module.to(torch.float32)
653
  if model_config.model_type == "btlm":
654
  # don't upcast lm_head for btlm
 
687
  if needs_fa2_dtype or cfg.flash_attention:
688
  LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
689
  for name, module in model.named_modules():
690
+ if "norm" in name or "LayerNorm" in module.__class__.__name__:
691
  module.to(cfg.torch_dtype)
692
  if any(m in name for m in embedding_modules):
693
  if hasattr(module, "weight"):