Spaces:
Sleeping
Sleeping
Hugo Flores
commited on
Commit
·
50f034f
0
Parent(s):
first commit
Browse files- .dockerignore +2 -0
- .gitattributes +4 -0
- .gitignore +173 -0
- .pre-commit-config.yaml +15 -0
- Dockerfile +36 -0
- README.md +287 -0
- changelog.md +11 -0
- conf/vampnet-c2f.yml +9 -0
- conf/vampnet.yml +56 -0
- docker-compose.yml +90 -0
- env/alias.sh +3 -0
- env/data.sh +36 -0
- env/entry_script.sh +41 -0
- env/setup.py +123 -0
- requirements.txt +29 -0
- scripts/generative/eval.py +124 -0
- scripts/generative/train.py +662 -0
- scripts/utils/README.md +28 -0
- scripts/utils/stage.py +30 -0
- setup.py +40 -0
- vampnet/__init__.py +6 -0
- vampnet/enchilada.py +179 -0
- vampnet/modules/__init__.py +4 -0
- vampnet/modules/base.py +461 -0
- vampnet/modules/modules.py +168 -0
- vampnet/modules/transformer.py +606 -0
- vampnet/scheduler.py +47 -0
.dockerignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.wav
|
2 |
+
runs/
|
.gitattributes
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
2 |
+
tests/assets/cpu_test_file.pt filter=lfs diff=lfs merge=lfs -text
|
3 |
+
tests/assets/quick.pth filter=lfs diff=lfs merge=lfs -text
|
4 |
+
tests/assets/slow.pth filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/env.sh
|
108 |
+
venv/
|
109 |
+
env.bak/
|
110 |
+
venv.bak/
|
111 |
+
|
112 |
+
# Spyder project settings
|
113 |
+
.spyderproject
|
114 |
+
.spyproject
|
115 |
+
|
116 |
+
# Rope project settings
|
117 |
+
.ropeproject
|
118 |
+
|
119 |
+
# mkdocs documentation
|
120 |
+
/site
|
121 |
+
|
122 |
+
# mypy
|
123 |
+
.mypy_cache/
|
124 |
+
.dmypy.json
|
125 |
+
dmypy.json
|
126 |
+
|
127 |
+
# Pyre type checker
|
128 |
+
.pyre/
|
129 |
+
|
130 |
+
# Files created by experiments
|
131 |
+
output/
|
132 |
+
snapshot/
|
133 |
+
*.m4a
|
134 |
+
*.wav
|
135 |
+
notebooks/scratch.ipynb
|
136 |
+
notebooks/inspect.ipynb
|
137 |
+
notebooks/effects.ipynb
|
138 |
+
notebooks/*.ipynb
|
139 |
+
notebooks/*.gif
|
140 |
+
notebooks/*.wav
|
141 |
+
notebooks/*.mp4
|
142 |
+
*runs/
|
143 |
+
boards/
|
144 |
+
samples/
|
145 |
+
*.ipynb
|
146 |
+
|
147 |
+
results.json
|
148 |
+
metrics.csv
|
149 |
+
mprofile_*
|
150 |
+
mem.png
|
151 |
+
|
152 |
+
results/
|
153 |
+
mprofile*
|
154 |
+
*.png
|
155 |
+
# do not ignore the test wav file
|
156 |
+
!tests/audio/short_test_audio.wav
|
157 |
+
!tests/audio/output.wav
|
158 |
+
*/.DS_Store
|
159 |
+
.DS_Store
|
160 |
+
env.sh
|
161 |
+
_codebraid/
|
162 |
+
**/*.html
|
163 |
+
**/*.exec.md
|
164 |
+
flagged/
|
165 |
+
log.txt
|
166 |
+
ckpt/
|
167 |
+
.syncthing*
|
168 |
+
tests/assets/
|
169 |
+
archived/
|
170 |
+
|
171 |
+
scratch/
|
172 |
+
|
173 |
+
runs-archive
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/asottile/reorder_python_imports
|
3 |
+
rev: v2.5.0
|
4 |
+
hooks:
|
5 |
+
- id: reorder-python-imports
|
6 |
+
- repo: https://github.com/psf/black
|
7 |
+
rev: 23.1.0
|
8 |
+
hooks:
|
9 |
+
- id: black
|
10 |
+
language_version: python3
|
11 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
12 |
+
rev: v4.0.1
|
13 |
+
hooks:
|
14 |
+
- id: end-of-file-fixer
|
15 |
+
- id: trailing-whitespace
|
Dockerfile
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM us.gcr.io/lyrebird-research/research-image/audio:beta
|
2 |
+
|
3 |
+
COPY requirements.txt requirements.txt
|
4 |
+
ARG GITHUB_TOKEN
|
5 |
+
RUN echo machine github.com login ${GITHUB_TOKEN} > ~/.netrc
|
6 |
+
|
7 |
+
COPY env/alias.sh /alias.sh
|
8 |
+
COPY env/entry_script.sh /entry_script.sh
|
9 |
+
RUN cat /alias.sh >> ~/.zshrc
|
10 |
+
|
11 |
+
# USER researcher
|
12 |
+
RUN pip install --upgrade -r requirements.txt
|
13 |
+
RUN pip install --upgrade tensorflow
|
14 |
+
RUN pip install --upgrade librosa
|
15 |
+
RUN pip install --upgrade numba
|
16 |
+
ENV PYTHONPATH "$PYTHONPATH:/u/home/src"
|
17 |
+
ENV NUMBA_CACHE_DIR=/tmp/
|
18 |
+
|
19 |
+
USER root
|
20 |
+
RUN wget https://github.com/jgm/pandoc/releases/download/2.18/pandoc-2.18-1-amd64.deb
|
21 |
+
RUN dpkg -i pandoc-2.18-1-amd64.deb
|
22 |
+
RUN apt-get update && apt-get install task-spooler
|
23 |
+
|
24 |
+
RUN head -n -1 /entry_script.sh > /entry_script_jupyter.sh
|
25 |
+
RUN head -n -1 /entry_script.sh > /entry_script_tensorboard.sh
|
26 |
+
RUN head -n -1 /entry_script.sh > /entry_script_gradio.sh
|
27 |
+
|
28 |
+
RUN echo \
|
29 |
+
'su -p ${USER} -c "source ~/.zshrc && jupyter lab --ip=0.0.0.0"' >> \
|
30 |
+
/entry_script_jupyter.sh
|
31 |
+
RUN echo \
|
32 |
+
'su -p ${USER} -c "source ~/.zshrc && tensorboard --logdir=$TENSORBOARD_PATH --samples_per_plugin audio=500 --bind_all"' >> \
|
33 |
+
/entry_script_tensorboard.sh
|
34 |
+
RUN echo \
|
35 |
+
'su -p ${USER} -c "source ~/.zshrc && python app.py --args.load=conf/app.yml"' >> \
|
36 |
+
/entry_script_gradio.sh
|
README.md
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Lyrebird Wav2Wav
|
2 |
+
|
3 |
+
This repository contains recipes for training Wav2Wav models.
|
4 |
+
|
5 |
+
## Install hooks
|
6 |
+
|
7 |
+
First install the pre-commit util:
|
8 |
+
|
9 |
+
https://pre-commit.com/#install
|
10 |
+
|
11 |
+
pip install pre-commit # with pip
|
12 |
+
brew install pre-commit # on Mac
|
13 |
+
|
14 |
+
Then install the git hooks
|
15 |
+
|
16 |
+
pre-commit install
|
17 |
+
# check .pre-commit-config.yaml for details of hooks
|
18 |
+
|
19 |
+
Upon `git commit`, the pre-commit hooks will be run automatically on the stage files (i.e. added by `git add`)
|
20 |
+
|
21 |
+
**N.B. By default, pre-commit checks only run on staged files**
|
22 |
+
|
23 |
+
If you need to run it on all files:
|
24 |
+
|
25 |
+
pre-commit run --all-files
|
26 |
+
|
27 |
+
## Usage & model zoo
|
28 |
+
|
29 |
+
To download the model, one must be authenticated to the `lyrebird-research` project on Google Cloud.
|
30 |
+
To see all available models, run
|
31 |
+
|
32 |
+
```bash
|
33 |
+
python -m wav2wav.list_models
|
34 |
+
```
|
35 |
+
|
36 |
+
which outputs something like this:
|
37 |
+
|
38 |
+
```
|
39 |
+
gs://research-models/wav2wav
|
40 |
+
└── prod
|
41 |
+
└── v3
|
42 |
+
└── ckpt
|
43 |
+
├── best
|
44 |
+
│ └── generator
|
45 |
+
│ ├── ❌ model.onnx
|
46 |
+
│ ├── ❌ nvidia_geforce_rtx_2080_ti_11_7.trt
|
47 |
+
│ ├── ✅ package.pth
|
48 |
+
│ ├── ❌ tesla_t4_11_7.trt
|
49 |
+
│ └── ✅ weights.pth
|
50 |
+
└── latest
|
51 |
+
└── generator
|
52 |
+
├── ❌ package.pth
|
53 |
+
└── ❌ weights.pth
|
54 |
+
└── v2
|
55 |
+
...
|
56 |
+
└── dev
|
57 |
+
...
|
58 |
+
```
|
59 |
+
|
60 |
+
This will show all the models that are available on GCP. Models that are available locally are marked with a ✅, while those not available locally
|
61 |
+
are marked with ❌. `.onnx` indicates a model that must be run with
|
62 |
+
the `ONNX` runtime, while `.trt` indicate models that have been optimized
|
63 |
+
with TensorRT. Note that TensorRT models are specific to GPU and CUDA
|
64 |
+
runtime, and their file names indicate what to use to run them.
|
65 |
+
|
66 |
+
`package.pth` is a version of the model that is saved using `torch.package`,
|
67 |
+
and contains a copy of the model code within it, which allow it to work
|
68 |
+
even if the model code in `wav2wav/modules/generator.py` changes. `weights.pth`
|
69 |
+
contains the model weights, and the code must match the code used
|
70 |
+
to create the model.
|
71 |
+
|
72 |
+
To use a model from this list, simply write its path and give it to the `enhance` script,
|
73 |
+
like so:
|
74 |
+
|
75 |
+
```
|
76 |
+
python -m wav2wav.interface \
|
77 |
+
[input_path]
|
78 |
+
--model_path=prod/v3/ckpt/best/generator/weights.pth
|
79 |
+
--output_path [output_path]
|
80 |
+
```
|
81 |
+
|
82 |
+
Models are downloaded to the location set by the environment variable `MODEL_LOCAL_PATH`, and defaults to `~/.wav2wav/models`. Similarly,
|
83 |
+
The model bucket is determined by `MODEL_GCS_PATH` and defaults to
|
84 |
+
`gs://research-models/wav2wav/`.
|
85 |
+
|
86 |
+
## Development
|
87 |
+
### Setting everything up
|
88 |
+
|
89 |
+
Run the setup script to set up your environment via:
|
90 |
+
|
91 |
+
```bash
|
92 |
+
python env/setup.py
|
93 |
+
```
|
94 |
+
|
95 |
+
The setup script does not require any dependencies beyond just Python.
|
96 |
+
Once run, follow the instructions it prints out to create your
|
97 |
+
environment file, which will be at `env/env.sh`.
|
98 |
+
|
99 |
+
Note that if this is a new machine, and
|
100 |
+
the data is not downloaded somewhere on it already, it will ask you
|
101 |
+
for a directory to download the data to.
|
102 |
+
|
103 |
+
For Github setup, if you don't have a .netrc token, create one by going to your Github profile -> Developer settings -> Personal access tokens -> Generate new token. Copy the token and [keep it secret, keep it safe](https://www.youtube.com/watch?v=iThtELZvfPs).
|
104 |
+
|
105 |
+
When complete, run:
|
106 |
+
|
107 |
+
```bash
|
108 |
+
source env/env.sh
|
109 |
+
```
|
110 |
+
|
111 |
+
Now build and launch the Docker containers:
|
112 |
+
|
113 |
+
```bash
|
114 |
+
docker compose up -d
|
115 |
+
```
|
116 |
+
|
117 |
+
This builds and runs a Jupyter notebook and Tensorboard
|
118 |
+
in the background, which points to your `TENSORBOARD_PATH`
|
119 |
+
env. variable.
|
120 |
+
|
121 |
+
Now, launch your development environment via:
|
122 |
+
|
123 |
+
```bash
|
124 |
+
docker compose run dev
|
125 |
+
```
|
126 |
+
|
127 |
+
To tear down your development environment, just do
|
128 |
+
|
129 |
+
```bash
|
130 |
+
docker compose down
|
131 |
+
```
|
132 |
+
|
133 |
+
### Downloading data and pre-processing
|
134 |
+
Next, from within the Docker environment (or an appropriately configured Conda environment with environment variables set as above), do the following:
|
135 |
+
|
136 |
+
```
|
137 |
+
python -m wav2wav.preprocess.download
|
138 |
+
```
|
139 |
+
|
140 |
+
This will download all the necessary data, which are referenced by
|
141 |
+
the CSV files in `conf/audio/*`. These CSVs were generated via
|
142 |
+
`python -m wav2wav.preprocess.organize`.
|
143 |
+
|
144 |
+
### Launching an experiment
|
145 |
+
|
146 |
+
Experiments are first _staged_ by running the `stage` command (which corresponds to the script `scripts/exp/stage.py`).
|
147 |
+
|
148 |
+
`stage` creates a directory with a copy of all of the Git-tracked files in the root repository.`stage` launches a shell into said directory, so all commands are run on the
|
149 |
+
copy of the original repository code. This is useful for rewinding to an old experiment
|
150 |
+
and resuming it, for example. Even if the repository code changes, the snapshot in the experiment directory is unchanged from the original run, so it can be re-used.
|
151 |
+
|
152 |
+
Then, the experiment can be run via:
|
153 |
+
|
154 |
+
```bash
|
155 |
+
torchrun --nproc_per_node gpu \
|
156 |
+
scripts/exp/train.py \
|
157 |
+
--args.load=conf/args.yml \
|
158 |
+
```
|
159 |
+
|
160 |
+
The full settings are in [conf/daps/train.yml](conf/daps/train.yml).
|
161 |
+
|
162 |
+
### Evaluating an experiment
|
163 |
+
|
164 |
+
There are two ways to evaluate an experiment: quantitative and qualitative.
|
165 |
+
For the first, we can use the `scripts/exp/evaluate.py` script. This script evaluates the model over the `val_data` and `test_data`, defined in your
|
166 |
+
`train` script, and takes as input an experiment directory. The metrics
|
167 |
+
computed by this script are saved to the same folder.
|
168 |
+
|
169 |
+
The other way is via a preference test. Let's say we want to compare
|
170 |
+
the v3 prod model against the v2 prod model. to do this, we use the
|
171 |
+
`scripts/exp/qa.py` script. This script creates a zip file containing all
|
172 |
+
the samples and an HTML page for easy viewing. It also creates a Papaya
|
173 |
+
preference test. Use it like this:
|
174 |
+
|
175 |
+
```bash
|
176 |
+
WAV2WAV_MODELS=a,b python scripts/exp/qa.py \
|
177 |
+
--a/model_path prod/v3/ckpt/best/generator/package.pth \
|
178 |
+
--b/model_path prod/v2/ckpt/best/generator/package.pth \
|
179 |
+
--a/name "v3" --b/name "v2" \
|
180 |
+
--device cuda:0 \
|
181 |
+
--n_samples 20 \
|
182 |
+
--zip_path "samples/out.zip"
|
183 |
+
```
|
184 |
+
|
185 |
+
### Useful commands
|
186 |
+
|
187 |
+
#### Monitoring the machine
|
188 |
+
|
189 |
+
There's a useful `tmux` workspace that you can launch via:
|
190 |
+
|
191 |
+
```bash
|
192 |
+
tmuxp load ./workspace.yml
|
193 |
+
```
|
194 |
+
|
195 |
+
which will have a split pane with a shell to launch commands on the left,
|
196 |
+
and GPU monitoring, `htop`, and a script that watches for changes in your
|
197 |
+
directory on the right, in three split panes.
|
198 |
+
|
199 |
+
#### Cleaning up after a run
|
200 |
+
|
201 |
+
Sometimes DDP runs fail to clear themselves out of the machine. To fix this, run
|
202 |
+
|
203 |
+
```bash
|
204 |
+
cleanup
|
205 |
+
```
|
206 |
+
|
207 |
+
### Deploying a new model to production
|
208 |
+
|
209 |
+
Okay, so you ran a model and it seems promising and you want to upload it
|
210 |
+
to GCS so it can be QA'd fully, and then shipped. First, upload
|
211 |
+
your experiment to the `dev` bucket on GCS via:
|
212 |
+
|
213 |
+
```bash
|
214 |
+
gsutil cp -r /path/to/{exp_name} gs://research-models/wav2wav/dev/{exp_name}
|
215 |
+
```
|
216 |
+
|
217 |
+
Once uploaded, QA can access the models by specifying
|
218 |
+
`model_path=dev/{exp_name}/ckpt/{best,latest}/generator/package.pth` when using the
|
219 |
+
`wav2wav.interface.enhance` function. If it passes QA, and is scheduled to
|
220 |
+
ship to production, then next we have to generate the TensorRT model file,
|
221 |
+
which requires us to have a machine that matches that of a production machine.
|
222 |
+
|
223 |
+
There is a script that automates this procedure, that does not require any
|
224 |
+
fiddling from our end. Navigate to the repository root and run:
|
225 |
+
|
226 |
+
```
|
227 |
+
python scripts/utils/convert_on_gcp.py dev/{exp_name}/ckpt/{best,latest}//generator/weights.pth
|
228 |
+
```
|
229 |
+
|
230 |
+
This will provision the machine, download the relevant model from GCS, optimize it on
|
231 |
+
the production GPU with the correct CUDA runtime, and then upload the generated `.trt`
|
232 |
+
and `.onnx` models back to the bucket.
|
233 |
+
|
234 |
+
Finally, copy the model to the `prod` bucket, incrementing the version number by one:
|
235 |
+
|
236 |
+
```bash
|
237 |
+
gsutil cp -r gs://research-models/wav2wav/dev/{exp_name} gs://research-models/wav2wav/prod/v{N}
|
238 |
+
```
|
239 |
+
|
240 |
+
where `N` is the next version (e.g. if v3 is the latest, the new one is v4). Then, update
|
241 |
+
the model table in [Notion](https://www.notion.so/descript/fc04de4b46e6417eba1d06bdc8de6c75?v=e56db4e6b37c4d9b9eca8d9be15c826a) with the new model.
|
242 |
+
|
243 |
+
Once the above is all done, we update the code in two places:
|
244 |
+
|
245 |
+
1. In `interface.py`, we update `PROD_MODEL_PATH` to point to the `weights.pth`
|
246 |
+
for whichever tag ended up shipping (either `best` or `latest`).
|
247 |
+
2. In `interface.py`, we update `PROD_TRT_PATH` to point the generated
|
248 |
+
TensorRT checkpoint generated by the script above.
|
249 |
+
|
250 |
+
After merging to master, a new Docker image will be created, and one can update the relevant lines
|
251 |
+
in descript-workflows like in this [PR](https://github.com/descriptinc/descript-workflows/pull/477/files).
|
252 |
+
|
253 |
+
We have Github action workflows in [.github/workflows/deploy.yml](.github/workflows/deploy.yml) to build and deploy new docker images. Two images are built - one for staging and another for production.
|
254 |
+
To deploy a new release version, follow the instructions in [this coda doc](https://coda.io/d/Research-Engineering_dOABAWL46p-/Deploying-Services_su1am#_lu7E8).
|
255 |
+
|
256 |
+
Coda doc with informations about deploying speech-enhance worker is [here](https://coda.io/d/Research-Engineering_dOABAWL46p-/Deploying-Services_su1am#_lu7E8).
|
257 |
+
|
258 |
+
And that's it! Once the new staging is built, you're done.
|
259 |
+
|
260 |
+
## Testing
|
261 |
+
|
262 |
+
### Profiling and Regression testing
|
263 |
+
|
264 |
+
- The [profiling script](tests/profile_inference.py) profiles the `wav2wav.interface.enhance` function.
|
265 |
+
- NOTE: ALWAYS run the profiler on a T4 GPU. ALWAYS run the profiling in isolation i.e kill all other processes on the GPU. Recommended vm size on GCP is `n1-standard-32` as the stress test of six hours of audio requires ~35GB of system memory.
|
266 |
+
- To run profiling use the [profiling script](tests/profile_inference.py) via command `python3 -m tests.profile_inference`. Results will be printed after `1` run.
|
267 |
+
- Use the [test_regression.py](tests/test_regression.py) script to run tests that
|
268 |
+
- compare performance stats of current model with known best model
|
269 |
+
- test for output deviation from the last model
|
270 |
+
- Run `git lfs checkout` to checkout input file and model weights required for testing the model.
|
271 |
+
- To launch these tests, run `python3 -m pytest tests/test_regression.py -v`.
|
272 |
+
- As a side effect, this will update the `tests/stat.csv` file if the current model performs better than last best known model as per `tests/stat.csv`.
|
273 |
+
- NOTE: In case of architecture change, purge the weights files : `tests/assets/{quick|slow}.pth` and reference stat file : `tests/assets/baseline.json` file. Running the [test_regression.py](tests/test_regression.py) script in absence of reference stat file, will generate new baseline referece stats as well as append new performance stats to stats file. In the absence of saved weights, new weights are generated and saved on disk. Make sure to commit these files (stat.csv, baseline.json, *.pth) when the model architecture changes.
|
274 |
+
|
275 |
+
### Unit tests
|
276 |
+
Regular unit tests that test functionality such as training resume etc. These are run on CPU. Update them when new features are added.
|
277 |
+
|
278 |
+
### Profiling tests
|
279 |
+
These tests profile the model's resource consumption. They are run on T4 GPU with 32 cores and >35GB memory. Their usage is reported in the above sections.
|
280 |
+
|
281 |
+
### Functional tests
|
282 |
+
These tests detect deviation from known baseline model. A category of these tests ensure that a new pytorch model doesn't deviate from the previous one. Another category ensures that the TensorRT version of the current pytorch model doens't deviate from it. These tests are marked with the marker `output_qa` and can be run via the command line `python3 -m pytest -v -m output_qa`. Some of these tests require a GPU.
|
283 |
+
|
284 |
+
### CI tests
|
285 |
+
- The tests are divided into two categories depending on the platform requirement - CPU tests and GPU tests.
|
286 |
+
- The CPU tests contains unit tests.
|
287 |
+
- The GPU tests contain a subset of functional tests. These tests can be run by the command `python3 -m pytest -v -m gpu_ci_test`.
|
changelog.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# v0.3.4
|
2 |
+
- Upgraded nussl, removing unneeded code from interface, and improving interaction.
|
3 |
+
|
4 |
+
# v0.3.1
|
5 |
+
- Rich interaction with the model.
|
6 |
+
|
7 |
+
# v0.2.0
|
8 |
+
- Release with script to process zipfiles for QA + gsutil sync for model weights.
|
9 |
+
|
10 |
+
# v0.1.0
|
11 |
+
- Initial release.
|
conf/vampnet-c2f.yml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
$include:
|
2 |
+
- conf/vampnet.yml
|
3 |
+
|
4 |
+
VampNet.n_codebooks: 9
|
5 |
+
VampNet.n_conditioning_codebooks: 3
|
6 |
+
|
7 |
+
train/AudioDataset.duration: 3
|
8 |
+
val/AudioDataset.duration: 3
|
9 |
+
test/AudioDataset.duration: 3
|
conf/vampnet.yml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
wav2wav_ckpt: /u/home/src/runs/codec-ckpt/codec.pth
|
3 |
+
save_path: ckpt
|
4 |
+
max_epochs: 1000000
|
5 |
+
epoch_length: 1000
|
6 |
+
save_audio_epochs: 2
|
7 |
+
val_idx: [0,1,2,3,4,5,6,7,8,9]
|
8 |
+
|
9 |
+
prefix_amt: 0.0
|
10 |
+
suffix_amt: 0.0
|
11 |
+
prefix_dropout: 0.1
|
12 |
+
suffix_dropout: 0.1
|
13 |
+
|
14 |
+
batch_size: 120
|
15 |
+
num_workers: 80
|
16 |
+
|
17 |
+
# Optimization
|
18 |
+
detect_anomaly: false
|
19 |
+
amp: false
|
20 |
+
|
21 |
+
CrossEntropyLoss.label_smoothing: 0.1
|
22 |
+
|
23 |
+
AdamW.lr: 0.001
|
24 |
+
|
25 |
+
NoamScheduler.factor: 2.0
|
26 |
+
NoamScheduler.warmup: 10000
|
27 |
+
|
28 |
+
VampNet.vocab_size: 1024
|
29 |
+
VampNet.n_codebooks: 3
|
30 |
+
VampNet.n_conditioning_codebooks: 0
|
31 |
+
VampNet.r_cond_dim: 64
|
32 |
+
VampNet.embedding_dim: 1280
|
33 |
+
VampNet.n_layers: 16
|
34 |
+
VampNet.n_heads: 20
|
35 |
+
VampNet.flash_attn: false
|
36 |
+
VampNet.dropout: 0.05
|
37 |
+
|
38 |
+
AudioLoader.relative_path: /data/
|
39 |
+
AudioDataset.loudness_cutoff: -30.0
|
40 |
+
AudioDataset.without_replacement: true
|
41 |
+
AudioLoader.shuffle: true
|
42 |
+
|
43 |
+
train/AudioDataset.duration: 5.0
|
44 |
+
train/AudioDataset.n_examples: 10000000
|
45 |
+
train/AudioLoader.sources:
|
46 |
+
- /data/spotdl/audio/train
|
47 |
+
|
48 |
+
val/AudioDataset.duration: 5.0
|
49 |
+
val/AudioDataset.n_examples: 2000
|
50 |
+
val/AudioLoader.sources:
|
51 |
+
- /data/spotdl/audio/val
|
52 |
+
|
53 |
+
test/AudioDataset.duration: 5.0
|
54 |
+
test/AudioDataset.n_examples: 1000
|
55 |
+
test/AudioLoader.sources:
|
56 |
+
- /data/spotdl/audio/test
|
docker-compose.yml
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
version: "3.5"
|
3 |
+
services:
|
4 |
+
tensorrt:
|
5 |
+
build:
|
6 |
+
context: .
|
7 |
+
dockerfile: ./deployment_build/dockerfile
|
8 |
+
args:
|
9 |
+
GITHUB_TOKEN: ${GITHUB_TOKEN}
|
10 |
+
profiles:
|
11 |
+
- tensorrt
|
12 |
+
volumes:
|
13 |
+
- ./:/u/home/src
|
14 |
+
- ~/.config/gcloud:/root/.config/gcloud
|
15 |
+
deploy:
|
16 |
+
resources:
|
17 |
+
limits:
|
18 |
+
# match production limits
|
19 |
+
cpus: '7'
|
20 |
+
memory: 25000M
|
21 |
+
reservations:
|
22 |
+
devices:
|
23 |
+
- driver: nvidia
|
24 |
+
count: 1
|
25 |
+
capabilities: [gpu]
|
26 |
+
working_dir: /u/home/src
|
27 |
+
entrypoint:
|
28 |
+
- python
|
29 |
+
- -m
|
30 |
+
- wav2wav.converter
|
31 |
+
base:
|
32 |
+
build:
|
33 |
+
context: .
|
34 |
+
dockerfile: ./Dockerfile
|
35 |
+
args:
|
36 |
+
GITHUB_TOKEN: ${GITHUB_TOKEN}
|
37 |
+
volumes:
|
38 |
+
- .:/u/home/src
|
39 |
+
- ~/.wav2wav:/u/home/.wav2wav
|
40 |
+
- ${PATH_TO_DATA}:/data
|
41 |
+
- ${PATH_TO_RUNS}:/runs
|
42 |
+
- ~/.config/gcloud:/u/home/.config/gcloud
|
43 |
+
- ~/.zsh_history:/u/home/.zsh_history
|
44 |
+
environment:
|
45 |
+
- GITHUB_TOKEN
|
46 |
+
- DISCOURSE_API_USERNAME
|
47 |
+
- DISCOURSE_SERVER
|
48 |
+
- DISCOURSE_API_KEY
|
49 |
+
- HOST_USER_ID
|
50 |
+
- HOST_USER_GID
|
51 |
+
- JUPYTER_TOKEN
|
52 |
+
- PATH_TO_DATA=/data
|
53 |
+
- PATH_TO_RUNS=/runs
|
54 |
+
- TENSORBOARD_PATH
|
55 |
+
- MPLCONFIGDIR=/u/home/.mplconfig
|
56 |
+
shm_size: 32G
|
57 |
+
working_dir: /u/home/src
|
58 |
+
deploy:
|
59 |
+
resources:
|
60 |
+
reservations:
|
61 |
+
devices:
|
62 |
+
- driver: nvidia
|
63 |
+
capabilities: [gpu]
|
64 |
+
dev:
|
65 |
+
extends: base
|
66 |
+
profiles:
|
67 |
+
- interactive
|
68 |
+
stdin_open: true
|
69 |
+
tty: true
|
70 |
+
jupyter:
|
71 |
+
extends: base
|
72 |
+
ports:
|
73 |
+
- ${JUPYTER_PORT}:8888
|
74 |
+
entrypoint:
|
75 |
+
- /bin/bash
|
76 |
+
- /entry_script_jupyter.sh
|
77 |
+
tensorboard:
|
78 |
+
extends: base
|
79 |
+
ports:
|
80 |
+
- ${TENSORBOARD_PORT}:6006
|
81 |
+
entrypoint:
|
82 |
+
- /bin/bash
|
83 |
+
- /entry_script_tensorboard.sh
|
84 |
+
gradio:
|
85 |
+
extends: base
|
86 |
+
ports:
|
87 |
+
- 7860:7860
|
88 |
+
entrypoint:
|
89 |
+
- /bin/bash
|
90 |
+
- /entry_script_gradio.sh
|
env/alias.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
alias cleanup="pkill python && echo -en '\e[?25h'"
|
2 |
+
alias stage="python ./scripts/utils/stage.py"
|
3 |
+
alias fix_cursor="echo -en '\e[?25h'"
|
env/data.sh
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export PATH_TO_DATA=~/data
|
2 |
+
|
3 |
+
if [[ $(hostname) == "oon17" ]]; then
|
4 |
+
export PATH_TO_DATA=/home/prem/shared/data/
|
5 |
+
fi
|
6 |
+
|
7 |
+
if [[ $(hostname) == "oon19" ]]; then
|
8 |
+
export PATH_TO_DATA=/home/prem/shared/data/
|
9 |
+
fi
|
10 |
+
|
11 |
+
if [[ $(hostname) == "lucas-ssound-trt-vm" ]]; then
|
12 |
+
export PATH_TO_DATA=~/data
|
13 |
+
fi
|
14 |
+
|
15 |
+
if [[ $(hostname) == "a100-ssound" ]]; then
|
16 |
+
export PATH_TO_DATA=~/data
|
17 |
+
fi
|
18 |
+
|
19 |
+
if [[ $(hostname) == "oon25" ]]; then
|
20 |
+
export PATH_TO_DATA=/data
|
21 |
+
fi
|
22 |
+
|
23 |
+
if [[ $(hostname) == "macbook-pro-2.lan" ]]; then
|
24 |
+
export PATH_TO_DATA=~/data
|
25 |
+
fi
|
26 |
+
|
27 |
+
if [[ $(hostname) == "oon11" ]]; then
|
28 |
+
export PATH_TO_DATA=/data2/syncthing_lucas/data
|
29 |
+
fi
|
30 |
+
|
31 |
+
if [[ $(hostname) == "oon12" ]]; then
|
32 |
+
export PATH_TO_DATA=/data
|
33 |
+
fi
|
34 |
+
if [[ $(hostname) == "oon26" ]]; then
|
35 |
+
export PATH_TO_DATA=/data
|
36 |
+
fi
|
env/entry_script.sh
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
set -e
|
3 |
+
|
4 |
+
if [ -z "${USER}" ]; then
|
5 |
+
echo "We need USER to be set!"; exit 100
|
6 |
+
fi
|
7 |
+
|
8 |
+
# check if host uid and gid are set
|
9 |
+
if [ -z "${HOST_USER_ID}" ]; then
|
10 |
+
echo "Please set HOST_USER_ID env. variables to continue." ; exit 0
|
11 |
+
fi
|
12 |
+
|
13 |
+
if [ -z "${HOST_USER_GID}" ]; then
|
14 |
+
echo "Please set HOST_USER_GID env. variables to continue." ; exit 0
|
15 |
+
fi
|
16 |
+
|
17 |
+
USER_ID=$HOST_USER_ID
|
18 |
+
USER_GID=$HOST_USER_GID
|
19 |
+
USER_HOME=/u/home
|
20 |
+
|
21 |
+
# modify uid and gid to match host
|
22 |
+
sed -i -e "s/^${USER}:\([^:]*\):[0-9]*:[0-9]*/${USER}:\1:${USER_ID}:${USER_GID}/" /etc/passwd
|
23 |
+
|
24 |
+
# create a group for host gid
|
25 |
+
groupadd -f --gid "${USER_GID}" "host_group"
|
26 |
+
|
27 |
+
chown $USER_ID $USER_HOME
|
28 |
+
chown $USER_ID /u/home/.zshrc
|
29 |
+
chown $USER_ID /u/home/.oh-my-zsh
|
30 |
+
|
31 |
+
mkdir -p /u/home/.cache
|
32 |
+
chown -R $USER_ID:$USER_GID /u/home/.cache/
|
33 |
+
|
34 |
+
_term() {
|
35 |
+
echo "Caught SIGTERM signal!"
|
36 |
+
kill -TERM "$child" 2>/dev/null
|
37 |
+
}
|
38 |
+
|
39 |
+
trap _term SIGTERM
|
40 |
+
|
41 |
+
su -p "${USER}"
|
env/setup.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This script guides the user through setting up their env.sh
|
2 |
+
# if env.sh does not exist. Should have no dependencies other
|
3 |
+
# than Python standard library.
|
4 |
+
import shlex
|
5 |
+
import socket
|
6 |
+
import subprocess
|
7 |
+
import textwrap
|
8 |
+
|
9 |
+
|
10 |
+
def run(cmd):
|
11 |
+
return subprocess.check_output(shlex.split(cmd)).decode("utf-8")
|
12 |
+
|
13 |
+
|
14 |
+
print("1. Setting up Google Cloud access")
|
15 |
+
print("---------------------------------")
|
16 |
+
gcloud_authorized = "gs://research-data-raw" in run("gsutil ls")
|
17 |
+
if not gcloud_authorized:
|
18 |
+
run("gcloud auth login")
|
19 |
+
|
20 |
+
run("gcloud config set project lyrebird-research")
|
21 |
+
run("gcloud auth configure-docker")
|
22 |
+
|
23 |
+
print()
|
24 |
+
print("2. Setting up Github access")
|
25 |
+
print("---------------------------")
|
26 |
+
|
27 |
+
lines = textwrap.wrap(
|
28 |
+
"First, let's get your Github token, so all "
|
29 |
+
"packages can be installed. Create one by going to your "
|
30 |
+
"Github profile -> Developer settings -> Personal access tokens -> "
|
31 |
+
"Generate new token. Copy the token below."
|
32 |
+
)
|
33 |
+
[print(l) for l in lines]
|
34 |
+
|
35 |
+
GITHUB_TOKEN = input("\nGithub token: ") or "undefined"
|
36 |
+
|
37 |
+
print()
|
38 |
+
print("3. Setting up Jupyter and Tensorboard")
|
39 |
+
print("-------------------------------------")
|
40 |
+
|
41 |
+
JUPYTER_TOKEN = input("Password for Jupyter server (default:password): ") or "password"
|
42 |
+
JUPYTER_PORT = input("Jupyter port to run on (default:8888): ") or "8888"
|
43 |
+
TENSORBOARD_PORT = input("Tensorboard port to run on (default:6006): ") or "6006"
|
44 |
+
|
45 |
+
print()
|
46 |
+
print("4. Setting up paths.")
|
47 |
+
print("--------------------")
|
48 |
+
|
49 |
+
PATH_TO_RUNS = input("Where runs should go (default:./runs/): ") or "./runs/"
|
50 |
+
TENSORBOARD_PATH = (
|
51 |
+
input("Bucket/dir for tensorboard logs (default=PATH_TO_RUNS): ") or PATH_TO_RUNS
|
52 |
+
)
|
53 |
+
|
54 |
+
with open("env/data.sh") as f:
|
55 |
+
data_script = f.read()
|
56 |
+
|
57 |
+
write_to_data_sh = False
|
58 |
+
if socket.gethostname() not in data_script:
|
59 |
+
print("Looks like the data path for this machine is not setup.")
|
60 |
+
PATH_TO_DATA = input(f"Path to data on {socket.gethostname()}: ") or "~/data"
|
61 |
+
|
62 |
+
data_command = f"""
|
63 |
+
if [[ $(hostname) == "{socket.gethostname()}" ]]; then
|
64 |
+
export PATH_TO_DATA={PATH_TO_DATA}
|
65 |
+
fi
|
66 |
+
"""
|
67 |
+
write_to_data_sh = True
|
68 |
+
|
69 |
+
|
70 |
+
print()
|
71 |
+
print("5. Setting up Papaya")
|
72 |
+
print("-----------------------------------------")
|
73 |
+
|
74 |
+
PAPAYA_USER_TOKEN = input("Papaya user token: ") or "undefined"
|
75 |
+
|
76 |
+
env_script = f"""
|
77 |
+
source env/alias.sh
|
78 |
+
source env/data.sh
|
79 |
+
export GITHUB_TOKEN={GITHUB_TOKEN}
|
80 |
+
|
81 |
+
export PAPAYA_USER_TOKEN={PAPAYA_USER_TOKEN}
|
82 |
+
|
83 |
+
export HOST_USER_ID=$(id -u)
|
84 |
+
export HOST_USER_GID=$(id -g)
|
85 |
+
|
86 |
+
export JUPYTER_TOKEN={JUPYTER_TOKEN}
|
87 |
+
export JUPYTER_PORT={JUPYTER_PORT}
|
88 |
+
export TENSORBOARD_PORT={TENSORBOARD_PORT}
|
89 |
+
|
90 |
+
export PATH_TO_RUNS={PATH_TO_RUNS}
|
91 |
+
export TENSORBOARD_PATH={TENSORBOARD_PATH}
|
92 |
+
"""
|
93 |
+
|
94 |
+
print()
|
95 |
+
print("6. Potential file contents.")
|
96 |
+
print("---------------------------")
|
97 |
+
|
98 |
+
print("env/env.sh: \n")
|
99 |
+
print("##################")
|
100 |
+
print(env_script)
|
101 |
+
print("##################")
|
102 |
+
|
103 |
+
if write_to_data_sh:
|
104 |
+
data_script += data_command
|
105 |
+
|
106 |
+
print("env/data.sh:")
|
107 |
+
print("##################")
|
108 |
+
print(data_script)
|
109 |
+
print("##################")
|
110 |
+
|
111 |
+
print()
|
112 |
+
write_to_files = input("Write to file [yn]? ") or "n"
|
113 |
+
if write_to_files == "y":
|
114 |
+
with open("env/env.sh", "w") as f:
|
115 |
+
f.write(env_script.strip())
|
116 |
+
with open("env/data.sh", "w") as f:
|
117 |
+
f.write(data_script.strip())
|
118 |
+
|
119 |
+
print()
|
120 |
+
print("8. Finalize setup.")
|
121 |
+
print("------------------")
|
122 |
+
print("Run the following command to complete setup.")
|
123 |
+
print("source env/env.sh")
|
requirements.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
argbind>=0.3.1
|
2 |
+
pytorch-ignite
|
3 |
+
rich
|
4 |
+
audiotools @ git+https://github.com/descriptinc/[email protected]
|
5 |
+
tqdm
|
6 |
+
tensorboard
|
7 |
+
google-cloud-logging==2.2.0
|
8 |
+
pytest
|
9 |
+
pytest-cov
|
10 |
+
papaya_client @ git+https://github.com/descriptinc/lyrebird-papaya.git@master
|
11 |
+
pynvml
|
12 |
+
psutil
|
13 |
+
pandas
|
14 |
+
onnx
|
15 |
+
onnx-simplifier
|
16 |
+
seaborn
|
17 |
+
jupyterlab
|
18 |
+
jupyterlab-link-share
|
19 |
+
pandas
|
20 |
+
watchdog
|
21 |
+
pesq
|
22 |
+
tabulate
|
23 |
+
torchmetrics
|
24 |
+
codebraid==0.5.0
|
25 |
+
jupyter-client==6.1.12
|
26 |
+
tensorboardX
|
27 |
+
gradio
|
28 |
+
einops
|
29 |
+
flash-attn
|
scripts/generative/eval.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import imp
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import argbind
|
7 |
+
import audiotools
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import torch
|
11 |
+
from flatten_dict import flatten
|
12 |
+
from rich.progress import track
|
13 |
+
from torch.utils.tensorboard import SummaryWriter
|
14 |
+
|
15 |
+
import wav2wav
|
16 |
+
|
17 |
+
train = imp.load_source("train", str(Path(__file__).absolute().parent / "train.py"))
|
18 |
+
|
19 |
+
|
20 |
+
@argbind.bind(without_prefix=True)
|
21 |
+
def evaluate(
|
22 |
+
args,
|
23 |
+
model_tag: str = "ckpt/best",
|
24 |
+
device: str = "cuda",
|
25 |
+
exp: str = None,
|
26 |
+
overwrite: bool = False,
|
27 |
+
):
|
28 |
+
assert exp is not None
|
29 |
+
|
30 |
+
sisdr_loss = audiotools.metrics.distance.SISDRLoss()
|
31 |
+
stft_loss = audiotools.metrics.spectral.MultiScaleSTFTLoss()
|
32 |
+
mel_loss = audiotools.metrics.spectral.MelSpectrogramLoss()
|
33 |
+
|
34 |
+
with audiotools.util.chdir(exp):
|
35 |
+
vampnet = wav2wav.modules.vampnet.transformer.VampNet.load(
|
36 |
+
f"{model_tag}/vampnet/package.pth"
|
37 |
+
)
|
38 |
+
vampnet = vampnet.to(device)
|
39 |
+
if vampnet.cond_dim > 0:
|
40 |
+
condnet = wav2wav.modules.condnet.transformer.CondNet.load(
|
41 |
+
f"{model_tag}/condnet/package.pth"
|
42 |
+
)
|
43 |
+
condnet = condnet.to(device)
|
44 |
+
else:
|
45 |
+
condnet = None
|
46 |
+
|
47 |
+
vqvae = wav2wav.modules.generator.Generator.load(
|
48 |
+
f"{model_tag}/vqvae/package.pth"
|
49 |
+
)
|
50 |
+
|
51 |
+
_, _, test_data = train.build_datasets(args, vqvae.sample_rate)
|
52 |
+
|
53 |
+
with audiotools.util.chdir(exp):
|
54 |
+
datasets = {
|
55 |
+
"test": test_data,
|
56 |
+
}
|
57 |
+
|
58 |
+
metrics_path = Path(f"{model_tag}/metrics")
|
59 |
+
metrics_path.mkdir(parents=True, exist_ok=True)
|
60 |
+
|
61 |
+
for key, dataset in datasets.items():
|
62 |
+
csv_path = metrics_path / f"{key}.csv"
|
63 |
+
if csv_path.exists() and not overwrite:
|
64 |
+
break
|
65 |
+
metrics = []
|
66 |
+
for i in track(range(len(dataset))):
|
67 |
+
# TODO: for coarse2fine
|
68 |
+
# grab the signal
|
69 |
+
# mask all the codebooks except the conditioning ones
|
70 |
+
# and infer
|
71 |
+
# then compute metrics
|
72 |
+
# for a baseline, just use the coarsest codebook
|
73 |
+
|
74 |
+
try:
|
75 |
+
visqol = audiotools.metrics.quality.visqol(
|
76 |
+
enhanced, clean, "audio"
|
77 |
+
).item()
|
78 |
+
except:
|
79 |
+
visqol = None
|
80 |
+
|
81 |
+
sisdr = sisdr_loss(enhanced, clean)
|
82 |
+
stft = stft_loss(enhanced, clean)
|
83 |
+
mel = mel_loss(enhanced, clean)
|
84 |
+
|
85 |
+
metrics.append(
|
86 |
+
{
|
87 |
+
"visqol": visqol,
|
88 |
+
"sisdr": sisdr.item(),
|
89 |
+
"stft": stft.item(),
|
90 |
+
"mel": mel.item(),
|
91 |
+
"dataset": key,
|
92 |
+
"condition": exp,
|
93 |
+
}
|
94 |
+
)
|
95 |
+
print(metrics[-1])
|
96 |
+
|
97 |
+
transform_args = flatten(item["transform_args"], "dot")
|
98 |
+
for k, v in transform_args.items():
|
99 |
+
if torch.is_tensor(v):
|
100 |
+
if len(v.shape) == 0:
|
101 |
+
metrics[-1][k] = v.item()
|
102 |
+
|
103 |
+
metrics = pd.DataFrame.from_dict(metrics)
|
104 |
+
with open(csv_path, "w") as f:
|
105 |
+
metrics.to_csv(f)
|
106 |
+
|
107 |
+
data = summary(model_tag).to_dict()
|
108 |
+
metrics = {}
|
109 |
+
for k1, v1 in data.items():
|
110 |
+
for k2, v2 in v1.items():
|
111 |
+
metrics[f"metrics/{k2}/{k1}"] = v2
|
112 |
+
|
113 |
+
# Number of steps to record
|
114 |
+
writer = SummaryWriter(log_dir=metrics_path)
|
115 |
+
num_steps = 10
|
116 |
+
for k, v in metrics.items():
|
117 |
+
for i in range(num_steps):
|
118 |
+
writer.add_scalar(k, v, i)
|
119 |
+
|
120 |
+
|
121 |
+
if __name__ == "__main__":
|
122 |
+
args = argbind.parse_args()
|
123 |
+
with argbind.scope(args):
|
124 |
+
evaluate(args)
|
scripts/generative/train.py
ADDED
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import time
|
4 |
+
import warnings
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
import argbind
|
9 |
+
import audiotools as at
|
10 |
+
import torch
|
11 |
+
import torch.nn as nn
|
12 |
+
from audiotools import AudioSignal
|
13 |
+
from audiotools.data import transforms
|
14 |
+
from einops import rearrange
|
15 |
+
from rich import pretty
|
16 |
+
from rich.traceback import install
|
17 |
+
from tensorboardX import SummaryWriter
|
18 |
+
|
19 |
+
import vampnet
|
20 |
+
from vampnet.modules.transformer import VampNet
|
21 |
+
from lac.model.lac import LAC
|
22 |
+
|
23 |
+
|
24 |
+
# Enable cudnn autotuner to speed up training
|
25 |
+
# (can be altered by the funcs.seed function)
|
26 |
+
torch.backends.cudnn.benchmark = bool(int(os.getenv("CUDNN_BENCHMARK", 1)))
|
27 |
+
# Uncomment to trade memory for speed.
|
28 |
+
|
29 |
+
# Install to make things look nice
|
30 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
31 |
+
pretty.install()
|
32 |
+
install()
|
33 |
+
|
34 |
+
# optim
|
35 |
+
Accelerator = argbind.bind(at.ml.Accelerator, without_prefix=True)
|
36 |
+
CrossEntropyLoss = argbind.bind(nn.CrossEntropyLoss)
|
37 |
+
AdamW = argbind.bind(torch.optim.AdamW)
|
38 |
+
NoamScheduler = argbind.bind(vampnet.scheduler.NoamScheduler)
|
39 |
+
|
40 |
+
# transforms
|
41 |
+
filter_fn = lambda fn: hasattr(fn, "transform") and fn.__qualname__ not in [
|
42 |
+
"BaseTransform",
|
43 |
+
"Compose",
|
44 |
+
"Choose",
|
45 |
+
]
|
46 |
+
tfm = argbind.bind_module(transforms, "train", "val", filter_fn=filter_fn)
|
47 |
+
|
48 |
+
# model
|
49 |
+
VampNet = argbind.bind(VampNet)
|
50 |
+
|
51 |
+
|
52 |
+
# data
|
53 |
+
AudioLoader = argbind.bind(at.datasets.AudioLoader)
|
54 |
+
AudioDataset = argbind.bind(at.datasets.AudioDataset, "train", "val")
|
55 |
+
|
56 |
+
IGNORE_INDEX = -100
|
57 |
+
|
58 |
+
|
59 |
+
@argbind.bind("train", "val", without_prefix=True)
|
60 |
+
def build_transform():
|
61 |
+
transform = transforms.Compose(
|
62 |
+
tfm.VolumeNorm(("uniform", -32, -14)),
|
63 |
+
tfm.VolumeChange(("uniform", -6, 3)),
|
64 |
+
tfm.RescaleAudio(),
|
65 |
+
)
|
66 |
+
return transform
|
67 |
+
|
68 |
+
|
69 |
+
@torch.no_grad()
|
70 |
+
def apply_transform(transform_fn, batch):
|
71 |
+
sig: AudioSignal = batch["signal"]
|
72 |
+
kwargs = batch["transform_args"]
|
73 |
+
|
74 |
+
sig: AudioSignal = transform_fn(sig.clone(), **kwargs)
|
75 |
+
return sig
|
76 |
+
|
77 |
+
|
78 |
+
def build_datasets(args, sample_rate: int):
|
79 |
+
with argbind.scope(args, "train"):
|
80 |
+
train_data = AudioDataset(
|
81 |
+
AudioLoader(), sample_rate, transform=build_transform()
|
82 |
+
)
|
83 |
+
with argbind.scope(args, "val"):
|
84 |
+
val_data = AudioDataset(AudioLoader(), sample_rate, transform=build_transform())
|
85 |
+
with argbind.scope(args, "test"):
|
86 |
+
test_data = AudioDataset(
|
87 |
+
AudioLoader(), sample_rate, transform=build_transform()
|
88 |
+
)
|
89 |
+
return train_data, val_data, test_data
|
90 |
+
|
91 |
+
|
92 |
+
def rand_float(shape, low, high, rng):
|
93 |
+
return rng.draw(shape)[:, 0] * (high - low) + low
|
94 |
+
|
95 |
+
|
96 |
+
def flip_coin(shape, p, rng):
|
97 |
+
return rng.draw(shape)[:, 0] < p
|
98 |
+
|
99 |
+
|
100 |
+
@argbind.bind(without_prefix=True)
|
101 |
+
def load(
|
102 |
+
args,
|
103 |
+
accel: at.ml.Accelerator,
|
104 |
+
save_path: str,
|
105 |
+
resume: bool = False,
|
106 |
+
tag: str = "latest",
|
107 |
+
load_weights: bool = False,
|
108 |
+
):
|
109 |
+
model, v_extra = None, {}
|
110 |
+
|
111 |
+
if resume:
|
112 |
+
kwargs = {
|
113 |
+
"folder": f"{save_path}/{tag}",
|
114 |
+
"map_location": "cpu",
|
115 |
+
"package": not load_weights,
|
116 |
+
}
|
117 |
+
if (Path(kwargs["folder"]) / "model").exists():
|
118 |
+
model, v_extra = model.load_from_folder(**kwargs)
|
119 |
+
|
120 |
+
codec = LAC.load(args["codec_ckpt"], map_location="cpu")
|
121 |
+
codec.eval()
|
122 |
+
model = VampNet() if model is None else model
|
123 |
+
|
124 |
+
model = accel.prepare_model(model)
|
125 |
+
|
126 |
+
# assert accel.unwrap(model).n_codebooks == codec.quantizer.n_codebooks
|
127 |
+
assert (
|
128 |
+
accel.unwrap(model).vocab_size == codec.quantizer.quantizers[0].codebook_size
|
129 |
+
)
|
130 |
+
|
131 |
+
optimizer = AdamW(model.parameters(), use_zero=accel.use_ddp)
|
132 |
+
scheduler = NoamScheduler(optimizer, d_model=accel.unwrap(model).embedding_dim)
|
133 |
+
scheduler.step()
|
134 |
+
|
135 |
+
trainer_state = {"state_dict": None, "start_idx": 0}
|
136 |
+
|
137 |
+
if "optimizer.pth" in v_extra:
|
138 |
+
optimizer.load_state_dict(v_extra["optimizer.pth"])
|
139 |
+
if "scheduler.pth" in v_extra:
|
140 |
+
scheduler.load_state_dict(v_extra["scheduler.pth"])
|
141 |
+
if "trainer.pth" in v_extra:
|
142 |
+
trainer_state = v_extra["trainer.pth"]
|
143 |
+
|
144 |
+
return {
|
145 |
+
"model": model,
|
146 |
+
"codec": codec,
|
147 |
+
"optimizer": optimizer,
|
148 |
+
"scheduler": scheduler,
|
149 |
+
"trainer_state": trainer_state,
|
150 |
+
}
|
151 |
+
|
152 |
+
|
153 |
+
def get_gpu_memory_map():
|
154 |
+
"""Get the current gpu usage.
|
155 |
+
|
156 |
+
Returns
|
157 |
+
-------
|
158 |
+
usage: dict
|
159 |
+
Keys are device ids as integers.
|
160 |
+
Values are memory usage as integers in MB.
|
161 |
+
"""
|
162 |
+
result = subprocess.check_output(
|
163 |
+
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
|
164 |
+
encoding="utf-8",
|
165 |
+
)
|
166 |
+
# Convert lines into a dictionary
|
167 |
+
gpu_memory = [int(x) for x in result.strip().split("\n")]
|
168 |
+
gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
|
169 |
+
gpu_memory_map = {f"gpu/{k}": v / 1024 for k, v in gpu_memory_map.items()}
|
170 |
+
return gpu_memory_map
|
171 |
+
|
172 |
+
|
173 |
+
def num_params_hook(o, p):
|
174 |
+
return o + f" {p/1e6:<.3f}M params."
|
175 |
+
|
176 |
+
|
177 |
+
def add_num_params_repr_hook(model):
|
178 |
+
import numpy as np
|
179 |
+
from functools import partial
|
180 |
+
|
181 |
+
for n, m in model.named_modules():
|
182 |
+
o = m.extra_repr()
|
183 |
+
p = sum([np.prod(p.size()) for p in m.parameters()])
|
184 |
+
|
185 |
+
setattr(m, "extra_repr", partial(num_params_hook, o=o, p=p))
|
186 |
+
|
187 |
+
|
188 |
+
def accuracy(
|
189 |
+
preds: torch.Tensor,
|
190 |
+
target: torch.Tensor,
|
191 |
+
top_k: int = 1,
|
192 |
+
ignore_index: Optional[int] = None,
|
193 |
+
**kwargs,
|
194 |
+
) -> torch.Tensor:
|
195 |
+
# Flatten the predictions and targets to be of shape (batch_size * sequence_length, n_class)
|
196 |
+
preds = rearrange(preds, "b p s -> (b s) p")
|
197 |
+
target = rearrange(target, "b s -> (b s)")
|
198 |
+
|
199 |
+
# return torchmetrics.functional.accuracy(preds, target, task='multiclass', top_k=topk, num_classes=preds.shape[-1], ignore_index=ignore_index)
|
200 |
+
if ignore_index is not None:
|
201 |
+
# Create a mask for the ignored index
|
202 |
+
mask = target != ignore_index
|
203 |
+
# Apply the mask to the target and predictions
|
204 |
+
preds = preds[mask]
|
205 |
+
target = target[mask]
|
206 |
+
|
207 |
+
# Get the top-k predicted classes and their indices
|
208 |
+
_, pred_indices = torch.topk(preds, k=top_k, dim=-1)
|
209 |
+
|
210 |
+
# Determine if the true target is in the top-k predicted classes
|
211 |
+
correct = torch.sum(torch.eq(pred_indices, target.unsqueeze(1)), dim=1)
|
212 |
+
|
213 |
+
# Calculate the accuracy
|
214 |
+
accuracy = torch.mean(correct.float())
|
215 |
+
|
216 |
+
return accuracy
|
217 |
+
|
218 |
+
|
219 |
+
@argbind.bind(without_prefix=True)
|
220 |
+
def train(
|
221 |
+
args,
|
222 |
+
accel: at.ml.Accelerator,
|
223 |
+
codec_ckpt: str = None,
|
224 |
+
seed: int = 0,
|
225 |
+
save_path: str = "ckpt",
|
226 |
+
max_epochs: int = int(100e3),
|
227 |
+
epoch_length: int = 1000,
|
228 |
+
save_audio_epochs: int = 10,
|
229 |
+
batch_size: int = 48,
|
230 |
+
grad_acc_steps: int = 1,
|
231 |
+
val_idx: list = [0, 1, 2, 3, 4],
|
232 |
+
num_workers: int = 20,
|
233 |
+
detect_anomaly: bool = False,
|
234 |
+
grad_clip_val: float = 5.0,
|
235 |
+
prefix_amt: float = 0.0,
|
236 |
+
suffix_amt: float = 0.0,
|
237 |
+
prefix_dropout: float = 0.1,
|
238 |
+
suffix_dropout: float = 0.1,
|
239 |
+
quiet: bool = False,
|
240 |
+
):
|
241 |
+
assert codec_ckpt is not None, "codec_ckpt is required"
|
242 |
+
|
243 |
+
at.util.seed(seed)
|
244 |
+
writer = None
|
245 |
+
|
246 |
+
if accel.local_rank == 0:
|
247 |
+
writer = SummaryWriter(log_dir=f"{save_path}/logs/")
|
248 |
+
argbind.dump_args(args, f"{save_path}/args.yml")
|
249 |
+
|
250 |
+
# load the codec model
|
251 |
+
loaded = load(args, accel, save_path)
|
252 |
+
model = loaded["model"]
|
253 |
+
codec = loaded["codec"]
|
254 |
+
optimizer = loaded["optimizer"]
|
255 |
+
scheduler = loaded["scheduler"]
|
256 |
+
trainer_state = loaded["trainer_state"]
|
257 |
+
|
258 |
+
sample_rate = codec.sample_rate
|
259 |
+
|
260 |
+
# a better rng for sampling from our schedule
|
261 |
+
rng = torch.quasirandom.SobolEngine(1, scramble=True)
|
262 |
+
|
263 |
+
# log a model summary w/ num params
|
264 |
+
if accel.local_rank == 0:
|
265 |
+
add_num_params_repr_hook(accel.unwrap(model))
|
266 |
+
with open(f"{save_path}/model.txt", "w") as f:
|
267 |
+
f.write(repr(accel.unwrap(model)))
|
268 |
+
|
269 |
+
# load the datasets
|
270 |
+
train_data, val_data, _ = build_datasets(args, sample_rate)
|
271 |
+
train_dataloader = accel.prepare_dataloader(
|
272 |
+
train_data,
|
273 |
+
start_idx=trainer_state["start_idx"],
|
274 |
+
num_workers=num_workers,
|
275 |
+
batch_size=batch_size,
|
276 |
+
collate_fn=train_data.collate,
|
277 |
+
)
|
278 |
+
val_dataloader = accel.prepare_dataloader(
|
279 |
+
val_data,
|
280 |
+
start_idx=0,
|
281 |
+
num_workers=num_workers,
|
282 |
+
batch_size=batch_size,
|
283 |
+
collate_fn=val_data.collate,
|
284 |
+
)
|
285 |
+
|
286 |
+
criterion = CrossEntropyLoss()
|
287 |
+
|
288 |
+
class Trainer(at.ml.BaseTrainer):
|
289 |
+
_last_grad_norm = 0.0
|
290 |
+
|
291 |
+
def metrics(self, vn, z_hat, r, target, flat_mask, output):
|
292 |
+
for r_range in [(0, 0.5), (0.5, 1.0)]:
|
293 |
+
unmasked_target = target.masked_fill(flat_mask.bool(), IGNORE_INDEX)
|
294 |
+
masked_target = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
|
295 |
+
|
296 |
+
assert target.shape[0] == r.shape[0]
|
297 |
+
# grab the indices of the r values that are in the range
|
298 |
+
r_idx = (r >= r_range[0]) & (r < r_range[1])
|
299 |
+
|
300 |
+
# grab the target and z_hat values that are in the range
|
301 |
+
r_unmasked_target = unmasked_target[r_idx]
|
302 |
+
r_masked_target = masked_target[r_idx]
|
303 |
+
r_z_hat = z_hat[r_idx]
|
304 |
+
|
305 |
+
for topk in (1, 25):
|
306 |
+
s, e = r_range
|
307 |
+
tag = f"accuracy-{s}-{e}/top{topk}"
|
308 |
+
|
309 |
+
output[f"{tag}/unmasked"] = accuracy(
|
310 |
+
preds=r_z_hat,
|
311 |
+
target=r_unmasked_target,
|
312 |
+
ignore_index=IGNORE_INDEX,
|
313 |
+
top_k=topk,
|
314 |
+
task="multiclass",
|
315 |
+
num_classes=vn.vocab_size,
|
316 |
+
)
|
317 |
+
output[f"{tag}/masked"] = accuracy(
|
318 |
+
preds=r_z_hat,
|
319 |
+
target=r_masked_target,
|
320 |
+
ignore_index=IGNORE_INDEX,
|
321 |
+
top_k=topk,
|
322 |
+
task="multiclass",
|
323 |
+
num_classes=vn.vocab_size,
|
324 |
+
)
|
325 |
+
|
326 |
+
def train_loop(self, engine, batch):
|
327 |
+
|
328 |
+
model.train()
|
329 |
+
batch = at.util.prepare_batch(batch, accel.device)
|
330 |
+
signal = apply_transform(train_data.transform, batch)
|
331 |
+
|
332 |
+
output = {}
|
333 |
+
vn = accel.unwrap(model)
|
334 |
+
with accel.autocast():
|
335 |
+
with torch.inference_mode():
|
336 |
+
z = codec.encode(signal.samples, signal.sample_rate)["codes"]
|
337 |
+
z = z[:, : vn.n_codebooks, :]
|
338 |
+
|
339 |
+
n_batch = z.shape[0]
|
340 |
+
r = rng.draw(n_batch)[:, 0].to(accel.device)
|
341 |
+
|
342 |
+
if prefix_amt > 0.0:
|
343 |
+
prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
|
344 |
+
n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
|
345 |
+
else:
|
346 |
+
n_prefix = None
|
347 |
+
if suffix_amt > 0.0:
|
348 |
+
suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
|
349 |
+
n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
|
350 |
+
else:
|
351 |
+
n_suffix = None
|
352 |
+
|
353 |
+
z_mask, mask = vn.add_noise(
|
354 |
+
z, r, n_prefix=n_prefix, n_suffix=n_suffix
|
355 |
+
)
|
356 |
+
z_mask_latent = vn.embedding.from_codes(z_mask, codec)
|
357 |
+
|
358 |
+
dtype = torch.bfloat16 if accel.amp else None
|
359 |
+
with accel.autocast(dtype=dtype):
|
360 |
+
z_hat = model(z_mask_latent, r)
|
361 |
+
# for mask mode
|
362 |
+
z_hat = vn.add_truth_to_logits(z, z_hat, mask)
|
363 |
+
|
364 |
+
target = vn.embedding.flatten(
|
365 |
+
z[:, vn.n_conditioning_codebooks :, :],
|
366 |
+
n_codebooks=vn.n_predict_codebooks,
|
367 |
+
)
|
368 |
+
|
369 |
+
flat_mask = vn.embedding.flatten(
|
370 |
+
mask[:, vn.n_conditioning_codebooks :, :],
|
371 |
+
n_codebooks=vn.n_predict_codebooks,
|
372 |
+
)
|
373 |
+
|
374 |
+
if vn.noise_mode == "mask":
|
375 |
+
# replace target with ignore index for masked tokens
|
376 |
+
t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
|
377 |
+
output["loss"] = criterion(z_hat, t_masked)
|
378 |
+
else:
|
379 |
+
output["loss"] = criterion(z_hat, target)
|
380 |
+
|
381 |
+
self.metrics(
|
382 |
+
vn=vn,
|
383 |
+
r=r,
|
384 |
+
z_hat=z_hat,
|
385 |
+
target=target,
|
386 |
+
flat_mask=flat_mask,
|
387 |
+
output=output,
|
388 |
+
)
|
389 |
+
|
390 |
+
|
391 |
+
accel.backward(output["loss"] / grad_acc_steps)
|
392 |
+
|
393 |
+
output["other/learning_rate"] = optimizer.param_groups[0]["lr"]
|
394 |
+
output["other/batch_size"] = z.shape[0]
|
395 |
+
|
396 |
+
output.update(get_gpu_memory_map())
|
397 |
+
|
398 |
+
if (
|
399 |
+
(engine.state.iteration % grad_acc_steps == 0)
|
400 |
+
or (engine.state.iteration % epoch_length == 0)
|
401 |
+
or (engine.state.iteration % epoch_length == 1)
|
402 |
+
): # (or we reached the end of the epoch)
|
403 |
+
accel.scaler.unscale_(optimizer)
|
404 |
+
output["other/grad_norm"] = torch.nn.utils.clip_grad_norm_(
|
405 |
+
model.parameters(), grad_clip_val
|
406 |
+
)
|
407 |
+
self._last_grad_norm = output["other/grad_norm"]
|
408 |
+
|
409 |
+
accel.step(optimizer)
|
410 |
+
optimizer.zero_grad()
|
411 |
+
|
412 |
+
scheduler.step()
|
413 |
+
accel.update()
|
414 |
+
else:
|
415 |
+
output["other/grad_norm"] = self._last_grad_norm
|
416 |
+
|
417 |
+
return {k: v for k, v in sorted(output.items())}
|
418 |
+
|
419 |
+
@torch.no_grad()
|
420 |
+
def val_loop(self, engine, batch):
|
421 |
+
model.eval()
|
422 |
+
codec.eval()
|
423 |
+
batch = at.util.prepare_batch(batch, accel.device)
|
424 |
+
signal = apply_transform(val_data.transform, batch)
|
425 |
+
|
426 |
+
vn = accel.unwrap(model)
|
427 |
+
z = codec.encode(signal.samples, signal.sample_rate)["codes"]
|
428 |
+
z = z[:, : vn.n_codebooks, :]
|
429 |
+
|
430 |
+
n_batch = z.shape[0]
|
431 |
+
r = rng.draw(n_batch)[:, 0].to(accel.device)
|
432 |
+
|
433 |
+
if prefix_amt > 0.0:
|
434 |
+
prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
|
435 |
+
n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
|
436 |
+
else:
|
437 |
+
n_prefix = None
|
438 |
+
if suffix_amt > 0.0:
|
439 |
+
suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
|
440 |
+
n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
|
441 |
+
else:
|
442 |
+
n_suffix = None
|
443 |
+
|
444 |
+
z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
|
445 |
+
z_mask_latent = vn.embedding.from_codes(z_mask, codec)
|
446 |
+
|
447 |
+
z_hat = model(z_mask_latent, r)
|
448 |
+
# for mask mode
|
449 |
+
z_hat = vn.add_truth_to_logits(z, z_hat, mask)
|
450 |
+
|
451 |
+
target = vn.embedding.flatten(
|
452 |
+
z[:, vn.n_conditioning_codebooks :, :],
|
453 |
+
n_codebooks=vn.n_predict_codebooks,
|
454 |
+
)
|
455 |
+
|
456 |
+
flat_mask = vn.embedding.flatten(
|
457 |
+
mask[:, vn.n_conditioning_codebooks :, :],
|
458 |
+
n_codebooks=vn.n_predict_codebooks,
|
459 |
+
)
|
460 |
+
|
461 |
+
output = {}
|
462 |
+
if vn.noise_mode == "mask":
|
463 |
+
# replace target with ignore index for masked tokens
|
464 |
+
t_masked = target.masked_fill(~flat_mask.bool(), IGNORE_INDEX)
|
465 |
+
output["loss"] = criterion(z_hat, t_masked)
|
466 |
+
else:
|
467 |
+
output["loss"] = criterion(z_hat, target)
|
468 |
+
|
469 |
+
self.metrics(
|
470 |
+
vn=vn,
|
471 |
+
r=r,
|
472 |
+
z_hat=z_hat,
|
473 |
+
target=target,
|
474 |
+
flat_mask=flat_mask,
|
475 |
+
output=output,
|
476 |
+
)
|
477 |
+
|
478 |
+
return output
|
479 |
+
|
480 |
+
def checkpoint(self, engine):
|
481 |
+
if accel.local_rank != 0:
|
482 |
+
print(f"ERROR:Skipping checkpoint on rank {accel.local_rank}")
|
483 |
+
return
|
484 |
+
|
485 |
+
metadata = {"logs": dict(engine.state.logs["epoch"])}
|
486 |
+
|
487 |
+
if self.state.epoch % save_audio_epochs == 0:
|
488 |
+
self.save_samples()
|
489 |
+
|
490 |
+
tags = ["latest"]
|
491 |
+
loss_key = "loss/val" if "loss/val" in metadata["logs"] else "loss/train"
|
492 |
+
self.print(f"Saving to {str(Path('.').absolute())}")
|
493 |
+
|
494 |
+
if self.is_best(engine, loss_key):
|
495 |
+
self.print(f"Best model so far")
|
496 |
+
tags.append("best")
|
497 |
+
|
498 |
+
for tag in tags:
|
499 |
+
model_extra = {
|
500 |
+
"optimizer.pth": optimizer.state_dict(),
|
501 |
+
"scheduler.pth": scheduler.state_dict(),
|
502 |
+
"trainer.pth": {
|
503 |
+
"start_idx": self.state.iteration * batch_size,
|
504 |
+
"state_dict": self.state_dict(),
|
505 |
+
},
|
506 |
+
"metadata.pth": metadata,
|
507 |
+
}
|
508 |
+
|
509 |
+
accel.unwrap(model).metadata = metadata
|
510 |
+
accel.unwrap(model).save_to_folder(
|
511 |
+
f"{save_path}/{tag}", model_extra
|
512 |
+
)
|
513 |
+
|
514 |
+
def save_sampled(self, z):
|
515 |
+
num_samples = z.shape[0]
|
516 |
+
|
517 |
+
for i in range(num_samples):
|
518 |
+
sampled = accel.unwrap(model).sample(
|
519 |
+
codec,
|
520 |
+
time_steps=z.shape[-1],
|
521 |
+
start_tokens=z[i : i + 1],
|
522 |
+
)
|
523 |
+
sampled.cpu().write_audio_to_tb(
|
524 |
+
f"sampled/{i}",
|
525 |
+
self.writer,
|
526 |
+
step=self.state.epoch,
|
527 |
+
plot_fn=None,
|
528 |
+
)
|
529 |
+
|
530 |
+
def save_imputation(self, z: torch.Tensor):
|
531 |
+
# imputations
|
532 |
+
mask_begin = z.shape[-1] // 4
|
533 |
+
mask_end = (z.shape[-1] * 3) // 4
|
534 |
+
|
535 |
+
imp_mask = torch.zeros(z.shape[0], z.shape[-1]).to(accel.device).int()
|
536 |
+
imp_mask[:, mask_begin:mask_end] = 1
|
537 |
+
|
538 |
+
imp_noisy = (
|
539 |
+
z * (1 - imp_mask[:, None, :])
|
540 |
+
+ torch.randint_like(z, 0, accel.unwrap(model).vocab_size)
|
541 |
+
* imp_mask[:, None, :]
|
542 |
+
)
|
543 |
+
imputed_noisy = accel.unwrap(model).to_signal(imp_noisy, codec)
|
544 |
+
imputed_true = accel.unwrap(model).to_signal(z, codec)
|
545 |
+
|
546 |
+
imputed = []
|
547 |
+
for i in range(len(z)):
|
548 |
+
imputed.append(
|
549 |
+
accel.unwrap(model).sample(
|
550 |
+
codec,
|
551 |
+
time_steps=z.shape[-1],
|
552 |
+
start_tokens=z[i][None, ...],
|
553 |
+
mask=imp_mask[i][None, ...],
|
554 |
+
)
|
555 |
+
)
|
556 |
+
imputed = AudioSignal.batch(imputed)
|
557 |
+
|
558 |
+
for i in range(len(val_idx)):
|
559 |
+
imputed_noisy[i].cpu().write_audio_to_tb(
|
560 |
+
f"imputed_noisy/{i}",
|
561 |
+
self.writer,
|
562 |
+
step=self.state.epoch,
|
563 |
+
plot_fn=None,
|
564 |
+
)
|
565 |
+
imputed[i].cpu().write_audio_to_tb(
|
566 |
+
f"imputed/{i}",
|
567 |
+
self.writer,
|
568 |
+
step=self.state.epoch,
|
569 |
+
plot_fn=None,
|
570 |
+
)
|
571 |
+
imputed_true[i].cpu().write_audio_to_tb(
|
572 |
+
f"imputed_true/{i}",
|
573 |
+
self.writer,
|
574 |
+
step=self.state.epoch,
|
575 |
+
plot_fn=None,
|
576 |
+
)
|
577 |
+
|
578 |
+
@torch.no_grad()
|
579 |
+
def save_samples(self):
|
580 |
+
model.eval()
|
581 |
+
codec.eval()
|
582 |
+
vn = accel.unwrap(model)
|
583 |
+
|
584 |
+
batch = [val_data[i] for i in val_idx]
|
585 |
+
batch = at.util.prepare_batch(val_data.collate(batch), accel.device)
|
586 |
+
|
587 |
+
signal = apply_transform(val_data.transform, batch)
|
588 |
+
|
589 |
+
z = codec.encode(signal.samples, signal.sample_rate)["codes"]
|
590 |
+
z = z[:, : vn.n_codebooks, :]
|
591 |
+
|
592 |
+
r = torch.linspace(0.1, 0.95, len(val_idx)).to(accel.device)
|
593 |
+
|
594 |
+
n_batch = z.shape[0]
|
595 |
+
|
596 |
+
if prefix_amt > 0.0:
|
597 |
+
prefix_mask = flip_coin(n_batch, 1 - prefix_dropout, rng)
|
598 |
+
n_prefix = int(prefix_amt * z.shape[-1]) * prefix_mask
|
599 |
+
else:
|
600 |
+
n_prefix = None
|
601 |
+
if suffix_amt > 0.0:
|
602 |
+
suffix_mask = flip_coin(n_batch, 1 - suffix_dropout, rng)
|
603 |
+
n_suffix = int(suffix_amt * z.shape[-1]) * suffix_mask
|
604 |
+
else:
|
605 |
+
n_suffix = None
|
606 |
+
|
607 |
+
z_mask, mask = vn.add_noise(z, r, n_prefix=n_prefix, n_suffix=n_suffix)
|
608 |
+
z_mask_latent = vn.embedding.from_codes(z_mask, codec)
|
609 |
+
|
610 |
+
z_hat = model(z_mask_latent, r)
|
611 |
+
# for mask mode
|
612 |
+
z_hat = vn.add_truth_to_logits(z, z_hat, mask)
|
613 |
+
|
614 |
+
z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
|
615 |
+
z_pred = vn.embedding.unflatten(z_pred, n_codebooks=vn.n_predict_codebooks)
|
616 |
+
z_pred = torch.cat([z[:, : vn.n_conditioning_codebooks, :], z_pred], dim=1)
|
617 |
+
|
618 |
+
print("z_mask", z_mask.shape)
|
619 |
+
generated = vn.to_signal(z_pred, codec)
|
620 |
+
reconstructed = vn.to_signal(z, codec)
|
621 |
+
masked = vn.to_signal(z_mask.squeeze(1), codec)
|
622 |
+
|
623 |
+
for i in range(generated.batch_size):
|
624 |
+
audio_dict = {
|
625 |
+
"original": signal[i],
|
626 |
+
"masked": masked[i],
|
627 |
+
"generated": generated[i],
|
628 |
+
"reconstructed": reconstructed[i],
|
629 |
+
}
|
630 |
+
for k, v in audio_dict.items():
|
631 |
+
v.cpu().write_audio_to_tb(
|
632 |
+
f"samples/_{i}.r={r[i]:0.2f}/{k}",
|
633 |
+
self.writer,
|
634 |
+
step=self.state.epoch,
|
635 |
+
plot_fn=None,
|
636 |
+
)
|
637 |
+
|
638 |
+
self.save_sampled(z)
|
639 |
+
self.save_imputation(z)
|
640 |
+
|
641 |
+
trainer = Trainer(writer=writer, quiet=quiet)
|
642 |
+
|
643 |
+
if trainer_state["state_dict"] is not None:
|
644 |
+
trainer.load_state_dict(trainer_state["state_dict"])
|
645 |
+
if hasattr(train_dataloader.sampler, "set_epoch"):
|
646 |
+
train_dataloader.sampler.set_epoch(trainer.trainer.state.epoch)
|
647 |
+
|
648 |
+
trainer.run(
|
649 |
+
train_dataloader,
|
650 |
+
val_dataloader,
|
651 |
+
num_epochs=max_epochs,
|
652 |
+
epoch_length=epoch_length,
|
653 |
+
detect_anomaly=detect_anomaly,
|
654 |
+
)
|
655 |
+
|
656 |
+
|
657 |
+
if __name__ == "__main__":
|
658 |
+
args = argbind.parse_args()
|
659 |
+
args["args.debug"] = int(os.getenv("LOCAL_RANK", 0)) == 0
|
660 |
+
with argbind.scope(args):
|
661 |
+
with Accelerator() as accel:
|
662 |
+
train(args, accel)
|
scripts/utils/README.md
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Scripts
|
2 |
+
|
3 |
+
## process_zip.py
|
4 |
+
|
5 |
+
Some requirements that may not be installed in the docker image:
|
6 |
+
* argbind
|
7 |
+
* wav2wav (pip install git+https://github.com/descriptinc/lyrebird-wav2wav.git or `pip install git+https://github.com/descriptinc/lyrebird-wav2wav.git@<branchname>`)
|
8 |
+
|
9 |
+
### zip folder structure
|
10 |
+
|
11 |
+
The zip folder should have the following internal structure:
|
12 |
+
|
13 |
+
```
|
14 |
+
base_folder/
|
15 |
+
test_case_1/
|
16 |
+
before.wav
|
17 |
+
test_case_2/
|
18 |
+
before.wav
|
19 |
+
...
|
20 |
+
test_case_n/
|
21 |
+
before.wav
|
22 |
+
```
|
23 |
+
|
24 |
+
Note: There can be issues with the output zip if the input zip folder structure is too deep or too shallow. IF you want/need to use a zip file with a different folder structure, adjust this:
|
25 |
+
https://github.com/descriptinc/lyrebird-wav2wav/blob/136c923ce19df03876a515ca0ed83854710cfa30/scripts/utils/process_zip.py#L28
|
26 |
+
|
27 |
+
### Execution
|
28 |
+
`python process_zip.py <path/to/zip> -tag <string>`
|
scripts/utils/stage.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import argbind
|
6 |
+
import rich
|
7 |
+
from audiotools.ml import Experiment
|
8 |
+
|
9 |
+
|
10 |
+
@argbind.bind(without_prefix=True)
|
11 |
+
def run(
|
12 |
+
run_dir: str = os.getenv("PATH_TO_RUNS", "runs"),
|
13 |
+
name: str = None,
|
14 |
+
recent: bool = False,
|
15 |
+
):
|
16 |
+
if recent:
|
17 |
+
paths = sorted(Path(run_dir).iterdir(), key=os.path.getmtime)
|
18 |
+
paths = [p.name for p in paths if p.is_dir()]
|
19 |
+
if paths:
|
20 |
+
name = paths[-1]
|
21 |
+
|
22 |
+
with Experiment(run_dir, name) as exp:
|
23 |
+
exp.snapshot()
|
24 |
+
rich.print(f"Created a snapshot of {exp.parent_directory} at {exp.exp_dir}")
|
25 |
+
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
args = argbind.parse_args()
|
29 |
+
with argbind.scope(args):
|
30 |
+
run()
|
setup.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import find_packages
|
2 |
+
from setuptools import setup
|
3 |
+
|
4 |
+
with open("README.md") as f:
|
5 |
+
long_description = f.read()
|
6 |
+
|
7 |
+
setup(
|
8 |
+
name="vampnet",
|
9 |
+
version="0.0.1",
|
10 |
+
classifiers=[
|
11 |
+
"Intended Audience :: Developers",
|
12 |
+
"Natural Language :: English",
|
13 |
+
"Programming Language :: Python :: 3.7",
|
14 |
+
"Topic :: Artistic Software",
|
15 |
+
"Topic :: Multimedia",
|
16 |
+
"Topic :: Multimedia :: Sound/Audio",
|
17 |
+
"Topic :: Multimedia :: Sound/Audio :: Editors",
|
18 |
+
"Topic :: Software Development :: Libraries",
|
19 |
+
],
|
20 |
+
description="Generative Music Modeling.",
|
21 |
+
long_description=long_description,
|
22 |
+
long_description_content_type="text/markdown",
|
23 |
+
author="Hugo Flores García",
|
24 |
+
author_email="[email protected]",
|
25 |
+
url="https://github.com/descriptinc/lyrebird-vampnet",
|
26 |
+
license="MIT",
|
27 |
+
packages=find_packages(),
|
28 |
+
install_requires=[
|
29 |
+
"torch<=1.11.0",
|
30 |
+
"argbind>=0.3.2",
|
31 |
+
"pytorch-ignite",
|
32 |
+
"rich",
|
33 |
+
"audiotools @ git+https://github.com/descriptinc/[email protected]",
|
34 |
+
"tqdm",
|
35 |
+
"tensorboard",
|
36 |
+
"google-cloud-logging==2.2.0",
|
37 |
+
"torchmetrics>=0.7.3",
|
38 |
+
"einops",
|
39 |
+
],
|
40 |
+
)
|
vampnet/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from . import modules
|
3 |
+
from . import scheduler
|
4 |
+
from . import enchilada
|
5 |
+
|
6 |
+
__version__ = "0.0.1"
|
vampnet/enchilada.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from audiotools import AudioSignal
|
6 |
+
|
7 |
+
from .modules.transformer import VampNet
|
8 |
+
from lac.model.lac import LAC
|
9 |
+
|
10 |
+
|
11 |
+
class TheWholeEnchilada:
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
coarse_ckpt: str,
|
15 |
+
coarse2fine_ckpt: str,
|
16 |
+
codec_ckpt: str,
|
17 |
+
device: str = "cpu",
|
18 |
+
):
|
19 |
+
self.codec = LAC.load(Path(codec_ckpt))
|
20 |
+
self.codec.eval()
|
21 |
+
self.codec.to(device)
|
22 |
+
|
23 |
+
self.coarse = VampNet.load(location=Path(coarse_ckpt), map_location="cpu")
|
24 |
+
self.coarse.to(device)
|
25 |
+
self.coarse.eval()
|
26 |
+
|
27 |
+
self.coarse2fine = VampNet.load(
|
28 |
+
location=Path(coarse2fine_ckpt), map_location="cpu"
|
29 |
+
)
|
30 |
+
# FIXME
|
31 |
+
print(
|
32 |
+
f"WARNING: PATCHING coarse2fine seq_len to 288, for backwards compatibility with a specific jazzpop model. it used to be {self.coarse2fine.seq_len}"
|
33 |
+
)
|
34 |
+
self.coarse2fine.seq_len = 288
|
35 |
+
|
36 |
+
self.coarse2fine.to(device)
|
37 |
+
self.coarse2fine.eval()
|
38 |
+
|
39 |
+
self.device = device
|
40 |
+
|
41 |
+
def seconds_to_tokens(self, seconds: float):
|
42 |
+
return int(seconds * self.codec.sample_rate / self.codec.hop_length)
|
43 |
+
|
44 |
+
def to(self, device):
|
45 |
+
self.device = device
|
46 |
+
self.coarse.to(device)
|
47 |
+
self.coarse2fine.to(device)
|
48 |
+
self.codec.to(device)
|
49 |
+
return self
|
50 |
+
|
51 |
+
def encode(self, signal: AudioSignal):
|
52 |
+
with torch.inference_mode():
|
53 |
+
# coarse z
|
54 |
+
cz = self.codec.encode(signal.samples, signal.sample_rate)["codes"]
|
55 |
+
|
56 |
+
return cz
|
57 |
+
|
58 |
+
def vamp(
|
59 |
+
self,
|
60 |
+
signal,
|
61 |
+
prefix_dur_s: float = 1.25,
|
62 |
+
suffix_dur_s: float = 1.25,
|
63 |
+
downsample_hint: bool = True,
|
64 |
+
downsample_factor: int = 4,
|
65 |
+
num_loops: int = 3,
|
66 |
+
**kwargs,
|
67 |
+
):
|
68 |
+
"""
|
69 |
+
Loop imputation of a signal.
|
70 |
+
"""
|
71 |
+
signal.to(self.device).resample(self.codec.sample_rate).to_mono()
|
72 |
+
|
73 |
+
z = self.encode(signal)
|
74 |
+
|
75 |
+
cz = z[:, : self.coarse.n_codebooks, :].clone()
|
76 |
+
original_cz = cz.clone()
|
77 |
+
seq_len = original_cz.shape[-1]
|
78 |
+
assert (
|
79 |
+
seq_len == self.coarse.seq_len
|
80 |
+
), f"expected seq_len {self.coarse.seq_len}, got {seq_len} for token sequence length. Is your signal the same duration as the model was trained with? "
|
81 |
+
|
82 |
+
vamp_hop_s = prefix_dur_s
|
83 |
+
vamp_hop = self.seconds_to_tokens(vamp_hop_s)
|
84 |
+
|
85 |
+
cmask = torch.ones_like(cz)
|
86 |
+
|
87 |
+
if downsample_hint:
|
88 |
+
# downsample by factor of 4
|
89 |
+
for i in range(cmask.shape[-1]):
|
90 |
+
if i % downsample_factor == 0:
|
91 |
+
cmask[:, :, i] = 0
|
92 |
+
|
93 |
+
if prefix_dur_s > 0:
|
94 |
+
prefix_len = self.seconds_to_tokens(prefix_dur_s)
|
95 |
+
cmask[:, :, :prefix_len] = 0
|
96 |
+
print(f"prefix_len: {prefix_len}")
|
97 |
+
else:
|
98 |
+
prefix_len = 0
|
99 |
+
|
100 |
+
if suffix_dur_s > 0:
|
101 |
+
suffix_len = self.seconds_to_tokens(suffix_dur_s)
|
102 |
+
cmask[:, :, -suffix_len:] = 0
|
103 |
+
print(f"suffix_len: {suffix_len}")
|
104 |
+
else:
|
105 |
+
suffix_len = 0
|
106 |
+
|
107 |
+
prefix_z = cz[:, :, :prefix_len]
|
108 |
+
|
109 |
+
coarse_vamp = [prefix_z.clone()]
|
110 |
+
for i in range(num_loops):
|
111 |
+
sampled_cz = self.coarse.sample(
|
112 |
+
codec=self.codec,
|
113 |
+
time_steps=seq_len,
|
114 |
+
mask=cmask,
|
115 |
+
start_tokens=cz,
|
116 |
+
return_signal=False,
|
117 |
+
**kwargs,
|
118 |
+
)
|
119 |
+
|
120 |
+
new_prefix = sampled_cz[:, :, prefix_len : prefix_len + vamp_hop]
|
121 |
+
coarse_vamp.append(new_prefix.clone())
|
122 |
+
|
123 |
+
# replace the prefix in cz with the new prefix
|
124 |
+
# don't worry about a copy of the prefix still being
|
125 |
+
# in the mask area, since that will be masked out
|
126 |
+
cz[:, :, :vamp_hop] = new_prefix.clone()
|
127 |
+
print("to append and to prefix")
|
128 |
+
|
129 |
+
# we're done, so add the suffix
|
130 |
+
coarse_vamp.append(sampled_cz[:, :, prefix_len + vamp_hop :])
|
131 |
+
|
132 |
+
# concatenate the vamps
|
133 |
+
coarse_vamp = torch.cat(coarse_vamp, dim=-1)
|
134 |
+
|
135 |
+
# add a layer of
|
136 |
+
fine_prefix = z[:, self.coarse.n_codebooks :, :prefix_len]
|
137 |
+
fine_suffix = z[:, self.coarse.n_codebooks :, -suffix_len:]
|
138 |
+
fine_vamp = torch.randint(
|
139 |
+
0,
|
140 |
+
self.coarse2fine.vocab_size,
|
141 |
+
(
|
142 |
+
coarse_vamp.shape[0],
|
143 |
+
self.coarse2fine.n_predict_codebooks,
|
144 |
+
coarse_vamp.shape[-1],
|
145 |
+
),
|
146 |
+
).to(self.device)
|
147 |
+
fine_vamp[:, :, :prefix_len] = fine_prefix
|
148 |
+
fine_vamp[:, :, -suffix_len:] = fine_suffix
|
149 |
+
|
150 |
+
vamp_z = torch.cat([coarse_vamp, fine_vamp], dim=1)
|
151 |
+
|
152 |
+
# now we sample from the coarse2fine model
|
153 |
+
# to get the fine details
|
154 |
+
start_pos = 0
|
155 |
+
|
156 |
+
c2f_vamp = []
|
157 |
+
while start_pos < vamp_z.shape[-1]:
|
158 |
+
end_pos = min(start_pos + self.coarse2fine.seq_len, vamp_z.shape[-1])
|
159 |
+
|
160 |
+
c2fz = vamp_z[:, :, start_pos:end_pos]
|
161 |
+
self.coarse2fine: VampNet
|
162 |
+
sampled_c2fz = self.coarse2fine.sample(
|
163 |
+
codec=self.codec,
|
164 |
+
start_tokens=c2fz,
|
165 |
+
return_signal=False,
|
166 |
+
mask=None,
|
167 |
+
)
|
168 |
+
c2f_vamp.append(sampled_c2fz)
|
169 |
+
start_pos += self.coarse2fine.seq_len
|
170 |
+
|
171 |
+
c2f_vamp = torch.cat(c2f_vamp, dim=-1)
|
172 |
+
|
173 |
+
# make it a signal
|
174 |
+
vamp_signal = self.coarse2fine.to_signal(c2f_vamp, self.codec)
|
175 |
+
|
176 |
+
return {
|
177 |
+
"full": vamp_signal,
|
178 |
+
"coarse": self.coarse.to_signal(coarse_vamp, self.codec),
|
179 |
+
}
|
vampnet/modules/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import audiotools
|
2 |
+
|
3 |
+
audiotools.ml.BaseModel.INTERN += ["vampnet.modules.**"]
|
4 |
+
audiotools.ml.BaseModel.EXTERN += ["einops", "flash_attn.flash_attention"]
|
vampnet/modules/base.py
ADDED
@@ -0,0 +1,461 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from typing import Optional
|
3 |
+
from typing import Tuple
|
4 |
+
from typing import Union
|
5 |
+
|
6 |
+
import audiotools as at
|
7 |
+
import torch
|
8 |
+
import torch.nn as nn
|
9 |
+
import torch.nn.functional as F
|
10 |
+
from einops import rearrange
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
def log(t, eps=1e-20):
|
15 |
+
return torch.log(t + eps)
|
16 |
+
|
17 |
+
|
18 |
+
def gumbel_noise(t):
|
19 |
+
noise = torch.zeros_like(t).uniform_(0, 1)
|
20 |
+
return -log(-log(noise))
|
21 |
+
|
22 |
+
|
23 |
+
def gumbel_sample(t, temperature=1.0, dim=-1):
|
24 |
+
return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
|
25 |
+
|
26 |
+
|
27 |
+
class VampBase(at.ml.BaseModel):
|
28 |
+
def forward(self, x: torch.Tensor, r: torch.Tensor):
|
29 |
+
raise NotImplementedError
|
30 |
+
|
31 |
+
def add_noise(
|
32 |
+
self,
|
33 |
+
x: torch.Tensor,
|
34 |
+
r: torch.Tensor,
|
35 |
+
random_x: Optional[torch.Tensor] = None,
|
36 |
+
mask: Optional[torch.Tensor] = None,
|
37 |
+
n_prefix: Optional[torch.Tensor] = None,
|
38 |
+
n_suffix: Optional[torch.Tensor] = None,
|
39 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
40 |
+
assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
|
41 |
+
|
42 |
+
if mask is None:
|
43 |
+
r = self.gamma(r)[:, None, None]
|
44 |
+
probs = torch.ones_like(x) * r
|
45 |
+
|
46 |
+
# if we have a prefix or suffix, set their mask prob to 0
|
47 |
+
if n_prefix is not None:
|
48 |
+
for i, n in enumerate(n_prefix):
|
49 |
+
probs[i, :, :n] = 0.0
|
50 |
+
if n_suffix is not None:
|
51 |
+
for i, n in enumerate(n_suffix):
|
52 |
+
probs[i, :, -n:] = 0.0
|
53 |
+
|
54 |
+
mask = torch.bernoulli(probs)
|
55 |
+
mask = mask.round().long()
|
56 |
+
|
57 |
+
# if we have any conditioning codebooks, set their mask to 0
|
58 |
+
mask[:, : self.n_conditioning_codebooks, :] = 0
|
59 |
+
else:
|
60 |
+
assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
|
61 |
+
assert mask.shape == x.shape, "mask must be same shape as x"
|
62 |
+
|
63 |
+
if random_x is None:
|
64 |
+
random_x = torch.randint_like(x, 0, self.vocab_size)
|
65 |
+
|
66 |
+
if self.noise_mode == "mask":
|
67 |
+
random_x = torch.full_like(x, self.mask_token)
|
68 |
+
elif self.noise_mode == "random":
|
69 |
+
if random_x is None:
|
70 |
+
random_x = torch.randint_like(x, 0, self.vocab_size)
|
71 |
+
else:
|
72 |
+
raise ValueError(f"invalid noise mode {self.noise_mode}")
|
73 |
+
|
74 |
+
x = x * (1 - mask) + random_x * mask
|
75 |
+
return x, mask
|
76 |
+
|
77 |
+
def add_truth_to_logits(
|
78 |
+
self,
|
79 |
+
z_true,
|
80 |
+
z_hat,
|
81 |
+
mask,
|
82 |
+
):
|
83 |
+
if self.noise_mode == "mask":
|
84 |
+
z_true = z_true[:, self.n_conditioning_codebooks :, :]
|
85 |
+
mask = mask[:, self.n_conditioning_codebooks :, :]
|
86 |
+
|
87 |
+
truth = F.one_hot(z_true, self.vocab_size)
|
88 |
+
print(truth.shape)
|
89 |
+
# truth = rearrange(truth, "b c t p -> b p (t c)")
|
90 |
+
mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
|
91 |
+
z_hat = rearrange(
|
92 |
+
z_hat,
|
93 |
+
"b p (t c) -> b c t p",
|
94 |
+
c=self.n_codebooks - self.n_conditioning_codebooks,
|
95 |
+
)
|
96 |
+
|
97 |
+
z_hat = z_hat * mask + truth * (1 - mask)
|
98 |
+
|
99 |
+
z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
|
100 |
+
|
101 |
+
return z_hat
|
102 |
+
|
103 |
+
def gamma(self, r):
|
104 |
+
return (r * torch.pi / 2).cos()
|
105 |
+
|
106 |
+
def r_embed(self, r, max_positions=10000):
|
107 |
+
""" """
|
108 |
+
assert hasattr(self, "r_cond_dim"), "must set r_cond_dim before calling r_embed"
|
109 |
+
|
110 |
+
if self.r_cond_dim > 0:
|
111 |
+
dtype = r.dtype
|
112 |
+
|
113 |
+
r = self.gamma(r) * max_positions
|
114 |
+
half_dim = self.r_cond_dim // 2
|
115 |
+
|
116 |
+
emb = math.log(max_positions) / (half_dim - 1)
|
117 |
+
emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
|
118 |
+
|
119 |
+
emb = r[:, None] * emb[None, :]
|
120 |
+
emb = torch.cat([emb.sin(), emb.cos()], dim=1)
|
121 |
+
|
122 |
+
if self.r_cond_dim % 2 == 1: # zero pad
|
123 |
+
emb = nn.functional.pad(emb, (0, 1), mode="constant")
|
124 |
+
|
125 |
+
return emb.to(dtype)
|
126 |
+
else:
|
127 |
+
return r
|
128 |
+
|
129 |
+
@torch.no_grad()
|
130 |
+
def to_signal(self, z, vqvae):
|
131 |
+
if z.ndim == 2:
|
132 |
+
z = self.embedding.unflatten(z)
|
133 |
+
assert z.ndim == 3
|
134 |
+
|
135 |
+
signal = at.AudioSignal(
|
136 |
+
vqvae.decode(
|
137 |
+
vqvae.quantizer.from_latents(self.embedding.from_codes(z, vqvae))[0]
|
138 |
+
)["audio"],
|
139 |
+
vqvae.sample_rate,
|
140 |
+
)
|
141 |
+
|
142 |
+
return signal
|
143 |
+
|
144 |
+
@torch.no_grad()
|
145 |
+
def sample(self, **kwargs):
|
146 |
+
if self.noise_mode == "mask":
|
147 |
+
return self.maskgit_sample(**kwargs)
|
148 |
+
else:
|
149 |
+
return self.paella_sample(**kwargs)
|
150 |
+
|
151 |
+
def paella_sample(
|
152 |
+
self,
|
153 |
+
vqvae,
|
154 |
+
time_steps: int = 400,
|
155 |
+
sampling_steps: int = 12,
|
156 |
+
start_tokens: Optional[torch.Tensor] = None,
|
157 |
+
mask: Optional[torch.Tensor] = None,
|
158 |
+
device: str = "cpu",
|
159 |
+
temperature: Union[float, Tuple[float, float]] = 1.0,
|
160 |
+
top_k: int = None,
|
161 |
+
sample: str = "gumbel",
|
162 |
+
renoise_mode: str = "start",
|
163 |
+
renoise_steps=None,
|
164 |
+
typical_filtering=True,
|
165 |
+
typical_mass=0.2,
|
166 |
+
typical_min_tokens=1,
|
167 |
+
return_signal=True,
|
168 |
+
):
|
169 |
+
r = torch.linspace(0, 1, sampling_steps + 1)[:-1][:, None].to(device)
|
170 |
+
if renoise_steps == None:
|
171 |
+
renoise_steps = sampling_steps - 1
|
172 |
+
|
173 |
+
if isinstance(temperature, float):
|
174 |
+
temperature = torch.tensor(temperature).repeat(sampling_steps)
|
175 |
+
elif isinstance(temperature, tuple):
|
176 |
+
assert len(temperature) == 2
|
177 |
+
l, h = temperature
|
178 |
+
temperature = torch.linspace(l, h, sampling_steps)
|
179 |
+
else:
|
180 |
+
raise TypeError(f"invalid type for temperature")
|
181 |
+
|
182 |
+
if self.n_conditioning_codebooks > 0:
|
183 |
+
assert (
|
184 |
+
start_tokens is not None
|
185 |
+
), "must provide start_tokens if n_conditioning_codebooks > 0"
|
186 |
+
|
187 |
+
if start_tokens is None:
|
188 |
+
if self.noise_mode == "noise":
|
189 |
+
z = torch.randint(
|
190 |
+
0, self.vocab_size, size=(1, self.n_codebooks, time_steps)
|
191 |
+
).to(device)
|
192 |
+
elif self.noise_mode == "mask":
|
193 |
+
z = torch.full((1, self.n_codebooks, time_steps), self.mask_token)
|
194 |
+
else:
|
195 |
+
z = start_tokens
|
196 |
+
assert (
|
197 |
+
z.ndim == 3
|
198 |
+
), f"start_tokens must be shape (batch, n_codebooks, seq_len), got {z.shape}"
|
199 |
+
assert z.shape[0] == 1, f"batch size must be 1"
|
200 |
+
|
201 |
+
if mask is None:
|
202 |
+
mask = torch.ones(z.shape[0], z.shape[-1]).to(device).int()
|
203 |
+
|
204 |
+
# apply mask
|
205 |
+
assert mask.shape == (
|
206 |
+
z.shape[0],
|
207 |
+
z.shape[-1],
|
208 |
+
), f"mask must be shape (batch, seq_len), got {mask.shape}"
|
209 |
+
mask = mask[:, None, :]
|
210 |
+
mask = mask.repeat(1, z.shape[1], 1)
|
211 |
+
mask[:, : self.n_conditioning_codebooks, :] = 0.0
|
212 |
+
|
213 |
+
if self.noise_mode == "mask":
|
214 |
+
z_true = z.clone()
|
215 |
+
|
216 |
+
z, mask = self.add_noise(z, r=r[0], random_x=None, mask=mask)
|
217 |
+
z_init = z.clone()
|
218 |
+
for i, tmpt in enumerate(temperature):
|
219 |
+
if renoise_mode == "prev":
|
220 |
+
z_prev = z.clone()
|
221 |
+
|
222 |
+
latents = self.embedding.from_codes(z, vqvae)
|
223 |
+
logits = self.forward(latents, r[i])
|
224 |
+
|
225 |
+
# for mask mode
|
226 |
+
logits = self.add_truth_to_logits(z_true, logits, mask)
|
227 |
+
|
228 |
+
# Apply topk sampling
|
229 |
+
logits = logits.permute(0, 2, 1)
|
230 |
+
|
231 |
+
z = self.sample_from_logits(
|
232 |
+
logits,
|
233 |
+
tmpt,
|
234 |
+
top_k,
|
235 |
+
sample=sample,
|
236 |
+
typical_filtering=typical_filtering,
|
237 |
+
typical_mass=typical_mass,
|
238 |
+
typical_min_tokens=typical_min_tokens,
|
239 |
+
)
|
240 |
+
|
241 |
+
# add back in conditioning codebooks
|
242 |
+
z = self.embedding.unflatten(z, n_codebooks=self.n_predict_codebooks)
|
243 |
+
z = torch.cat(
|
244 |
+
[z_init[:, : self.n_conditioning_codebooks, :], z], dim=1
|
245 |
+
).int()
|
246 |
+
|
247 |
+
if i < renoise_steps:
|
248 |
+
if renoise_mode == "prev":
|
249 |
+
z, _ = self.add_noise(z, r[i + 1], random_x=z_prev)
|
250 |
+
elif renoise_mode == "start":
|
251 |
+
z, _ = self.add_noise(z, r[i + 1], random_x=z_init)
|
252 |
+
elif renoise_mode == "rand":
|
253 |
+
z, _ = self.add_noise(z, r[i + 1])
|
254 |
+
else:
|
255 |
+
raise ValueError(f"Invalid renoise_mode: {renoise_mode}")
|
256 |
+
|
257 |
+
if mask is not None:
|
258 |
+
z = start_tokens * (1 - mask) + z * mask
|
259 |
+
|
260 |
+
if return_signal:
|
261 |
+
return self.to_signal(z, vqvae)
|
262 |
+
else:
|
263 |
+
return z
|
264 |
+
|
265 |
+
def maskgit_sample(
|
266 |
+
self,
|
267 |
+
vqvae,
|
268 |
+
time_steps: int = 300,
|
269 |
+
sampling_steps: int = 24,
|
270 |
+
start_tokens: Optional[torch.Tensor] = None,
|
271 |
+
mask: Optional[torch.Tensor] = None,
|
272 |
+
temperature: Union[float, Tuple[float, float]] = 1.0,
|
273 |
+
top_k: int = None,
|
274 |
+
sample: str = "multinomial",
|
275 |
+
typical_filtering=False,
|
276 |
+
typical_mass=0.2,
|
277 |
+
typical_min_tokens=1,
|
278 |
+
return_signal=True,
|
279 |
+
):
|
280 |
+
if isinstance(temperature, float):
|
281 |
+
temperature = torch.tensor(temperature).repeat(sampling_steps)
|
282 |
+
elif isinstance(temperature, tuple):
|
283 |
+
assert len(temperature) == 2
|
284 |
+
l, h = temperature
|
285 |
+
temperature = torch.linspace(l, h, sampling_steps)
|
286 |
+
else:
|
287 |
+
raise TypeError(f"invalid type for temperature")
|
288 |
+
|
289 |
+
def flatten(codes):
|
290 |
+
return rearrange(codes, "b c t -> b (t c)")
|
291 |
+
|
292 |
+
def unflatten(codes, c):
|
293 |
+
return rearrange(codes, "b (t c) -> b c t", c=c)
|
294 |
+
|
295 |
+
z = start_tokens
|
296 |
+
|
297 |
+
if z is None:
|
298 |
+
z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
|
299 |
+
self.device
|
300 |
+
)
|
301 |
+
|
302 |
+
if mask is None:
|
303 |
+
mask = torch.ones_like(z).to(self.device).int()
|
304 |
+
mask[:, : self.n_conditioning_codebooks, :] = 0.0
|
305 |
+
if mask.ndim == 2:
|
306 |
+
mask = mask[:, None, :].repeat(1, z.shape[1], 1)
|
307 |
+
|
308 |
+
# figure out which timesteps we're keeping
|
309 |
+
keep_mask = 1 - mask
|
310 |
+
|
311 |
+
# any conditioning codebook levels need to be in the keep mask
|
312 |
+
# if self.n_conditioning_codebooks > 0:
|
313 |
+
# cond_mask = torch.ones(z.shape[0], self.n_conditioning_codebooks, z.shape[-1]).to(z.device)
|
314 |
+
# keep_mask = torch.cat([cond_mask, keep_mask], dim=1)
|
315 |
+
|
316 |
+
# flatten
|
317 |
+
keep_mask = flatten(keep_mask)
|
318 |
+
|
319 |
+
# our r steps
|
320 |
+
r_steps = torch.linspace(0, 1, sampling_steps + 1)[1:].to(self.device)
|
321 |
+
|
322 |
+
# how many tokens did we keep on init?
|
323 |
+
num_kept_on_init = keep_mask.sum()
|
324 |
+
|
325 |
+
# how many codebooks are we inferring vs conditioning on?
|
326 |
+
n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
|
327 |
+
|
328 |
+
for i in tqdm(range(sampling_steps)):
|
329 |
+
# our current temperature
|
330 |
+
tmpt = temperature[i]
|
331 |
+
|
332 |
+
# our current schedule step
|
333 |
+
r = r_steps[i : i + 1]
|
334 |
+
|
335 |
+
with torch.inference_mode():
|
336 |
+
# mask our z
|
337 |
+
keep_mask_unflat = unflatten(keep_mask, c=self.n_codebooks)
|
338 |
+
z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
|
339 |
+
|
340 |
+
# get latents
|
341 |
+
latents = self.embedding.from_codes(z_masked, vqvae)
|
342 |
+
|
343 |
+
# infer from latents
|
344 |
+
logits = self.forward(latents, r)
|
345 |
+
logits = logits.permute(0, 2, 1) # b, seq, prob
|
346 |
+
|
347 |
+
# the schedule determines how many samples to keep
|
348 |
+
num_tokens_to_infer = (z.shape[-1] * z.shape[-2]) - num_kept_on_init
|
349 |
+
num_to_keep = num_kept_on_init + int(
|
350 |
+
num_tokens_to_infer * (self.gamma(1 - r))
|
351 |
+
)
|
352 |
+
|
353 |
+
# figure out which logits we wanna keep
|
354 |
+
if num_to_keep > 0:
|
355 |
+
probs = logits.softmax(dim=-1)
|
356 |
+
|
357 |
+
keep_probs = F.one_hot(z, self.vocab_size)[:, :, :]
|
358 |
+
|
359 |
+
probs = rearrange(
|
360 |
+
probs, "b (t c) p -> b c t p", c=n_infer_codebooks
|
361 |
+
)
|
362 |
+
probs = torch.cat(
|
363 |
+
[keep_probs[:, : self.n_conditioning_codebooks, ...], probs],
|
364 |
+
dim=1,
|
365 |
+
)
|
366 |
+
|
367 |
+
keep_probs = rearrange(
|
368 |
+
keep_probs, "b c t p -> b (t c) p", c=self.n_codebooks
|
369 |
+
)
|
370 |
+
probs = rearrange(probs, "b c t p -> b (t c) p", c=self.n_codebooks)
|
371 |
+
|
372 |
+
keep_prob_mask = keep_mask.unsqueeze(-1).repeat(
|
373 |
+
1, 1, self.vocab_size
|
374 |
+
)
|
375 |
+
probs = (keep_prob_mask.long() * keep_probs) + (
|
376 |
+
1 - keep_prob_mask.long()
|
377 |
+
) * probs
|
378 |
+
|
379 |
+
highest_probs = probs.max(dim=-1, keepdim=False)[0]
|
380 |
+
v, _ = highest_probs.topk(num_to_keep, dim=-1)
|
381 |
+
|
382 |
+
keep_mask = torch.ones_like(keep_mask).bool().clone()
|
383 |
+
keep_mask[highest_probs < v[..., [-1]]] = 0
|
384 |
+
|
385 |
+
logits = torch.log(probs)
|
386 |
+
|
387 |
+
z_inferred = self.sample_from_logits(
|
388 |
+
logits=logits,
|
389 |
+
top_k=top_k,
|
390 |
+
temperature=tmpt,
|
391 |
+
sample=sample,
|
392 |
+
typical_filtering=typical_filtering,
|
393 |
+
typical_mass=typical_mass,
|
394 |
+
typical_min_tokens=typical_min_tokens,
|
395 |
+
)
|
396 |
+
|
397 |
+
z = rearrange(z_inferred, "b (t c) -> b c t", c=self.n_codebooks)
|
398 |
+
|
399 |
+
# add conditioning codebooks back
|
400 |
+
# z = torch.cat([z[:, :self.n_conditioning_codebooks, :], z_inferred], dim=1)
|
401 |
+
|
402 |
+
if return_signal:
|
403 |
+
return self.to_signal(z, vqvae)
|
404 |
+
else:
|
405 |
+
return z
|
406 |
+
|
407 |
+
def sample_from_logits(
|
408 |
+
self,
|
409 |
+
logits,
|
410 |
+
top_k: int = None,
|
411 |
+
temperature: float = 1.0,
|
412 |
+
sample: str = "multinomial",
|
413 |
+
typical_filtering=False,
|
414 |
+
typical_mass=0.2,
|
415 |
+
typical_min_tokens=1,
|
416 |
+
):
|
417 |
+
# add temperature
|
418 |
+
logits = logits / temperature
|
419 |
+
|
420 |
+
# add topk
|
421 |
+
if top_k is not None:
|
422 |
+
v, topk_idx = logits.topk(top_k)
|
423 |
+
logits[logits < v[..., [-1]]] = -float("inf")
|
424 |
+
|
425 |
+
if typical_filtering:
|
426 |
+
assert top_k is None
|
427 |
+
nb, nt, _ = logits.shape
|
428 |
+
x_flat = rearrange(logits, "b t l -> (b t ) l")
|
429 |
+
x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
|
430 |
+
x_flat_norm_p = torch.exp(x_flat_norm)
|
431 |
+
entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
|
432 |
+
|
433 |
+
c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
|
434 |
+
c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
|
435 |
+
x_flat_cumsum = (
|
436 |
+
x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
|
437 |
+
)
|
438 |
+
|
439 |
+
last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
|
440 |
+
sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(
|
441 |
+
1, last_ind.view(-1, 1)
|
442 |
+
)
|
443 |
+
if typical_min_tokens > 1:
|
444 |
+
sorted_indices_to_remove[..., :typical_min_tokens] = 0
|
445 |
+
indices_to_remove = sorted_indices_to_remove.scatter(
|
446 |
+
1, x_flat_indices, sorted_indices_to_remove
|
447 |
+
)
|
448 |
+
x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
|
449 |
+
logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
|
450 |
+
|
451 |
+
if sample == "multinomial":
|
452 |
+
probs = torch.softmax(logits, dim=-1)
|
453 |
+
inferred = torch.stack([pr.multinomial(1).squeeze(-1) for pr in probs])
|
454 |
+
elif sample == "argmax":
|
455 |
+
inferred = torch.softmax(probs, dim=-1).argmax(dim=-1)
|
456 |
+
elif sample == "gumbel":
|
457 |
+
inferred = gumbel_sample(logits, dim=-1)
|
458 |
+
else:
|
459 |
+
raise ValueError(f"invalid sampling method: {sample}")
|
460 |
+
|
461 |
+
return inferred
|
vampnet/modules/modules.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from typing import Optional
|
3 |
+
from typing import Tuple
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
import torch.nn.functional as F
|
8 |
+
from einops import rearrange
|
9 |
+
from torch.nn.utils import weight_norm
|
10 |
+
|
11 |
+
|
12 |
+
def num_params(model):
|
13 |
+
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
14 |
+
|
15 |
+
|
16 |
+
def recurse_children(module, fn):
|
17 |
+
for child in module.children():
|
18 |
+
if isinstance(child, nn.ModuleList):
|
19 |
+
for c in child:
|
20 |
+
yield recurse_children(c, fn)
|
21 |
+
if isinstance(child, nn.ModuleDict):
|
22 |
+
for c in child.values():
|
23 |
+
yield recurse_children(c, fn)
|
24 |
+
|
25 |
+
yield recurse_children(child, fn)
|
26 |
+
yield fn(child)
|
27 |
+
|
28 |
+
|
29 |
+
# Scripting this brings model speed up 1.4x
|
30 |
+
@torch.jit.script
|
31 |
+
def snake(x, alpha):
|
32 |
+
shape = x.shape
|
33 |
+
x = x.reshape(shape[0], shape[1], -1)
|
34 |
+
x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
|
35 |
+
x = x.reshape(shape)
|
36 |
+
return x
|
37 |
+
|
38 |
+
|
39 |
+
class Snake1d(nn.Module):
|
40 |
+
def __init__(self, channels):
|
41 |
+
super().__init__()
|
42 |
+
self.alpha = nn.Parameter(torch.ones(1, channels, 1))
|
43 |
+
|
44 |
+
def forward(self, x):
|
45 |
+
return snake(x, self.alpha)
|
46 |
+
|
47 |
+
|
48 |
+
def WNConv1d(*args, **kwargs):
|
49 |
+
return weight_norm(nn.Conv1d(*args, **kwargs))
|
50 |
+
|
51 |
+
|
52 |
+
def WNConvTranspose1d(*args, **kwargs):
|
53 |
+
return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
|
54 |
+
|
55 |
+
|
56 |
+
class SequentialWithFiLM(nn.Module):
|
57 |
+
"""
|
58 |
+
handy wrapper for nn.Sequential that allows FiLM layers to be
|
59 |
+
inserted in between other layers.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, *layers):
|
63 |
+
super().__init__()
|
64 |
+
self.layers = nn.ModuleList(layers)
|
65 |
+
|
66 |
+
@staticmethod
|
67 |
+
def has_film(module):
|
68 |
+
mod_has_film = any(
|
69 |
+
[res for res in recurse_children(module, lambda c: isinstance(c, FiLM))]
|
70 |
+
)
|
71 |
+
return mod_has_film
|
72 |
+
|
73 |
+
def forward(self, x, cond):
|
74 |
+
for layer in self.layers:
|
75 |
+
if self.has_film(layer):
|
76 |
+
x = layer(x, cond)
|
77 |
+
else:
|
78 |
+
x = layer(x)
|
79 |
+
return x
|
80 |
+
|
81 |
+
|
82 |
+
class FiLM(nn.Module):
|
83 |
+
def __init__(self, input_dim: int, output_dim: int):
|
84 |
+
super().__init__()
|
85 |
+
|
86 |
+
self.input_dim = input_dim
|
87 |
+
self.output_dim = output_dim
|
88 |
+
|
89 |
+
if input_dim > 0:
|
90 |
+
self.beta = nn.Linear(input_dim, output_dim)
|
91 |
+
self.gamma = nn.Linear(input_dim, output_dim)
|
92 |
+
|
93 |
+
def forward(self, x, r):
|
94 |
+
if self.input_dim == 0:
|
95 |
+
return x
|
96 |
+
else:
|
97 |
+
beta, gamma = self.beta(r), self.gamma(r)
|
98 |
+
beta, gamma = (
|
99 |
+
beta.view(x.size(0), self.output_dim, 1),
|
100 |
+
gamma.view(x.size(0), self.output_dim, 1),
|
101 |
+
)
|
102 |
+
x = x * (gamma + 1) + beta
|
103 |
+
return x
|
104 |
+
|
105 |
+
|
106 |
+
class CodebookEmbedding(nn.Module):
|
107 |
+
def __init__(
|
108 |
+
self,
|
109 |
+
vocab_size: int,
|
110 |
+
latent_dim: int,
|
111 |
+
n_codebooks: int,
|
112 |
+
emb_dim: int,
|
113 |
+
special_tokens: Optional[Tuple[str]] = None,
|
114 |
+
):
|
115 |
+
super().__init__()
|
116 |
+
self.n_codebooks = n_codebooks
|
117 |
+
self.emb_dim = emb_dim
|
118 |
+
self.latent_dim = latent_dim
|
119 |
+
self.vocab_size = vocab_size
|
120 |
+
|
121 |
+
if special_tokens is not None:
|
122 |
+
for tkn in special_tokens:
|
123 |
+
self.special = nn.ParameterDict(
|
124 |
+
{
|
125 |
+
tkn: nn.Parameter(torch.randn(n_codebooks, self.latent_dim))
|
126 |
+
for tkn in special_tokens
|
127 |
+
}
|
128 |
+
)
|
129 |
+
self.special_idxs = {
|
130 |
+
tkn: i + vocab_size for i, tkn in enumerate(special_tokens)
|
131 |
+
}
|
132 |
+
|
133 |
+
self.out_proj = nn.Conv1d(n_codebooks * self.latent_dim, self.emb_dim, 1)
|
134 |
+
|
135 |
+
def from_codes(self, codes: torch.Tensor, vqvae):
|
136 |
+
n_codebooks = codes.shape[1]
|
137 |
+
latent = []
|
138 |
+
for i in range(n_codebooks):
|
139 |
+
c = codes[:, i, :]
|
140 |
+
|
141 |
+
lookup_table = vqvae.quantizer.quantizers[i].codebook.weight
|
142 |
+
if hasattr(self, "special"):
|
143 |
+
special_lookup = torch.cat(
|
144 |
+
[self.special[tkn][i : i + 1] for tkn in self.special], dim=0
|
145 |
+
)
|
146 |
+
lookup_table = torch.cat([lookup_table, special_lookup], dim=0)
|
147 |
+
|
148 |
+
l = F.embedding(c, lookup_table).transpose(1, 2)
|
149 |
+
latent.append(l)
|
150 |
+
|
151 |
+
latent = torch.cat(latent, dim=1)
|
152 |
+
return latent
|
153 |
+
|
154 |
+
def forward(self, latents: torch.Tensor):
|
155 |
+
x = self.out_proj(latents)
|
156 |
+
return x
|
157 |
+
|
158 |
+
def flatten(self, tokens: torch.Tensor, n_codebooks: int = None):
|
159 |
+
n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
|
160 |
+
return rearrange(tokens, "b c t -> b (t c)", c=n_c)
|
161 |
+
|
162 |
+
def unflatten(self, flat_tokens: torch.Tensor, n_codebooks: int = None):
|
163 |
+
nb, nt = flat_tokens.shape
|
164 |
+
|
165 |
+
n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
|
166 |
+
tokens = rearrange(flat_tokens, "b (t c) -> b c t", c=n_c)
|
167 |
+
|
168 |
+
return tokens
|
vampnet/modules/transformer.py
ADDED
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from einops import rearrange
|
8 |
+
|
9 |
+
from .base import VampBase
|
10 |
+
from .modules import CodebookEmbedding
|
11 |
+
from .modules import FiLM
|
12 |
+
from .modules import SequentialWithFiLM
|
13 |
+
from .modules import WNConv1d
|
14 |
+
|
15 |
+
|
16 |
+
class RMSNorm(nn.Module):
|
17 |
+
def __init__(self, hidden_size: int, eps=1e-6):
|
18 |
+
super().__init__()
|
19 |
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
20 |
+
self.var_eps = eps
|
21 |
+
|
22 |
+
def forward(self, x):
|
23 |
+
"""Returns root mean square normalized version of input `x`
|
24 |
+
# T5 uses a layer_norm which only scales and doesn't shift, which is also known
|
25 |
+
# as Root Mean Square Layer Normalization https://arxiv.org/abs/1910.07467
|
26 |
+
# thus varience is calculated w/o mean and there is no bias
|
27 |
+
Parameters
|
28 |
+
----------
|
29 |
+
x : Tensor[B x T x D]
|
30 |
+
Returns
|
31 |
+
-------
|
32 |
+
Tensor[B x T x D]
|
33 |
+
"""
|
34 |
+
var = x.pow(2).mean(-1, keepdim=True)
|
35 |
+
x = x * torch.rsqrt(var + self.var_eps)
|
36 |
+
|
37 |
+
return self.weight * x
|
38 |
+
|
39 |
+
|
40 |
+
def get_activation(name: str = "relu"):
|
41 |
+
if name == "relu":
|
42 |
+
return nn.ReLU
|
43 |
+
elif name == "gelu":
|
44 |
+
return NewGELU
|
45 |
+
elif name == "geglu":
|
46 |
+
return GatedGELU
|
47 |
+
elif name == "snake":
|
48 |
+
return Snake1d
|
49 |
+
else:
|
50 |
+
raise ValueError(f"Unrecognized activation {name}")
|
51 |
+
|
52 |
+
|
53 |
+
class NewGELU(nn.Module):
|
54 |
+
"""
|
55 |
+
Implementation of the GELU activation function currently in Google BERT repo
|
56 |
+
(identical to OpenAI GPT). Also see the Gaussian Error Linear Units
|
57 |
+
paper: https://arxiv.org/abs/1606.08415
|
58 |
+
"""
|
59 |
+
|
60 |
+
def forward(self, x):
|
61 |
+
return (
|
62 |
+
0.5
|
63 |
+
* x
|
64 |
+
* (
|
65 |
+
1.0
|
66 |
+
+ torch.tanh(
|
67 |
+
math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
|
68 |
+
)
|
69 |
+
)
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
class GatedGELU(nn.Module):
|
74 |
+
def __init__(self):
|
75 |
+
super().__init__()
|
76 |
+
self.gelu = NewGELU()
|
77 |
+
|
78 |
+
def forward(self, x, dim: int = -1):
|
79 |
+
p1, p2 = x.chunk(2, dim=dim)
|
80 |
+
return p1 * self.gelu(p2)
|
81 |
+
|
82 |
+
|
83 |
+
class Snake1d(nn.Module):
|
84 |
+
def __init__(self, channels):
|
85 |
+
super().__init__()
|
86 |
+
self.alpha = nn.Parameter(torch.ones(channels))
|
87 |
+
|
88 |
+
def forward(self, x):
|
89 |
+
return x + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * x).pow(2)
|
90 |
+
|
91 |
+
|
92 |
+
class FeedForward(nn.Module):
|
93 |
+
def __init__(
|
94 |
+
self, d_model: int = 512, dropout: float = 0.1, activation: str = "geglu"
|
95 |
+
):
|
96 |
+
super().__init__()
|
97 |
+
factor = 2 if activation == "geglu" else 1
|
98 |
+
self.w_1 = nn.Linear(d_model, d_model * 4, bias=False)
|
99 |
+
self.w_2 = nn.Linear(d_model * 4 // factor, d_model, bias=False)
|
100 |
+
self.drop = nn.Dropout(dropout)
|
101 |
+
self.act = get_activation(activation)()
|
102 |
+
|
103 |
+
def forward(self, x):
|
104 |
+
"""Computes position-wise feed-forward layer
|
105 |
+
Parameters
|
106 |
+
----------
|
107 |
+
x : Tensor[B x T x D]
|
108 |
+
Returns
|
109 |
+
-------
|
110 |
+
Tensor[B x T x D]
|
111 |
+
"""
|
112 |
+
x = self.w_1(x)
|
113 |
+
x = self.act(x)
|
114 |
+
x = self.drop(x)
|
115 |
+
x = self.w_2(x)
|
116 |
+
return x
|
117 |
+
|
118 |
+
|
119 |
+
class MultiHeadRelativeAttention(nn.Module):
|
120 |
+
def __init__(
|
121 |
+
self,
|
122 |
+
n_head: int = 8,
|
123 |
+
d_model: int = 512,
|
124 |
+
dropout: float = 0.1,
|
125 |
+
bidirectional: bool = True,
|
126 |
+
has_relative_attention_bias: bool = True,
|
127 |
+
attention_num_buckets: int = 32,
|
128 |
+
attention_max_distance: int = 128,
|
129 |
+
):
|
130 |
+
super().__init__()
|
131 |
+
d_head = d_model // n_head
|
132 |
+
self.n_head = n_head
|
133 |
+
self.d_head = d_head
|
134 |
+
self.bidirectional = bidirectional
|
135 |
+
self.has_relative_attention_bias = has_relative_attention_bias
|
136 |
+
self.attention_num_buckets = attention_num_buckets
|
137 |
+
self.attention_max_distance = attention_max_distance
|
138 |
+
|
139 |
+
# Create linear query, key, value projections
|
140 |
+
self.w_qs = nn.Linear(d_model, d_model, bias=False)
|
141 |
+
self.w_ks = nn.Linear(d_model, d_model, bias=False)
|
142 |
+
self.w_vs = nn.Linear(d_model, d_model, bias=False)
|
143 |
+
|
144 |
+
# Create linear final output projection
|
145 |
+
self.fc = nn.Linear(d_model, d_model, bias=False)
|
146 |
+
|
147 |
+
# Dropout for attention output weights
|
148 |
+
self.dropout = nn.Dropout(dropout)
|
149 |
+
|
150 |
+
# Create relative positional embeddings (if turned on)
|
151 |
+
if has_relative_attention_bias:
|
152 |
+
self.relative_attention_bias = nn.Embedding(attention_num_buckets, n_head)
|
153 |
+
|
154 |
+
def _relative_position_bucket(self, relative_position):
|
155 |
+
"""Converts unbounded relative position into bounded set of buckets
|
156 |
+
with half "exact" buckets (1 position = 1 bucket) and half "log-spaced"
|
157 |
+
buckets
|
158 |
+
Parameters
|
159 |
+
----------
|
160 |
+
relative_position : Tensor[T_q x T_kv]
|
161 |
+
Relative positions between queries and key_value items
|
162 |
+
Returns
|
163 |
+
-------
|
164 |
+
Tensor[T_q x T_kv]
|
165 |
+
Input relative positions converted into buckets
|
166 |
+
"""
|
167 |
+
relative_buckets = 0
|
168 |
+
num_buckets = self.attention_num_buckets
|
169 |
+
max_distance = self.attention_max_distance
|
170 |
+
|
171 |
+
# Convert relative position for (-inf, inf) to [0, inf]
|
172 |
+
# Negative relative positions correspond to past
|
173 |
+
# Positive relative positions correspond to future
|
174 |
+
if self.bidirectional:
|
175 |
+
# use half buckets for each side (past / future)
|
176 |
+
num_buckets //= 2
|
177 |
+
|
178 |
+
# Shift the position positions by `num_buckets` to wrap around
|
179 |
+
# negative positions
|
180 |
+
relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
|
181 |
+
relative_position = torch.abs(relative_position)
|
182 |
+
else:
|
183 |
+
# If not bidirectional, ignore positive positions and wrap
|
184 |
+
# negative positions to positive
|
185 |
+
relative_position = -torch.min(
|
186 |
+
relative_position, torch.zeros_like(relative_position)
|
187 |
+
)
|
188 |
+
|
189 |
+
# Allocate half of the buckets are for exact increments in positions
|
190 |
+
max_exact = num_buckets // 2
|
191 |
+
is_small = relative_position < max_exact
|
192 |
+
|
193 |
+
# The other half of the buckets are for logarithmically bigger bins in
|
194 |
+
# positions up to `max_distance`
|
195 |
+
relative_postion_if_large = max_exact + (
|
196 |
+
torch.log(relative_position.float() / max_exact)
|
197 |
+
/ math.log(max_distance / max_exact)
|
198 |
+
* (num_buckets - max_exact)
|
199 |
+
).to(torch.long)
|
200 |
+
|
201 |
+
# Clip the max relative position to `num_buckets - 1`
|
202 |
+
relative_postion_if_large = torch.min(
|
203 |
+
relative_postion_if_large,
|
204 |
+
torch.full_like(relative_postion_if_large, num_buckets - 1),
|
205 |
+
)
|
206 |
+
|
207 |
+
# Choose relative buckets based on small or large positions
|
208 |
+
relative_buckets += torch.where(
|
209 |
+
is_small, relative_position, relative_postion_if_large
|
210 |
+
)
|
211 |
+
|
212 |
+
return relative_buckets
|
213 |
+
|
214 |
+
def compute_bias(self, query_length, key_length):
|
215 |
+
"""Computes a position bias scalar for each index in query_length x key_length
|
216 |
+
Parameters
|
217 |
+
----------
|
218 |
+
query_length : int
|
219 |
+
key_length : int
|
220 |
+
Returns
|
221 |
+
-------
|
222 |
+
Tensor[heads x 1 x T_q x T_kv]
|
223 |
+
Position bias to be applied on attention logits
|
224 |
+
"""
|
225 |
+
|
226 |
+
query_position = torch.arange(query_length, dtype=torch.long)[:, None]
|
227 |
+
key_position = torch.arange(key_length, dtype=torch.long)[None, :]
|
228 |
+
relative_position = key_position - query_position
|
229 |
+
|
230 |
+
# Convert relative position to buckets
|
231 |
+
relative_position_bucket = self._relative_position_bucket(relative_position)
|
232 |
+
relative_position_bucket = relative_position_bucket.to(
|
233 |
+
self.relative_attention_bias.weight.device
|
234 |
+
)
|
235 |
+
|
236 |
+
# Index attention bias values
|
237 |
+
values = self.relative_attention_bias(relative_position_bucket)
|
238 |
+
values = rearrange(values, "q k h -> h 1 q k")
|
239 |
+
|
240 |
+
return values
|
241 |
+
|
242 |
+
def forward(self, q, k, v, mask=None, position_bias=None):
|
243 |
+
"""Computes attention over (keys, values) for every timestep in query
|
244 |
+
Parameters
|
245 |
+
----------
|
246 |
+
q : Tensor[B x T_q x d_model]
|
247 |
+
Query vectors
|
248 |
+
k : Tensor[B x T_kv x d_model]
|
249 |
+
Key vectors to compute attention over
|
250 |
+
v : Tensor[B x T_kv x d_model]
|
251 |
+
Value vectors corresponding to the keys
|
252 |
+
mask : Tensor[B x T_q x T_kv], optional
|
253 |
+
position_bias: Tensor[head x 1 x T_q x T_kv]
|
254 |
+
Returns
|
255 |
+
-------
|
256 |
+
Tensor[B x T_q x d_model]
|
257 |
+
Outputs after attending (key, value) using queries
|
258 |
+
"""
|
259 |
+
# Compute query, key, value projections
|
260 |
+
q = rearrange(self.w_qs(q), "b l (head k) -> head b l k", head=self.n_head)
|
261 |
+
k = rearrange(self.w_ks(k), "b t (head k) -> head b t k", head=self.n_head)
|
262 |
+
v = rearrange(self.w_vs(v), "b t (head k) -> head b t k", head=self.n_head)
|
263 |
+
|
264 |
+
# Compute attention matrix
|
265 |
+
attn = torch.einsum("hblk,hbtk->hblt", [q, k]) / np.sqrt(q.shape[-1])
|
266 |
+
|
267 |
+
# Add relative position bias to attention scores
|
268 |
+
if position_bias is None:
|
269 |
+
if self.has_relative_attention_bias:
|
270 |
+
position_bias = self.compute_bias(q.size(-2), k.size(-2))
|
271 |
+
else:
|
272 |
+
position_bias = torch.zeros_like(attn)
|
273 |
+
attn += position_bias
|
274 |
+
|
275 |
+
# Apply mask to attention scores to prevent looking up invalid locations
|
276 |
+
if mask is not None:
|
277 |
+
attn = attn.masked_fill(mask[None] == 0, -1e9)
|
278 |
+
|
279 |
+
# Normalize attention scores and add dropout
|
280 |
+
attn = torch.softmax(attn, dim=3)
|
281 |
+
attn = self.dropout(attn)
|
282 |
+
|
283 |
+
# Compute attended outputs (product of attention matrix and values)
|
284 |
+
output = torch.einsum("hblt,hbtv->hblv", [attn, v])
|
285 |
+
output = rearrange(output, "head b l v -> b l (head v)")
|
286 |
+
output = self.fc(output)
|
287 |
+
|
288 |
+
return output, position_bias
|
289 |
+
|
290 |
+
|
291 |
+
class TransformerLayer(nn.Module):
|
292 |
+
def __init__(
|
293 |
+
self,
|
294 |
+
d_model: int = 512,
|
295 |
+
d_cond: int = 64,
|
296 |
+
n_heads: int = 8,
|
297 |
+
bidirectional: bool = True,
|
298 |
+
is_decoder: bool = False,
|
299 |
+
has_relative_attention_bias: bool = False,
|
300 |
+
flash_attn: bool = False,
|
301 |
+
dropout: float = 0.1,
|
302 |
+
):
|
303 |
+
super().__init__()
|
304 |
+
# Store args
|
305 |
+
self.is_decoder = is_decoder
|
306 |
+
|
307 |
+
# Create self-attention layer
|
308 |
+
self.norm_1 = RMSNorm(d_model)
|
309 |
+
self.film_1 = FiLM(d_cond, d_model)
|
310 |
+
self.flash_attn = flash_attn
|
311 |
+
|
312 |
+
if flash_attn:
|
313 |
+
from flash_attn.flash_attention import FlashMHA
|
314 |
+
self.self_attn = FlashMHA(
|
315 |
+
embed_dim=d_model,
|
316 |
+
num_heads=n_heads,
|
317 |
+
attention_dropout=dropout,
|
318 |
+
causal=False,
|
319 |
+
)
|
320 |
+
else:
|
321 |
+
self.self_attn = MultiHeadRelativeAttention(
|
322 |
+
n_heads, d_model, dropout, bidirectional, has_relative_attention_bias
|
323 |
+
)
|
324 |
+
|
325 |
+
# (Optional) Create cross-attention layer
|
326 |
+
if is_decoder:
|
327 |
+
self.norm_2 = RMSNorm(d_model)
|
328 |
+
self.film_2 = FiLM(d_cond, d_model)
|
329 |
+
self.cross_attn = MultiHeadRelativeAttention(
|
330 |
+
n_heads,
|
331 |
+
d_model,
|
332 |
+
dropout,
|
333 |
+
bidirectional=True,
|
334 |
+
has_relative_attention_bias=False,
|
335 |
+
)
|
336 |
+
|
337 |
+
# Create last feed-forward layer
|
338 |
+
self.norm_3 = RMSNorm(d_model)
|
339 |
+
self.film_3 = FiLM(d_cond, d_model)
|
340 |
+
self.feed_forward = FeedForward(d_model=d_model, dropout=dropout)
|
341 |
+
|
342 |
+
# Create dropout
|
343 |
+
self.dropout = nn.Dropout(dropout)
|
344 |
+
|
345 |
+
def forward(
|
346 |
+
self,
|
347 |
+
x,
|
348 |
+
x_mask,
|
349 |
+
cond,
|
350 |
+
src=None,
|
351 |
+
src_mask=None,
|
352 |
+
position_bias=None,
|
353 |
+
encoder_decoder_position_bias=None,
|
354 |
+
):
|
355 |
+
"""Computes one transformer layer consisting of self attention, (op) cross attention
|
356 |
+
and feedforward layer
|
357 |
+
Parameters
|
358 |
+
----------
|
359 |
+
x : Tensor[B x T_q x D]
|
360 |
+
x_mask : Tensor[B x T_q]
|
361 |
+
src : Tensor[B x T_kv x D], optional
|
362 |
+
src_mask : Tensor[B x T_kv x D], optional
|
363 |
+
position_bias : Tensor[heads x B x T_q x T_q], optional
|
364 |
+
Relative position bias for self attention layer
|
365 |
+
encoder_decoder_position_bias : Tensor[heads x B x T_q x T_kv], optional
|
366 |
+
Relative position bias for cross attention layer
|
367 |
+
Returns
|
368 |
+
-------
|
369 |
+
Tensor[B x T_q x D]
|
370 |
+
"""
|
371 |
+
y = self.norm_1(x)
|
372 |
+
y = self.film_1(y.permute(0, 2, 1), cond).permute(0, 2, 1)
|
373 |
+
if self.flash_attn:
|
374 |
+
with torch.autocast(y.device.type, dtype=torch.bfloat16):
|
375 |
+
y = self.self_attn(y)[0]
|
376 |
+
else:
|
377 |
+
y, position_bias = self.self_attn(y, y, y, x_mask, position_bias)
|
378 |
+
x = x + self.dropout(y)
|
379 |
+
|
380 |
+
if self.is_decoder:
|
381 |
+
y = self.norm_2(x)
|
382 |
+
y = self.film_2(y.permute(0, 2, 1), cond).permute(0, 2, 1)
|
383 |
+
y, encoder_decoder_position_bias = self.cross_attn(
|
384 |
+
y, src, src, src_mask, encoder_decoder_position_bias
|
385 |
+
)
|
386 |
+
x = x + self.dropout(y)
|
387 |
+
|
388 |
+
y = self.norm_3(x)
|
389 |
+
y = self.film_3(
|
390 |
+
y.permute(
|
391 |
+
0,
|
392 |
+
2,
|
393 |
+
1,
|
394 |
+
),
|
395 |
+
cond,
|
396 |
+
).permute(0, 2, 1)
|
397 |
+
y = self.feed_forward(y)
|
398 |
+
x = x + self.dropout(y)
|
399 |
+
|
400 |
+
return x, position_bias, encoder_decoder_position_bias
|
401 |
+
|
402 |
+
|
403 |
+
class TransformerStack(nn.Module):
|
404 |
+
def __init__(
|
405 |
+
self,
|
406 |
+
d_model: int = 512,
|
407 |
+
d_cond: int = 64,
|
408 |
+
n_heads: int = 8,
|
409 |
+
n_layers: int = 8,
|
410 |
+
last_layer: bool = True,
|
411 |
+
bidirectional: bool = True,
|
412 |
+
flash_attn: bool = False,
|
413 |
+
is_decoder: bool = False,
|
414 |
+
dropout: float = 0.1,
|
415 |
+
):
|
416 |
+
super().__init__()
|
417 |
+
# Store args
|
418 |
+
self.bidirectional = bidirectional
|
419 |
+
self.is_decoder = is_decoder
|
420 |
+
|
421 |
+
# Create transformer layers
|
422 |
+
# In T5, relative attention bias is shared by all layers in the stack
|
423 |
+
self.layers = nn.ModuleList(
|
424 |
+
[
|
425 |
+
TransformerLayer(
|
426 |
+
d_model,
|
427 |
+
d_cond,
|
428 |
+
n_heads,
|
429 |
+
bidirectional,
|
430 |
+
is_decoder,
|
431 |
+
has_relative_attention_bias=(i == 0),
|
432 |
+
flash_attn=flash_attn,
|
433 |
+
dropout=dropout,
|
434 |
+
)
|
435 |
+
for i in range(n_layers)
|
436 |
+
]
|
437 |
+
)
|
438 |
+
|
439 |
+
# Perform last normalization
|
440 |
+
self.norm = RMSNorm(d_model) if last_layer else None
|
441 |
+
|
442 |
+
def subsequent_mask(self, size):
|
443 |
+
return torch.ones(1, size, size).tril().bool()
|
444 |
+
|
445 |
+
def forward(self, x, x_mask, cond=None, src=None, src_mask=None):
|
446 |
+
"""Computes a full transformer stack
|
447 |
+
Parameters
|
448 |
+
----------
|
449 |
+
x : Tensor[B x T_q x D]
|
450 |
+
x_mask : Tensor[B x T_q]
|
451 |
+
src : Tensor[B x T_kv x D], optional
|
452 |
+
src_mask : Tensor[B x T_kv], optional
|
453 |
+
Returns
|
454 |
+
-------
|
455 |
+
Tensor[B x T_q x D]
|
456 |
+
"""
|
457 |
+
|
458 |
+
# Convert `src_mask` to (B x T_q x T_kv) shape for cross attention masking
|
459 |
+
if self.is_decoder:
|
460 |
+
src_mask = x_mask.unsqueeze(-1) * src_mask.unsqueeze(-2)
|
461 |
+
|
462 |
+
# Convert `x_mask` to (B x T_q x T_q) shape for self attention masking
|
463 |
+
x_mask = x_mask.unsqueeze(-2)
|
464 |
+
if not self.bidirectional:
|
465 |
+
x_mask = x_mask * self.subsequent_mask(x.size(1)).to(x_mask.device)
|
466 |
+
|
467 |
+
# Initialize position biases
|
468 |
+
position_bias = None
|
469 |
+
encoder_decoder_position_bias = None
|
470 |
+
|
471 |
+
# Compute transformer layers
|
472 |
+
for layer in self.layers:
|
473 |
+
x, position_bias, encoder_decoder_position_bias = layer(
|
474 |
+
x=x,
|
475 |
+
x_mask=x_mask,
|
476 |
+
cond=cond,
|
477 |
+
src=src,
|
478 |
+
src_mask=src_mask,
|
479 |
+
position_bias=position_bias,
|
480 |
+
encoder_decoder_position_bias=encoder_decoder_position_bias,
|
481 |
+
)
|
482 |
+
|
483 |
+
return self.norm(x) if self.norm is not None else x
|
484 |
+
|
485 |
+
|
486 |
+
class VampNet(VampBase):
|
487 |
+
def __init__(
|
488 |
+
self,
|
489 |
+
n_heads: int = 20,
|
490 |
+
n_layers: int = 16,
|
491 |
+
r_cond_dim: int = 64,
|
492 |
+
n_codebooks: int = 9,
|
493 |
+
n_conditioning_codebooks: int = 0,
|
494 |
+
latent_dim: int = 8,
|
495 |
+
embedding_dim: int = 1280,
|
496 |
+
vocab_size: int = 1024,
|
497 |
+
flash_attn: bool = True,
|
498 |
+
noise_mode: str = "mask",
|
499 |
+
seq_len: int = 313,
|
500 |
+
):
|
501 |
+
super().__init__()
|
502 |
+
self.n_heads = n_heads
|
503 |
+
self.n_layers = n_layers
|
504 |
+
self.r_cond_dim = r_cond_dim
|
505 |
+
self.n_codebooks = n_codebooks
|
506 |
+
self.n_conditioning_codebooks = n_conditioning_codebooks
|
507 |
+
self.embedding_dim = embedding_dim
|
508 |
+
self.vocab_size = vocab_size
|
509 |
+
self.latent_dim = latent_dim
|
510 |
+
self.flash_attn = flash_attn
|
511 |
+
self.noise_mode = noise_mode
|
512 |
+
self.seq_len = seq_len
|
513 |
+
|
514 |
+
if noise_mode == "mask":
|
515 |
+
special_tokens = ["MASK"]
|
516 |
+
elif noise_mode == "random":
|
517 |
+
special_tokens = None
|
518 |
+
else:
|
519 |
+
raise ValueError(f"Unknown noise mode: {noise_mode}")
|
520 |
+
|
521 |
+
self.embedding = CodebookEmbedding(
|
522 |
+
latent_dim=latent_dim,
|
523 |
+
n_codebooks=n_codebooks,
|
524 |
+
vocab_size=vocab_size,
|
525 |
+
emb_dim=embedding_dim,
|
526 |
+
special_tokens=special_tokens,
|
527 |
+
)
|
528 |
+
|
529 |
+
if noise_mode == "mask":
|
530 |
+
self.mask_token = self.embedding.special_idxs["MASK"]
|
531 |
+
|
532 |
+
self.transformer = TransformerStack(
|
533 |
+
d_model=embedding_dim,
|
534 |
+
d_cond=r_cond_dim,
|
535 |
+
n_heads=n_heads,
|
536 |
+
n_layers=n_layers,
|
537 |
+
last_layer=True,
|
538 |
+
bidirectional=True,
|
539 |
+
flash_attn=flash_attn,
|
540 |
+
is_decoder=False,
|
541 |
+
dropout=0.1,
|
542 |
+
)
|
543 |
+
|
544 |
+
# Add final conv layer
|
545 |
+
self.n_predict_codebooks = n_codebooks - n_conditioning_codebooks
|
546 |
+
self.classifier = SequentialWithFiLM(
|
547 |
+
WNConv1d(
|
548 |
+
embedding_dim,
|
549 |
+
vocab_size * self.n_predict_codebooks,
|
550 |
+
kernel_size=1,
|
551 |
+
padding="same",
|
552 |
+
# groups=self.n_predict_codebooks,
|
553 |
+
),
|
554 |
+
)
|
555 |
+
|
556 |
+
def forward(self, x, cond):
|
557 |
+
x = self.embedding(x)
|
558 |
+
x_mask = torch.ones_like(x, dtype=torch.bool)[:, :1, :].squeeze(1)
|
559 |
+
|
560 |
+
cond = self.r_embed(cond)
|
561 |
+
|
562 |
+
x = rearrange(x, "b d n -> b n d")
|
563 |
+
out = self.transformer(x=x, x_mask=x_mask, cond=cond)
|
564 |
+
out = rearrange(out, "b n d -> b d n")
|
565 |
+
|
566 |
+
out = self.classifier(out, cond)
|
567 |
+
|
568 |
+
out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
|
569 |
+
|
570 |
+
return out
|
571 |
+
|
572 |
+
|
573 |
+
if __name__ == "__main__":
|
574 |
+
# import argbind
|
575 |
+
from .modules import num_params
|
576 |
+
|
577 |
+
VampNet = argbind.bind(VampNet)
|
578 |
+
|
579 |
+
@argbind.bind(without_prefix=True)
|
580 |
+
def try_model(device: str = "cuda", batch_size: int = 2, seq_len_s: float = 10.0):
|
581 |
+
seq_len = int(32000 / 512 * seq_len_s)
|
582 |
+
|
583 |
+
model = VampNet().to(device)
|
584 |
+
|
585 |
+
z = torch.randint(
|
586 |
+
0, model.vocab_size, size=(batch_size, model.n_codebooks, seq_len)
|
587 |
+
).to(device)
|
588 |
+
|
589 |
+
r = torch.zeros(batch_size).to(device)
|
590 |
+
z_mask, mask = model.add_noise(z, r)
|
591 |
+
|
592 |
+
z_mask_latent = torch.rand(
|
593 |
+
batch_size, model.latent_dim * model.n_codebooks, seq_len
|
594 |
+
).to(device)
|
595 |
+
z_hat = model(z_mask_latent, r)
|
596 |
+
|
597 |
+
pred = z_hat.argmax(dim=1)
|
598 |
+
pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)
|
599 |
+
|
600 |
+
print(f"model has {num_params(model)/1e6:<.3f}M parameters")
|
601 |
+
print(f"prediction has shape {pred.shape}")
|
602 |
+
breakpoint()
|
603 |
+
|
604 |
+
args = argbind.parse_args()
|
605 |
+
with argbind.scope(args):
|
606 |
+
try_model()
|
vampnet/scheduler.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
import torch
|
5 |
+
|
6 |
+
class NoamScheduler:
|
7 |
+
"""OG scheduler from transformer paper: https://arxiv.org/pdf/1706.03762.pdf
|
8 |
+
Implementation from Annotated Transformer: https://nlp.seas.harvard.edu/2018/04/03/attention.html
|
9 |
+
"""
|
10 |
+
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
optimizer: torch.optim.Optimizer,
|
14 |
+
d_model: int = 512,
|
15 |
+
factor: float = 1.0,
|
16 |
+
warmup: int = 4000,
|
17 |
+
):
|
18 |
+
# Store hparams
|
19 |
+
self.warmup = warmup
|
20 |
+
self.factor = factor
|
21 |
+
self.d_model = d_model
|
22 |
+
|
23 |
+
# Initialize variables `lr` and `steps`
|
24 |
+
self.lr = None
|
25 |
+
self.steps = 0
|
26 |
+
|
27 |
+
# Store the optimizer
|
28 |
+
self.optimizer = optimizer
|
29 |
+
|
30 |
+
def state_dict(self):
|
31 |
+
return {
|
32 |
+
key: value for key, value in self.__dict__.items() if key != "optimizer"
|
33 |
+
}
|
34 |
+
|
35 |
+
def load_state_dict(self, state_dict):
|
36 |
+
self.__dict__.update(state_dict)
|
37 |
+
|
38 |
+
def step(self):
|
39 |
+
self.steps += 1
|
40 |
+
self.lr = self.factor * (
|
41 |
+
self.d_model ** (-0.5)
|
42 |
+
* min(self.steps ** (-0.5), self.steps * self.warmup ** (-1.5))
|
43 |
+
)
|
44 |
+
|
45 |
+
for p in self.optimizer.param_groups:
|
46 |
+
p["lr"] = self.lr
|
47 |
+
|