muellerzr HF staff commited on
Commit
e8b88f6
·
1 Parent(s): da85501
Files changed (2) hide show
  1. index.html +5 -5
  2. llm_conf.html +0 -1337
index.html CHANGED
@@ -426,7 +426,7 @@
426
  <li>Backward ~= 2x the model size</li>
427
  <li>The optimizer step ~= 4x the model size (1x model, 1x gradients, 2x optimizer):</li>
428
  </ul>
429
- <div style="font-size: 50%;">
430
  <table>
431
  <thead>
432
  <tr class="header">
@@ -465,7 +465,7 @@
465
  <p>This works fine for small models, we have cards with anywhere from 12-24GB of GPU memory (on the GPU-poor side).</p>
466
  <p>But what happens as we scale?</p>
467
  <p>Here’s <code>llama-3-8B</code> (8.03B parameters)</p>
468
- <div style="font-size: 50%;">
469
  <table>
470
  <thead>
471
  <tr class="header">
@@ -698,7 +698,7 @@
698
  <li>Rely on <code>config.yaml</code> files</li>
699
  <li>Choose to either running <code>accelerate config</code> or write your own:</li>
700
  </ul>
701
- <div class="columns" style="font-size: 50%;padding-left:10%;">
702
  <div class="column" style="width:40%;">
703
  <div class="code-with-filename">
704
  <div class="code-with-filename-file">
@@ -804,7 +804,7 @@
804
  <ul>
805
  <li>Let’s tie that back up to the model estimator with neat tools like NVIDIA’s TransformerEngine</li>
806
  </ul>
807
- <div style="font-size: 60%;">
808
  <table style="width:100%;">
809
  <colgroup>
810
  <col style="width: 14%">
@@ -894,7 +894,7 @@
894
  <ul>
895
  <li>Extremely similar, however mostly used different naming conventions for items and slight tweaks in the implementation</li>
896
  </ul>
897
- <div style="font-size: 50%;">
898
  <table style="width:100%;">
899
  <colgroup>
900
  <col style="width: 16%">
 
426
  <li>Backward ~= 2x the model size</li>
427
  <li>The optimizer step ~= 4x the model size (1x model, 1x gradients, 2x optimizer):</li>
428
  </ul>
429
+ <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
430
  <table>
431
  <thead>
432
  <tr class="header">
 
465
  <p>This works fine for small models, we have cards with anywhere from 12-24GB of GPU memory (on the GPU-poor side).</p>
466
  <p>But what happens as we scale?</p>
467
  <p>Here’s <code>llama-3-8B</code> (8.03B parameters)</p>
468
+ <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
469
  <table>
470
  <thead>
471
  <tr class="header">
 
698
  <li>Rely on <code>config.yaml</code> files</li>
699
  <li>Choose to either running <code>accelerate config</code> or write your own:</li>
700
  </ul>
701
+ <div class="columns" style="font-size: 50%;padding-left:10%;background-color: rgba(0,0,0,.1);">
702
  <div class="column" style="width:40%;">
703
  <div class="code-with-filename">
704
  <div class="code-with-filename-file">
 
804
  <ul>
805
  <li>Let’s tie that back up to the model estimator with neat tools like NVIDIA’s TransformerEngine</li>
806
  </ul>
807
+ <div style="font-size: 60%;background-color: rgba(0,0,0,.1);">
808
  <table style="width:100%;">
809
  <colgroup>
810
  <col style="width: 14%">
 
894
  <ul>
895
  <li>Extremely similar, however mostly used different naming conventions for items and slight tweaks in the implementation</li>
896
  </ul>
897
+ <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
898
  <table style="width:100%;">
899
  <colgroup>
900
  <col style="width: 16%">
llm_conf.html DELETED
@@ -1,1337 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en"><head>
3
- <script src="llm_conf_files/libs/clipboard/clipboard.min.js"></script>
4
- <script src="llm_conf_files/libs/quarto-html/tabby.min.js"></script>
5
- <script src="llm_conf_files/libs/quarto-html/popper.min.js"></script>
6
- <script src="llm_conf_files/libs/quarto-html/tippy.umd.min.js"></script>
7
- <link href="llm_conf_files/libs/quarto-html/tippy.css" rel="stylesheet">
8
- <link href="llm_conf_files/libs/quarto-html/light-border.css" rel="stylesheet">
9
- <link href="llm_conf_files/libs/quarto-html/quarto-html.min.css" rel="stylesheet" data-mode="light">
10
- <link href="llm_conf_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles"><meta charset="utf-8">
11
- <meta name="generator" content="quarto-99.9.9">
12
-
13
- <title>Scaling Model Training with More Compute, How Do They Do It?</title>
14
- <meta name="apple-mobile-web-app-capable" content="yes">
15
- <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
16
- <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
17
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reset.css">
18
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reveal.css">
19
- <style>
20
- code{white-space: pre-wrap;}
21
- span.smallcaps{font-variant: small-caps;}
22
- div.columns{display: flex; gap: min(4vw, 1.5em);}
23
- div.column{flex: auto; overflow-x: auto;}
24
- div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
25
- ul.task-list{list-style: none;}
26
- ul.task-list li input[type="checkbox"] {
27
- width: 0.8em;
28
- margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
29
- vertical-align: middle;
30
- }
31
- /* CSS for syntax highlighting */
32
- pre > code.sourceCode { white-space: pre; position: relative; }
33
- pre > code.sourceCode > span { line-height: 1.25; }
34
- pre > code.sourceCode > span:empty { height: 1.2em; }
35
- .sourceCode { overflow: visible; }
36
- code.sourceCode > span { color: inherit; text-decoration: inherit; }
37
- div.sourceCode { margin: 1em 0; }
38
- pre.sourceCode { margin: 0; }
39
- @media screen {
40
- div.sourceCode { overflow: auto; }
41
- }
42
- @media print {
43
- pre > code.sourceCode { white-space: pre-wrap; }
44
- pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
45
- }
46
- pre.numberSource code
47
- { counter-reset: source-line 0; }
48
- pre.numberSource code > span
49
- { position: relative; left: -4em; counter-increment: source-line; }
50
- pre.numberSource code > span > a:first-child::before
51
- { content: counter(source-line);
52
- position: relative; left: -1em; text-align: right; vertical-align: baseline;
53
- border: none; display: inline-block;
54
- -webkit-touch-callout: none; -webkit-user-select: none;
55
- -khtml-user-select: none; -moz-user-select: none;
56
- -ms-user-select: none; user-select: none;
57
- padding: 0 4px; width: 4em;
58
- }
59
- pre.numberSource { margin-left: 3em; padding-left: 4px; }
60
- div.sourceCode
61
- { color: #f8f8f2; }
62
- @media screen {
63
- pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
64
- }
65
- code span { color: #f8f8f2; } /* Normal */
66
- code span.al { color: #f07178; background-color: #2a0f15; font-weight: bold; } /* Alert */
67
- code span.an { color: #d4d0ab; } /* Annotation */
68
- code span.at { color: #00e0e0; } /* Attribute */
69
- code span.bn { color: #d4d0ab; } /* BaseN */
70
- code span.bu { color: #abe338; } /* BuiltIn */
71
- code span.cf { color: #ffa07a; font-weight: bold; } /* ControlFlow */
72
- code span.ch { color: #abe338; } /* Char */
73
- code span.cn { color: #ffd700; } /* Constant */
74
- code span.co { color: #f8f8f2; font-style: italic; } /* Comment */
75
- code span.cv { color: #ffd700; } /* CommentVar */
76
- code span.do { color: #f8f8f2; } /* Documentation */
77
- code span.dt { color: #ffa07a; } /* DataType */
78
- code span.dv { color: #d4d0ab; } /* DecVal */
79
- code span.er { color: #f07178; text-decoration: underline; } /* Error */
80
- code span.ex { color: #00e0e0; font-weight: bold; } /* Extension */
81
- code span.fl { color: #d4d0ab; } /* Float */
82
- code span.fu { color: #ffa07a; } /* Function */
83
- code span.im { color: #abe338; } /* Import */
84
- code span.in { color: #d4d0ab; } /* Information */
85
- code span.kw { color: #ffa07a; font-weight: bold; } /* Keyword */
86
- code span.op { color: #ffa07a; } /* Operator */
87
- code span.ot { color: #00e0e0; } /* Other */
88
- code span.pp { color: #dcc6e0; } /* Preprocessor */
89
- code span.re { color: #00e0e0; background-color: #f8f8f2; } /* RegionMarker */
90
- code span.sc { color: #abe338; } /* SpecialChar */
91
- code span.ss { color: #abe338; } /* SpecialString */
92
- code span.st { color: #abe338; } /* String */
93
- code span.va { color: #00e0e0; } /* Variable */
94
- code span.vs { color: #abe338; } /* VerbatimString */
95
- code span.wa { color: #dcc6e0; } /* Warning */
96
- </style>
97
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/theme/quarto.css">
98
- <link href="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.css" rel="stylesheet">
99
- <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.css" rel="stylesheet">
100
- <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.css" rel="stylesheet">
101
- <link href="llm_conf_files/libs/revealjs/plugin/quarto-support/footer.css" rel="stylesheet">
102
- <style type="text/css">
103
-
104
- .callout {
105
- margin-top: 1em;
106
- margin-bottom: 1em;
107
- border-radius: .25rem;
108
- }
109
-
110
- .callout.callout-style-simple {
111
- padding: 0em 0.5em;
112
- border-left: solid #acacac .3rem;
113
- border-right: solid 1px silver;
114
- border-top: solid 1px silver;
115
- border-bottom: solid 1px silver;
116
- display: flex;
117
- }
118
-
119
- .callout.callout-style-default {
120
- border-left: solid #acacac .3rem;
121
- border-right: solid 1px silver;
122
- border-top: solid 1px silver;
123
- border-bottom: solid 1px silver;
124
- }
125
-
126
- .callout .callout-body-container {
127
- flex-grow: 1;
128
- }
129
-
130
- .callout.callout-style-simple .callout-body {
131
- font-size: 1rem;
132
- font-weight: 400;
133
- }
134
-
135
- .callout.callout-style-default .callout-body {
136
- font-size: 0.9rem;
137
- font-weight: 400;
138
- }
139
-
140
- .callout.callout-titled.callout-style-simple .callout-body {
141
- margin-top: 0.2em;
142
- }
143
-
144
- .callout:not(.callout-titled) .callout-body {
145
- display: flex;
146
- }
147
-
148
- .callout:not(.no-icon).callout-titled.callout-style-simple .callout-content {
149
- padding-left: 1.6em;
150
- }
151
-
152
- .callout.callout-titled .callout-header {
153
- padding-top: 0.2em;
154
- margin-bottom: -0.2em;
155
- }
156
-
157
- .callout.callout-titled .callout-title p {
158
- margin-top: 0.5em;
159
- margin-bottom: 0.5em;
160
- }
161
-
162
- .callout.callout-titled.callout-style-simple .callout-content p {
163
- margin-top: 0;
164
- }
165
-
166
- .callout.callout-titled.callout-style-default .callout-content p {
167
- margin-top: 0.7em;
168
- }
169
-
170
- .callout.callout-style-simple div.callout-title {
171
- border-bottom: none;
172
- font-size: .9rem;
173
- font-weight: 600;
174
- opacity: 75%;
175
- }
176
-
177
- .callout.callout-style-default div.callout-title {
178
- border-bottom: none;
179
- font-weight: 600;
180
- opacity: 85%;
181
- font-size: 0.9rem;
182
- padding-left: 0.5em;
183
- padding-right: 0.5em;
184
- }
185
-
186
- .callout.callout-style-default div.callout-content {
187
- padding-left: 0.5em;
188
- padding-right: 0.5em;
189
- }
190
-
191
- .callout.callout-style-simple .callout-icon::before {
192
- height: 1rem;
193
- width: 1rem;
194
- display: inline-block;
195
- content: "";
196
- background-repeat: no-repeat;
197
- background-size: 1rem 1rem;
198
- }
199
-
200
- .callout.callout-style-default .callout-icon::before {
201
- height: 0.9rem;
202
- width: 0.9rem;
203
- display: inline-block;
204
- content: "";
205
- background-repeat: no-repeat;
206
- background-size: 0.9rem 0.9rem;
207
- }
208
-
209
- .callout-title {
210
- display: flex
211
- }
212
-
213
- .callout-icon::before {
214
- margin-top: 1rem;
215
- padding-right: .5rem;
216
- }
217
-
218
- .callout.no-icon::before {
219
- display: none !important;
220
- }
221
-
222
- .callout.callout-titled .callout-body > .callout-content > :last-child {
223
- padding-bottom: 0.5rem;
224
- margin-bottom: 0;
225
- }
226
-
227
- .callout.callout-titled .callout-icon::before {
228
- margin-top: .5rem;
229
- padding-right: .5rem;
230
- }
231
-
232
- .callout:not(.callout-titled) .callout-icon::before {
233
- margin-top: 1rem;
234
- padding-right: .5rem;
235
- }
236
-
237
- /* Callout Types */
238
-
239
- div.callout-note {
240
- border-left-color: #4582ec !important;
241
- }
242
-
243
- div.callout-note .callout-icon::before {
244
- background-image: url('');
245
- }
246
-
247
- div.callout-note.callout-style-default .callout-title {
248
- background-color: #dae6fb
249
- }
250
-
251
- div.callout-important {
252
- border-left-color: #d9534f !important;
253
- }
254
-
255
- div.callout-important .callout-icon::before {
256
- background-image: url('');
257
- }
258
-
259
- div.callout-important.callout-style-default .callout-title {
260
- background-color: #f7dddc
261
- }
262
-
263
- div.callout-warning {
264
- border-left-color: #f0ad4e !important;
265
- }
266
-
267
- div.callout-warning .callout-icon::before {
268
- background-image: url('');
269
- }
270
-
271
- div.callout-warning.callout-style-default .callout-title {
272
- background-color: #fcefdc
273
- }
274
-
275
- div.callout-tip {
276
- border-left-color: #02b875 !important;
277
- }
278
-
279
- div.callout-tip .callout-icon::before {
280
- background-image: url('');
281
- }
282
-
283
- div.callout-tip.callout-style-default .callout-title {
284
- background-color: #ccf1e3
285
- }
286
-
287
- div.callout-caution {
288
- border-left-color: #fd7e14 !important;
289
- }
290
-
291
- div.callout-caution .callout-icon::before {
292
- background-image: url('');
293
- }
294
-
295
- div.callout-caution.callout-style-default .callout-title {
296
- background-color: #ffe5d0
297
- }
298
-
299
- </style>
300
- <style type="text/css">
301
- .reveal div.sourceCode {
302
- margin: 0;
303
- overflow: auto;
304
- }
305
- .reveal div.hanging-indent {
306
- margin-left: 1em;
307
- text-indent: -1em;
308
- }
309
- .reveal .slide:not(.center) {
310
- height: 100%;
311
- }
312
- .reveal .slide.scrollable {
313
- overflow-y: auto;
314
- }
315
- .reveal .footnotes {
316
- height: 100%;
317
- overflow-y: auto;
318
- }
319
- .reveal .slide .absolute {
320
- position: absolute;
321
- display: block;
322
- }
323
- .reveal .footnotes ol {
324
- counter-reset: ol;
325
- list-style-type: none;
326
- margin-left: 0;
327
- }
328
- .reveal .footnotes ol li:before {
329
- counter-increment: ol;
330
- content: counter(ol) ". ";
331
- }
332
- .reveal .footnotes ol li > p:first-child {
333
- display: inline-block;
334
- }
335
- .reveal .slide ul,
336
- .reveal .slide ol {
337
- margin-bottom: 0.5em;
338
- }
339
- .reveal .slide ul li,
340
- .reveal .slide ol li {
341
- margin-top: 0.4em;
342
- margin-bottom: 0.2em;
343
- }
344
- .reveal .slide ul[role="tablist"] li {
345
- margin-bottom: 0;
346
- }
347
- .reveal .slide ul li > *:first-child,
348
- .reveal .slide ol li > *:first-child {
349
- margin-block-start: 0;
350
- }
351
- .reveal .slide ul li > *:last-child,
352
- .reveal .slide ol li > *:last-child {
353
- margin-block-end: 0;
354
- }
355
- .reveal .slide .columns:nth-child(3) {
356
- margin-block-start: 0.8em;
357
- }
358
- .reveal blockquote {
359
- box-shadow: none;
360
- }
361
- .reveal .tippy-content>* {
362
- margin-top: 0.2em;
363
- margin-bottom: 0.7em;
364
- }
365
- .reveal .tippy-content>*:last-child {
366
- margin-bottom: 0.2em;
367
- }
368
- .reveal .slide > img.stretch.quarto-figure-center,
369
- .reveal .slide > img.r-stretch.quarto-figure-center {
370
- display: block;
371
- margin-left: auto;
372
- margin-right: auto;
373
- }
374
- .reveal .slide > img.stretch.quarto-figure-left,
375
- .reveal .slide > img.r-stretch.quarto-figure-left {
376
- display: block;
377
- margin-left: 0;
378
- margin-right: auto;
379
- }
380
- .reveal .slide > img.stretch.quarto-figure-right,
381
- .reveal .slide > img.r-stretch.quarto-figure-right {
382
- display: block;
383
- margin-left: auto;
384
- margin-right: 0;
385
- }
386
- </style>
387
- <script src="llm_conf_files/libs/quarto-diagram/mermaid.min.js"></script>
388
- <script src="llm_conf_files/libs/quarto-diagram/mermaid-init.js"></script>
389
- <link href="llm_conf_files/libs/quarto-diagram/mermaid.css" rel="stylesheet">
390
- </head>
391
- <body class="quarto-dark">
392
- <div class="reveal">
393
- <div class="slides">
394
-
395
- <section id="title-slide" class="quarto-title-block center">
396
- <h1 class="title">Scaling Model Training with More Compute, How Do They Do It?</h1>
397
-
398
- <div class="quarto-title-authors">
399
- </div>
400
-
401
- </section>
402
- <section id="who-am-i" class="slide level2">
403
- <h2>Who am I?</h2>
404
- <ul>
405
- <li>Zachary Mueller</li>
406
- <li>Technical Lead for the 🤗 Accelerate project</li>
407
- <li>API design geek</li>
408
- </ul>
409
- </section>
410
- <section id="understanding-gpu-usage" class="slide level2">
411
- <h2>Understanding GPU Usage</h2>
412
- <ul>
413
- <li>We can somewhat estimate the memory usage in vanilla full-fine-tuning of models</li>
414
- <li>Requires certain assumptions (that I’ll be covering):
415
- <ul>
416
- <li>Adam optimizer</li>
417
- <li>Batch size of 1</li>
418
- </ul></li>
419
- </ul>
420
- </section>
421
- <section id="understanding-gpu-usage-1" class="slide level2">
422
- <h2>Understanding GPU Usage</h2>
423
- <p>General estimate (<code>bert-base-cased</code>, 108M params):</p>
424
- <ul>
425
- <li>Each parameter is 4 bytes</li>
426
- <li>Backward ~= 2x the model size</li>
427
- <li>The optimizer step ~= 4x the model size (1x model, 1x gradients, 2x optimizer):</li>
428
- </ul>
429
- <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
430
- <table>
431
- <thead>
432
- <tr class="header">
433
- <th>dtype</th>
434
- <th style="text-align: left;">Model</th>
435
- <th style="text-align: center;">Gradients</th>
436
- <th style="text-align: center;">Backward pass</th>
437
- <th style="text-align: center;">Optimizer step</th>
438
- <th style="text-align: center;">Highest</th>
439
- </tr>
440
- </thead>
441
- <tbody>
442
- <tr class="odd">
443
- <td>float32</td>
444
- <td style="text-align: left;">413.18 MB</td>
445
- <td style="text-align: center;">413.18 MB</td>
446
- <td style="text-align: center;">826.36 MB</td>
447
- <td style="text-align: center;">1.61 GB</td>
448
- <td style="text-align: center;">1.61 GB</td>
449
- </tr>
450
- <tr class="even">
451
- <td>float16</td>
452
- <td style="text-align: left;">413.18 MB*</td>
453
- <td style="text-align: center;">619.77 MB</td>
454
- <td style="text-align: center;">826.36 MB</td>
455
- <td style="text-align: center;">826.36 MB</td>
456
- <td style="text-align: center;">826.36 MB</td>
457
- </tr>
458
- </tbody>
459
- </table>
460
- <p>*All estimations were based off the <a href="https://huggingface.co/spaces/hf-accelerate/model-memory-usage">Model Estimator Tool</a></p>
461
- </div>
462
- </section>
463
- <section id="understanding-gpu-usage-2" class="slide level2">
464
- <h2>Understanding GPU Usage</h2>
465
- <p>This works fine for small models, we have cards with anywhere from 12-24GB of GPU memory (on the GPU-poor side).</p>
466
- <p>But what happens as we scale?</p>
467
- <p>Here’s <code>llama-3-8B</code> (8.03B parameters)</p>
468
- <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
469
- <table>
470
- <thead>
471
- <tr class="header">
472
- <th>dtype</th>
473
- <th style="text-align: left;">Model</th>
474
- <th style="text-align: center;">Gradients</th>
475
- <th style="text-align: center;">Backward pass</th>
476
- <th style="text-align: center;">Optimizer step</th>
477
- <th style="text-align: center;">Highest</th>
478
- </tr>
479
- </thead>
480
- <tbody>
481
- <tr class="odd">
482
- <td>float32</td>
483
- <td style="text-align: left;">28.21 GB</td>
484
- <td style="text-align: center;">28.21 GB</td>
485
- <td style="text-align: center;">56.43 GB</td>
486
- <td style="text-align: center;">112.84 GB</td>
487
- <td style="text-align: center;">112.84 GB</td>
488
- </tr>
489
- <tr class="even">
490
- <td>float16</td>
491
- <td style="text-align: left;">28.21 GB*</td>
492
- <td style="text-align: center;">42.32 GB</td>
493
- <td style="text-align: center;">56.43 GB</td>
494
- <td style="text-align: center;">56.43 GB</td>
495
- <td style="text-align: center;">56.43 GB</td>
496
- </tr>
497
- </tbody>
498
- </table>
499
- </div>
500
- <p>Well, <em>I</em> don’t have 56GB of GPU memory in a single card, let alone 112GB.</p>
501
- <p>What can we do?</p>
502
- </section>
503
- <section>
504
- <section id="distributed-training" class="title-slide slide level1 center">
505
- <h1>Distributed Training</h1>
506
-
507
- </section>
508
- <section id="kinds-of-training" class="slide level2">
509
- <h2>Kinds of Training</h2>
510
- <ul>
511
- <li>Single GPU:
512
- <ul>
513
- <li>No distributed techniques at play</li>
514
- </ul></li>
515
- <li>DDP:
516
- <ul>
517
- <li>A full copy of the model exists on each device, but data is chunked between each GPU</li>
518
- </ul></li>
519
- <li>FSDP &amp; DeepSpeed:
520
- <ul>
521
- <li>Split chunks of the model and optimizer states across GPUs, allowing for training bigger models on smaller (multiple) GPUs</li>
522
- </ul></li>
523
- </ul>
524
- </section></section>
525
- <section>
526
- <section id="fully-sharded-data-parallelism" class="title-slide slide level1 center">
527
- <h1>Fully Sharded Data Parallelism</h1>
528
-
529
- </section>
530
- <section id="fully-sharded-data-parallelism-1" class="slide level2">
531
- <h2>Fully Sharded Data Parallelism</h2>
532
-
533
- <img data-src="fsdp.png" id="fig-539a35d47e664c97a50115a146a7f1bd-1" class="r-stretch quarto-figure-center"><aside class="notes">
534
- <ul>
535
- <li>Take the model and split it across <code>n</code> GPUs</li>
536
- <li>Each GPU computes the shard’s gradients</li>
537
- <li>At the end, all gradients are synchronized and the final full model gradient is calculated</li>
538
- <li>The backward pass can then be performed</li>
539
- </ul>
540
- <style type="text/css">
541
- span.MJX_Assistive_MathML {
542
- position:absolute!important;
543
- clip: rect(1px, 1px, 1px, 1px);
544
- padding: 1px 0 0 0!important;
545
- border: 0!important;
546
- height: 1px!important;
547
- width: 1px!important;
548
- overflow: hidden!important;
549
- display:block!important;
550
- }</style></aside>
551
- </section>
552
- <section id="fsdp-getting-parameter-specific" class="slide level2">
553
- <h2>FSDP: Getting parameter specific</h2>
554
- <ul>
555
- <li>Different parameters can dicatate how much memory is needed for total GPU training across multiple GPUs</li>
556
- <li>These include how model weights are sharded, gradients, and more.</li>
557
- <li>I’ll cover some important ones I needed when doing a Full-Fine-Tune of Llama-3-8B <em>without PEFT</em> on 2x4090’s</li>
558
- </ul>
559
- </section>
560
- <section id="sharding_strategy" class="slide level2">
561
- <h2><code>sharding_strategy</code></h2>
562
- <ul>
563
- <li>Dictates the level of divving resources to perform
564
- <ul>
565
- <li><code>FULL_SHARD</code>: Includes optimizer states, gradients, and parameters</li>
566
- <li><code>SHARD_GRAD_OP</code>: Includes optimizer states and gradients</li>
567
- <li><code>NO_SHARD</code>: Normal DDP</li>
568
- <li><code>HYBRID_SHARD</code>: Includes optimizer states, gradients, and parameters but each node has the full model</li>
569
- </ul>
570
- <aside class="notes">
571
- <pre><code>FULL_SHARD:
572
- Parameters, Gradients, Optimizer States: All are sharded.
573
- Parameters Handling: Unshard before forward pass, reshard after forward pass, unshard before backward pass, reshard after backward pass.
574
- Gradients Handling: Synchronize and shard after backward pass.
575
- Optimizer States: Updated locally per rank.</code></pre>
576
- <p>SHARD_GRAD_OP: Gradients and Optimizer States: Sharded during computation. Parameters: Unshard before forward pass, remain unsharded during forward pass, reshard after backward pass. Inside no_sync(): Parameters are not resharded after backward computation. Optimizer States: Updated locally per rank.</p>
577
- <p>NO_SHARD: Parameters, Gradients, Optimizer States: Not sharded, replicated across ranks. Gradients Handling: Synchronized via all-reduce after backward pass. Optimizer States: Updated locally per rank.</p>
578
- <p>HYBRID_SHARD: Parameters, Gradients, Optimizer States: Combines FULL_SHARD within a node and replicates parameters across nodes. Communication: Expensive operations like all-gathers and reduce-scatters are limited to within a node, enhancing performance for medium-sized models.</p>
579
- <style type="text/css">
580
- span.MJX_Assistive_MathML {
581
- position:absolute!important;
582
- clip: rect(1px, 1px, 1px, 1px);
583
- padding: 1px 0 0 0!important;
584
- border: 0!important;
585
- height: 1px!important;
586
- width: 1px!important;
587
- overflow: hidden!important;
588
- display:block!important;
589
- }</style></aside></li>
590
- </ul>
591
- </section>
592
- <section id="auto_wrap_policy" class="slide level2">
593
- <h2><code>auto_wrap_policy</code>:</h2>
594
- <ul>
595
- <li>How the model should be split</li>
596
- <li>Can be either <code>TRANSFORMER_BASED_WRAP</code> or <code>SIZE_BASED_WRAP</code></li>
597
- <li><code>TRANSFORMER</code>/<code>fsdp_transformers_layer_cls_to_wrap</code>:
598
- <ul>
599
- <li>Need to declare the layer</li>
600
- <li>Generally <code>transformers</code> has good defaults</li>
601
- </ul></li>
602
- <li><code>SIZE</code>/<code>fsdp_min_num_param</code>:
603
- <ul>
604
- <li>Number of total parameters in a shard</li>
605
- </ul></li>
606
- </ul>
607
- </section>
608
- <section id="offload_params" class="slide level2">
609
- <h2><code>offload_params</code>:</h2>
610
- <ul>
611
- <li>Offloads the parameters and gradients to the CPU if they can’t fit into memory</li>
612
- <li>Allows you to train much larger models locally, but will be much slower</li>
613
- </ul>
614
- <blockquote>
615
- <p>Case: FFT of Llama-3-8B with <code>fsdp_offload_params</code> on 2x4090 GPUs was 72hrs, vs ~an hour or two when using 1xH100</p>
616
- </blockquote>
617
- </section>
618
- <section id="cpu_ram_efficient_loading-and-sync_module_states" class="slide level2">
619
- <h2><code>cpu_ram_efficient_loading</code> and <code>sync_module_states</code></h2>
620
- <ul>
621
- <li>Uses the idea behind big model inference/the <code>meta</code> device to load in the model to the GPU in a low-ram scenario</li>
622
- <li>Rather than needing <code>model_size</code> * <code>n_gpus</code> RAM, we can load the model on a single node and then send the weights directly to each shard when the time is right via <code>sync_module_states</code></li>
623
- </ul>
624
- </section></section>
625
- <section>
626
- <section id="tying-this-to-accelerate" class="title-slide slide level1 center">
627
- <h1>Tying this to 🤗 Accelerate</h1>
628
-
629
- </section>
630
- <section id="tying-this-to-accelerate-1" class="slide level2">
631
- <h2>Tying this to 🤗 Accelerate</h2>
632
- <ul>
633
- <li>So far we’ve covered the theory, but how do we put it into practice</li>
634
- <li>By using a library that’s at the heart of the entire open-source ecosystem</li>
635
- </ul>
636
- <div style="font-size: 60%;padding-left:10%;padding-top:0%;">
637
- <ul>
638
- <li>Nearly all of 🤗</li>
639
- <li><code>axolotl</code></li>
640
- <li><code>fastai</code></li>
641
- <li><code>FastChat</code></li>
642
- <li><code>lucidrains</code></li>
643
- <li><code>kornia</code></li>
644
- </ul>
645
- </div>
646
- <p>Are you using it and you don’t even know?</p>
647
- </section>
648
- <section id="what-is-accelerate" class="slide level2">
649
- <h2>What is 🤗 Accelerate?</h2>
650
- <div class="cell" data-reveal="true" data-fig-height="6">
651
- <div class="cell-output-display">
652
- <div>
653
- <div>
654
- <pre class="mermaid mermaid-js">graph LR
655
- A(("🤗 Accelerate#32;"))
656
- A --&gt; B["CLI Interface#32;"]
657
- A --&gt; C["Training Library#32;"]
658
- A --&gt; D["Big Model&lt;br&gt;Inference#32;"]
659
- </pre>
660
- </div>
661
- </div>
662
- </div>
663
- </div>
664
- </section>
665
- <section id="a-cli-interface" class="slide level2">
666
- <h2>A CLI Interface</h2>
667
- <ul>
668
- <li><code>accelerate config</code>
669
- <ul>
670
- <li>Configure the environment</li>
671
- </ul></li>
672
- <li><code>accelerate estimate-memory</code>
673
- <ul>
674
- <li>How to guess vRAM requirements</li>
675
- </ul></li>
676
- <li><code>accelerate launch</code>
677
- <ul>
678
- <li>How to run your script</li>
679
- </ul></li>
680
- </ul>
681
- </section>
682
- <section id="launching-distributed-training-is-hard" class="slide level2">
683
- <h2>Launching distributed training is hard</h2>
684
- <ul>
685
- <li><div class="sourceCode" id="cb2"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1"></a><span class="ex">python</span> script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
686
- <li><div class="sourceCode" id="cb3"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span><span class="op">=</span>1 <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
687
- <li><div class="sourceCode" id="cb4"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1"></a><span class="ex">deepspeed</span> <span class="at">--num_gpus</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
688
- </ul>
689
- <p>How can we make this better?</p>
690
- </section>
691
- <section id="accelerate-launch" class="slide level2">
692
- <h2><code>accelerate launch</code></h2>
693
- <div class="sourceCode" id="cb5"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1"></a><span class="ex">accelerate</span> launch script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
694
- </section>
695
- <section id="accelerate-config" class="slide level2">
696
- <h2><code>accelerate config</code></h2>
697
- <ul>
698
- <li>Rely on <code>config.yaml</code> files</li>
699
- <li>Choose to either running <code>accelerate config</code> or write your own:</li>
700
- </ul>
701
- <div class="columns" style="font-size: 50%;padding-left:10%;background-color: rgba(0,0,0,.1);">
702
- <div class="column" style="width:40%;">
703
- <div class="code-with-filename">
704
- <div class="code-with-filename-file">
705
- <pre><strong>ddp_config.yaml</strong></pre>
706
- </div>
707
- <div class="sourceCode" id="cb6" data-filename="ddp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
708
- <span id="cb6-2"><a href="#cb6-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> MULTI_GPU</span></span>
709
- <span id="cb6-3"><a href="#cb6-3"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
710
- <span id="cb6-4"><a href="#cb6-4"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
711
- <span id="cb6-5"><a href="#cb6-5"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
712
- <span id="cb6-6"><a href="#cb6-6"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
713
- </div>
714
- </div><div class="column" style="width:40%;">
715
- <div class="code-with-filename">
716
- <div class="code-with-filename-file">
717
- <pre><strong>fsdp_config.yaml</strong></pre>
718
- </div>
719
- <div class="sourceCode" id="cb7" data-filename="fsdp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
720
- <span id="cb7-2"><a href="#cb7-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> FSDP</span></span>
721
- <span id="cb7-3"><a href="#cb7-3"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
722
- <span id="cb7-4"><a href="#cb7-4"></a><span class="at"> </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
723
- <span id="cb7-5"><a href="#cb7-5"></a><span class="at"> </span><span class="fu">fsdp_backward_prefetch</span><span class="kw">:</span><span class="at"> BACKWARD_PRE</span></span>
724
- <span id="cb7-6"><a href="#cb7-6"></a><span class="at"> </span><span class="fu">fsdp_cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
725
- <span id="cb7-7"><a href="#cb7-7"></a><span class="at"> </span><span class="fu">fsdp_forward_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
726
- <span id="cb7-8"><a href="#cb7-8"></a><span class="at"> </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
727
- <span id="cb7-9"><a href="#cb7-9"></a><span class="at"> </span><span class="fu">fsdp_sharding_strategy</span><span class="kw">:</span><span class="at"> FULL_SHARD</span></span>
728
- <span id="cb7-10"><a href="#cb7-10"></a><span class="at"> </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> SHARDED_STATE_DICT</span></span>
729
- <span id="cb7-11"><a href="#cb7-11"></a><span class="at"> </span><span class="fu">fsdp_sync_module_states</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
730
- <span id="cb7-12"><a href="#cb7-12"></a><span class="at"> </span><span class="fu">fsdp_use_orig_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
731
- <span id="cb7-13"><a href="#cb7-13"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
732
- <span id="cb7-14"><a href="#cb7-14"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
733
- <span id="cb7-15"><a href="#cb7-15"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
734
- <span id="cb7-16"><a href="#cb7-16"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
735
- </div>
736
- </div>
737
- </div>
738
- </section></section>
739
- <section>
740
- <section id="a-training-library" class="title-slide slide level1 center">
741
- <h1>A Training Library</h1>
742
-
743
- </section>
744
- <section id="a-training-library-the-code" class="slide level2">
745
- <h2>A Training Library: The Code</h2>
746
- <div class="columns" style="font-size: 50%;">
747
- <div class="column">
748
- <p><br><br><br></p>
749
- <div class="sourceCode" id="cb8" data-code-line-numbers="5-6,9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="co"># For alignment purposes</span></span>
750
- <span id="cb8-2"><a href="#cb8-2"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
751
- <span id="cb8-3"><a href="#cb8-3"></a> optimizer.zero_grad()</span>
752
- <span id="cb8-4"><a href="#cb8-4"></a> inputs, targets <span class="op">=</span> batch</span>
753
- <span id="cb8-5"><a href="#cb8-5"></a> inputs <span class="op">=</span> inputs.to(device)</span>
754
- <span id="cb8-6"><a href="#cb8-6"></a> targets <span class="op">=</span> targets.to(device)</span>
755
- <span id="cb8-7"><a href="#cb8-7"></a> outputs <span class="op">=</span> model(inputs)</span>
756
- <span id="cb8-8"><a href="#cb8-8"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
757
- <span id="cb8-9"><a href="#cb8-9"></a> loss.backward()</span>
758
- <span id="cb8-10"><a href="#cb8-10"></a> optimizer.step()</span>
759
- <span id="cb8-11"><a href="#cb8-11"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
760
- </div><div class="column">
761
- <div class="sourceCode" id="cb9" data-code-line-numbers="1-7,12-13,16"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
762
- <span id="cb9-2"><a href="#cb9-2"></a>accelerator <span class="op">=</span> Accelerator()</span>
763
- <span id="cb9-3"><a href="#cb9-3"></a>dataloader, model, optimizer scheduler <span class="op">=</span> (</span>
764
- <span id="cb9-4"><a href="#cb9-4"></a> accelerator.prepare(</span>
765
- <span id="cb9-5"><a href="#cb9-5"></a> dataloader, model, optimizer, scheduler</span>
766
- <span id="cb9-6"><a href="#cb9-6"></a> )</span>
767
- <span id="cb9-7"><a href="#cb9-7"></a>)</span>
768
- <span id="cb9-8"><a href="#cb9-8"></a></span>
769
- <span id="cb9-9"><a href="#cb9-9"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
770
- <span id="cb9-10"><a href="#cb9-10"></a> optimizer.zero_grad()</span>
771
- <span id="cb9-11"><a href="#cb9-11"></a> inputs, targets <span class="op">=</span> batch</span>
772
- <span id="cb9-12"><a href="#cb9-12"></a> <span class="co"># inputs = inputs.to(device)</span></span>
773
- <span id="cb9-13"><a href="#cb9-13"></a> <span class="co"># targets = targets.to(device)</span></span>
774
- <span id="cb9-14"><a href="#cb9-14"></a> outputs <span class="op">=</span> model(inputs)</span>
775
- <span id="cb9-15"><a href="#cb9-15"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
776
- <span id="cb9-16"><a href="#cb9-16"></a> accelerator.backward(loss) <span class="co"># loss.backward()</span></span>
777
- <span id="cb9-17"><a href="#cb9-17"></a> optimizer.step()</span>
778
- <span id="cb9-18"><a href="#cb9-18"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
779
- </div>
780
- </div>
781
- </section>
782
- <section id="a-training-library-how-scaling-works" class="slide level2">
783
- <h2>A Training Library: How Scaling Works</h2>
784
- <ul>
785
- <li>Accelerate’s DataLoaders and schedulers work off of a sharding mindset</li>
786
- <li>Rather than repeating the same data across <code>n</code> nodes, we instead split it</li>
787
- <li>Speeds up training linearly</li>
788
- <li>Given a batch size of 16 on a single GPU, to recreate this across 8 GPUs you would use a batch size of 2</li>
789
- <li>This also means the scheduler will be stepped <code>n</code> GPUs at a time per “global step”</li>
790
- </ul>
791
- </section>
792
- <section id="a-training-library-mixed-precision" class="slide level2">
793
- <h2>A Training Library: Mixed Precision</h2>
794
- <ul>
795
- <li>This may be a bit different than your “normal” idea of mixed precision.</li>
796
- <li>We do <strong>not</strong> convert the model weights to BF16/FP16</li>
797
- <li>Instead we <strong>wrap the forward pass</strong> with <code>autocast</code> to convert the gradients automatically</li>
798
- <li>This preserves the original precision of the weights, which leads to stable training and better fine-tuning later on.</li>
799
- <li><strong>If you use <code>.bf16()</code> weights, you are STUCK in bf16 perminantly</strong></li>
800
- </ul>
801
- </section>
802
- <section id="a-training-library-mixed-precision-1" class="slide level2">
803
- <h2>A Training Library: Mixed Precision</h2>
804
- <ul>
805
- <li>Let’s tie that back up to the model estimator with neat tools like NVIDIA’s TransformerEngine</li>
806
- </ul>
807
- <div style="font-size: 60%;background-color: rgba(0,0,0,.1);">
808
- <table style="width:100%;">
809
- <colgroup>
810
- <col style="width: 14%">
811
- <col style="width: 14%">
812
- <col style="width: 14%">
813
- <col style="width: 14%">
814
- <col style="width: 14%">
815
- <col style="width: 14%">
816
- <col style="width: 14%">
817
- </colgroup>
818
- <thead>
819
- <tr class="header">
820
- <th>Optimization Level</th>
821
- <th>Computation (GEMM)</th>
822
- <th>Comm</th>
823
- <th>Weight</th>
824
- <th>Master Weight</th>
825
- <th>Weight Gradient</th>
826
- <th>Optimizer States</th>
827
- </tr>
828
- </thead>
829
- <tbody>
830
- <tr class="odd">
831
- <td>FP16 AMP</td>
832
- <td>FP16</td>
833
- <td>FP32</td>
834
- <td>FP32</td>
835
- <td>N/A</td>
836
- <td>FP32</td>
837
- <td>FP32+FP32</td>
838
- </tr>
839
- <tr class="even">
840
- <td>Nvidia TE</td>
841
- <td>FP8</td>
842
- <td>FP32</td>
843
- <td>FP32</td>
844
- <td>N/A</td>
845
- <td>FP32</td>
846
- <td>FP32+FP32</td>
847
- </tr>
848
- <tr class="odd">
849
- <td>MS-AMP O1</td>
850
- <td>FP8</td>
851
- <td>FP8</td>
852
- <td>FP16</td>
853
- <td>N/A</td>
854
- <td>FP8</td>
855
- <td>FP32+FP32</td>
856
- </tr>
857
- <tr class="even">
858
- <td>MS-AMP O2</td>
859
- <td>FP8</td>
860
- <td>FP8</td>
861
- <td>FP16</td>
862
- <td>N/A</td>
863
- <td>FP8</td>
864
- <td>FP8+FP16</td>
865
- </tr>
866
- <tr class="odd">
867
- <td>MS-AMP O3</td>
868
- <td>FP8</td>
869
- <td>FP8</td>
870
- <td>FP8</td>
871
- <td>FP16</td>
872
- <td>FP8</td>
873
- <td>FP8+FP16</td>
874
- </tr>
875
- </tbody>
876
- </table>
877
- </div>
878
- <aside class="notes">
879
- <p>What is actually happening: * Linear Layers and other certain compatible layers are wrapped in a special version that allows for FP8 computation * The general forward pass is wrapped around BF16 * This means that the most memory saved is done during the gradients of the model, <em>not</em> the model itself. * With tools like <code>MS-AMP</code> we can convert more chunks into lower precision, but again like before stable training occurs when the models weights are in full precision and the backprop happens in full precision too.</p>
880
- <style type="text/css">
881
- span.MJX_Assistive_MathML {
882
- position:absolute!important;
883
- clip: rect(1px, 1px, 1px, 1px);
884
- padding: 1px 0 0 0!important;
885
- border: 0!important;
886
- height: 1px!important;
887
- width: 1px!important;
888
- overflow: hidden!important;
889
- display:block!important;
890
- }</style></aside>
891
- </section>
892
- <section id="deepspeed-vs-fully-sharded-data-parallelism" class="slide level2">
893
- <h2>DeepSpeed vs Fully Sharded Data Parallelism</h2>
894
- <ul>
895
- <li>Extremely similar, however mostly used different naming conventions for items and slight tweaks in the implementation</li>
896
- </ul>
897
- <div style="font-size: 50%;background-color: rgba(0,0,0,.1);">
898
- <table style="width:100%;">
899
- <colgroup>
900
- <col style="width: 16%">
901
- <col style="width: 16%">
902
- <col style="width: 16%">
903
- <col style="width: 16%">
904
- <col style="width: 16%">
905
- <col style="width: 16%">
906
- </colgroup>
907
- <thead>
908
- <tr class="header">
909
- <th>Framework</th>
910
- <th>Model Loading (<code>torch_dtype</code>)</th>
911
- <th>Mixed Precision</th>
912
- <th>Preparation (Local)</th>
913
- <th>Training</th>
914
- <th>Optimizer (Local)</th>
915
- </tr>
916
- </thead>
917
- <tbody>
918
- <tr class="odd">
919
- <td>FSDP</td>
920
- <td>bf16</td>
921
- <td>default (none)</td>
922
- <td>bf16</td>
923
- <td>bf16</td>
924
- <td>bf16</td>
925
- </tr>
926
- <tr class="even">
927
- <td>FSDP</td>
928
- <td>bf16</td>
929
- <td>bf16</td>
930
- <td>fp32</td>
931
- <td>bf16</td>
932
- <td>fp32</td>
933
- </tr>
934
- <tr class="odd">
935
- <td>DeepSpeed</td>
936
- <td>bf16</td>
937
- <td>bf16</td>
938
- <td>fp32</td>
939
- <td>bf16</td>
940
- <td>fp32</td>
941
- </tr>
942
- </tbody>
943
- </table>
944
- </div>
945
- <p>To learn more, check out the <a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">documentation</a> or join my office hours</p>
946
- </section>
947
- <section id="key-takeaways" class="slide level2">
948
- <h2>Key Takeaways:</h2>
949
- <ul>
950
- <li>You can scale out training with <code>accelerate</code>, FSDP, and DeepSpeed across multiple GPUs to train bigger models</li>
951
- <li>Techniques like <code>FP8</code> can help speed up training some and reduce computational overhead</li>
952
- <li>Comes at a cost of end-precision and locking model weights for futher fine-tunes if not careful</li>
953
- </ul>
954
- </section>
955
- <section id="some-handy-resources" class="slide level2">
956
- <h2>Some Handy Resources</h2>
957
- <ul>
958
- <li><a href="https://hf.co/docs/accelerate">🤗 Accelerate documentation</a></li>
959
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch">Launching distributed code</a></li>
960
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/notebook">Distributed code and Jupyter Notebooks</a></li>
961
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/migration">Migrating to 🤗 Accelerate easily</a></li>
962
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/big_modeling">Big Model Inference tutorial</a></li>
963
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed">DeepSpeed and 🤗 Accelerate</a></li>
964
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp">Fully Sharded Data Parallelism and 🤗 Accelerate</a></li>
965
- <li><a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">FSDP vs DeepSpeed In-Depth</a></li>
966
- </ul>
967
- <div class="footer footer-default">
968
-
969
- </div>
970
- </section></section>
971
- </div>
972
- </div>
973
-
974
- <script>window.backupDefine = window.define; window.define = undefined;</script>
975
- <script src="llm_conf_files/libs/revealjs/dist/reveal.js"></script>
976
- <!-- reveal.js plugins -->
977
- <script src="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.js"></script>
978
- <script src="llm_conf_files/libs/revealjs/plugin/pdf-export/pdfexport.js"></script>
979
- <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.js"></script>
980
- <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.js"></script>
981
- <script src="llm_conf_files/libs/revealjs/plugin/quarto-support/support.js"></script>
982
-
983
-
984
- <script src="llm_conf_files/libs/revealjs/plugin/notes/notes.js"></script>
985
- <script src="llm_conf_files/libs/revealjs/plugin/search/search.js"></script>
986
- <script src="llm_conf_files/libs/revealjs/plugin/zoom/zoom.js"></script>
987
- <script src="llm_conf_files/libs/revealjs/plugin/math/math.js"></script>
988
- <script>window.define = window.backupDefine; window.backupDefine = undefined;</script>
989
-
990
- <script>
991
-
992
- // Full list of configuration options available at:
993
- // https://revealjs.com/config/
994
- Reveal.initialize({
995
- 'controlsAuto': true,
996
- 'previewLinksAuto': false,
997
- 'pdfSeparateFragments': false,
998
- 'autoAnimateEasing': "ease",
999
- 'autoAnimateDuration': 1,
1000
- 'autoAnimateUnmatched': true,
1001
- 'menu': {"side":"left","useTextContentForMissingTitles":true,"markers":false,"loadIcons":false,"custom":[{"title":"Tools","icon":"<i class=\"fas fa-gear\"></i>","content":"<ul class=\"slide-menu-items\">\n<li class=\"slide-tool-item active\" data-item=\"0\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.fullscreen(event)\"><kbd>f</kbd> Fullscreen</a></li>\n<li class=\"slide-tool-item\" data-item=\"1\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.speakerMode(event)\"><kbd>s</kbd> Speaker View</a></li>\n<li class=\"slide-tool-item\" data-item=\"2\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.overview(event)\"><kbd>o</kbd> Slide Overview</a></li>\n<li class=\"slide-tool-item\" data-item=\"3\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.togglePdfExport(event)\"><kbd>e</kbd> PDF Export Mode</a></li>\n<li class=\"slide-tool-item\" data-item=\"4\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.keyboardHelp(event)\"><kbd>?</kbd> Keyboard Help</a></li>\n</ul>"}],"openButton":true},
1002
- 'smaller': false,
1003
-
1004
- // Display controls in the bottom right corner
1005
- controls: false,
1006
-
1007
- // Help the user learn the controls by providing hints, for example by
1008
- // bouncing the down arrow when they first encounter a vertical slide
1009
- controlsTutorial: false,
1010
-
1011
- // Determines where controls appear, "edges" or "bottom-right"
1012
- controlsLayout: 'edges',
1013
-
1014
- // Visibility rule for backwards navigation arrows; "faded", "hidden"
1015
- // or "visible"
1016
- controlsBackArrows: 'faded',
1017
-
1018
- // Display a presentation progress bar
1019
- progress: true,
1020
-
1021
- // Display the page number of the current slide
1022
- slideNumber: false,
1023
-
1024
- // 'all', 'print', or 'speaker'
1025
- showSlideNumber: 'all',
1026
-
1027
- // Add the current slide number to the URL hash so that reloading the
1028
- // page/copying the URL will return you to the same slide
1029
- hash: true,
1030
-
1031
- // Start with 1 for the hash rather than 0
1032
- hashOneBasedIndex: false,
1033
-
1034
- // Flags if we should monitor the hash and change slides accordingly
1035
- respondToHashChanges: true,
1036
-
1037
- // Push each slide change to the browser history
1038
- history: true,
1039
-
1040
- // Enable keyboard shortcuts for navigation
1041
- keyboard: true,
1042
-
1043
- // Enable the slide overview mode
1044
- overview: true,
1045
-
1046
- // Disables the default reveal.js slide layout (scaling and centering)
1047
- // so that you can use custom CSS layout
1048
- disableLayout: false,
1049
-
1050
- // Vertical centering of slides
1051
- center: false,
1052
-
1053
- // Enables touch navigation on devices with touch input
1054
- touch: true,
1055
-
1056
- // Loop the presentation
1057
- loop: false,
1058
-
1059
- // Change the presentation direction to be RTL
1060
- rtl: false,
1061
-
1062
- // see https://revealjs.com/vertical-slides/#navigation-mode
1063
- navigationMode: 'linear',
1064
-
1065
- // Randomizes the order of slides each time the presentation loads
1066
- shuffle: false,
1067
-
1068
- // Turns fragments on and off globally
1069
- fragments: true,
1070
-
1071
- // Flags whether to include the current fragment in the URL,
1072
- // so that reloading brings you to the same fragment position
1073
- fragmentInURL: false,
1074
-
1075
- // Flags if the presentation is running in an embedded mode,
1076
- // i.e. contained within a limited portion of the screen
1077
- embedded: false,
1078
-
1079
- // Flags if we should show a help overlay when the questionmark
1080
- // key is pressed
1081
- help: true,
1082
-
1083
- // Flags if it should be possible to pause the presentation (blackout)
1084
- pause: true,
1085
-
1086
- // Flags if speaker notes should be visible to all viewers
1087
- showNotes: false,
1088
-
1089
- // Global override for autoplaying embedded media (null/true/false)
1090
- autoPlayMedia: null,
1091
-
1092
- // Global override for preloading lazy-loaded iframes (null/true/false)
1093
- preloadIframes: null,
1094
-
1095
- // Number of milliseconds between automatically proceeding to the
1096
- // next slide, disabled when set to 0, this value can be overwritten
1097
- // by using a data-autoslide attribute on your slides
1098
- autoSlide: 0,
1099
-
1100
- // Stop auto-sliding after user input
1101
- autoSlideStoppable: true,
1102
-
1103
- // Use this method for navigation when auto-sliding
1104
- autoSlideMethod: null,
1105
-
1106
- // Specify the average time in seconds that you think you will spend
1107
- // presenting each slide. This is used to show a pacing timer in the
1108
- // speaker view
1109
- defaultTiming: null,
1110
-
1111
- // Enable slide navigation via mouse wheel
1112
- mouseWheel: false,
1113
-
1114
- // The display mode that will be used to show slides
1115
- display: 'block',
1116
-
1117
- // Hide cursor if inactive
1118
- hideInactiveCursor: true,
1119
-
1120
- // Time before the cursor is hidden (in ms)
1121
- hideCursorTime: 5000,
1122
-
1123
- // Opens links in an iframe preview overlay
1124
- previewLinks: false,
1125
-
1126
- // Transition style (none/fade/slide/convex/concave/zoom)
1127
- transition: 'none',
1128
-
1129
- // Transition speed (default/fast/slow)
1130
- transitionSpeed: 'default',
1131
-
1132
- // Transition style for full page slide backgrounds
1133
- // (none/fade/slide/convex/concave/zoom)
1134
- backgroundTransition: 'none',
1135
-
1136
- // Number of slides away from the current that are visible
1137
- viewDistance: 3,
1138
-
1139
- // Number of slides away from the current that are visible on mobile
1140
- // devices. It is advisable to set this to a lower number than
1141
- // viewDistance in order to save resources.
1142
- mobileViewDistance: 2,
1143
-
1144
- // The "normal" size of the presentation, aspect ratio will be preserved
1145
- // when the presentation is scaled to fit different resolutions. Can be
1146
- // specified using percentage units.
1147
- width: 1050,
1148
-
1149
- height: 700,
1150
-
1151
- // Factor of the display size that should remain empty around the content
1152
- margin: 0.1,
1153
-
1154
- math: {
1155
- mathjax: 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js',
1156
- config: 'TeX-AMS_HTML-full',
1157
- tex2jax: {
1158
- inlineMath: [['\\(','\\)']],
1159
- displayMath: [['\\[','\\]']],
1160
- balanceBraces: true,
1161
- processEscapes: false,
1162
- processRefs: true,
1163
- processEnvironments: true,
1164
- preview: 'TeX',
1165
- skipTags: ['script','noscript','style','textarea','pre','code'],
1166
- ignoreClass: 'tex2jax_ignore',
1167
- processClass: 'tex2jax_process'
1168
- },
1169
- },
1170
-
1171
- // reveal.js plugins
1172
- plugins: [QuartoLineHighlight, PdfExport, RevealMenu, QuartoSupport,
1173
-
1174
- RevealMath,
1175
- RevealNotes,
1176
- RevealSearch,
1177
- RevealZoom
1178
- ]
1179
- });
1180
- </script>
1181
- <script id="quarto-html-after-body" type="application/javascript">
1182
- window.document.addEventListener("DOMContentLoaded", function (event) {
1183
- const toggleBodyColorMode = (bsSheetEl) => {
1184
- const mode = bsSheetEl.getAttribute("data-mode");
1185
- const bodyEl = window.document.querySelector("body");
1186
- if (mode === "dark") {
1187
- bodyEl.classList.add("quarto-dark");
1188
- bodyEl.classList.remove("quarto-light");
1189
- } else {
1190
- bodyEl.classList.add("quarto-light");
1191
- bodyEl.classList.remove("quarto-dark");
1192
- }
1193
- }
1194
- const toggleBodyColorPrimary = () => {
1195
- const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
1196
- if (bsSheetEl) {
1197
- toggleBodyColorMode(bsSheetEl);
1198
- }
1199
- }
1200
- toggleBodyColorPrimary();
1201
- const tabsets = window.document.querySelectorAll(".panel-tabset-tabby")
1202
- tabsets.forEach(function(tabset) {
1203
- const tabby = new Tabby('#' + tabset.id);
1204
- });
1205
- const isCodeAnnotation = (el) => {
1206
- for (const clz of el.classList) {
1207
- if (clz.startsWith('code-annotation-')) {
1208
- return true;
1209
- }
1210
- }
1211
- return false;
1212
- }
1213
- const clipboard = new window.ClipboardJS('.code-copy-button', {
1214
- text: function(trigger) {
1215
- const codeEl = trigger.previousElementSibling.cloneNode(true);
1216
- for (const childEl of codeEl.children) {
1217
- if (isCodeAnnotation(childEl)) {
1218
- childEl.remove();
1219
- }
1220
- }
1221
- return codeEl.innerText;
1222
- }
1223
- });
1224
- clipboard.on('success', function(e) {
1225
- // button target
1226
- const button = e.trigger;
1227
- // don't keep focus
1228
- button.blur();
1229
- // flash "checked"
1230
- button.classList.add('code-copy-button-checked');
1231
- var currentTitle = button.getAttribute("title");
1232
- button.setAttribute("title", "Copied!");
1233
- let tooltip;
1234
- if (window.bootstrap) {
1235
- button.setAttribute("data-bs-toggle", "tooltip");
1236
- button.setAttribute("data-bs-placement", "left");
1237
- button.setAttribute("data-bs-title", "Copied!");
1238
- tooltip = new bootstrap.Tooltip(button,
1239
- { trigger: "manual",
1240
- customClass: "code-copy-button-tooltip",
1241
- offset: [0, -8]});
1242
- tooltip.show();
1243
- }
1244
- setTimeout(function() {
1245
- if (tooltip) {
1246
- tooltip.hide();
1247
- button.removeAttribute("data-bs-title");
1248
- button.removeAttribute("data-bs-toggle");
1249
- button.removeAttribute("data-bs-placement");
1250
- }
1251
- button.setAttribute("title", currentTitle);
1252
- button.classList.remove('code-copy-button-checked');
1253
- }, 1000);
1254
- // clear code selection
1255
- e.clearSelection();
1256
- });
1257
- function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
1258
- const config = {
1259
- allowHTML: true,
1260
- maxWidth: 500,
1261
- delay: 100,
1262
- arrow: false,
1263
- appendTo: function(el) {
1264
- return el.closest('section.slide') || el.parentElement;
1265
- },
1266
- interactive: true,
1267
- interactiveBorder: 10,
1268
- theme: 'light-border',
1269
- placement: 'bottom-start',
1270
- };
1271
- if (contentFn) {
1272
- config.content = contentFn;
1273
- }
1274
- if (onTriggerFn) {
1275
- config.onTrigger = onTriggerFn;
1276
- }
1277
- if (onUntriggerFn) {
1278
- config.onUntrigger = onUntriggerFn;
1279
- }
1280
- config['offset'] = [0,0];
1281
- config['maxWidth'] = 700;
1282
- window.tippy(el, config);
1283
- }
1284
- const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
1285
- for (var i=0; i<noterefs.length; i++) {
1286
- const ref = noterefs[i];
1287
- tippyHover(ref, function() {
1288
- // use id or data attribute instead here
1289
- let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
1290
- try { href = new URL(href).hash; } catch {}
1291
- const id = href.replace(/^#\/?/, "");
1292
- const note = window.document.getElementById(id);
1293
- return note.innerHTML;
1294
- });
1295
- }
1296
- const findCites = (el) => {
1297
- const parentEl = el.parentElement;
1298
- if (parentEl) {
1299
- const cites = parentEl.dataset.cites;
1300
- if (cites) {
1301
- return {
1302
- el,
1303
- cites: cites.split(' ')
1304
- };
1305
- } else {
1306
- return findCites(el.parentElement)
1307
- }
1308
- } else {
1309
- return undefined;
1310
- }
1311
- };
1312
- var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
1313
- for (var i=0; i<bibliorefs.length; i++) {
1314
- const ref = bibliorefs[i];
1315
- const citeInfo = findCites(ref);
1316
- if (citeInfo) {
1317
- tippyHover(citeInfo.el, function() {
1318
- var popup = window.document.createElement('div');
1319
- citeInfo.cites.forEach(function(cite) {
1320
- var citeDiv = window.document.createElement('div');
1321
- citeDiv.classList.add('hanging-indent');
1322
- citeDiv.classList.add('csl-entry');
1323
- var biblioDiv = window.document.getElementById('ref-' + cite);
1324
- if (biblioDiv) {
1325
- citeDiv.innerHTML = biblioDiv.innerHTML;
1326
- }
1327
- popup.appendChild(citeDiv);
1328
- });
1329
- return popup.innerHTML;
1330
- });
1331
- }
1332
- }
1333
- });
1334
- </script>
1335
-
1336
-
1337
- </body></html>