muellerzr HF staff commited on
Commit
b06653e
·
1 Parent(s): fbff59d
Files changed (2) hide show
  1. index.html +594 -346
  2. llm_conf.html +0 -1337
index.html CHANGED
@@ -1,21 +1,21 @@
1
  <!DOCTYPE html>
2
  <html lang="en"><head>
3
- <script src="Accelerate_files/libs/clipboard/clipboard.min.js"></script>
4
- <script src="Accelerate_files/libs/quarto-html/tabby.min.js"></script>
5
- <script src="Accelerate_files/libs/quarto-html/popper.min.js"></script>
6
- <script src="Accelerate_files/libs/quarto-html/tippy.umd.min.js"></script>
7
- <link href="Accelerate_files/libs/quarto-html/tippy.css" rel="stylesheet">
8
- <link href="Accelerate_files/libs/quarto-html/quarto-html.min.css" rel="stylesheet" data-mode="light">
9
- <link href="Accelerate_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles"><meta charset="utf-8">
10
- <meta name="generator" content="quarto-1.2.237">
11
-
12
- <meta name="author" content="Zachary Mueller">
13
- <title>Accelerate, Three Powerful Sublibraries for PyTorch</title>
14
  <meta name="apple-mobile-web-app-capable" content="yes">
15
  <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
16
  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
17
- <link rel="stylesheet" href="Accelerate_files/libs/revealjs/dist/reset.css">
18
- <link rel="stylesheet" href="Accelerate_files/libs/revealjs/dist/reveal.css">
19
  <style>
20
  code{white-space: pre-wrap;}
21
  span.smallcaps{font-variant: small-caps;}
@@ -25,11 +25,12 @@
25
  ul.task-list{list-style: none;}
26
  ul.task-list li input[type="checkbox"] {
27
  width: 0.8em;
28
- margin: 0 0.8em 0.2em -1.6em;
29
  vertical-align: middle;
30
  }
 
31
  pre > code.sourceCode { white-space: pre; position: relative; }
32
- pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
33
  pre > code.sourceCode > span:empty { height: 1.2em; }
34
  .sourceCode { overflow: visible; }
35
  code.sourceCode > span { color: inherit; text-decoration: inherit; }
@@ -93,11 +94,11 @@
93
  code span.vs { color: #abe338; } /* VerbatimString */
94
  code span.wa { color: #dcc6e0; } /* Warning */
95
  </style>
96
- <link rel="stylesheet" href="Accelerate_files/libs/revealjs/dist/theme/quarto.css" id="theme">
97
- <link href="Accelerate_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.css" rel="stylesheet">
98
- <link href="Accelerate_files/libs/revealjs/plugin/reveal-menu/menu.css" rel="stylesheet">
99
- <link href="Accelerate_files/libs/revealjs/plugin/reveal-menu/quarto-menu.css" rel="stylesheet">
100
- <link href="Accelerate_files/libs/revealjs/plugin/quarto-support/footer.css" rel="stylesheet">
101
  <style type="text/css">
102
 
103
  .callout {
@@ -136,44 +137,44 @@
136
  font-weight: 400;
137
  }
138
 
139
- .callout.callout-captioned.callout-style-simple .callout-body {
140
  margin-top: 0.2em;
141
  }
142
 
143
- .callout:not(.callout-captioned) .callout-body {
144
  display: flex;
145
  }
146
 
147
- .callout:not(.no-icon).callout-captioned.callout-style-simple .callout-content {
148
  padding-left: 1.6em;
149
  }
150
 
151
- .callout.callout-captioned .callout-header {
152
  padding-top: 0.2em;
153
  margin-bottom: -0.2em;
154
  }
155
 
156
- .callout.callout-captioned .callout-caption p {
157
  margin-top: 0.5em;
158
  margin-bottom: 0.5em;
159
  }
160
 
161
- .callout.callout-captioned.callout-style-simple .callout-content p {
162
  margin-top: 0;
163
  }
164
 
165
- .callout.callout-captioned.callout-style-default .callout-content p {
166
  margin-top: 0.7em;
167
  }
168
 
169
- .callout.callout-style-simple div.callout-caption {
170
  border-bottom: none;
171
  font-size: .9rem;
172
  font-weight: 600;
173
  opacity: 75%;
174
  }
175
 
176
- .callout.callout-style-default div.callout-caption {
177
  border-bottom: none;
178
  font-weight: 600;
179
  opacity: 85%;
@@ -205,7 +206,7 @@
205
  background-size: 0.9rem 0.9rem;
206
  }
207
 
208
- .callout-caption {
209
  display: flex
210
  }
211
 
@@ -218,16 +219,17 @@
218
  display: none !important;
219
  }
220
 
221
- .callout.callout-captioned .callout-body > .callout-content > :last-child {
222
- margin-bottom: 0.5rem;
 
223
  }
224
 
225
- .callout.callout-captioned .callout-icon::before {
226
  margin-top: .5rem;
227
  padding-right: .5rem;
228
  }
229
 
230
- .callout:not(.callout-captioned) .callout-icon::before {
231
  margin-top: 1rem;
232
  padding-right: .5rem;
233
  }
@@ -242,7 +244,7 @@
242
  background-image: url('');
243
  }
244
 
245
- div.callout-note.callout-style-default .callout-caption {
246
  background-color: #dae6fb
247
  }
248
 
@@ -254,7 +256,7 @@
254
  background-image: url('');
255
  }
256
 
257
- div.callout-important.callout-style-default .callout-caption {
258
  background-color: #f7dddc
259
  }
260
 
@@ -266,7 +268,7 @@
266
  background-image: url('');
267
  }
268
 
269
- div.callout-warning.callout-style-default .callout-caption {
270
  background-color: #fcefdc
271
  }
272
 
@@ -278,7 +280,7 @@
278
  background-image: url('');
279
  }
280
 
281
- div.callout-tip.callout-style-default .callout-caption {
282
  background-color: #ccf1e3
283
  }
284
 
@@ -290,7 +292,7 @@
290
  background-image: url('');
291
  }
292
 
293
- div.callout-caution.callout-style-default .callout-caption {
294
  background-color: #ffe5d0
295
  }
296
 
@@ -382,23 +384,18 @@
382
  margin-right: 0;
383
  }
384
  </style>
385
- <script src="Accelerate_files/libs/quarto-diagram/mermaid.min.js"></script>
386
- <script src="Accelerate_files/libs/quarto-diagram/mermaid-init.js"></script>
387
- <link href="Accelerate_files/libs/quarto-diagram/mermaid.css" rel="stylesheet">
388
  </head>
389
  <body class="quarto-dark">
390
  <div class="reveal">
391
  <div class="slides">
392
 
393
  <section id="title-slide" class="quarto-title-block center">
394
- <h1 class="title">Accelerate, Three Powerful Sublibraries for PyTorch</h1>
395
 
396
  <div class="quarto-title-authors">
397
- <div class="quarto-title-author">
398
- <div class="quarto-title-author-name">
399
- Zachary Mueller
400
- </div>
401
- </div>
402
  </div>
403
 
404
  </section>
@@ -406,324 +403,554 @@ Zachary Mueller
406
  <h2>Who am I?</h2>
407
  <ul>
408
  <li>Zachary Mueller</li>
409
- <li>Deep Learning Software Engineer at 🤗</li>
410
  <li>API design geek</li>
411
  </ul>
412
  </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  <section id="what-is-accelerate" class="slide level2">
414
  <h2>What is 🤗 Accelerate?</h2>
415
  <div class="cell" data-reveal="true" data-fig-height="6">
416
  <div class="cell-output-display">
417
  <div>
418
- <p>
419
- </p><pre class="mermaid mermaid-js" data-tooltip-selector="#mermaid-tooltip-1">graph LR
420
- A{"🤗 Accelerate#32;"}
421
- A --&gt; B["Launching&lt;br&gt;Interface#32;"]
422
  A --&gt; C["Training Library#32;"]
423
  A --&gt; D["Big Model&lt;br&gt;Inference#32;"]
424
  </pre>
425
- <div id="mermaid-tooltip-1" class="mermaidTooltip">
426
-
427
  </div>
428
- <p></p>
429
  </div>
430
  </div>
431
  </div>
432
  </section>
433
- <section>
434
- <section id="a-launching-interface" class="title-slide slide level1 center">
435
- <h1>A Launching Interface</h1>
436
- <p>Can’t I just use <code>python do_the_thing.py</code>?</p>
 
 
 
 
 
 
 
 
 
 
 
 
437
  </section>
438
- <section id="a-launching-interface-1" class="slide level2">
439
- <h2>A Launching Interface</h2>
440
- <p>Launching scripts in different environments is complicated:</p>
441
  <ul>
442
- <li><div class="sourceCode" id="cb1"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1"></a><span class="ex">python</span> script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
443
- <li><div class="sourceCode" id="cb2"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span><span class="op">=</span>1 <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
444
- <li><div class="sourceCode" id="cb3"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1"></a><span class="ex">deepspeed</span> <span class="at">--num_gpus</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
445
  </ul>
446
- <p>And more!</p>
447
  </section>
448
- <section id="a-launching-interface-2" class="slide level2">
449
- <h2>A Launching Interface</h2>
450
- <p>But it doesn’t have to be:</p>
451
- <div class="sourceCode" id="cb4"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1"></a><span class="ex">accelerate</span> launch script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
452
- <p>A single command to launch with <code>DeepSpeed</code>, Fully Sharded Data Parallelism, across single and multi CPUs and GPUs, and to train on TPUs<sup>1</sup> too!</p>
453
- <aside><ol class="aside-footnotes"><li id="fn1"><p>Without needing to modify your code and create a <code>_mp_fn</code></p></li></ol></aside></section>
454
- <section id="a-launching-interface-3" class="slide level2">
455
- <h2>A Launching Interface</h2>
456
- <p>Generate a device-specific configuration through <code>accelerate config</code></p>
457
-
458
- <img data-src="CLI.gif" class="r-stretch"></section>
459
- <section id="a-launching-interface-4" class="slide level2">
460
- <h2>A Launching Interface</h2>
461
- <p>Or don’t. <code>accelerate config</code> doesn’t <em>have</em> to be done!</p>
462
- <div class="sourceCode" id="cb5"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span><span class="op">=</span>1 <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span>
463
- <span id="cb5-2"><a href="#cb5-2"></a><span class="ex">accelerate</span> launch <span class="at">--multi_gpu</span> <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
464
- <p>A quick default configuration can be made too:</p>
465
- <div class="sourceCode" id="cb6"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1"></a><span class="ex">accelerate</span> config default</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
466
  </section>
467
- <section id="a-launching-interface-5" class="slide level2">
468
- <h2>A Launching Interface</h2>
469
- <p>With the <code>notebook_launcher</code> it’s also possible to launch code directly from your Jupyter environment too!</p>
470
- <div class="sourceCode" id="cb7"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1"></a><span class="im">from</span> accelerate <span class="im">import</span> notebook_launcher</span>
471
- <span id="cb7-2"><a href="#cb7-2"></a>notebook_launcher(</span>
472
- <span id="cb7-3"><a href="#cb7-3"></a> training_loop_function, </span>
473
- <span id="cb7-4"><a href="#cb7-4"></a> args, </span>
474
- <span id="cb7-5"><a href="#cb7-5"></a> num_processes<span class="op">=</span><span class="dv">2</span></span>
475
- <span id="cb7-6"><a href="#cb7-6"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
476
- <div class="sourceCode" id="cb8"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a>Launching training on <span class="dv">2</span> GPUs.</span>
477
- <span id="cb8-2"><a href="#cb8-2"></a>epoch <span class="dv">0</span>: <span class="fl">88.12</span></span>
478
- <span id="cb8-3"><a href="#cb8-3"></a>epoch <span class="dv">1</span>: <span class="fl">91.73</span></span>
479
- <span id="cb8-4"><a href="#cb8-4"></a>epoch <span class="dv">2</span>: <span class="fl">92.58</span></span>
480
- <span id="cb8-5"><a href="#cb8-5"></a>epoch <span class="dv">3</span>: <span class="fl">93.90</span></span>
481
- <span id="cb8-6"><a href="#cb8-6"></a>epoch <span class="dv">4</span>: <span class="fl">94.71</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  </section></section>
483
  <section>
484
  <section id="a-training-library" class="title-slide slide level1 center">
485
  <h1>A Training Library</h1>
486
- <p>Okay, will <code>accelerate launch</code> make <code>do_the_thing.py</code> use all my GPUs magically?</p>
487
- </section>
488
- <section id="a-training-library-1" class="slide level2">
489
- <h2>A Training Library</h2>
490
- <ul>
491
- <li>Just showed that its possible using <code>accelerate launch</code> to <em>launch</em> a python script in various distributed environments</li>
492
- <li>This does <em>not</em> mean that the script will just “use” that code and still run on the new compute efficiently.</li>
493
- <li>Training on different computes often means <em>many</em> lines of code changed for each specific compute.</li>
494
- <li>🤗 <code>accelerate</code> solves this by ensuring the same code can be ran on a CPU or GPU, multiples, and on TPUs!</li>
495
- </ul>
496
- </section>
497
- <section id="a-training-library-2" class="slide level2">
498
- <h2>A Training Library</h2>
499
- <div class="sourceCode" id="cb9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
500
- <span id="cb9-2"><a href="#cb9-2"></a> optimizer.zero_grad()</span>
501
- <span id="cb9-3"><a href="#cb9-3"></a> inputs, targets <span class="op">=</span> batch</span>
502
- <span id="cb9-4"><a href="#cb9-4"></a> inputs <span class="op">=</span> inputs.to(device)</span>
503
- <span id="cb9-5"><a href="#cb9-5"></a> targets <span class="op">=</span> targets.to(device)</span>
504
- <span id="cb9-6"><a href="#cb9-6"></a> outputs <span class="op">=</span> model(inputs)</span>
505
- <span id="cb9-7"><a href="#cb9-7"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
506
- <span id="cb9-8"><a href="#cb9-8"></a> loss.backward()</span>
507
- <span id="cb9-9"><a href="#cb9-9"></a> optimizer.step()</span>
508
- <span id="cb9-10"><a href="#cb9-10"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
509
  </section>
510
- <section id="a-training-library-3" class="slide level2 smaller">
511
- <h2>A Training Library</h2>
512
- <div class="columns">
513
- <div class="column" style="width:43%;">
514
  <p><br><br><br></p>
515
- <div class="sourceCode" id="cb10" data-code-line-numbers="5-6,9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1"></a><span class="co"># For alignment purposes</span></span>
516
- <span id="cb10-2"><a href="#cb10-2"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
517
- <span id="cb10-3"><a href="#cb10-3"></a> optimizer.zero_grad()</span>
518
- <span id="cb10-4"><a href="#cb10-4"></a> inputs, targets <span class="op">=</span> batch</span>
519
- <span id="cb10-5"><a href="#cb10-5"></a> inputs <span class="op">=</span> inputs.to(device)</span>
520
- <span id="cb10-6"><a href="#cb10-6"></a> targets <span class="op">=</span> targets.to(device)</span>
521
- <span id="cb10-7"><a href="#cb10-7"></a> outputs <span class="op">=</span> model(inputs)</span>
522
- <span id="cb10-8"><a href="#cb10-8"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
523
- <span id="cb10-9"><a href="#cb10-9"></a> loss.backward()</span>
524
- <span id="cb10-10"><a href="#cb10-10"></a> optimizer.step()</span>
525
- <span id="cb10-11"><a href="#cb10-11"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
526
- </div><div class="column" style="width:57%;">
527
- <div class="sourceCode" id="cb11" data-code-line-numbers="1-7,12-13,16"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
528
- <span id="cb11-2"><a href="#cb11-2"></a>accelerator <span class="op">=</span> Accelerator()</span>
529
- <span id="cb11-3"><a href="#cb11-3"></a>dataloader, model, optimizer scheduler <span class="op">=</span> (</span>
530
- <span id="cb11-4"><a href="#cb11-4"></a> accelerator.prepare(</span>
531
- <span id="cb11-5"><a href="#cb11-5"></a> dataloader, model, optimizer, scheduler</span>
532
- <span id="cb11-6"><a href="#cb11-6"></a> )</span>
533
- <span id="cb11-7"><a href="#cb11-7"></a>)</span>
534
- <span id="cb11-8"><a href="#cb11-8"></a></span>
535
- <span id="cb11-9"><a href="#cb11-9"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
536
- <span id="cb11-10"><a href="#cb11-10"></a> optimizer.zero_grad()</span>
537
- <span id="cb11-11"><a href="#cb11-11"></a> inputs, targets <span class="op">=</span> batch</span>
538
- <span id="cb11-12"><a href="#cb11-12"></a> <span class="co"># inputs = inputs.to(device)</span></span>
539
- <span id="cb11-13"><a href="#cb11-13"></a> <span class="co"># targets = targets.to(device)</span></span>
540
- <span id="cb11-14"><a href="#cb11-14"></a> outputs <span class="op">=</span> model(inputs)</span>
541
- <span id="cb11-15"><a href="#cb11-15"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
542
- <span id="cb11-16"><a href="#cb11-16"></a> accelerator.backward(loss) <span class="co"># loss.backward()</span></span>
543
- <span id="cb11-17"><a href="#cb11-17"></a> optimizer.step()</span>
544
- <span id="cb11-18"><a href="#cb11-18"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
545
  </div>
546
  </div>
547
  </section>
548
- <section id="a-training-library-4" class="slide level2">
549
- <h2>A Training Library</h2>
550
- <p>What all happened in <code>Accelerator.prepare</code>?</p>
551
- <div>
552
- <ol type="1">
553
- <li class="fragment"><code>Accelerator</code> looked at the configuration</li>
554
- <li class="fragment">The <code>dataloader</code> was converted into one that can dispatch each batch onto a seperate GPU</li>
555
- <li class="fragment">The <code>model</code> was wrapped with the appropriate DDP wrapper from either <code>torch.distributed</code> or <code>torch_xla</code></li>
556
- <li class="fragment">The <code>optimizer</code> and <code>scheduler</code> were both converted into an <code>AcceleratedOptimizer</code> and <code>AcceleratedScheduler</code> which knows how to handle any distributed scenario</li>
557
- </ol>
558
- </div>
559
  </section>
560
  <section id="a-training-library-mixed-precision" class="slide level2">
561
- <h2>A Training Library, Mixed Precision</h2>
562
- <p>🤗 <code>accelerate</code> also supports <em>automatic mixed precision</em>.</p>
563
- <p>Through a single flag to the <code>Accelerator</code> object when calling <code>accelerator.backward()</code> the mixed precision of your choosing (such as <code>bf16</code> or <code>fp16</code>) will be applied:</p>
564
- <div class="sourceCode" id="cb12" data-code-line-numbers="2,9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
565
- <span id="cb12-2"><a href="#cb12-2"></a>accelerator <span class="op">=</span> Accelerator(mixed_precision<span class="op">=</span><span class="st">"fp16"</span>)</span>
566
- <span id="cb12-3"><a href="#cb12-3"></a>...</span>
567
- <span id="cb12-4"><a href="#cb12-4"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
568
- <span id="cb12-5"><a href="#cb12-5"></a> optimizer.zero_grad()</span>
569
- <span id="cb12-6"><a href="#cb12-6"></a> inputs, targets <span class="op">=</span> batch</span>
570
- <span id="cb12-7"><a href="#cb12-7"></a> outputs <span class="op">=</span> model(inputs)</span>
571
- <span id="cb12-8"><a href="#cb12-8"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
572
- <span id="cb12-9"><a href="#cb12-9"></a> accelerator.backward(loss)</span>
573
- <span id="cb12-10"><a href="#cb12-10"></a> optimizer.step()</span>
574
- <span id="cb12-11"><a href="#cb12-11"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
575
- </section>
576
- <section id="a-training-library-gradient-accumulation" class="slide level2">
577
- <h2>A Training Library, Gradient Accumulation</h2>
578
- <p>Gradient accumulation in distributed setups often need extra care to ensure gradients are aligned when they need to be and the backward pass is computationally efficient.</p>
579
- <p>🤗 <code>accelerate</code> can just easily handle this for you:</p>
580
- <div class="sourceCode" id="cb13" data-code-line-numbers="2,5"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
581
- <span id="cb13-2"><a href="#cb13-2"></a>accelerator <span class="op">=</span> Accelerator(gradient_accumulation_steps<span class="op">=</span><span class="dv">4</span>)</span>
582
- <span id="cb13-3"><a href="#cb13-3"></a>...</span>
583
- <span id="cb13-4"><a href="#cb13-4"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
584
- <span id="cb13-5"><a href="#cb13-5"></a> <span class="cf">with</span> accelerator.accumulate(model):</span>
585
- <span id="cb13-6"><a href="#cb13-6"></a> optimizer.zero_grad()</span>
586
- <span id="cb13-7"><a href="#cb13-7"></a> inputs, targets <span class="op">=</span> batch</span>
587
- <span id="cb13-8"><a href="#cb13-8"></a> outputs <span class="op">=</span> model(inputs)</span>
588
- <span id="cb13-9"><a href="#cb13-9"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
589
- <span id="cb13-10"><a href="#cb13-10"></a> accelerator.backward(loss)</span>
590
- <span id="cb13-11"><a href="#cb13-11"></a> optimizer.step()</span>
591
- <span id="cb13-12"><a href="#cb13-12"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
592
- </section>
593
- <section id="a-training-library-gradient-accumulation-1" class="slide level2">
594
- <h2>A Training Library, Gradient Accumulation</h2>
595
- <div class="sourceCode" id="cb14" data-code-line-numbers="5-7,10,11,12,15"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1"></a>ddp_model, dataloader <span class="op">=</span> accelerator.prepare(model, dataloader)</span>
596
- <span id="cb14-2"><a href="#cb14-2"></a></span>
597
- <span id="cb14-3"><a href="#cb14-3"></a><span class="cf">for</span> index, batch <span class="kw">in</span> <span class="bu">enumerate</span>(dataloader):</span>
598
- <span id="cb14-4"><a href="#cb14-4"></a> inputs, targets <span class="op">=</span> batch</span>
599
- <span id="cb14-5"><a href="#cb14-5"></a> <span class="cf">if</span> index <span class="op">!=</span> (<span class="bu">len</span>(dataloader)<span class="op">-</span><span class="dv">1</span>) <span class="kw">or</span> (index <span class="op">%</span> <span class="dv">4</span>) <span class="op">!=</span> <span class="dv">0</span>:</span>
600
- <span id="cb14-6"><a href="#cb14-6"></a> <span class="co"># Gradients don't sync</span></span>
601
- <span id="cb14-7"><a href="#cb14-7"></a> <span class="cf">with</span> accelerator.no_sync(model):</span>
602
- <span id="cb14-8"><a href="#cb14-8"></a> outputs <span class="op">=</span> ddp_model(inputs)</span>
603
- <span id="cb14-9"><a href="#cb14-9"></a> loss <span class="op">=</span> loss_func(outputs, targets)</span>
604
- <span id="cb14-10"><a href="#cb14-10"></a> accelerator.backward(loss)</span>
605
- <span id="cb14-11"><a href="#cb14-11"></a> <span class="cf">else</span>:</span>
606
- <span id="cb14-12"><a href="#cb14-12"></a> <span class="co"># Gradients finally sync</span></span>
607
- <span id="cb14-13"><a href="#cb14-13"></a> outputs <span class="op">=</span> ddp_model(inputs)</span>
608
- <span id="cb14-14"><a href="#cb14-14"></a> loss <span class="op">=</span> loss_func(outputs)</span>
609
- <span id="cb14-15"><a href="#cb14-15"></a> accelerator.backward(loss)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
610
- </section></section>
611
- <section>
612
- <section id="big-model-inference" class="title-slide slide level1 center">
613
- <h1>Big Model Inference</h1>
614
- <p>Stable Diffusion taking the world by storm</p>
615
- </section>
616
- <section id="bigger-models-higher-compute" class="slide level2">
617
- <h2>Bigger Models == Higher Compute</h2>
618
- <p>As more large models were being released, Hugging Face quickly realized there must be a way to continue our decentralization of Machine Learning and have the day-to-day programmer be able to leverage these big models.</p>
619
- <p>Born out of this effort by Sylvain Gugger:</p>
620
- <p>🤗 Accelerate: Big Model Inference.</p>
621
  </section>
622
- <section id="the-basic-premise" class="slide level2">
623
- <h2>The Basic Premise</h2>
624
- <div>
625
  <ul>
626
- <li class="fragment"><p>In PyTorch, there exists the <code>meta</code> device.</p></li>
627
- <li class="fragment"><p>Super small footprint to load in huge models quickly by not loading in their weights immediatly.</p></li>
628
- <li class="fragment"><p>As an input gets passed through each layer, we can load and unload <em>parts</em> of the PyTorch model quickly so that only a small portion of the big model is loaded in at a single time.</p></li>
629
- <li class="fragment"><p>The end result? Stable Diffusion v1 can be ran on &lt; 800mb of vRAM</p></li>
630
  </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  </section>
633
- <section id="the-code" class="slide level2">
634
- <h2>The Code</h2>
635
- <p>Generally you start with something like so:</p>
636
- <div class="sourceCode" id="cb15"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1"></a><span class="im">import</span> torch</span>
637
- <span id="cb15-2"><a href="#cb15-2"></a></span>
638
- <span id="cb15-3"><a href="#cb15-3"></a>my_model <span class="op">=</span> ModelClass(...)</span>
639
- <span id="cb15-4"><a href="#cb15-4"></a>state_dict <span class="op">=</span> torch.load(checkpoint_file)</span>
640
- <span id="cb15-5"><a href="#cb15-5"></a>my_model.load_state_dict(state_dict)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
641
- <p>But this has issues:</p>
642
- <ol type="1">
643
- <li>The full version of the model is loaded at <code>3</code></li>
644
- <li>Another version of the model is loaded into memory at <code>4</code></li>
645
- </ol>
646
- <p>If a 6 <em>billion</em> parameter model is being loaded, each model class has a dictionary of 24GB so 48GB of vRAM is needed</p>
647
- </section>
648
- <section id="empty-model-weights" class="slide level2">
649
- <h2>Empty Model Weights</h2>
650
- <p>We can fix step 1 by loading in an empty model skeleton at first:</p>
651
- <div class="sourceCode" id="cb16" data-code-line-numbers="1,3-4"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1"></a><span class="im">from</span> accelerate <span class="im">import</span> init_empty_weights</span>
652
- <span id="cb16-2"><a href="#cb16-2"></a></span>
653
- <span id="cb16-3"><a href="#cb16-3"></a><span class="cf">with</span> init_empty_weights():</span>
654
- <span id="cb16-4"><a href="#cb16-4"></a> my_model <span class="op">=</span> ModelClass(...)</span>
655
- <span id="cb16-5"><a href="#cb16-5"></a>state_dict <span class="op">=</span> torch.load(checkpoint_file)</span>
656
- <span id="cb16-6"><a href="#cb16-6"></a>my_model.load_state_dict(state_dict)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
657
- <div class="callout callout-important callout-captioned callout-style-default">
658
- <div class="callout-body">
659
- <div class="callout-caption">
660
- <div class="callout-icon-container">
661
- <i class="callout-icon"></i>
662
- </div>
663
- <p><strong>This code will not run</strong></p>
664
- </div>
665
- <div class="callout-content">
666
- <p>It is likely that just calling <code>my_model(x)</code> will fail as not all tensor operations are supported on the <code>meta</code> device.</p>
667
- </div>
668
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
  </div>
 
670
  </section>
671
- <section id="sharded-checkpoints---the-concept" class="slide level2">
672
- <h2>Sharded Checkpoints - The Concept</h2>
673
- <p>The next step is to have “Sharded Checkpoints” saved for your model.</p>
674
- <p>Basically smaller chunks of your model weights stored that can be brought in at any particular time.</p>
675
- <p>This reduces the amount of memory step 2 takes in since we can just load in a “chunk” of the model at a time, then swap it out for a new chunk through PyTorch hooks</p>
676
- </section>
677
- <section id="sharded-checkpoints---the-code" class="slide level2">
678
- <h2>Sharded Checkpoints - The Code</h2>
679
- <div class="sourceCode" id="cb17" data-code-line-numbers="1,6-8"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1"></a><span class="im">from</span> accelerate <span class="im">import</span> init_empty_weights, load_checkpoint_and_dispatch</span>
680
- <span id="cb17-2"><a href="#cb17-2"></a></span>
681
- <span id="cb17-3"><a href="#cb17-3"></a><span class="cf">with</span> init_empty_weights():</span>
682
- <span id="cb17-4"><a href="#cb17-4"></a> my_model <span class="op">=</span> ModelClass(...)</span>
683
- <span id="cb17-5"><a href="#cb17-5"></a></span>
684
- <span id="cb17-6"><a href="#cb17-6"></a>my_model <span class="op">=</span> load_checkpoint_and_dispatch(</span>
685
- <span id="cb17-7"><a href="#cb17-7"></a> my_model, <span class="st">"sharded-weights"</span>, device_map<span class="op">=</span><span class="st">"auto"</span></span>
686
- <span id="cb17-8"><a href="#cb17-8"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
687
- <p><code>device_map="auto"</code> will tell 🤗 Accelerate that it should determine where to put each layer of the model:</p>
688
- <ol type="1">
689
- <li>Maximum space on the GPU(s)</li>
690
- <li>Maximum space on the CPU(s)</li>
691
- <li>Utilize disk space through memory-mapped tensors</li>
692
- </ol>
693
- </section>
694
- <section id="big-model-inference-put-together" class="slide level2">
695
- <h2>Big Model Inference Put Together</h2>
696
- <div class="sourceCode" id="cb18"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1"></a><span class="im">from</span> accelerate <span class="im">import</span> init_empty_weights, load_checkpoint_and_dispatch</span>
697
- <span id="cb18-2"><a href="#cb18-2"></a></span>
698
- <span id="cb18-3"><a href="#cb18-3"></a><span class="cf">with</span> init_empty_weights():</span>
699
- <span id="cb18-4"><a href="#cb18-4"></a> my_model <span class="op">=</span> ModelClass(...)</span>
700
- <span id="cb18-5"><a href="#cb18-5"></a></span>
701
- <span id="cb18-6"><a href="#cb18-6"></a>my_model <span class="op">=</span> load_checkpoint_and_dispatch(</span>
702
- <span id="cb18-7"><a href="#cb18-7"></a> my_model, <span class="st">"sharded-weights"</span>, device_map<span class="op">=</span><span class="st">"auto"</span></span>
703
- <span id="cb18-8"><a href="#cb18-8"></a>)</span>
704
- <span id="cb18-9"><a href="#cb18-9"></a>my_model.<span class="bu">eval</span>()</span>
705
- <span id="cb18-10"><a href="#cb18-10"></a></span>
706
- <span id="cb18-11"><a href="#cb18-11"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
707
- <span id="cb18-12"><a href="#cb18-12"></a> output <span class="op">=</span> my_model(batch)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
708
- </section>
709
- <section id="is-there-an-easier-way" class="slide level2">
710
- <h2>Is there an easier way?</h2>
711
- <p>The <code>transformers</code> library combined with the Hub makes all this code wrapping much easier for you with the <code>pipeline</code></p>
712
- <div class="sourceCode" id="cb19"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb19-1"><a href="#cb19-1"></a><span class="im">import</span> torch</span>
713
- <span id="cb19-2"><a href="#cb19-2"></a><span class="im">from</span> transformers <span class="im">import</span> pipeline</span>
714
- <span id="cb19-3"><a href="#cb19-3"></a>pipe <span class="op">=</span> pipeline(</span>
715
- <span id="cb19-4"><a href="#cb19-4"></a> task<span class="op">=</span><span class="st">"text-generation"</span>,</span>
716
- <span id="cb19-5"><a href="#cb19-5"></a> model<span class="op">=</span><span class="st">"EleutherAI/gpt-j-6B"</span>,</span>
717
- <span id="cb19-6"><a href="#cb19-6"></a> device_map<span class="op">=</span><span class="st">"auto"</span>,</span>
718
- <span id="cb19-7"><a href="#cb19-7"></a> torch_dtype<span class="op">=</span>torch.float16</span>
719
- <span id="cb19-8"><a href="#cb19-8"></a>)</span>
720
- <span id="cb19-9"><a href="#cb19-9"></a></span>
721
- <span id="cb19-10"><a href="#cb19-10"></a>text <span class="op">=</span> pipe(<span class="st">"This is some generated text, I think"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
722
- </section></section>
723
- <section>
724
- <section id="what-about-stable-diffusion" class="title-slide slide level1 center">
725
- <h1>What about Stable Diffusion?</h1>
726
- <p>A demo with <code>diffusers</code> &amp; Weights and Biases</p>
727
  </section>
728
  <section id="some-handy-resources" class="slide level2">
729
  <h2>Some Handy Resources</h2>
@@ -735,29 +962,29 @@ Zachary Mueller
735
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/big_modeling">Big Model Inference tutorial</a></li>
736
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed">DeepSpeed and 🤗 Accelerate</a></li>
737
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp">Fully Sharded Data Parallelism and 🤗 Accelerate</a></li>
 
738
  </ul>
739
  <div class="footer footer-default">
740
 
741
  </div>
742
  </section></section>
743
-
744
  </div>
745
  </div>
746
 
747
  <script>window.backupDefine = window.define; window.define = undefined;</script>
748
- <script src="Accelerate_files/libs/revealjs/dist/reveal.js"></script>
749
  <!-- reveal.js plugins -->
750
- <script src="Accelerate_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.js"></script>
751
- <script src="Accelerate_files/libs/revealjs/plugin/pdf-export/pdfexport.js"></script>
752
- <script src="Accelerate_files/libs/revealjs/plugin/reveal-menu/menu.js"></script>
753
- <script src="Accelerate_files/libs/revealjs/plugin/reveal-menu/quarto-menu.js"></script>
754
- <script src="Accelerate_files/libs/revealjs/plugin/quarto-support/support.js"></script>
755
 
756
 
757
- <script src="Accelerate_files/libs/revealjs/plugin/notes/notes.js"></script>
758
- <script src="Accelerate_files/libs/revealjs/plugin/search/search.js"></script>
759
- <script src="Accelerate_files/libs/revealjs/plugin/zoom/zoom.js"></script>
760
- <script src="Accelerate_files/libs/revealjs/plugin/math/math.js"></script>
761
  <script>window.define = window.backupDefine; window.backupDefine = undefined;</script>
762
 
763
  <script>
@@ -767,12 +994,11 @@ Zachary Mueller
767
  Reveal.initialize({
768
  'controlsAuto': true,
769
  'previewLinksAuto': false,
770
- 'smaller': false,
771
  'pdfSeparateFragments': false,
772
  'autoAnimateEasing': "ease",
773
  'autoAnimateDuration': 1,
774
  'autoAnimateUnmatched': true,
775
- 'menu': {"side":"left","useTextContentForMissingTitles":true,"markers":false,"loadIcons":false,"custom":[{"title":"Tools","icon":"<i class=\"fas fa-gear\"></i>","content":"<ul class=\"slide-menu-items\">\n<li class=\"slide-tool-item active\" data-item=\"0\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.fullscreen(event)\"><kbd>f</kbd> Fullscreen</a></li>\n<li class=\"slide-tool-item\" data-item=\"1\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.speakerMode(event)\"><kbd>s</kbd> Speaker View</a></li>\n<li class=\"slide-tool-item\" data-item=\"2\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.overview(event)\"><kbd>o</kbd> Slide Overview</a></li>\n<li class=\"slide-tool-item\" data-item=\"3\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.overview(event)\"><kbd>e</kbd> PDF Export Mode</a></li>\n<li class=\"slide-tool-item\" data-item=\"4\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.keyboardHelp(event)\"><kbd>?</kbd> Keyboard Help</a></li>\n</ul>"}],"openButton":true},
776
  'smaller': false,
777
 
778
  // Display controls in the bottom right corner
@@ -976,9 +1202,23 @@ Zachary Mueller
976
  tabsets.forEach(function(tabset) {
977
  const tabby = new Tabby('#' + tabset.id);
978
  });
 
 
 
 
 
 
 
 
979
  const clipboard = new window.ClipboardJS('.code-copy-button', {
980
- target: function(trigger) {
981
- return trigger.previousElementSibling;
 
 
 
 
 
 
982
  }
983
  });
984
  clipboard.on('success', function(e) {
@@ -1014,10 +1254,9 @@ Zachary Mueller
1014
  // clear code selection
1015
  e.clearSelection();
1016
  });
1017
- function tippyHover(el, contentFn) {
1018
  const config = {
1019
  allowHTML: true,
1020
- content: contentFn,
1021
  maxWidth: 500,
1022
  delay: 100,
1023
  arrow: false,
@@ -1026,9 +1265,18 @@ Zachary Mueller
1026
  },
1027
  interactive: true,
1028
  interactiveBorder: 10,
1029
- theme: 'quarto-reveal',
1030
- placement: 'bottom-start'
1031
  };
 
 
 
 
 
 
 
 
 
1032
  config['offset'] = [0,0];
1033
  config['maxWidth'] = 700;
1034
  window.tippy(el, config);
 
1
  <!DOCTYPE html>
2
  <html lang="en"><head>
3
+ <script src="llm_conf_files/libs/clipboard/clipboard.min.js"></script>
4
+ <script src="llm_conf_files/libs/quarto-html/tabby.min.js"></script>
5
+ <script src="llm_conf_files/libs/quarto-html/popper.min.js"></script>
6
+ <script src="llm_conf_files/libs/quarto-html/tippy.umd.min.js"></script>
7
+ <link href="llm_conf_files/libs/quarto-html/tippy.css" rel="stylesheet">
8
+ <link href="llm_conf_files/libs/quarto-html/light-border.css" rel="stylesheet">
9
+ <link href="llm_conf_files/libs/quarto-html/quarto-html.min.css" rel="stylesheet" data-mode="light">
10
+ <link href="llm_conf_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles"><meta charset="utf-8">
11
+ <meta name="generator" content="quarto-99.9.9">
12
+
13
+ <title>Scaling Model Training with More Compute, How Do They Do It?</title>
14
  <meta name="apple-mobile-web-app-capable" content="yes">
15
  <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
16
  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
17
+ <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reset.css">
18
+ <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reveal.css">
19
  <style>
20
  code{white-space: pre-wrap;}
21
  span.smallcaps{font-variant: small-caps;}
 
25
  ul.task-list{list-style: none;}
26
  ul.task-list li input[type="checkbox"] {
27
  width: 0.8em;
28
+ margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
29
  vertical-align: middle;
30
  }
31
+ /* CSS for syntax highlighting */
32
  pre > code.sourceCode { white-space: pre; position: relative; }
33
+ pre > code.sourceCode > span { line-height: 1.25; }
34
  pre > code.sourceCode > span:empty { height: 1.2em; }
35
  .sourceCode { overflow: visible; }
36
  code.sourceCode > span { color: inherit; text-decoration: inherit; }
 
94
  code span.vs { color: #abe338; } /* VerbatimString */
95
  code span.wa { color: #dcc6e0; } /* Warning */
96
  </style>
97
+ <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/theme/quarto.css">
98
+ <link href="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.css" rel="stylesheet">
99
+ <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.css" rel="stylesheet">
100
+ <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.css" rel="stylesheet">
101
+ <link href="llm_conf_files/libs/revealjs/plugin/quarto-support/footer.css" rel="stylesheet">
102
  <style type="text/css">
103
 
104
  .callout {
 
137
  font-weight: 400;
138
  }
139
 
140
+ .callout.callout-titled.callout-style-simple .callout-body {
141
  margin-top: 0.2em;
142
  }
143
 
144
+ .callout:not(.callout-titled) .callout-body {
145
  display: flex;
146
  }
147
 
148
+ .callout:not(.no-icon).callout-titled.callout-style-simple .callout-content {
149
  padding-left: 1.6em;
150
  }
151
 
152
+ .callout.callout-titled .callout-header {
153
  padding-top: 0.2em;
154
  margin-bottom: -0.2em;
155
  }
156
 
157
+ .callout.callout-titled .callout-title p {
158
  margin-top: 0.5em;
159
  margin-bottom: 0.5em;
160
  }
161
 
162
+ .callout.callout-titled.callout-style-simple .callout-content p {
163
  margin-top: 0;
164
  }
165
 
166
+ .callout.callout-titled.callout-style-default .callout-content p {
167
  margin-top: 0.7em;
168
  }
169
 
170
+ .callout.callout-style-simple div.callout-title {
171
  border-bottom: none;
172
  font-size: .9rem;
173
  font-weight: 600;
174
  opacity: 75%;
175
  }
176
 
177
+ .callout.callout-style-default div.callout-title {
178
  border-bottom: none;
179
  font-weight: 600;
180
  opacity: 85%;
 
206
  background-size: 0.9rem 0.9rem;
207
  }
208
 
209
+ .callout-title {
210
  display: flex
211
  }
212
 
 
219
  display: none !important;
220
  }
221
 
222
+ .callout.callout-titled .callout-body > .callout-content > :last-child {
223
+ padding-bottom: 0.5rem;
224
+ margin-bottom: 0;
225
  }
226
 
227
+ .callout.callout-titled .callout-icon::before {
228
  margin-top: .5rem;
229
  padding-right: .5rem;
230
  }
231
 
232
+ .callout:not(.callout-titled) .callout-icon::before {
233
  margin-top: 1rem;
234
  padding-right: .5rem;
235
  }
 
244
  background-image: url('');
245
  }
246
 
247
+ div.callout-note.callout-style-default .callout-title {
248
  background-color: #dae6fb
249
  }
250
 
 
256
  background-image: url('');
257
  }
258
 
259
+ div.callout-important.callout-style-default .callout-title {
260
  background-color: #f7dddc
261
  }
262
 
 
268
  background-image: url('');
269
  }
270
 
271
+ div.callout-warning.callout-style-default .callout-title {
272
  background-color: #fcefdc
273
  }
274
 
 
280
  background-image: url('');
281
  }
282
 
283
+ div.callout-tip.callout-style-default .callout-title {
284
  background-color: #ccf1e3
285
  }
286
 
 
292
  background-image: url('');
293
  }
294
 
295
+ div.callout-caution.callout-style-default .callout-title {
296
  background-color: #ffe5d0
297
  }
298
 
 
384
  margin-right: 0;
385
  }
386
  </style>
387
+ <script src="llm_conf_files/libs/quarto-diagram/mermaid.min.js"></script>
388
+ <script src="llm_conf_files/libs/quarto-diagram/mermaid-init.js"></script>
389
+ <link href="llm_conf_files/libs/quarto-diagram/mermaid.css" rel="stylesheet">
390
  </head>
391
  <body class="quarto-dark">
392
  <div class="reveal">
393
  <div class="slides">
394
 
395
  <section id="title-slide" class="quarto-title-block center">
396
+ <h1 class="title">Scaling Model Training with More Compute, How Do They Do It?</h1>
397
 
398
  <div class="quarto-title-authors">
 
 
 
 
 
399
  </div>
400
 
401
  </section>
 
403
  <h2>Who am I?</h2>
404
  <ul>
405
  <li>Zachary Mueller</li>
406
+ <li>Technical Lead for the 🤗 Accelerate project</li>
407
  <li>API design geek</li>
408
  </ul>
409
  </section>
410
+ <section id="understanding-gpu-usage" class="slide level2">
411
+ <h2>Understanding GPU Usage</h2>
412
+ <ul>
413
+ <li>We can somewhat estimate the memory usage in vanilla full-fine-tuning of models</li>
414
+ <li>Requires certain assumptions (that I’ll be covering):
415
+ <ul>
416
+ <li>Adam optimizer</li>
417
+ <li>Batch size of 1</li>
418
+ </ul></li>
419
+ </ul>
420
+ </section>
421
+ <section id="understanding-gpu-usage-1" class="slide level2">
422
+ <h2>Understanding GPU Usage</h2>
423
+ <p>General estimate (<code>bert-base-cased</code>, 108M params):</p>
424
+ <ul>
425
+ <li>Each parameter is 4 bytes</li>
426
+ <li>Backward ~= 2x the model size</li>
427
+ <li>The optimizer step ~= 4x the model size (1x model, 1x gradients, 2x optimizer):</li>
428
+ </ul>
429
+ <div style="font-size: 50%;">
430
+ <table>
431
+ <thead>
432
+ <tr class="header">
433
+ <th>dtype</th>
434
+ <th style="text-align: left;">Model</th>
435
+ <th style="text-align: center;">Gradients</th>
436
+ <th style="text-align: center;">Backward pass</th>
437
+ <th style="text-align: center;">Optimizer step</th>
438
+ <th style="text-align: center;">Highest</th>
439
+ </tr>
440
+ </thead>
441
+ <tbody>
442
+ <tr class="odd">
443
+ <td>float32</td>
444
+ <td style="text-align: left;">413.18 MB</td>
445
+ <td style="text-align: center;">413.18 MB</td>
446
+ <td style="text-align: center;">826.36 MB</td>
447
+ <td style="text-align: center;">1.61 GB</td>
448
+ <td style="text-align: center;">1.61 GB</td>
449
+ </tr>
450
+ <tr class="even">
451
+ <td>float16</td>
452
+ <td style="text-align: left;">413.18 MB*</td>
453
+ <td style="text-align: center;">619.77 MB</td>
454
+ <td style="text-align: center;">826.36 MB</td>
455
+ <td style="text-align: center;">826.36 MB</td>
456
+ <td style="text-align: center;">826.36 MB</td>
457
+ </tr>
458
+ </tbody>
459
+ </table>
460
+ <p>*All estimations were based off the <a href="https://huggingface.co/spaces/hf-accelerate/model-memory-usage">Model Estimator Tool</a></p>
461
+ </div>
462
+ </section>
463
+ <section id="understanding-gpu-usage-2" class="slide level2">
464
+ <h2>Understanding GPU Usage</h2>
465
+ <p>This works fine for small models, we have cards with anywhere from 12-24GB of GPU memory (on the GPU-poor side).</p>
466
+ <p>But what happens as we scale?</p>
467
+ <p>Here’s <code>llama-3-8B</code> (8.03B parameters)</p>
468
+ <div style="font-size: 50%;">
469
+ <table>
470
+ <thead>
471
+ <tr class="header">
472
+ <th>dtype</th>
473
+ <th style="text-align: left;">Model</th>
474
+ <th style="text-align: center;">Gradients</th>
475
+ <th style="text-align: center;">Backward pass</th>
476
+ <th style="text-align: center;">Optimizer step</th>
477
+ <th style="text-align: center;">Highest</th>
478
+ </tr>
479
+ </thead>
480
+ <tbody>
481
+ <tr class="odd">
482
+ <td>float32</td>
483
+ <td style="text-align: left;">28.21 GB</td>
484
+ <td style="text-align: center;">28.21 GB</td>
485
+ <td style="text-align: center;">56.43 GB</td>
486
+ <td style="text-align: center;">112.84 GB</td>
487
+ <td style="text-align: center;">112.84 GB</td>
488
+ </tr>
489
+ <tr class="even">
490
+ <td>float16</td>
491
+ <td style="text-align: left;">28.21 GB*</td>
492
+ <td style="text-align: center;">42.32 GB</td>
493
+ <td style="text-align: center;">56.43 GB</td>
494
+ <td style="text-align: center;">56.43 GB</td>
495
+ <td style="text-align: center;">56.43 GB</td>
496
+ </tr>
497
+ </tbody>
498
+ </table>
499
+ </div>
500
+ <p>Well, <em>I</em> don’t have 56GB of GPU memory in a single card, let alone 112GB.</p>
501
+ <p>What can we do?</p>
502
+ </section>
503
+ <section>
504
+ <section id="distributed-training" class="title-slide slide level1 center">
505
+ <h1>Distributed Training</h1>
506
+
507
+ </section>
508
+ <section id="kinds-of-training" class="slide level2">
509
+ <h2>Kinds of Training</h2>
510
+ <ul>
511
+ <li>Single GPU:
512
+ <ul>
513
+ <li>No distributed techniques at play</li>
514
+ </ul></li>
515
+ <li>DDP:
516
+ <ul>
517
+ <li>A full copy of the model exists on each device, but data is chunked between each GPU</li>
518
+ </ul></li>
519
+ <li>FSDP &amp; DeepSpeed:
520
+ <ul>
521
+ <li>Split chunks of the model and optimizer states across GPUs, allowing for training bigger models on smaller (multiple) GPUs</li>
522
+ </ul></li>
523
+ </ul>
524
+ </section></section>
525
+ <section>
526
+ <section id="fully-sharded-data-parallelism" class="title-slide slide level1 center">
527
+ <h1>Fully Sharded Data Parallelism</h1>
528
+
529
+ </section>
530
+ <section id="fully-sharded-data-parallelism-1" class="slide level2">
531
+ <h2>Fully Sharded Data Parallelism</h2>
532
+
533
+ <img data-src="fsdp.png" id="fig-539a35d47e664c97a50115a146a7f1bd-1" class="r-stretch quarto-figure-center"><aside class="notes">
534
+ <ul>
535
+ <li>Take the model and split it across <code>n</code> GPUs</li>
536
+ <li>Each GPU computes the shard’s gradients</li>
537
+ <li>At the end, all gradients are synchronized and the final full model gradient is calculated</li>
538
+ <li>The backward pass can then be performed</li>
539
+ </ul>
540
+ <style type="text/css">
541
+ span.MJX_Assistive_MathML {
542
+ position:absolute!important;
543
+ clip: rect(1px, 1px, 1px, 1px);
544
+ padding: 1px 0 0 0!important;
545
+ border: 0!important;
546
+ height: 1px!important;
547
+ width: 1px!important;
548
+ overflow: hidden!important;
549
+ display:block!important;
550
+ }</style></aside>
551
+ </section>
552
+ <section id="fsdp-getting-parameter-specific" class="slide level2">
553
+ <h2>FSDP: Getting parameter specific</h2>
554
+ <ul>
555
+ <li>Different parameters can dicatate how much memory is needed for total GPU training across multiple GPUs</li>
556
+ <li>These include how model weights are sharded, gradients, and more.</li>
557
+ <li>I’ll cover some important ones I needed when doing a Full-Fine-Tune of Llama-3-8B <em>without PEFT</em> on 2x4090’s</li>
558
+ </ul>
559
+ </section>
560
+ <section id="sharding_strategy" class="slide level2">
561
+ <h2><code>sharding_strategy</code></h2>
562
+ <ul>
563
+ <li>Dictates the level of divving resources to perform
564
+ <ul>
565
+ <li><code>FULL_SHARD</code>: Includes optimizer states, gradients, and parameters</li>
566
+ <li><code>SHARD_GRAD_OP</code>: Includes optimizer states and gradients</li>
567
+ <li><code>NO_SHARD</code>: Normal DDP</li>
568
+ <li><code>HYBRID_SHARD</code>: Includes optimizer states, gradients, and parameters but each node has the full model</li>
569
+ </ul>
570
+ <aside class="notes">
571
+ <pre><code>FULL_SHARD:
572
+ Parameters, Gradients, Optimizer States: All are sharded.
573
+ Parameters Handling: Unshard before forward pass, reshard after forward pass, unshard before backward pass, reshard after backward pass.
574
+ Gradients Handling: Synchronize and shard after backward pass.
575
+ Optimizer States: Updated locally per rank.</code></pre>
576
+ <p>SHARD_GRAD_OP: Gradients and Optimizer States: Sharded during computation. Parameters: Unshard before forward pass, remain unsharded during forward pass, reshard after backward pass. Inside no_sync(): Parameters are not resharded after backward computation. Optimizer States: Updated locally per rank.</p>
577
+ <p>NO_SHARD: Parameters, Gradients, Optimizer States: Not sharded, replicated across ranks. Gradients Handling: Synchronized via all-reduce after backward pass. Optimizer States: Updated locally per rank.</p>
578
+ <p>HYBRID_SHARD: Parameters, Gradients, Optimizer States: Combines FULL_SHARD within a node and replicates parameters across nodes. Communication: Expensive operations like all-gathers and reduce-scatters are limited to within a node, enhancing performance for medium-sized models.</p>
579
+ <style type="text/css">
580
+ span.MJX_Assistive_MathML {
581
+ position:absolute!important;
582
+ clip: rect(1px, 1px, 1px, 1px);
583
+ padding: 1px 0 0 0!important;
584
+ border: 0!important;
585
+ height: 1px!important;
586
+ width: 1px!important;
587
+ overflow: hidden!important;
588
+ display:block!important;
589
+ }</style></aside></li>
590
+ </ul>
591
+ </section>
592
+ <section id="auto_wrap_policy" class="slide level2">
593
+ <h2><code>auto_wrap_policy</code>:</h2>
594
+ <ul>
595
+ <li>How the model should be split</li>
596
+ <li>Can be either <code>TRANSFORMER_BASED_WRAP</code> or <code>SIZE_BASED_WRAP</code></li>
597
+ <li><code>TRANSFORMER</code>/<code>fsdp_transformers_layer_cls_to_wrap</code>:
598
+ <ul>
599
+ <li>Need to declare the layer</li>
600
+ <li>Generally <code>transformers</code> has good defaults</li>
601
+ </ul></li>
602
+ <li><code>SIZE</code>/<code>fsdp_min_num_param</code>:
603
+ <ul>
604
+ <li>Number of total parameters in a shard</li>
605
+ </ul></li>
606
+ </ul>
607
+ </section>
608
+ <section id="offload_params" class="slide level2">
609
+ <h2><code>offload_params</code>:</h2>
610
+ <ul>
611
+ <li>Offloads the parameters and gradients to the CPU if they can’t fit into memory</li>
612
+ <li>Allows you to train much larger models locally, but will be much slower</li>
613
+ </ul>
614
+ <blockquote>
615
+ <p>Case: FFT of Llama-3-8B with <code>fsdp_offload_params</code> on 2x4090 GPUs was 72hrs, vs ~an hour or two when using 1xH100</p>
616
+ </blockquote>
617
+ </section>
618
+ <section id="cpu_ram_efficient_loading-and-sync_module_states" class="slide level2">
619
+ <h2><code>cpu_ram_efficient_loading</code> and <code>sync_module_states</code></h2>
620
+ <ul>
621
+ <li>Uses the idea behind big model inference/the <code>meta</code> device to load in the model to the GPU in a low-ram scenario</li>
622
+ <li>Rather than needing <code>model_size</code> * <code>n_gpus</code> RAM, we can load the model on a single node and then send the weights directly to each shard when the time is right via <code>sync_module_states</code></li>
623
+ </ul>
624
+ </section></section>
625
+ <section>
626
+ <section id="tying-this-to-accelerate" class="title-slide slide level1 center">
627
+ <h1>Tying this to 🤗 Accelerate</h1>
628
+
629
+ </section>
630
+ <section id="tying-this-to-accelerate-1" class="slide level2">
631
+ <h2>Tying this to 🤗 Accelerate</h2>
632
+ <ul>
633
+ <li>So far we’ve covered the theory, but how do we put it into practice</li>
634
+ <li>By using a library that’s at the heart of the entire open-source ecosystem</li>
635
+ </ul>
636
+ <div style="font-size: 60%;padding-left:10%;padding-top:0%;">
637
+ <ul>
638
+ <li>Nearly all of 🤗</li>
639
+ <li><code>axolotl</code></li>
640
+ <li><code>fastai</code></li>
641
+ <li><code>FastChat</code></li>
642
+ <li><code>lucidrains</code></li>
643
+ <li><code>kornia</code></li>
644
+ </ul>
645
+ </div>
646
+ <p>Are you using it and you don’t even know?</p>
647
+ </section>
648
  <section id="what-is-accelerate" class="slide level2">
649
  <h2>What is 🤗 Accelerate?</h2>
650
  <div class="cell" data-reveal="true" data-fig-height="6">
651
  <div class="cell-output-display">
652
  <div>
653
+ <div>
654
+ <pre class="mermaid mermaid-js">graph LR
655
+ A(("🤗 Accelerate#32;"))
656
+ A --&gt; B["CLI Interface#32;"]
657
  A --&gt; C["Training Library#32;"]
658
  A --&gt; D["Big Model&lt;br&gt;Inference#32;"]
659
  </pre>
 
 
660
  </div>
 
661
  </div>
662
  </div>
663
  </div>
664
  </section>
665
+ <section id="a-cli-interface" class="slide level2">
666
+ <h2>A CLI Interface</h2>
667
+ <ul>
668
+ <li><code>accelerate config</code>
669
+ <ul>
670
+ <li>Configure the environment</li>
671
+ </ul></li>
672
+ <li><code>accelerate estimate-memory</code>
673
+ <ul>
674
+ <li>How to guess vRAM requirements</li>
675
+ </ul></li>
676
+ <li><code>accelerate launch</code>
677
+ <ul>
678
+ <li>How to run your script</li>
679
+ </ul></li>
680
+ </ul>
681
  </section>
682
+ <section id="launching-distributed-training-is-hard" class="slide level2">
683
+ <h2>Launching distributed training is hard</h2>
 
684
  <ul>
685
+ <li><div class="sourceCode" id="cb2"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1"></a><span class="ex">python</span> script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
686
+ <li><div class="sourceCode" id="cb3"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span><span class="op">=</span>1 <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
687
+ <li><div class="sourceCode" id="cb4"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1"></a><span class="ex">deepspeed</span> <span class="at">--num_gpus</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
688
  </ul>
689
+ <p>How can we make this better?</p>
690
  </section>
691
+ <section id="accelerate-launch" class="slide level2">
692
+ <h2><code>accelerate launch</code></h2>
693
+ <div class="sourceCode" id="cb5"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1"></a><span class="ex">accelerate</span> launch script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  </section>
695
+ <section id="accelerate-config" class="slide level2">
696
+ <h2><code>accelerate config</code></h2>
697
+ <ul>
698
+ <li>Rely on <code>config.yaml</code> files</li>
699
+ <li>Choose to either running <code>accelerate config</code> or write your own:</li>
700
+ </ul>
701
+ <div class="columns" style="font-size: 50%;padding-left:10%;">
702
+ <div class="column" style="width:40%;">
703
+ <div class="code-with-filename">
704
+ <div class="code-with-filename-file">
705
+ <pre><strong>ddp_config.yaml</strong></pre>
706
+ </div>
707
+ <div class="sourceCode" id="cb6" data-filename="ddp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
708
+ <span id="cb6-2"><a href="#cb6-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> MULTI_GPU</span></span>
709
+ <span id="cb6-3"><a href="#cb6-3"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
710
+ <span id="cb6-4"><a href="#cb6-4"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
711
+ <span id="cb6-5"><a href="#cb6-5"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
712
+ <span id="cb6-6"><a href="#cb6-6"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
713
+ </div>
714
+ </div><div class="column" style="width:40%;">
715
+ <div class="code-with-filename">
716
+ <div class="code-with-filename-file">
717
+ <pre><strong>fsdp_config.yaml</strong></pre>
718
+ </div>
719
+ <div class="sourceCode" id="cb7" data-filename="fsdp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
720
+ <span id="cb7-2"><a href="#cb7-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> FSDP</span></span>
721
+ <span id="cb7-3"><a href="#cb7-3"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
722
+ <span id="cb7-4"><a href="#cb7-4"></a><span class="at"> </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
723
+ <span id="cb7-5"><a href="#cb7-5"></a><span class="at"> </span><span class="fu">fsdp_backward_prefetch</span><span class="kw">:</span><span class="at"> BACKWARD_PRE</span></span>
724
+ <span id="cb7-6"><a href="#cb7-6"></a><span class="at"> </span><span class="fu">fsdp_cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
725
+ <span id="cb7-7"><a href="#cb7-7"></a><span class="at"> </span><span class="fu">fsdp_forward_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
726
+ <span id="cb7-8"><a href="#cb7-8"></a><span class="at"> </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
727
+ <span id="cb7-9"><a href="#cb7-9"></a><span class="at"> </span><span class="fu">fsdp_sharding_strategy</span><span class="kw">:</span><span class="at"> FULL_SHARD</span></span>
728
+ <span id="cb7-10"><a href="#cb7-10"></a><span class="at"> </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> SHARDED_STATE_DICT</span></span>
729
+ <span id="cb7-11"><a href="#cb7-11"></a><span class="at"> </span><span class="fu">fsdp_sync_module_states</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
730
+ <span id="cb7-12"><a href="#cb7-12"></a><span class="at"> </span><span class="fu">fsdp_use_orig_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
731
+ <span id="cb7-13"><a href="#cb7-13"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
732
+ <span id="cb7-14"><a href="#cb7-14"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
733
+ <span id="cb7-15"><a href="#cb7-15"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
734
+ <span id="cb7-16"><a href="#cb7-16"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
735
+ </div>
736
+ </div>
737
+ </div>
738
  </section></section>
739
  <section>
740
  <section id="a-training-library" class="title-slide slide level1 center">
741
  <h1>A Training Library</h1>
742
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  </section>
744
+ <section id="a-training-library-the-code" class="slide level2">
745
+ <h2>A Training Library: The Code</h2>
746
+ <div class="columns" style="font-size: 50%;">
747
+ <div class="column">
748
  <p><br><br><br></p>
749
+ <div class="sourceCode" id="cb8" data-code-line-numbers="5-6,9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="co"># For alignment purposes</span></span>
750
+ <span id="cb8-2"><a href="#cb8-2"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
751
+ <span id="cb8-3"><a href="#cb8-3"></a> optimizer.zero_grad()</span>
752
+ <span id="cb8-4"><a href="#cb8-4"></a> inputs, targets <span class="op">=</span> batch</span>
753
+ <span id="cb8-5"><a href="#cb8-5"></a> inputs <span class="op">=</span> inputs.to(device)</span>
754
+ <span id="cb8-6"><a href="#cb8-6"></a> targets <span class="op">=</span> targets.to(device)</span>
755
+ <span id="cb8-7"><a href="#cb8-7"></a> outputs <span class="op">=</span> model(inputs)</span>
756
+ <span id="cb8-8"><a href="#cb8-8"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
757
+ <span id="cb8-9"><a href="#cb8-9"></a> loss.backward()</span>
758
+ <span id="cb8-10"><a href="#cb8-10"></a> optimizer.step()</span>
759
+ <span id="cb8-11"><a href="#cb8-11"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
760
+ </div><div class="column">
761
+ <div class="sourceCode" id="cb9" data-code-line-numbers="1-7,12-13,16"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
762
+ <span id="cb9-2"><a href="#cb9-2"></a>accelerator <span class="op">=</span> Accelerator()</span>
763
+ <span id="cb9-3"><a href="#cb9-3"></a>dataloader, model, optimizer scheduler <span class="op">=</span> (</span>
764
+ <span id="cb9-4"><a href="#cb9-4"></a> accelerator.prepare(</span>
765
+ <span id="cb9-5"><a href="#cb9-5"></a> dataloader, model, optimizer, scheduler</span>
766
+ <span id="cb9-6"><a href="#cb9-6"></a> )</span>
767
+ <span id="cb9-7"><a href="#cb9-7"></a>)</span>
768
+ <span id="cb9-8"><a href="#cb9-8"></a></span>
769
+ <span id="cb9-9"><a href="#cb9-9"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
770
+ <span id="cb9-10"><a href="#cb9-10"></a> optimizer.zero_grad()</span>
771
+ <span id="cb9-11"><a href="#cb9-11"></a> inputs, targets <span class="op">=</span> batch</span>
772
+ <span id="cb9-12"><a href="#cb9-12"></a> <span class="co"># inputs = inputs.to(device)</span></span>
773
+ <span id="cb9-13"><a href="#cb9-13"></a> <span class="co"># targets = targets.to(device)</span></span>
774
+ <span id="cb9-14"><a href="#cb9-14"></a> outputs <span class="op">=</span> model(inputs)</span>
775
+ <span id="cb9-15"><a href="#cb9-15"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
776
+ <span id="cb9-16"><a href="#cb9-16"></a> accelerator.backward(loss) <span class="co"># loss.backward()</span></span>
777
+ <span id="cb9-17"><a href="#cb9-17"></a> optimizer.step()</span>
778
+ <span id="cb9-18"><a href="#cb9-18"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
779
  </div>
780
  </div>
781
  </section>
782
+ <section id="a-training-library-how-scaling-works" class="slide level2">
783
+ <h2>A Training Library: How Scaling Works</h2>
784
+ <ul>
785
+ <li>Accelerate’s DataLoaders and schedulers work off of a sharding mindset</li>
786
+ <li>Rather than repeating the same data across <code>n</code> nodes, we instead split it</li>
787
+ <li>Speeds up training linearly</li>
788
+ <li>Given a batch size of 16 on a single GPU, to recreate this across 8 GPUs you would use a batch size of 2</li>
789
+ <li>This also means the scheduler will be stepped <code>n</code> GPUs at a time per “global step”</li>
790
+ </ul>
 
 
791
  </section>
792
  <section id="a-training-library-mixed-precision" class="slide level2">
793
+ <h2>A Training Library: Mixed Precision</h2>
794
+ <ul>
795
+ <li>This may be a bit different than your “normal” idea of mixed precision.</li>
796
+ <li>We do <strong>not</strong> convert the model weights to BF16/FP16</li>
797
+ <li>Instead we <strong>wrap the forward pass</strong> with <code>autocast</code> to convert the gradients automatically</li>
798
+ <li>This preserves the original precision of the weights, which leads to stable training and better fine-tuning later on.</li>
799
+ <li><strong>If you use <code>.bf16()</code> weights, you are STUCK in bf16 perminantly</strong></li>
800
+ </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  </section>
802
+ <section id="a-training-library-mixed-precision-1" class="slide level2">
803
+ <h2>A Training Library: Mixed Precision</h2>
 
804
  <ul>
805
+ <li>Let’s tie that back up to the model estimator with neat tools like NVIDIA’s TransformerEngine</li>
 
 
 
806
  </ul>
807
+ <div style="font-size: 60%;">
808
+ <table style="width:100%;">
809
+ <colgroup>
810
+ <col style="width: 14%">
811
+ <col style="width: 14%">
812
+ <col style="width: 14%">
813
+ <col style="width: 14%">
814
+ <col style="width: 14%">
815
+ <col style="width: 14%">
816
+ <col style="width: 14%">
817
+ </colgroup>
818
+ <thead>
819
+ <tr class="header">
820
+ <th>Optimization Level</th>
821
+ <th>Computation (GEMM)</th>
822
+ <th>Comm</th>
823
+ <th>Weight</th>
824
+ <th>Master Weight</th>
825
+ <th>Weight Gradient</th>
826
+ <th>Optimizer States</th>
827
+ </tr>
828
+ </thead>
829
+ <tbody>
830
+ <tr class="odd">
831
+ <td>FP16 AMP</td>
832
+ <td>FP16</td>
833
+ <td>FP32</td>
834
+ <td>FP32</td>
835
+ <td>N/A</td>
836
+ <td>FP32</td>
837
+ <td>FP32+FP32</td>
838
+ </tr>
839
+ <tr class="even">
840
+ <td>Nvidia TE</td>
841
+ <td>FP8</td>
842
+ <td>FP32</td>
843
+ <td>FP32</td>
844
+ <td>N/A</td>
845
+ <td>FP32</td>
846
+ <td>FP32+FP32</td>
847
+ </tr>
848
+ <tr class="odd">
849
+ <td>MS-AMP O1</td>
850
+ <td>FP8</td>
851
+ <td>FP8</td>
852
+ <td>FP16</td>
853
+ <td>N/A</td>
854
+ <td>FP8</td>
855
+ <td>FP32+FP32</td>
856
+ </tr>
857
+ <tr class="even">
858
+ <td>MS-AMP O2</td>
859
+ <td>FP8</td>
860
+ <td>FP8</td>
861
+ <td>FP16</td>
862
+ <td>N/A</td>
863
+ <td>FP8</td>
864
+ <td>FP8+FP16</td>
865
+ </tr>
866
+ <tr class="odd">
867
+ <td>MS-AMP O3</td>
868
+ <td>FP8</td>
869
+ <td>FP8</td>
870
+ <td>FP8</td>
871
+ <td>FP16</td>
872
+ <td>FP8</td>
873
+ <td>FP8+FP16</td>
874
+ </tr>
875
+ </tbody>
876
+ </table>
877
  </div>
878
+ <aside class="notes">
879
+ <p>What is actually happening: * Linear Layers and other certain compatible layers are wrapped in a special version that allows for FP8 computation * The general forward pass is wrapped around BF16 * This means that the most memory saved is done during the gradients of the model, <em>not</em> the model itself. * With tools like <code>MS-AMP</code> we can convert more chunks into lower precision, but again like before stable training occurs when the models weights are in full precision and the backprop happens in full precision too.</p>
880
+ <style type="text/css">
881
+ span.MJX_Assistive_MathML {
882
+ position:absolute!important;
883
+ clip: rect(1px, 1px, 1px, 1px);
884
+ padding: 1px 0 0 0!important;
885
+ border: 0!important;
886
+ height: 1px!important;
887
+ width: 1px!important;
888
+ overflow: hidden!important;
889
+ display:block!important;
890
+ }</style></aside>
891
  </section>
892
+ <section id="deepspeed-vs-fully-sharded-data-parallelism" class="slide level2">
893
+ <h2>DeepSpeed vs Fully Sharded Data Parallelism</h2>
894
+ <ul>
895
+ <li>Extremely similar, however mostly used different naming conventions for items and slight tweaks in the implementation</li>
896
+ </ul>
897
+ <div style="font-size: 50%;">
898
+ <table style="width:100%;">
899
+ <colgroup>
900
+ <col style="width: 16%">
901
+ <col style="width: 16%">
902
+ <col style="width: 16%">
903
+ <col style="width: 16%">
904
+ <col style="width: 16%">
905
+ <col style="width: 16%">
906
+ </colgroup>
907
+ <thead>
908
+ <tr class="header">
909
+ <th>Framework</th>
910
+ <th>Model Loading (<code>torch_dtype</code>)</th>
911
+ <th>Mixed Precision</th>
912
+ <th>Preparation (Local)</th>
913
+ <th>Training</th>
914
+ <th>Optimizer (Local)</th>
915
+ </tr>
916
+ </thead>
917
+ <tbody>
918
+ <tr class="odd">
919
+ <td>FSDP</td>
920
+ <td>bf16</td>
921
+ <td>default (none)</td>
922
+ <td>bf16</td>
923
+ <td>bf16</td>
924
+ <td>bf16</td>
925
+ </tr>
926
+ <tr class="even">
927
+ <td>FSDP</td>
928
+ <td>bf16</td>
929
+ <td>bf16</td>
930
+ <td>fp32</td>
931
+ <td>bf16</td>
932
+ <td>fp32</td>
933
+ </tr>
934
+ <tr class="odd">
935
+ <td>DeepSpeed</td>
936
+ <td>bf16</td>
937
+ <td>bf16</td>
938
+ <td>fp32</td>
939
+ <td>bf16</td>
940
+ <td>fp32</td>
941
+ </tr>
942
+ </tbody>
943
+ </table>
944
  </div>
945
+ <p>To learn more, check out the <a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">documentation</a> or join my office hours</p>
946
  </section>
947
+ <section id="key-takeaways" class="slide level2">
948
+ <h2>Key Takeaways:</h2>
949
+ <ul>
950
+ <li>You can scale out training with <code>accelerate</code>, FSDP, and DeepSpeed across multiple GPUs to train bigger models</li>
951
+ <li>Techniques like <code>FP8</code> can help speed up training some and reduce computational overhead</li>
952
+ <li>Comes at a cost of end-precision and locking model weights for futher fine-tunes if not careful</li>
953
+ </ul>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
954
  </section>
955
  <section id="some-handy-resources" class="slide level2">
956
  <h2>Some Handy Resources</h2>
 
962
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/big_modeling">Big Model Inference tutorial</a></li>
963
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed">DeepSpeed and 🤗 Accelerate</a></li>
964
  <li><a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp">Fully Sharded Data Parallelism and 🤗 Accelerate</a></li>
965
+ <li><a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">FSDP vs DeepSpeed In-Depth</a></li>
966
  </ul>
967
  <div class="footer footer-default">
968
 
969
  </div>
970
  </section></section>
 
971
  </div>
972
  </div>
973
 
974
  <script>window.backupDefine = window.define; window.define = undefined;</script>
975
+ <script src="llm_conf_files/libs/revealjs/dist/reveal.js"></script>
976
  <!-- reveal.js plugins -->
977
+ <script src="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.js"></script>
978
+ <script src="llm_conf_files/libs/revealjs/plugin/pdf-export/pdfexport.js"></script>
979
+ <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.js"></script>
980
+ <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.js"></script>
981
+ <script src="llm_conf_files/libs/revealjs/plugin/quarto-support/support.js"></script>
982
 
983
 
984
+ <script src="llm_conf_files/libs/revealjs/plugin/notes/notes.js"></script>
985
+ <script src="llm_conf_files/libs/revealjs/plugin/search/search.js"></script>
986
+ <script src="llm_conf_files/libs/revealjs/plugin/zoom/zoom.js"></script>
987
+ <script src="llm_conf_files/libs/revealjs/plugin/math/math.js"></script>
988
  <script>window.define = window.backupDefine; window.backupDefine = undefined;</script>
989
 
990
  <script>
 
994
  Reveal.initialize({
995
  'controlsAuto': true,
996
  'previewLinksAuto': false,
 
997
  'pdfSeparateFragments': false,
998
  'autoAnimateEasing': "ease",
999
  'autoAnimateDuration': 1,
1000
  'autoAnimateUnmatched': true,
1001
+ 'menu': {"side":"left","useTextContentForMissingTitles":true,"markers":false,"loadIcons":false,"custom":[{"title":"Tools","icon":"<i class=\"fas fa-gear\"></i>","content":"<ul class=\"slide-menu-items\">\n<li class=\"slide-tool-item active\" data-item=\"0\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.fullscreen(event)\"><kbd>f</kbd> Fullscreen</a></li>\n<li class=\"slide-tool-item\" data-item=\"1\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.speakerMode(event)\"><kbd>s</kbd> Speaker View</a></li>\n<li class=\"slide-tool-item\" data-item=\"2\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.overview(event)\"><kbd>o</kbd> Slide Overview</a></li>\n<li class=\"slide-tool-item\" data-item=\"3\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.togglePdfExport(event)\"><kbd>e</kbd> PDF Export Mode</a></li>\n<li class=\"slide-tool-item\" data-item=\"4\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.keyboardHelp(event)\"><kbd>?</kbd> Keyboard Help</a></li>\n</ul>"}],"openButton":true},
1002
  'smaller': false,
1003
 
1004
  // Display controls in the bottom right corner
 
1202
  tabsets.forEach(function(tabset) {
1203
  const tabby = new Tabby('#' + tabset.id);
1204
  });
1205
+ const isCodeAnnotation = (el) => {
1206
+ for (const clz of el.classList) {
1207
+ if (clz.startsWith('code-annotation-')) {
1208
+ return true;
1209
+ }
1210
+ }
1211
+ return false;
1212
+ }
1213
  const clipboard = new window.ClipboardJS('.code-copy-button', {
1214
+ text: function(trigger) {
1215
+ const codeEl = trigger.previousElementSibling.cloneNode(true);
1216
+ for (const childEl of codeEl.children) {
1217
+ if (isCodeAnnotation(childEl)) {
1218
+ childEl.remove();
1219
+ }
1220
+ }
1221
+ return codeEl.innerText;
1222
  }
1223
  });
1224
  clipboard.on('success', function(e) {
 
1254
  // clear code selection
1255
  e.clearSelection();
1256
  });
1257
+ function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
1258
  const config = {
1259
  allowHTML: true,
 
1260
  maxWidth: 500,
1261
  delay: 100,
1262
  arrow: false,
 
1265
  },
1266
  interactive: true,
1267
  interactiveBorder: 10,
1268
+ theme: 'light-border',
1269
+ placement: 'bottom-start',
1270
  };
1271
+ if (contentFn) {
1272
+ config.content = contentFn;
1273
+ }
1274
+ if (onTriggerFn) {
1275
+ config.onTrigger = onTriggerFn;
1276
+ }
1277
+ if (onUntriggerFn) {
1278
+ config.onUntrigger = onUntriggerFn;
1279
+ }
1280
  config['offset'] = [0,0];
1281
  config['maxWidth'] = 700;
1282
  window.tippy(el, config);
llm_conf.html DELETED
@@ -1,1337 +0,0 @@
1
- <!DOCTYPE html>
2
- <html lang="en"><head>
3
- <script src="llm_conf_files/libs/clipboard/clipboard.min.js"></script>
4
- <script src="llm_conf_files/libs/quarto-html/tabby.min.js"></script>
5
- <script src="llm_conf_files/libs/quarto-html/popper.min.js"></script>
6
- <script src="llm_conf_files/libs/quarto-html/tippy.umd.min.js"></script>
7
- <link href="llm_conf_files/libs/quarto-html/tippy.css" rel="stylesheet">
8
- <link href="llm_conf_files/libs/quarto-html/light-border.css" rel="stylesheet">
9
- <link href="llm_conf_files/libs/quarto-html/quarto-html.min.css" rel="stylesheet" data-mode="light">
10
- <link href="llm_conf_files/libs/quarto-html/quarto-syntax-highlighting-dark.css" rel="stylesheet" id="quarto-text-highlighting-styles"><meta charset="utf-8">
11
- <meta name="generator" content="quarto-99.9.9">
12
-
13
- <title>Scaling Model Training with More Compute, How Do They Do It?</title>
14
- <meta name="apple-mobile-web-app-capable" content="yes">
15
- <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
16
- <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, minimal-ui">
17
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reset.css">
18
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/reveal.css">
19
- <style>
20
- code{white-space: pre-wrap;}
21
- span.smallcaps{font-variant: small-caps;}
22
- div.columns{display: flex; gap: min(4vw, 1.5em);}
23
- div.column{flex: auto; overflow-x: auto;}
24
- div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
25
- ul.task-list{list-style: none;}
26
- ul.task-list li input[type="checkbox"] {
27
- width: 0.8em;
28
- margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
29
- vertical-align: middle;
30
- }
31
- /* CSS for syntax highlighting */
32
- pre > code.sourceCode { white-space: pre; position: relative; }
33
- pre > code.sourceCode > span { line-height: 1.25; }
34
- pre > code.sourceCode > span:empty { height: 1.2em; }
35
- .sourceCode { overflow: visible; }
36
- code.sourceCode > span { color: inherit; text-decoration: inherit; }
37
- div.sourceCode { margin: 1em 0; }
38
- pre.sourceCode { margin: 0; }
39
- @media screen {
40
- div.sourceCode { overflow: auto; }
41
- }
42
- @media print {
43
- pre > code.sourceCode { white-space: pre-wrap; }
44
- pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
45
- }
46
- pre.numberSource code
47
- { counter-reset: source-line 0; }
48
- pre.numberSource code > span
49
- { position: relative; left: -4em; counter-increment: source-line; }
50
- pre.numberSource code > span > a:first-child::before
51
- { content: counter(source-line);
52
- position: relative; left: -1em; text-align: right; vertical-align: baseline;
53
- border: none; display: inline-block;
54
- -webkit-touch-callout: none; -webkit-user-select: none;
55
- -khtml-user-select: none; -moz-user-select: none;
56
- -ms-user-select: none; user-select: none;
57
- padding: 0 4px; width: 4em;
58
- }
59
- pre.numberSource { margin-left: 3em; padding-left: 4px; }
60
- div.sourceCode
61
- { color: #f8f8f2; }
62
- @media screen {
63
- pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
64
- }
65
- code span { color: #f8f8f2; } /* Normal */
66
- code span.al { color: #f07178; background-color: #2a0f15; font-weight: bold; } /* Alert */
67
- code span.an { color: #d4d0ab; } /* Annotation */
68
- code span.at { color: #00e0e0; } /* Attribute */
69
- code span.bn { color: #d4d0ab; } /* BaseN */
70
- code span.bu { color: #abe338; } /* BuiltIn */
71
- code span.cf { color: #ffa07a; font-weight: bold; } /* ControlFlow */
72
- code span.ch { color: #abe338; } /* Char */
73
- code span.cn { color: #ffd700; } /* Constant */
74
- code span.co { color: #f8f8f2; font-style: italic; } /* Comment */
75
- code span.cv { color: #ffd700; } /* CommentVar */
76
- code span.do { color: #f8f8f2; } /* Documentation */
77
- code span.dt { color: #ffa07a; } /* DataType */
78
- code span.dv { color: #d4d0ab; } /* DecVal */
79
- code span.er { color: #f07178; text-decoration: underline; } /* Error */
80
- code span.ex { color: #00e0e0; font-weight: bold; } /* Extension */
81
- code span.fl { color: #d4d0ab; } /* Float */
82
- code span.fu { color: #ffa07a; } /* Function */
83
- code span.im { color: #abe338; } /* Import */
84
- code span.in { color: #d4d0ab; } /* Information */
85
- code span.kw { color: #ffa07a; font-weight: bold; } /* Keyword */
86
- code span.op { color: #ffa07a; } /* Operator */
87
- code span.ot { color: #00e0e0; } /* Other */
88
- code span.pp { color: #dcc6e0; } /* Preprocessor */
89
- code span.re { color: #00e0e0; background-color: #f8f8f2; } /* RegionMarker */
90
- code span.sc { color: #abe338; } /* SpecialChar */
91
- code span.ss { color: #abe338; } /* SpecialString */
92
- code span.st { color: #abe338; } /* String */
93
- code span.va { color: #00e0e0; } /* Variable */
94
- code span.vs { color: #abe338; } /* VerbatimString */
95
- code span.wa { color: #dcc6e0; } /* Warning */
96
- </style>
97
- <link rel="stylesheet" href="llm_conf_files/libs/revealjs/dist/theme/quarto.css">
98
- <link href="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.css" rel="stylesheet">
99
- <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.css" rel="stylesheet">
100
- <link href="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.css" rel="stylesheet">
101
- <link href="llm_conf_files/libs/revealjs/plugin/quarto-support/footer.css" rel="stylesheet">
102
- <style type="text/css">
103
-
104
- .callout {
105
- margin-top: 1em;
106
- margin-bottom: 1em;
107
- border-radius: .25rem;
108
- }
109
-
110
- .callout.callout-style-simple {
111
- padding: 0em 0.5em;
112
- border-left: solid #acacac .3rem;
113
- border-right: solid 1px silver;
114
- border-top: solid 1px silver;
115
- border-bottom: solid 1px silver;
116
- display: flex;
117
- }
118
-
119
- .callout.callout-style-default {
120
- border-left: solid #acacac .3rem;
121
- border-right: solid 1px silver;
122
- border-top: solid 1px silver;
123
- border-bottom: solid 1px silver;
124
- }
125
-
126
- .callout .callout-body-container {
127
- flex-grow: 1;
128
- }
129
-
130
- .callout.callout-style-simple .callout-body {
131
- font-size: 1rem;
132
- font-weight: 400;
133
- }
134
-
135
- .callout.callout-style-default .callout-body {
136
- font-size: 0.9rem;
137
- font-weight: 400;
138
- }
139
-
140
- .callout.callout-titled.callout-style-simple .callout-body {
141
- margin-top: 0.2em;
142
- }
143
-
144
- .callout:not(.callout-titled) .callout-body {
145
- display: flex;
146
- }
147
-
148
- .callout:not(.no-icon).callout-titled.callout-style-simple .callout-content {
149
- padding-left: 1.6em;
150
- }
151
-
152
- .callout.callout-titled .callout-header {
153
- padding-top: 0.2em;
154
- margin-bottom: -0.2em;
155
- }
156
-
157
- .callout.callout-titled .callout-title p {
158
- margin-top: 0.5em;
159
- margin-bottom: 0.5em;
160
- }
161
-
162
- .callout.callout-titled.callout-style-simple .callout-content p {
163
- margin-top: 0;
164
- }
165
-
166
- .callout.callout-titled.callout-style-default .callout-content p {
167
- margin-top: 0.7em;
168
- }
169
-
170
- .callout.callout-style-simple div.callout-title {
171
- border-bottom: none;
172
- font-size: .9rem;
173
- font-weight: 600;
174
- opacity: 75%;
175
- }
176
-
177
- .callout.callout-style-default div.callout-title {
178
- border-bottom: none;
179
- font-weight: 600;
180
- opacity: 85%;
181
- font-size: 0.9rem;
182
- padding-left: 0.5em;
183
- padding-right: 0.5em;
184
- }
185
-
186
- .callout.callout-style-default div.callout-content {
187
- padding-left: 0.5em;
188
- padding-right: 0.5em;
189
- }
190
-
191
- .callout.callout-style-simple .callout-icon::before {
192
- height: 1rem;
193
- width: 1rem;
194
- display: inline-block;
195
- content: "";
196
- background-repeat: no-repeat;
197
- background-size: 1rem 1rem;
198
- }
199
-
200
- .callout.callout-style-default .callout-icon::before {
201
- height: 0.9rem;
202
- width: 0.9rem;
203
- display: inline-block;
204
- content: "";
205
- background-repeat: no-repeat;
206
- background-size: 0.9rem 0.9rem;
207
- }
208
-
209
- .callout-title {
210
- display: flex
211
- }
212
-
213
- .callout-icon::before {
214
- margin-top: 1rem;
215
- padding-right: .5rem;
216
- }
217
-
218
- .callout.no-icon::before {
219
- display: none !important;
220
- }
221
-
222
- .callout.callout-titled .callout-body > .callout-content > :last-child {
223
- padding-bottom: 0.5rem;
224
- margin-bottom: 0;
225
- }
226
-
227
- .callout.callout-titled .callout-icon::before {
228
- margin-top: .5rem;
229
- padding-right: .5rem;
230
- }
231
-
232
- .callout:not(.callout-titled) .callout-icon::before {
233
- margin-top: 1rem;
234
- padding-right: .5rem;
235
- }
236
-
237
- /* Callout Types */
238
-
239
- div.callout-note {
240
- border-left-color: #4582ec !important;
241
- }
242
-
243
- div.callout-note .callout-icon::before {
244
- background-image: url('');
245
- }
246
-
247
- div.callout-note.callout-style-default .callout-title {
248
- background-color: #dae6fb
249
- }
250
-
251
- div.callout-important {
252
- border-left-color: #d9534f !important;
253
- }
254
-
255
- div.callout-important .callout-icon::before {
256
- background-image: url('');
257
- }
258
-
259
- div.callout-important.callout-style-default .callout-title {
260
- background-color: #f7dddc
261
- }
262
-
263
- div.callout-warning {
264
- border-left-color: #f0ad4e !important;
265
- }
266
-
267
- div.callout-warning .callout-icon::before {
268
- background-image: url('');
269
- }
270
-
271
- div.callout-warning.callout-style-default .callout-title {
272
- background-color: #fcefdc
273
- }
274
-
275
- div.callout-tip {
276
- border-left-color: #02b875 !important;
277
- }
278
-
279
- div.callout-tip .callout-icon::before {
280
- background-image: url('');
281
- }
282
-
283
- div.callout-tip.callout-style-default .callout-title {
284
- background-color: #ccf1e3
285
- }
286
-
287
- div.callout-caution {
288
- border-left-color: #fd7e14 !important;
289
- }
290
-
291
- div.callout-caution .callout-icon::before {
292
- background-image: url('');
293
- }
294
-
295
- div.callout-caution.callout-style-default .callout-title {
296
- background-color: #ffe5d0
297
- }
298
-
299
- </style>
300
- <style type="text/css">
301
- .reveal div.sourceCode {
302
- margin: 0;
303
- overflow: auto;
304
- }
305
- .reveal div.hanging-indent {
306
- margin-left: 1em;
307
- text-indent: -1em;
308
- }
309
- .reveal .slide:not(.center) {
310
- height: 100%;
311
- }
312
- .reveal .slide.scrollable {
313
- overflow-y: auto;
314
- }
315
- .reveal .footnotes {
316
- height: 100%;
317
- overflow-y: auto;
318
- }
319
- .reveal .slide .absolute {
320
- position: absolute;
321
- display: block;
322
- }
323
- .reveal .footnotes ol {
324
- counter-reset: ol;
325
- list-style-type: none;
326
- margin-left: 0;
327
- }
328
- .reveal .footnotes ol li:before {
329
- counter-increment: ol;
330
- content: counter(ol) ". ";
331
- }
332
- .reveal .footnotes ol li > p:first-child {
333
- display: inline-block;
334
- }
335
- .reveal .slide ul,
336
- .reveal .slide ol {
337
- margin-bottom: 0.5em;
338
- }
339
- .reveal .slide ul li,
340
- .reveal .slide ol li {
341
- margin-top: 0.4em;
342
- margin-bottom: 0.2em;
343
- }
344
- .reveal .slide ul[role="tablist"] li {
345
- margin-bottom: 0;
346
- }
347
- .reveal .slide ul li > *:first-child,
348
- .reveal .slide ol li > *:first-child {
349
- margin-block-start: 0;
350
- }
351
- .reveal .slide ul li > *:last-child,
352
- .reveal .slide ol li > *:last-child {
353
- margin-block-end: 0;
354
- }
355
- .reveal .slide .columns:nth-child(3) {
356
- margin-block-start: 0.8em;
357
- }
358
- .reveal blockquote {
359
- box-shadow: none;
360
- }
361
- .reveal .tippy-content>* {
362
- margin-top: 0.2em;
363
- margin-bottom: 0.7em;
364
- }
365
- .reveal .tippy-content>*:last-child {
366
- margin-bottom: 0.2em;
367
- }
368
- .reveal .slide > img.stretch.quarto-figure-center,
369
- .reveal .slide > img.r-stretch.quarto-figure-center {
370
- display: block;
371
- margin-left: auto;
372
- margin-right: auto;
373
- }
374
- .reveal .slide > img.stretch.quarto-figure-left,
375
- .reveal .slide > img.r-stretch.quarto-figure-left {
376
- display: block;
377
- margin-left: 0;
378
- margin-right: auto;
379
- }
380
- .reveal .slide > img.stretch.quarto-figure-right,
381
- .reveal .slide > img.r-stretch.quarto-figure-right {
382
- display: block;
383
- margin-left: auto;
384
- margin-right: 0;
385
- }
386
- </style>
387
- <script src="llm_conf_files/libs/quarto-diagram/mermaid.min.js"></script>
388
- <script src="llm_conf_files/libs/quarto-diagram/mermaid-init.js"></script>
389
- <link href="llm_conf_files/libs/quarto-diagram/mermaid.css" rel="stylesheet">
390
- </head>
391
- <body class="quarto-dark">
392
- <div class="reveal">
393
- <div class="slides">
394
-
395
- <section id="title-slide" class="quarto-title-block center">
396
- <h1 class="title">Scaling Model Training with More Compute, How Do They Do It?</h1>
397
-
398
- <div class="quarto-title-authors">
399
- </div>
400
-
401
- </section>
402
- <section id="who-am-i" class="slide level2">
403
- <h2>Who am I?</h2>
404
- <ul>
405
- <li>Zachary Mueller</li>
406
- <li>Technical Lead for the 🤗 Accelerate project</li>
407
- <li>API design geek</li>
408
- </ul>
409
- </section>
410
- <section id="understanding-gpu-usage" class="slide level2">
411
- <h2>Understanding GPU Usage</h2>
412
- <ul>
413
- <li>We can somewhat estimate the memory usage in vanilla full-fine-tuning of models</li>
414
- <li>Requires certain assumptions (that I’ll be covering):
415
- <ul>
416
- <li>Adam optimizer</li>
417
- <li>Batch size of 1</li>
418
- </ul></li>
419
- </ul>
420
- </section>
421
- <section id="understanding-gpu-usage-1" class="slide level2">
422
- <h2>Understanding GPU Usage</h2>
423
- <p>General estimate (<code>bert-base-cased</code>, 108M params):</p>
424
- <ul>
425
- <li>Each parameter is 4 bytes</li>
426
- <li>Backward ~= 2x the model size</li>
427
- <li>The optimizer step ~= 4x the model size (1x model, 1x gradients, 2x optimizer):</li>
428
- </ul>
429
- <div style="font-size: 50%;">
430
- <table>
431
- <thead>
432
- <tr class="header">
433
- <th>dtype</th>
434
- <th style="text-align: left;">Model</th>
435
- <th style="text-align: center;">Gradients</th>
436
- <th style="text-align: center;">Backward pass</th>
437
- <th style="text-align: center;">Optimizer step</th>
438
- <th style="text-align: center;">Highest</th>
439
- </tr>
440
- </thead>
441
- <tbody>
442
- <tr class="odd">
443
- <td>float32</td>
444
- <td style="text-align: left;">413.18 MB</td>
445
- <td style="text-align: center;">413.18 MB</td>
446
- <td style="text-align: center;">826.36 MB</td>
447
- <td style="text-align: center;">1.61 GB</td>
448
- <td style="text-align: center;">1.61 GB</td>
449
- </tr>
450
- <tr class="even">
451
- <td>float16</td>
452
- <td style="text-align: left;">413.18 MB*</td>
453
- <td style="text-align: center;">619.77 MB</td>
454
- <td style="text-align: center;">826.36 MB</td>
455
- <td style="text-align: center;">826.36 MB</td>
456
- <td style="text-align: center;">826.36 MB</td>
457
- </tr>
458
- </tbody>
459
- </table>
460
- <p>*All estimations were based off the <a href="https://huggingface.co/spaces/hf-accelerate/model-memory-usage">Model Estimator Tool</a></p>
461
- </div>
462
- </section>
463
- <section id="understanding-gpu-usage-2" class="slide level2">
464
- <h2>Understanding GPU Usage</h2>
465
- <p>This works fine for small models, we have cards with anywhere from 12-24GB of GPU memory (on the GPU-poor side).</p>
466
- <p>But what happens as we scale?</p>
467
- <p>Here’s <code>llama-3-8B</code> (8.03B parameters)</p>
468
- <div style="font-size: 50%;">
469
- <table>
470
- <thead>
471
- <tr class="header">
472
- <th>dtype</th>
473
- <th style="text-align: left;">Model</th>
474
- <th style="text-align: center;">Gradients</th>
475
- <th style="text-align: center;">Backward pass</th>
476
- <th style="text-align: center;">Optimizer step</th>
477
- <th style="text-align: center;">Highest</th>
478
- </tr>
479
- </thead>
480
- <tbody>
481
- <tr class="odd">
482
- <td>float32</td>
483
- <td style="text-align: left;">28.21 GB</td>
484
- <td style="text-align: center;">28.21 GB</td>
485
- <td style="text-align: center;">56.43 GB</td>
486
- <td style="text-align: center;">112.84 GB</td>
487
- <td style="text-align: center;">112.84 GB</td>
488
- </tr>
489
- <tr class="even">
490
- <td>float16</td>
491
- <td style="text-align: left;">28.21 GB*</td>
492
- <td style="text-align: center;">42.32 GB</td>
493
- <td style="text-align: center;">56.43 GB</td>
494
- <td style="text-align: center;">56.43 GB</td>
495
- <td style="text-align: center;">56.43 GB</td>
496
- </tr>
497
- </tbody>
498
- </table>
499
- </div>
500
- <p>Well, <em>I</em> don’t have 56GB of GPU memory in a single card, let alone 112GB.</p>
501
- <p>What can we do?</p>
502
- </section>
503
- <section>
504
- <section id="distributed-training" class="title-slide slide level1 center">
505
- <h1>Distributed Training</h1>
506
-
507
- </section>
508
- <section id="kinds-of-training" class="slide level2">
509
- <h2>Kinds of Training</h2>
510
- <ul>
511
- <li>Single GPU:
512
- <ul>
513
- <li>No distributed techniques at play</li>
514
- </ul></li>
515
- <li>DDP:
516
- <ul>
517
- <li>A full copy of the model exists on each device, but data is chunked between each GPU</li>
518
- </ul></li>
519
- <li>FSDP &amp; DeepSpeed:
520
- <ul>
521
- <li>Split chunks of the model and optimizer states across GPUs, allowing for training bigger models on smaller (multiple) GPUs</li>
522
- </ul></li>
523
- </ul>
524
- </section></section>
525
- <section>
526
- <section id="fully-sharded-data-parallelism" class="title-slide slide level1 center">
527
- <h1>Fully Sharded Data Parallelism</h1>
528
-
529
- </section>
530
- <section id="fully-sharded-data-parallelism-1" class="slide level2">
531
- <h2>Fully Sharded Data Parallelism</h2>
532
-
533
- <img data-src="fsdp.png" id="fig-539a35d47e664c97a50115a146a7f1bd-1" class="r-stretch quarto-figure-center"><aside class="notes">
534
- <ul>
535
- <li>Take the model and split it across <code>n</code> GPUs</li>
536
- <li>Each GPU computes the shard’s gradients</li>
537
- <li>At the end, all gradients are synchronized and the final full model gradient is calculated</li>
538
- <li>The backward pass can then be performed</li>
539
- </ul>
540
- <style type="text/css">
541
- span.MJX_Assistive_MathML {
542
- position:absolute!important;
543
- clip: rect(1px, 1px, 1px, 1px);
544
- padding: 1px 0 0 0!important;
545
- border: 0!important;
546
- height: 1px!important;
547
- width: 1px!important;
548
- overflow: hidden!important;
549
- display:block!important;
550
- }</style></aside>
551
- </section>
552
- <section id="fsdp-getting-parameter-specific" class="slide level2">
553
- <h2>FSDP: Getting parameter specific</h2>
554
- <ul>
555
- <li>Different parameters can dicatate how much memory is needed for total GPU training across multiple GPUs</li>
556
- <li>These include how model weights are sharded, gradients, and more.</li>
557
- <li>I’ll cover some important ones I needed when doing a Full-Fine-Tune of Llama-3-8B <em>without PEFT</em> on 2x4090’s</li>
558
- </ul>
559
- </section>
560
- <section id="sharding_strategy" class="slide level2">
561
- <h2><code>sharding_strategy</code></h2>
562
- <ul>
563
- <li>Dictates the level of divving resources to perform
564
- <ul>
565
- <li><code>FULL_SHARD</code>: Includes optimizer states, gradients, and parameters</li>
566
- <li><code>SHARD_GRAD_OP</code>: Includes optimizer states and gradients</li>
567
- <li><code>NO_SHARD</code>: Normal DDP</li>
568
- <li><code>HYBRID_SHARD</code>: Includes optimizer states, gradients, and parameters but each node has the full model</li>
569
- </ul>
570
- <aside class="notes">
571
- <pre><code>FULL_SHARD:
572
- Parameters, Gradients, Optimizer States: All are sharded.
573
- Parameters Handling: Unshard before forward pass, reshard after forward pass, unshard before backward pass, reshard after backward pass.
574
- Gradients Handling: Synchronize and shard after backward pass.
575
- Optimizer States: Updated locally per rank.</code></pre>
576
- <p>SHARD_GRAD_OP: Gradients and Optimizer States: Sharded during computation. Parameters: Unshard before forward pass, remain unsharded during forward pass, reshard after backward pass. Inside no_sync(): Parameters are not resharded after backward computation. Optimizer States: Updated locally per rank.</p>
577
- <p>NO_SHARD: Parameters, Gradients, Optimizer States: Not sharded, replicated across ranks. Gradients Handling: Synchronized via all-reduce after backward pass. Optimizer States: Updated locally per rank.</p>
578
- <p>HYBRID_SHARD: Parameters, Gradients, Optimizer States: Combines FULL_SHARD within a node and replicates parameters across nodes. Communication: Expensive operations like all-gathers and reduce-scatters are limited to within a node, enhancing performance for medium-sized models.</p>
579
- <style type="text/css">
580
- span.MJX_Assistive_MathML {
581
- position:absolute!important;
582
- clip: rect(1px, 1px, 1px, 1px);
583
- padding: 1px 0 0 0!important;
584
- border: 0!important;
585
- height: 1px!important;
586
- width: 1px!important;
587
- overflow: hidden!important;
588
- display:block!important;
589
- }</style></aside></li>
590
- </ul>
591
- </section>
592
- <section id="auto_wrap_policy" class="slide level2">
593
- <h2><code>auto_wrap_policy</code>:</h2>
594
- <ul>
595
- <li>How the model should be split</li>
596
- <li>Can be either <code>TRANSFORMER_BASED_WRAP</code> or <code>SIZE_BASED_WRAP</code></li>
597
- <li><code>TRANSFORMER</code>/<code>fsdp_transformers_layer_cls_to_wrap</code>:
598
- <ul>
599
- <li>Need to declare the layer</li>
600
- <li>Generally <code>transformers</code> has good defaults</li>
601
- </ul></li>
602
- <li><code>SIZE</code>/<code>fsdp_min_num_param</code>:
603
- <ul>
604
- <li>Number of total parameters in a shard</li>
605
- </ul></li>
606
- </ul>
607
- </section>
608
- <section id="offload_params" class="slide level2">
609
- <h2><code>offload_params</code>:</h2>
610
- <ul>
611
- <li>Offloads the parameters and gradients to the CPU if they can’t fit into memory</li>
612
- <li>Allows you to train much larger models locally, but will be much slower</li>
613
- </ul>
614
- <blockquote>
615
- <p>Case: FFT of Llama-3-8B with <code>fsdp_offload_params</code> on 2x4090 GPUs was 72hrs, vs ~an hour or two when using 1xH100</p>
616
- </blockquote>
617
- </section>
618
- <section id="cpu_ram_efficient_loading-and-sync_module_states" class="slide level2">
619
- <h2><code>cpu_ram_efficient_loading</code> and <code>sync_module_states</code></h2>
620
- <ul>
621
- <li>Uses the idea behind big model inference/the <code>meta</code> device to load in the model to the GPU in a low-ram scenario</li>
622
- <li>Rather than needing <code>model_size</code> * <code>n_gpus</code> RAM, we can load the model on a single node and then send the weights directly to each shard when the time is right via <code>sync_module_states</code></li>
623
- </ul>
624
- </section></section>
625
- <section>
626
- <section id="tying-this-to-accelerate" class="title-slide slide level1 center">
627
- <h1>Tying this to 🤗 Accelerate</h1>
628
-
629
- </section>
630
- <section id="tying-this-to-accelerate-1" class="slide level2">
631
- <h2>Tying this to 🤗 Accelerate</h2>
632
- <ul>
633
- <li>So far we’ve covered the theory, but how do we put it into practice</li>
634
- <li>By using a library that’s at the heart of the entire open-source ecosystem</li>
635
- </ul>
636
- <div style="font-size: 60%;padding-left:10%;padding-top:0%;">
637
- <ul>
638
- <li>Nearly all of 🤗</li>
639
- <li><code>axolotl</code></li>
640
- <li><code>fastai</code></li>
641
- <li><code>FastChat</code></li>
642
- <li><code>lucidrains</code></li>
643
- <li><code>kornia</code></li>
644
- </ul>
645
- </div>
646
- <p>Are you using it and you don’t even know?</p>
647
- </section>
648
- <section id="what-is-accelerate" class="slide level2">
649
- <h2>What is 🤗 Accelerate?</h2>
650
- <div class="cell" data-reveal="true" data-fig-height="6">
651
- <div class="cell-output-display">
652
- <div>
653
- <div>
654
- <pre class="mermaid mermaid-js">graph LR
655
- A(("🤗 Accelerate#32;"))
656
- A --&gt; B["CLI Interface#32;"]
657
- A --&gt; C["Training Library#32;"]
658
- A --&gt; D["Big Model&lt;br&gt;Inference#32;"]
659
- </pre>
660
- </div>
661
- </div>
662
- </div>
663
- </div>
664
- </section>
665
- <section id="a-cli-interface" class="slide level2">
666
- <h2>A CLI Interface</h2>
667
- <ul>
668
- <li><code>accelerate config</code>
669
- <ul>
670
- <li>Configure the environment</li>
671
- </ul></li>
672
- <li><code>accelerate estimate-memory</code>
673
- <ul>
674
- <li>How to guess vRAM requirements</li>
675
- </ul></li>
676
- <li><code>accelerate launch</code>
677
- <ul>
678
- <li>How to run your script</li>
679
- </ul></li>
680
- </ul>
681
- </section>
682
- <section id="launching-distributed-training-is-hard" class="slide level2">
683
- <h2>Launching distributed training is hard</h2>
684
- <ul>
685
- <li><div class="sourceCode" id="cb2"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1"></a><span class="ex">python</span> script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
686
- <li><div class="sourceCode" id="cb3"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1"></a><span class="ex">torchrun</span> <span class="at">--nnodes</span><span class="op">=</span>1 <span class="at">--nproc_per_node</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
687
- <li><div class="sourceCode" id="cb4"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1"></a><span class="ex">deepspeed</span> <span class="at">--num_gpus</span><span class="op">=</span>2 script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
688
- </ul>
689
- <p>How can we make this better?</p>
690
- </section>
691
- <section id="accelerate-launch" class="slide level2">
692
- <h2><code>accelerate launch</code></h2>
693
- <div class="sourceCode" id="cb5"><pre class="sourceCode numberSource bash number-lines code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1"></a><span class="ex">accelerate</span> launch script.py</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
694
- </section>
695
- <section id="accelerate-config" class="slide level2">
696
- <h2><code>accelerate config</code></h2>
697
- <ul>
698
- <li>Rely on <code>config.yaml</code> files</li>
699
- <li>Choose to either running <code>accelerate config</code> or write your own:</li>
700
- </ul>
701
- <div class="columns" style="font-size: 50%;padding-left:10%;">
702
- <div class="column" style="width:40%;">
703
- <div class="code-with-filename">
704
- <div class="code-with-filename-file">
705
- <pre><strong>ddp_config.yaml</strong></pre>
706
- </div>
707
- <div class="sourceCode" id="cb6" data-filename="ddp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
708
- <span id="cb6-2"><a href="#cb6-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> MULTI_GPU</span></span>
709
- <span id="cb6-3"><a href="#cb6-3"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
710
- <span id="cb6-4"><a href="#cb6-4"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
711
- <span id="cb6-5"><a href="#cb6-5"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
712
- <span id="cb6-6"><a href="#cb6-6"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
713
- </div>
714
- </div><div class="column" style="width:40%;">
715
- <div class="code-with-filename">
716
- <div class="code-with-filename-file">
717
- <pre><strong>fsdp_config.yaml</strong></pre>
718
- </div>
719
- <div class="sourceCode" id="cb7" data-filename="fsdp_config.yaml"><pre class="sourceCode numberSource yaml number-lines code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1"></a><span class="fu">compute_environment</span><span class="kw">:</span><span class="at"> LOCAL_MACHINE</span></span>
720
- <span id="cb7-2"><a href="#cb7-2"></a><span class="fu">distributed_type</span><span class="kw">:</span><span class="at"> FSDP</span></span>
721
- <span id="cb7-3"><a href="#cb7-3"></a><span class="fu">fsdp_config</span><span class="kw">:</span></span>
722
- <span id="cb7-4"><a href="#cb7-4"></a><span class="at"> </span><span class="fu">fsdp_auto_wrap_policy</span><span class="kw">:</span><span class="at"> TRANSFORMER_BASED_WRAP</span></span>
723
- <span id="cb7-5"><a href="#cb7-5"></a><span class="at"> </span><span class="fu">fsdp_backward_prefetch</span><span class="kw">:</span><span class="at"> BACKWARD_PRE</span></span>
724
- <span id="cb7-6"><a href="#cb7-6"></a><span class="at"> </span><span class="fu">fsdp_cpu_ram_efficient_loading</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
725
- <span id="cb7-7"><a href="#cb7-7"></a><span class="at"> </span><span class="fu">fsdp_forward_prefetch</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
726
- <span id="cb7-8"><a href="#cb7-8"></a><span class="at"> </span><span class="fu">fsdp_offload_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
727
- <span id="cb7-9"><a href="#cb7-9"></a><span class="at"> </span><span class="fu">fsdp_sharding_strategy</span><span class="kw">:</span><span class="at"> FULL_SHARD</span></span>
728
- <span id="cb7-10"><a href="#cb7-10"></a><span class="at"> </span><span class="fu">fsdp_state_dict_type</span><span class="kw">:</span><span class="at"> SHARDED_STATE_DICT</span></span>
729
- <span id="cb7-11"><a href="#cb7-11"></a><span class="at"> </span><span class="fu">fsdp_sync_module_states</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
730
- <span id="cb7-12"><a href="#cb7-12"></a><span class="at"> </span><span class="fu">fsdp_use_orig_params</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
731
- <span id="cb7-13"><a href="#cb7-13"></a><span class="fu">main_training_function</span><span class="kw">:</span><span class="at"> main</span></span>
732
- <span id="cb7-14"><a href="#cb7-14"></a><span class="fu">mixed_precision</span><span class="kw">:</span><span class="at"> bf16</span></span>
733
- <span id="cb7-15"><a href="#cb7-15"></a><span class="fu">num_machines</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
734
- <span id="cb7-16"><a href="#cb7-16"></a><span class="fu">num_processes</span><span class="kw">:</span><span class="at"> </span><span class="dv">8</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
735
- </div>
736
- </div>
737
- </div>
738
- </section></section>
739
- <section>
740
- <section id="a-training-library" class="title-slide slide level1 center">
741
- <h1>A Training Library</h1>
742
-
743
- </section>
744
- <section id="a-training-library-the-code" class="slide level2">
745
- <h2>A Training Library: The Code</h2>
746
- <div class="columns" style="font-size: 50%;">
747
- <div class="column">
748
- <p><br><br><br></p>
749
- <div class="sourceCode" id="cb8" data-code-line-numbers="5-6,9"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1"></a><span class="co"># For alignment purposes</span></span>
750
- <span id="cb8-2"><a href="#cb8-2"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
751
- <span id="cb8-3"><a href="#cb8-3"></a> optimizer.zero_grad()</span>
752
- <span id="cb8-4"><a href="#cb8-4"></a> inputs, targets <span class="op">=</span> batch</span>
753
- <span id="cb8-5"><a href="#cb8-5"></a> inputs <span class="op">=</span> inputs.to(device)</span>
754
- <span id="cb8-6"><a href="#cb8-6"></a> targets <span class="op">=</span> targets.to(device)</span>
755
- <span id="cb8-7"><a href="#cb8-7"></a> outputs <span class="op">=</span> model(inputs)</span>
756
- <span id="cb8-8"><a href="#cb8-8"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
757
- <span id="cb8-9"><a href="#cb8-9"></a> loss.backward()</span>
758
- <span id="cb8-10"><a href="#cb8-10"></a> optimizer.step()</span>
759
- <span id="cb8-11"><a href="#cb8-11"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
760
- </div><div class="column">
761
- <div class="sourceCode" id="cb9" data-code-line-numbers="1-7,12-13,16"><pre class="sourceCode numberSource python number-lines code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1"></a><span class="im">from</span> accelerate <span class="im">import</span> Accelerator</span>
762
- <span id="cb9-2"><a href="#cb9-2"></a>accelerator <span class="op">=</span> Accelerator()</span>
763
- <span id="cb9-3"><a href="#cb9-3"></a>dataloader, model, optimizer scheduler <span class="op">=</span> (</span>
764
- <span id="cb9-4"><a href="#cb9-4"></a> accelerator.prepare(</span>
765
- <span id="cb9-5"><a href="#cb9-5"></a> dataloader, model, optimizer, scheduler</span>
766
- <span id="cb9-6"><a href="#cb9-6"></a> )</span>
767
- <span id="cb9-7"><a href="#cb9-7"></a>)</span>
768
- <span id="cb9-8"><a href="#cb9-8"></a></span>
769
- <span id="cb9-9"><a href="#cb9-9"></a><span class="cf">for</span> batch <span class="kw">in</span> dataloader:</span>
770
- <span id="cb9-10"><a href="#cb9-10"></a> optimizer.zero_grad()</span>
771
- <span id="cb9-11"><a href="#cb9-11"></a> inputs, targets <span class="op">=</span> batch</span>
772
- <span id="cb9-12"><a href="#cb9-12"></a> <span class="co"># inputs = inputs.to(device)</span></span>
773
- <span id="cb9-13"><a href="#cb9-13"></a> <span class="co"># targets = targets.to(device)</span></span>
774
- <span id="cb9-14"><a href="#cb9-14"></a> outputs <span class="op">=</span> model(inputs)</span>
775
- <span id="cb9-15"><a href="#cb9-15"></a> loss <span class="op">=</span> loss_function(outputs, targets)</span>
776
- <span id="cb9-16"><a href="#cb9-16"></a> accelerator.backward(loss) <span class="co"># loss.backward()</span></span>
777
- <span id="cb9-17"><a href="#cb9-17"></a> optimizer.step()</span>
778
- <span id="cb9-18"><a href="#cb9-18"></a> scheduler.step()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
779
- </div>
780
- </div>
781
- </section>
782
- <section id="a-training-library-how-scaling-works" class="slide level2">
783
- <h2>A Training Library: How Scaling Works</h2>
784
- <ul>
785
- <li>Accelerate’s DataLoaders and schedulers work off of a sharding mindset</li>
786
- <li>Rather than repeating the same data across <code>n</code> nodes, we instead split it</li>
787
- <li>Speeds up training linearly</li>
788
- <li>Given a batch size of 16 on a single GPU, to recreate this across 8 GPUs you would use a batch size of 2</li>
789
- <li>This also means the scheduler will be stepped <code>n</code> GPUs at a time per “global step”</li>
790
- </ul>
791
- </section>
792
- <section id="a-training-library-mixed-precision" class="slide level2">
793
- <h2>A Training Library: Mixed Precision</h2>
794
- <ul>
795
- <li>This may be a bit different than your “normal” idea of mixed precision.</li>
796
- <li>We do <strong>not</strong> convert the model weights to BF16/FP16</li>
797
- <li>Instead we <strong>wrap the forward pass</strong> with <code>autocast</code> to convert the gradients automatically</li>
798
- <li>This preserves the original precision of the weights, which leads to stable training and better fine-tuning later on.</li>
799
- <li><strong>If you use <code>.bf16()</code> weights, you are STUCK in bf16 perminantly</strong></li>
800
- </ul>
801
- </section>
802
- <section id="a-training-library-mixed-precision-1" class="slide level2">
803
- <h2>A Training Library: Mixed Precision</h2>
804
- <ul>
805
- <li>Let’s tie that back up to the model estimator with neat tools like NVIDIA’s TransformerEngine</li>
806
- </ul>
807
- <div style="font-size: 60%;">
808
- <table style="width:100%;">
809
- <colgroup>
810
- <col style="width: 14%">
811
- <col style="width: 14%">
812
- <col style="width: 14%">
813
- <col style="width: 14%">
814
- <col style="width: 14%">
815
- <col style="width: 14%">
816
- <col style="width: 14%">
817
- </colgroup>
818
- <thead>
819
- <tr class="header">
820
- <th>Optimization Level</th>
821
- <th>Computation (GEMM)</th>
822
- <th>Comm</th>
823
- <th>Weight</th>
824
- <th>Master Weight</th>
825
- <th>Weight Gradient</th>
826
- <th>Optimizer States</th>
827
- </tr>
828
- </thead>
829
- <tbody>
830
- <tr class="odd">
831
- <td>FP16 AMP</td>
832
- <td>FP16</td>
833
- <td>FP32</td>
834
- <td>FP32</td>
835
- <td>N/A</td>
836
- <td>FP32</td>
837
- <td>FP32+FP32</td>
838
- </tr>
839
- <tr class="even">
840
- <td>Nvidia TE</td>
841
- <td>FP8</td>
842
- <td>FP32</td>
843
- <td>FP32</td>
844
- <td>N/A</td>
845
- <td>FP32</td>
846
- <td>FP32+FP32</td>
847
- </tr>
848
- <tr class="odd">
849
- <td>MS-AMP O1</td>
850
- <td>FP8</td>
851
- <td>FP8</td>
852
- <td>FP16</td>
853
- <td>N/A</td>
854
- <td>FP8</td>
855
- <td>FP32+FP32</td>
856
- </tr>
857
- <tr class="even">
858
- <td>MS-AMP O2</td>
859
- <td>FP8</td>
860
- <td>FP8</td>
861
- <td>FP16</td>
862
- <td>N/A</td>
863
- <td>FP8</td>
864
- <td>FP8+FP16</td>
865
- </tr>
866
- <tr class="odd">
867
- <td>MS-AMP O3</td>
868
- <td>FP8</td>
869
- <td>FP8</td>
870
- <td>FP8</td>
871
- <td>FP16</td>
872
- <td>FP8</td>
873
- <td>FP8+FP16</td>
874
- </tr>
875
- </tbody>
876
- </table>
877
- </div>
878
- <aside class="notes">
879
- <p>What is actually happening: * Linear Layers and other certain compatible layers are wrapped in a special version that allows for FP8 computation * The general forward pass is wrapped around BF16 * This means that the most memory saved is done during the gradients of the model, <em>not</em> the model itself. * With tools like <code>MS-AMP</code> we can convert more chunks into lower precision, but again like before stable training occurs when the models weights are in full precision and the backprop happens in full precision too.</p>
880
- <style type="text/css">
881
- span.MJX_Assistive_MathML {
882
- position:absolute!important;
883
- clip: rect(1px, 1px, 1px, 1px);
884
- padding: 1px 0 0 0!important;
885
- border: 0!important;
886
- height: 1px!important;
887
- width: 1px!important;
888
- overflow: hidden!important;
889
- display:block!important;
890
- }</style></aside>
891
- </section>
892
- <section id="deepspeed-vs-fully-sharded-data-parallelism" class="slide level2">
893
- <h2>DeepSpeed vs Fully Sharded Data Parallelism</h2>
894
- <ul>
895
- <li>Extremely similar, however mostly used different naming conventions for items and slight tweaks in the implementation</li>
896
- </ul>
897
- <div style="font-size: 50%;">
898
- <table style="width:100%;">
899
- <colgroup>
900
- <col style="width: 16%">
901
- <col style="width: 16%">
902
- <col style="width: 16%">
903
- <col style="width: 16%">
904
- <col style="width: 16%">
905
- <col style="width: 16%">
906
- </colgroup>
907
- <thead>
908
- <tr class="header">
909
- <th>Framework</th>
910
- <th>Model Loading (<code>torch_dtype</code>)</th>
911
- <th>Mixed Precision</th>
912
- <th>Preparation (Local)</th>
913
- <th>Training</th>
914
- <th>Optimizer (Local)</th>
915
- </tr>
916
- </thead>
917
- <tbody>
918
- <tr class="odd">
919
- <td>FSDP</td>
920
- <td>bf16</td>
921
- <td>default (none)</td>
922
- <td>bf16</td>
923
- <td>bf16</td>
924
- <td>bf16</td>
925
- </tr>
926
- <tr class="even">
927
- <td>FSDP</td>
928
- <td>bf16</td>
929
- <td>bf16</td>
930
- <td>fp32</td>
931
- <td>bf16</td>
932
- <td>fp32</td>
933
- </tr>
934
- <tr class="odd">
935
- <td>DeepSpeed</td>
936
- <td>bf16</td>
937
- <td>bf16</td>
938
- <td>fp32</td>
939
- <td>bf16</td>
940
- <td>fp32</td>
941
- </tr>
942
- </tbody>
943
- </table>
944
- </div>
945
- <p>To learn more, check out the <a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">documentation</a> or join my office hours</p>
946
- </section>
947
- <section id="key-takeaways" class="slide level2">
948
- <h2>Key Takeaways:</h2>
949
- <ul>
950
- <li>You can scale out training with <code>accelerate</code>, FSDP, and DeepSpeed across multiple GPUs to train bigger models</li>
951
- <li>Techniques like <code>FP8</code> can help speed up training some and reduce computational overhead</li>
952
- <li>Comes at a cost of end-precision and locking model weights for futher fine-tunes if not careful</li>
953
- </ul>
954
- </section>
955
- <section id="some-handy-resources" class="slide level2">
956
- <h2>Some Handy Resources</h2>
957
- <ul>
958
- <li><a href="https://hf.co/docs/accelerate">🤗 Accelerate documentation</a></li>
959
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch">Launching distributed code</a></li>
960
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/notebook">Distributed code and Jupyter Notebooks</a></li>
961
- <li><a href="https://huggingface.co/docs/accelerate/basic_tutorials/migration">Migrating to 🤗 Accelerate easily</a></li>
962
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/big_modeling">Big Model Inference tutorial</a></li>
963
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed">DeepSpeed and 🤗 Accelerate</a></li>
964
- <li><a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp">Fully Sharded Data Parallelism and 🤗 Accelerate</a></li>
965
- <li><a href="https://huggingface.co/docs/accelerate/concept_guides/fsdp_and_deepspeed">FSDP vs DeepSpeed In-Depth</a></li>
966
- </ul>
967
- <div class="footer footer-default">
968
-
969
- </div>
970
- </section></section>
971
- </div>
972
- </div>
973
-
974
- <script>window.backupDefine = window.define; window.define = undefined;</script>
975
- <script src="llm_conf_files/libs/revealjs/dist/reveal.js"></script>
976
- <!-- reveal.js plugins -->
977
- <script src="llm_conf_files/libs/revealjs/plugin/quarto-line-highlight/line-highlight.js"></script>
978
- <script src="llm_conf_files/libs/revealjs/plugin/pdf-export/pdfexport.js"></script>
979
- <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/menu.js"></script>
980
- <script src="llm_conf_files/libs/revealjs/plugin/reveal-menu/quarto-menu.js"></script>
981
- <script src="llm_conf_files/libs/revealjs/plugin/quarto-support/support.js"></script>
982
-
983
-
984
- <script src="llm_conf_files/libs/revealjs/plugin/notes/notes.js"></script>
985
- <script src="llm_conf_files/libs/revealjs/plugin/search/search.js"></script>
986
- <script src="llm_conf_files/libs/revealjs/plugin/zoom/zoom.js"></script>
987
- <script src="llm_conf_files/libs/revealjs/plugin/math/math.js"></script>
988
- <script>window.define = window.backupDefine; window.backupDefine = undefined;</script>
989
-
990
- <script>
991
-
992
- // Full list of configuration options available at:
993
- // https://revealjs.com/config/
994
- Reveal.initialize({
995
- 'controlsAuto': true,
996
- 'previewLinksAuto': false,
997
- 'pdfSeparateFragments': false,
998
- 'autoAnimateEasing': "ease",
999
- 'autoAnimateDuration': 1,
1000
- 'autoAnimateUnmatched': true,
1001
- 'menu': {"side":"left","useTextContentForMissingTitles":true,"markers":false,"loadIcons":false,"custom":[{"title":"Tools","icon":"<i class=\"fas fa-gear\"></i>","content":"<ul class=\"slide-menu-items\">\n<li class=\"slide-tool-item active\" data-item=\"0\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.fullscreen(event)\"><kbd>f</kbd> Fullscreen</a></li>\n<li class=\"slide-tool-item\" data-item=\"1\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.speakerMode(event)\"><kbd>s</kbd> Speaker View</a></li>\n<li class=\"slide-tool-item\" data-item=\"2\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.overview(event)\"><kbd>o</kbd> Slide Overview</a></li>\n<li class=\"slide-tool-item\" data-item=\"3\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.togglePdfExport(event)\"><kbd>e</kbd> PDF Export Mode</a></li>\n<li class=\"slide-tool-item\" data-item=\"4\"><a href=\"#\" onclick=\"RevealMenuToolHandlers.keyboardHelp(event)\"><kbd>?</kbd> Keyboard Help</a></li>\n</ul>"}],"openButton":true},
1002
- 'smaller': false,
1003
-
1004
- // Display controls in the bottom right corner
1005
- controls: false,
1006
-
1007
- // Help the user learn the controls by providing hints, for example by
1008
- // bouncing the down arrow when they first encounter a vertical slide
1009
- controlsTutorial: false,
1010
-
1011
- // Determines where controls appear, "edges" or "bottom-right"
1012
- controlsLayout: 'edges',
1013
-
1014
- // Visibility rule for backwards navigation arrows; "faded", "hidden"
1015
- // or "visible"
1016
- controlsBackArrows: 'faded',
1017
-
1018
- // Display a presentation progress bar
1019
- progress: true,
1020
-
1021
- // Display the page number of the current slide
1022
- slideNumber: false,
1023
-
1024
- // 'all', 'print', or 'speaker'
1025
- showSlideNumber: 'all',
1026
-
1027
- // Add the current slide number to the URL hash so that reloading the
1028
- // page/copying the URL will return you to the same slide
1029
- hash: true,
1030
-
1031
- // Start with 1 for the hash rather than 0
1032
- hashOneBasedIndex: false,
1033
-
1034
- // Flags if we should monitor the hash and change slides accordingly
1035
- respondToHashChanges: true,
1036
-
1037
- // Push each slide change to the browser history
1038
- history: true,
1039
-
1040
- // Enable keyboard shortcuts for navigation
1041
- keyboard: true,
1042
-
1043
- // Enable the slide overview mode
1044
- overview: true,
1045
-
1046
- // Disables the default reveal.js slide layout (scaling and centering)
1047
- // so that you can use custom CSS layout
1048
- disableLayout: false,
1049
-
1050
- // Vertical centering of slides
1051
- center: false,
1052
-
1053
- // Enables touch navigation on devices with touch input
1054
- touch: true,
1055
-
1056
- // Loop the presentation
1057
- loop: false,
1058
-
1059
- // Change the presentation direction to be RTL
1060
- rtl: false,
1061
-
1062
- // see https://revealjs.com/vertical-slides/#navigation-mode
1063
- navigationMode: 'linear',
1064
-
1065
- // Randomizes the order of slides each time the presentation loads
1066
- shuffle: false,
1067
-
1068
- // Turns fragments on and off globally
1069
- fragments: true,
1070
-
1071
- // Flags whether to include the current fragment in the URL,
1072
- // so that reloading brings you to the same fragment position
1073
- fragmentInURL: false,
1074
-
1075
- // Flags if the presentation is running in an embedded mode,
1076
- // i.e. contained within a limited portion of the screen
1077
- embedded: false,
1078
-
1079
- // Flags if we should show a help overlay when the questionmark
1080
- // key is pressed
1081
- help: true,
1082
-
1083
- // Flags if it should be possible to pause the presentation (blackout)
1084
- pause: true,
1085
-
1086
- // Flags if speaker notes should be visible to all viewers
1087
- showNotes: false,
1088
-
1089
- // Global override for autoplaying embedded media (null/true/false)
1090
- autoPlayMedia: null,
1091
-
1092
- // Global override for preloading lazy-loaded iframes (null/true/false)
1093
- preloadIframes: null,
1094
-
1095
- // Number of milliseconds between automatically proceeding to the
1096
- // next slide, disabled when set to 0, this value can be overwritten
1097
- // by using a data-autoslide attribute on your slides
1098
- autoSlide: 0,
1099
-
1100
- // Stop auto-sliding after user input
1101
- autoSlideStoppable: true,
1102
-
1103
- // Use this method for navigation when auto-sliding
1104
- autoSlideMethod: null,
1105
-
1106
- // Specify the average time in seconds that you think you will spend
1107
- // presenting each slide. This is used to show a pacing timer in the
1108
- // speaker view
1109
- defaultTiming: null,
1110
-
1111
- // Enable slide navigation via mouse wheel
1112
- mouseWheel: false,
1113
-
1114
- // The display mode that will be used to show slides
1115
- display: 'block',
1116
-
1117
- // Hide cursor if inactive
1118
- hideInactiveCursor: true,
1119
-
1120
- // Time before the cursor is hidden (in ms)
1121
- hideCursorTime: 5000,
1122
-
1123
- // Opens links in an iframe preview overlay
1124
- previewLinks: false,
1125
-
1126
- // Transition style (none/fade/slide/convex/concave/zoom)
1127
- transition: 'none',
1128
-
1129
- // Transition speed (default/fast/slow)
1130
- transitionSpeed: 'default',
1131
-
1132
- // Transition style for full page slide backgrounds
1133
- // (none/fade/slide/convex/concave/zoom)
1134
- backgroundTransition: 'none',
1135
-
1136
- // Number of slides away from the current that are visible
1137
- viewDistance: 3,
1138
-
1139
- // Number of slides away from the current that are visible on mobile
1140
- // devices. It is advisable to set this to a lower number than
1141
- // viewDistance in order to save resources.
1142
- mobileViewDistance: 2,
1143
-
1144
- // The "normal" size of the presentation, aspect ratio will be preserved
1145
- // when the presentation is scaled to fit different resolutions. Can be
1146
- // specified using percentage units.
1147
- width: 1050,
1148
-
1149
- height: 700,
1150
-
1151
- // Factor of the display size that should remain empty around the content
1152
- margin: 0.1,
1153
-
1154
- math: {
1155
- mathjax: 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js',
1156
- config: 'TeX-AMS_HTML-full',
1157
- tex2jax: {
1158
- inlineMath: [['\\(','\\)']],
1159
- displayMath: [['\\[','\\]']],
1160
- balanceBraces: true,
1161
- processEscapes: false,
1162
- processRefs: true,
1163
- processEnvironments: true,
1164
- preview: 'TeX',
1165
- skipTags: ['script','noscript','style','textarea','pre','code'],
1166
- ignoreClass: 'tex2jax_ignore',
1167
- processClass: 'tex2jax_process'
1168
- },
1169
- },
1170
-
1171
- // reveal.js plugins
1172
- plugins: [QuartoLineHighlight, PdfExport, RevealMenu, QuartoSupport,
1173
-
1174
- RevealMath,
1175
- RevealNotes,
1176
- RevealSearch,
1177
- RevealZoom
1178
- ]
1179
- });
1180
- </script>
1181
- <script id="quarto-html-after-body" type="application/javascript">
1182
- window.document.addEventListener("DOMContentLoaded", function (event) {
1183
- const toggleBodyColorMode = (bsSheetEl) => {
1184
- const mode = bsSheetEl.getAttribute("data-mode");
1185
- const bodyEl = window.document.querySelector("body");
1186
- if (mode === "dark") {
1187
- bodyEl.classList.add("quarto-dark");
1188
- bodyEl.classList.remove("quarto-light");
1189
- } else {
1190
- bodyEl.classList.add("quarto-light");
1191
- bodyEl.classList.remove("quarto-dark");
1192
- }
1193
- }
1194
- const toggleBodyColorPrimary = () => {
1195
- const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
1196
- if (bsSheetEl) {
1197
- toggleBodyColorMode(bsSheetEl);
1198
- }
1199
- }
1200
- toggleBodyColorPrimary();
1201
- const tabsets = window.document.querySelectorAll(".panel-tabset-tabby")
1202
- tabsets.forEach(function(tabset) {
1203
- const tabby = new Tabby('#' + tabset.id);
1204
- });
1205
- const isCodeAnnotation = (el) => {
1206
- for (const clz of el.classList) {
1207
- if (clz.startsWith('code-annotation-')) {
1208
- return true;
1209
- }
1210
- }
1211
- return false;
1212
- }
1213
- const clipboard = new window.ClipboardJS('.code-copy-button', {
1214
- text: function(trigger) {
1215
- const codeEl = trigger.previousElementSibling.cloneNode(true);
1216
- for (const childEl of codeEl.children) {
1217
- if (isCodeAnnotation(childEl)) {
1218
- childEl.remove();
1219
- }
1220
- }
1221
- return codeEl.innerText;
1222
- }
1223
- });
1224
- clipboard.on('success', function(e) {
1225
- // button target
1226
- const button = e.trigger;
1227
- // don't keep focus
1228
- button.blur();
1229
- // flash "checked"
1230
- button.classList.add('code-copy-button-checked');
1231
- var currentTitle = button.getAttribute("title");
1232
- button.setAttribute("title", "Copied!");
1233
- let tooltip;
1234
- if (window.bootstrap) {
1235
- button.setAttribute("data-bs-toggle", "tooltip");
1236
- button.setAttribute("data-bs-placement", "left");
1237
- button.setAttribute("data-bs-title", "Copied!");
1238
- tooltip = new bootstrap.Tooltip(button,
1239
- { trigger: "manual",
1240
- customClass: "code-copy-button-tooltip",
1241
- offset: [0, -8]});
1242
- tooltip.show();
1243
- }
1244
- setTimeout(function() {
1245
- if (tooltip) {
1246
- tooltip.hide();
1247
- button.removeAttribute("data-bs-title");
1248
- button.removeAttribute("data-bs-toggle");
1249
- button.removeAttribute("data-bs-placement");
1250
- }
1251
- button.setAttribute("title", currentTitle);
1252
- button.classList.remove('code-copy-button-checked');
1253
- }, 1000);
1254
- // clear code selection
1255
- e.clearSelection();
1256
- });
1257
- function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
1258
- const config = {
1259
- allowHTML: true,
1260
- maxWidth: 500,
1261
- delay: 100,
1262
- arrow: false,
1263
- appendTo: function(el) {
1264
- return el.closest('section.slide') || el.parentElement;
1265
- },
1266
- interactive: true,
1267
- interactiveBorder: 10,
1268
- theme: 'light-border',
1269
- placement: 'bottom-start',
1270
- };
1271
- if (contentFn) {
1272
- config.content = contentFn;
1273
- }
1274
- if (onTriggerFn) {
1275
- config.onTrigger = onTriggerFn;
1276
- }
1277
- if (onUntriggerFn) {
1278
- config.onUntrigger = onUntriggerFn;
1279
- }
1280
- config['offset'] = [0,0];
1281
- config['maxWidth'] = 700;
1282
- window.tippy(el, config);
1283
- }
1284
- const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
1285
- for (var i=0; i<noterefs.length; i++) {
1286
- const ref = noterefs[i];
1287
- tippyHover(ref, function() {
1288
- // use id or data attribute instead here
1289
- let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
1290
- try { href = new URL(href).hash; } catch {}
1291
- const id = href.replace(/^#\/?/, "");
1292
- const note = window.document.getElementById(id);
1293
- return note.innerHTML;
1294
- });
1295
- }
1296
- const findCites = (el) => {
1297
- const parentEl = el.parentElement;
1298
- if (parentEl) {
1299
- const cites = parentEl.dataset.cites;
1300
- if (cites) {
1301
- return {
1302
- el,
1303
- cites: cites.split(' ')
1304
- };
1305
- } else {
1306
- return findCites(el.parentElement)
1307
- }
1308
- } else {
1309
- return undefined;
1310
- }
1311
- };
1312
- var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
1313
- for (var i=0; i<bibliorefs.length; i++) {
1314
- const ref = bibliorefs[i];
1315
- const citeInfo = findCites(ref);
1316
- if (citeInfo) {
1317
- tippyHover(citeInfo.el, function() {
1318
- var popup = window.document.createElement('div');
1319
- citeInfo.cites.forEach(function(cite) {
1320
- var citeDiv = window.document.createElement('div');
1321
- citeDiv.classList.add('hanging-indent');
1322
- citeDiv.classList.add('csl-entry');
1323
- var biblioDiv = window.document.getElementById('ref-' + cite);
1324
- if (biblioDiv) {
1325
- citeDiv.innerHTML = biblioDiv.innerHTML;
1326
- }
1327
- popup.appendChild(citeDiv);
1328
- });
1329
- return popup.innerHTML;
1330
- });
1331
- }
1332
- }
1333
- });
1334
- </script>
1335
-
1336
-
1337
- </body></html>