File size: 14,143 Bytes
63cf97c
e05e505
 
 
63cf97c
 
 
 
 
 
d5ec1fb
 
63cf97c
 
 
 
b04b6bd
1d81d95
 
 
 
 
 
 
 
 
 
 
 
e05e505
1d81d95
e05e505
1d81d95
e05e505
 
1d81d95
e05e505
1d81d95
e05e505
 
1d81d95
e05e505
1d81d95
e05e505
 
1d81d95
e05e505
1d81d95
e05e505
 
1d81d95
e05e505
1d81d95
e05e505
 
1d81d95
e05e505
1d81d95
e05e505
bdda27d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34f042c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63cf97c
 
 
 
25f8f89
 
 
 
ba3e462
454168b
ba3e462
 
09901c4
cd493ca
63cf97c
205ea6f
 
 
 
454168b
205ea6f
ba3e462
205ea6f
454168b
 
 
 
 
 
ba3e462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454168b
 
 
 
205ea6f
819b3be
63cf97c
 
 
454168b
63cf97c
 
 
25f8f89
63cf97c
25f8f89
63cf97c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454168b
1bb84f7
1217a66
ba3e462
1bb84f7
 
63cf97c
 
 
1bb84f7
1217a66
ba3e462
25f8f89
6d4035b
1217a66
6d4035b
 
25f8f89
6d4035b
25f8f89
6d4035b
25f8f89
 
6d4035b
25f8f89
1217a66
 
 
25f8f89
 
 
1217a66
 
25f8f89
 
 
1217a66
 
 
6d4035b
1217a66
ba3e462
1217a66
454168b
63cf97c
819b3be
 
 
63cf97c
ba3e462
09901c4
ba3e462
454168b
ba3e462
63cf97c
819b3be
63cf97c
454168b
09901c4
ba3e462
 
 
63cf97c
819b3be
63cf97c
 
 
ba3e462
63cf97c
454168b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34f99d2
 
454168b
63cf97c
 
 
 
 
 
 
984a13e
 
 
 
ba3e462
 
 
984a13e
adc4997
 
 
 
ba3e462
0e751b5
ba3e462
0e751b5
 
 
 
 
 
 
 
 
 
3845156
 
 
0e751b5
ba3e462
 
3845156
0e751b5
 
ba3e462
0e751b5
ba3e462
0e751b5
454168b
63cf97c
 
 
 
 
ba3e462
63cf97c
 
 
 
454168b
 
 
 
 
 
 
 
 
 
 
 
 
63cf97c
4c06e3c
63cf97c
 
 
454168b
 
 
 
63cf97c
454168b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
---
license:
- apache-2.0
- bsd-3-clause
tags:
- summarization
- summary
- booksum
- long-document
- long-form
- tglobal-xl
- XL
datasets:
- kmfoda/booksum
metrics:
- rouge
inference: false
model-index:
- name: pszemraj/long-t5-tglobal-xl-16384-book-summary
  results:
  - task:
      type: summarization
      name: Summarization
    dataset:
      name: multi_news
      type: multi_news
      config: default
      split: test
    metrics:
    - type: rouge
      value: 36.2043
      name: ROUGE-1
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiYzRmMmUyOTVjMmJmZTRiZDcyYzY3MTQ1MmUyNDA5NjVhYzEzYzBiNzcxYTRhMDQ3OTlhMGZjYmJlNDM1M2NjYyIsInZlcnNpb24iOjF9._uArOQ1_0znXDPXMq7unA1OHB-XbgqzzKRbFRcVUzTUJdWk26LiSa2pEEVNNmJPg6Uo7CAvONmhpEswLvl9TAg
    - type: rouge
      value: 8.424
      name: ROUGE-2
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNzg0MzljYjVjYWQ3MmRkZDBlOGI5M2RiMGU0M2UwZGUzMDg2NTU0NjcwMTNiN2ZmODEzNTQ0MmEwNDA3NDA5MSIsInZlcnNpb24iOjF9.Dzj85ld6TjosQ8KyUdoadzicMLedEFrICC6Q-08O3qx28d9B9Uke1zw-VWabiuesPEDTRGbWuBgPA5vxYWUZAw
    - type: rouge
      value: 17.3721
      name: ROUGE-L
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNDA3ZjZmODAwMTNlM2RlZmJlMDI5MGVkMGRkMTBjMTYzNDk5ZjFiNTY5MWE1MDUwNWI2MDE4ZDA2YWMwMmI2NCIsInZlcnNpb24iOjF9.MOV_nId0XAK1eMQssG5GN9DsitZaTrxl4jdCJnOg9EZ0-vAw227ln599YV5YfZ1OPJnWwek6rneqqyONiHn9AQ
    - type: rouge
      value: 32.3994
      name: ROUGE-LSUM
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiZmY3MDMwOTZjNWI0YTk1MDgwMzJkYTFiN2U5YWU0Mzc0MWRiMzc1NzZlMDhjMWUwMmY2ODI2MjI5ODBkYWUxOSIsInZlcnNpb24iOjF9._BwGIZbcA4pUBkEAL0cW-JPPta0KSoGug4Z7vogHacUz-AEhIOI5ICUldZh0pt9OK67MpUSzpShJOu3rSt5YDQ
    - type: loss
      value: 2.0843334197998047
      name: loss
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiOWFhMmE5ZjA3ODM4YmVjMDMyMjk5YjNlMjA1MGMzOWY0NTRlYzk1YjZiMzQxMDMxOTMwMjFkNTdmNjM1NDcyMyIsInZlcnNpb24iOjF9.3wbXV4CIIgnfXAnnRztdOR12PwsWsEfiglQQ09K-C1EgW4gai4x9l-wTE2OZ7CTWkuk_tr4tL_uqOCXLZRMtCQ
    - type: gen_len
      value: 248.3572
      name: gen_len
      verified: true
      verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiMWZhOGMwMDJjNGU2MzA2YzI1OWU1ZDY5N2NjZmM1YTA5NDg1MzUwNmU1YTBhNjQyNWYwYzA3OGNmODFjMmE2NSIsInZlcnNpb24iOjF9.Rc9u89zCdbFnjsnmq65l_JvCtUwOX_ZWapKJpTZ-rC8HxcUVfi2Ash2QfvvvxHH_YWhwklxxdnNa0HCm46qLAA
  - task:
      type: summarization
      name: Summarization
    dataset:
      name: billsum
      type: billsum
      config: default
      split: test
    metrics:
    - name: ROUGE-1
      type: rouge
      value: 41.3645
      verified: true
    - name: ROUGE-2
      type: rouge
      value: 16.144
      verified: true
    - name: ROUGE-L
      type: rouge
      value: 24.2981
      verified: true
    - name: ROUGE-LSUM
      type: rouge
      value: 35.3234
      verified: true
    - name: loss
      type: loss
      value: 1.282260775566101
      verified: true
    - name: gen_len
      type: gen_len
      value: 291.8158
      verified: true
  - task:
      type: summarization
      name: Summarization
    dataset:
      name: ccdv/arxiv-summarization
      type: ccdv/arxiv-summarization
      config: document
      split: test
    metrics:
    - name: ROUGE-1
      type: rouge
      value: 36.3225
      verified: true
    - name: ROUGE-2
      type: rouge
      value: 9.3743
      verified: true
    - name: ROUGE-L
      type: rouge
      value: 19.8396
      verified: true
    - name: ROUGE-LSUM
      type: rouge
      value: 32.2532
      verified: true
    - name: loss
      type: loss
      value: 2.146871566772461
      verified: true
    - name: gen_len
      type: gen_len
      value: 186.2966
      verified: true
---

# long-t5-tglobal-xl + BookSum

<a href="https://colab.research.google.com/gist/pszemraj/c19e32baf876deb866c31cd46c86e893/long-t5-xl-accelerate-test.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Summarize long text and get a SparkNotes-like summary of any topic!

- Generalizes reasonably well to academic & narrative text.
- This is the XL checkpoint, which **produces even better summaries [from a human evaluation perspective](https://long-t5-xl-book-summary-examples.netlify.app/)**.

A simple example/use case with [the base model](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) on ASR is [here](https://longt5-booksum-example.netlify.app/).

## Cheeky Proof-of-Concept

A summary of the [infamous navy seals copypasta](https://knowyourmeme.com/memes/navy-seal-copypasta):

> In this chapter, the monster explains how he intends to exact revenge on "the little b\*\*\*\*" who insulted him. He tells the kiddo that he is a highly trained and experienced killer who will use his arsenal of weapons--including his access to the internet--to exact justice on the little brat.

While this is a crude example, try running this copypasta through other summarization models to see the difference in comprehension (_even though it's not even a "long" text!_).

* * *

**Contents**

<!-- TOC -->

- [Description](#description)
- [How-To in Python](#how-to-in-python)
    - [Beyond the basics](#beyond-the-basics)
        - [Adjusting parameters](#adjusting-parameters)
        - [LLM.int8 Quantization](#llmint8-quantization)
- [About](#about)
    - [Intended uses & limitations](#intended-uses--limitations)
    - [Training and evaluation data](#training-and-evaluation-data)
    - [Eval results](#eval-results)
- [FAQ](#faq)
    - [How can I run inference with this on CPU?](#how-can-i-run-inference-with-this-on-cpu)
    - [How to run inference over a very long (30k+ tokens) document in batches?](#how-to-run-inference-over-a-very-long-30k-tokens-document-in-batches)
    - [How to fine-tune further?](#how-to-fine-tune-further)
    - [Are there simpler ways to run this?](#are-there-simpler-ways-to-run-this)
- [Training procedure](#training-procedure)
    - [Updates](#updates)
    - [Training hyperparameters](#training-hyperparameters)
    - [Framework versions](#framework-versions)

<!-- /TOC -->

* * *

## Description

A fine-tuned version of [google/long-t5-tglobal-xl](https://huggingface.co/google/long-t5-tglobal-xl) on the `kmfoda/booksum` dataset.

Read the paper by Guo et al. here: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/pdf/2112.07916.pdf)

## How-To in Python

install/update transformers `pip install -U transformers`

summarize text with pipeline:

```python
import torch
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    "pszemraj/long-t5-tglobal-xl-16384-book-summary",
    device=0 if torch.cuda.is_available() else -1,
)
long_text = "Here is a lot of text I don't want to read. Replace me"

result = summarizer(long_text)
print(result[0]["summary_text"])
```

### Beyond the basics

There are two additional points to consider beyond simple inference: adjusting decoding parameters for improved performance, and quantization for reduced memory consumption.

#### Adjusting parameters

Pass [other parameters related to beam search textgen](https://huggingface.co/blog/how-to-generate) when calling `summarizer` to get even higher quality results.

#### LLM.int8 Quantization

> alternative section title: how to get this monster to run inference on free colab runtimes

Via [this PR](https://github.com/huggingface/transformers/pull/20341) LLM.int8 is now supported for `long-t5` models. 

- per **initial tests** the summarization quality seems to hold while using _significantly_ less memory! \*
- a version of this model quantized to int8 is [already on the hub here](https://huggingface.co/pszemraj/long-t5-tglobal-xl-16384-book-summary-8bit) so if you're using the 8-bit version anyway, you can start there for a 3.5 gb download only!

First, make sure you have the latest versions of the relevant packages:
```bash
pip install -U transformers bitsandbytes accelerate
```

load in 8-bit (_magic completed by `bitsandbytes` behind the scenes_)

```python
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(
    "pszemraj/long-t5-tglobal-xl-16384-book-summary"
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "pszemraj/long-t5-tglobal-xl-16384-book-summary",
    load_in_8bit=True,
    device_map="auto",
)
```

The above is already present in the Colab demo linked at the top of the model card.

\* More rigorous metrics-based research comparing beam-search summarization with and without LLM.int8 will take place over time.

* * *

## About

### Intended uses & limitations

While this model seems to improve factual consistency, **don't take summaries as foolproof and check things that seem odd**.

Specifically: negation statements (i.e., the model says: _this thing does not have [ATTRIBUTE]_, when instead it should have said _this thing has lots of [ATTRIBUTE]_).

- I'm sure someone will write a paper on this eventually (if there isn't one already), but you can usually check this by comparing a particular statement with what the surrounding sentences imply.

### Training and evaluation data

`kmfoda/booksum` dataset on HuggingFace - read [the original paper here](https://arxiv.org/abs/2105.08209).

- For **initial fine-tuning**, only input text with 12288 input tokens or less and 1024 output tokens or less was used (_i.e. lines longer than that were dropped before training_) for memory reasons. After a quick analysis, summaries in the 12288-16384 range are in the **small** minority in this dataset.
    - In addition, this initial training combined the training and validation sets and trained on them in aggregate to increase the functional dataset size. **Therefore, take the validation set results with a grain of salt; primary metrics should (always) be the test set.**.
- The **final stages of fine-tuning** used the standard 16384 input/1024 output conventions, preserving the standard in/out lengths (_and truncating longer sequences_). This did not seem to change the loss/performance much.

### Eval results

Official results with the [model evaluator](https://huggingface.co/spaces/autoevaluate/model-evaluator) will be computed and posted here.

**Please read the note above, as due to the training methods, the performance on the validation set looks better than the results on the test set will be**. The model achieves the following results on the evaluation set:

-   eval_loss: 1.2756
-   eval_rouge1: 41.8013
-   eval_rouge2: 12.0895
-   eval_rougeL: 21.6007
-   eval_rougeLsum: 39.5382
-   eval_gen_len: 387.2945
-   eval_runtime: 13908.4995
-   eval_samples_per_second: 0.107
-   eval_steps_per_second: 0.027


    ***** predict/test metrics (initial) *****
      predict_gen_len            =   506.4368
      predict_loss               =      2.028
      predict_rouge1             =    36.8815
      predict_rouge2             =     8.0625
      predict_rougeL             =    17.6161
      predict_rougeLsum          =    34.9068
      predict_runtime            = 2:04:14.37
      predict_samples            =       1431
      predict_samples_per_second =      0.192
      predict_steps_per_second   =      0.048

\* evaluating big model not as easy as it seems. Doing a bit more investigating

* * *

## FAQ

### How can I run inference with this on CPU?

lol

### How to run inference over a very long (30k+ tokens) document in batches?

See `summarize.py` in [the code for my hf space Document Summarization](https://huggingface.co/spaces/pszemraj/document-summarization/blob/main/summarize.py) :)

You can also use the same code to split a document into batches of 4096, etc., and iterate over them with the model. This is useful in situations where CUDA memory is limited.

**Update:** see the section on the `textsum` package below.

### How to fine-tune further?

See [train with a script](https://huggingface.co/docs/transformers/run_scripts) and [the summarization scripts](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization)

### Are there simpler ways to run this?

For this reason, I created a Python package utility. It's called [textsum](https://github.com/pszemraj/textsum), and you can use it to load models and summarize things in a few lines of code.

```sh
pip install textsum
```

Use `textsum` in python with this model:

```python
from textsum.summarize import Summarizer

summarizer = Summarizer(
    model_name_or_path="pszemraj/long-t5-tglobal-xl-16384-book-summary"
)

long_string = "This is a long string of text that will be summarized."
out_str = summarizer.summarize_string(long_string)
print(f"summary: {out_str}")
```

This package provides easy-to-use interfaces for applying summarization models to text documents of arbitrary length. Currently implemented interfaces include a Python API, a CLI, and a shareable demo application.

For details, explanations, and documentation, see the README (_linked above_) or the [wiki](https://github.com/pszemraj/textsum/wiki).

* * *

## Training procedure

### Updates

Updates to this model/model card will be posted here when relevant. The model seems to be fairly converged; if updates/improvements are possible using the `BookSum` dataset, this repo will be updated.

### Training hyperparameters

The following hyperparameters were used during training:

-   learning_rate: 0.0006
-   train_batch_size: 1
-   eval_batch_size: 1
-   seed: 10350
-   distributed_type: multi-GPU
-   num_devices: 4
-   gradient_accumulation_steps: 32
-   total_train_batch_size: 128
-   total_eval_batch_size: 4
-   optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
-   lr_scheduler_type: constant
-   num_epochs: 1.0

\*_Prior training sessions used roughly similar parameters (learning rates were higher); multiple sessions were required as this takes eons to train._

### Framework versions

-   Transformers 4.25.0.dev0
-   Pytorch 1.13.0+cu117
-   Datasets 2.6.1
-   Tokenizers 0.13.1

* * *