File size: 14,143 Bytes
63cf97c e05e505 63cf97c d5ec1fb 63cf97c b04b6bd 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 1d81d95 e05e505 bdda27d 34f042c 63cf97c 25f8f89 ba3e462 454168b ba3e462 09901c4 cd493ca 63cf97c 205ea6f 454168b 205ea6f ba3e462 205ea6f 454168b ba3e462 454168b 205ea6f 819b3be 63cf97c 454168b 63cf97c 25f8f89 63cf97c 25f8f89 63cf97c 454168b 1bb84f7 1217a66 ba3e462 1bb84f7 63cf97c 1bb84f7 1217a66 ba3e462 25f8f89 6d4035b 1217a66 6d4035b 25f8f89 6d4035b 25f8f89 6d4035b 25f8f89 6d4035b 25f8f89 1217a66 25f8f89 1217a66 25f8f89 1217a66 6d4035b 1217a66 ba3e462 1217a66 454168b 63cf97c 819b3be 63cf97c ba3e462 09901c4 ba3e462 454168b ba3e462 63cf97c 819b3be 63cf97c 454168b 09901c4 ba3e462 63cf97c 819b3be 63cf97c ba3e462 63cf97c 454168b 34f99d2 454168b 63cf97c 984a13e ba3e462 984a13e adc4997 ba3e462 0e751b5 ba3e462 0e751b5 3845156 0e751b5 ba3e462 3845156 0e751b5 ba3e462 0e751b5 ba3e462 0e751b5 454168b 63cf97c ba3e462 63cf97c 454168b 63cf97c 4c06e3c 63cf97c 454168b 63cf97c 454168b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
---
license:
- apache-2.0
- bsd-3-clause
tags:
- summarization
- summary
- booksum
- long-document
- long-form
- tglobal-xl
- XL
datasets:
- kmfoda/booksum
metrics:
- rouge
inference: false
model-index:
- name: pszemraj/long-t5-tglobal-xl-16384-book-summary
results:
- task:
type: summarization
name: Summarization
dataset:
name: multi_news
type: multi_news
config: default
split: test
metrics:
- type: rouge
value: 36.2043
name: ROUGE-1
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiYzRmMmUyOTVjMmJmZTRiZDcyYzY3MTQ1MmUyNDA5NjVhYzEzYzBiNzcxYTRhMDQ3OTlhMGZjYmJlNDM1M2NjYyIsInZlcnNpb24iOjF9._uArOQ1_0znXDPXMq7unA1OHB-XbgqzzKRbFRcVUzTUJdWk26LiSa2pEEVNNmJPg6Uo7CAvONmhpEswLvl9TAg
- type: rouge
value: 8.424
name: ROUGE-2
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNzg0MzljYjVjYWQ3MmRkZDBlOGI5M2RiMGU0M2UwZGUzMDg2NTU0NjcwMTNiN2ZmODEzNTQ0MmEwNDA3NDA5MSIsInZlcnNpb24iOjF9.Dzj85ld6TjosQ8KyUdoadzicMLedEFrICC6Q-08O3qx28d9B9Uke1zw-VWabiuesPEDTRGbWuBgPA5vxYWUZAw
- type: rouge
value: 17.3721
name: ROUGE-L
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNDA3ZjZmODAwMTNlM2RlZmJlMDI5MGVkMGRkMTBjMTYzNDk5ZjFiNTY5MWE1MDUwNWI2MDE4ZDA2YWMwMmI2NCIsInZlcnNpb24iOjF9.MOV_nId0XAK1eMQssG5GN9DsitZaTrxl4jdCJnOg9EZ0-vAw227ln599YV5YfZ1OPJnWwek6rneqqyONiHn9AQ
- type: rouge
value: 32.3994
name: ROUGE-LSUM
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiZmY3MDMwOTZjNWI0YTk1MDgwMzJkYTFiN2U5YWU0Mzc0MWRiMzc1NzZlMDhjMWUwMmY2ODI2MjI5ODBkYWUxOSIsInZlcnNpb24iOjF9._BwGIZbcA4pUBkEAL0cW-JPPta0KSoGug4Z7vogHacUz-AEhIOI5ICUldZh0pt9OK67MpUSzpShJOu3rSt5YDQ
- type: loss
value: 2.0843334197998047
name: loss
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiOWFhMmE5ZjA3ODM4YmVjMDMyMjk5YjNlMjA1MGMzOWY0NTRlYzk1YjZiMzQxMDMxOTMwMjFkNTdmNjM1NDcyMyIsInZlcnNpb24iOjF9.3wbXV4CIIgnfXAnnRztdOR12PwsWsEfiglQQ09K-C1EgW4gai4x9l-wTE2OZ7CTWkuk_tr4tL_uqOCXLZRMtCQ
- type: gen_len
value: 248.3572
name: gen_len
verified: true
verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiMWZhOGMwMDJjNGU2MzA2YzI1OWU1ZDY5N2NjZmM1YTA5NDg1MzUwNmU1YTBhNjQyNWYwYzA3OGNmODFjMmE2NSIsInZlcnNpb24iOjF9.Rc9u89zCdbFnjsnmq65l_JvCtUwOX_ZWapKJpTZ-rC8HxcUVfi2Ash2QfvvvxHH_YWhwklxxdnNa0HCm46qLAA
- task:
type: summarization
name: Summarization
dataset:
name: billsum
type: billsum
config: default
split: test
metrics:
- name: ROUGE-1
type: rouge
value: 41.3645
verified: true
- name: ROUGE-2
type: rouge
value: 16.144
verified: true
- name: ROUGE-L
type: rouge
value: 24.2981
verified: true
- name: ROUGE-LSUM
type: rouge
value: 35.3234
verified: true
- name: loss
type: loss
value: 1.282260775566101
verified: true
- name: gen_len
type: gen_len
value: 291.8158
verified: true
- task:
type: summarization
name: Summarization
dataset:
name: ccdv/arxiv-summarization
type: ccdv/arxiv-summarization
config: document
split: test
metrics:
- name: ROUGE-1
type: rouge
value: 36.3225
verified: true
- name: ROUGE-2
type: rouge
value: 9.3743
verified: true
- name: ROUGE-L
type: rouge
value: 19.8396
verified: true
- name: ROUGE-LSUM
type: rouge
value: 32.2532
verified: true
- name: loss
type: loss
value: 2.146871566772461
verified: true
- name: gen_len
type: gen_len
value: 186.2966
verified: true
---
# long-t5-tglobal-xl + BookSum
<a href="https://colab.research.google.com/gist/pszemraj/c19e32baf876deb866c31cd46c86e893/long-t5-xl-accelerate-test.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
Summarize long text and get a SparkNotes-like summary of any topic!
- Generalizes reasonably well to academic & narrative text.
- This is the XL checkpoint, which **produces even better summaries [from a human evaluation perspective](https://long-t5-xl-book-summary-examples.netlify.app/)**.
A simple example/use case with [the base model](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) on ASR is [here](https://longt5-booksum-example.netlify.app/).
## Cheeky Proof-of-Concept
A summary of the [infamous navy seals copypasta](https://knowyourmeme.com/memes/navy-seal-copypasta):
> In this chapter, the monster explains how he intends to exact revenge on "the little b\*\*\*\*" who insulted him. He tells the kiddo that he is a highly trained and experienced killer who will use his arsenal of weapons--including his access to the internet--to exact justice on the little brat.
While this is a crude example, try running this copypasta through other summarization models to see the difference in comprehension (_even though it's not even a "long" text!_).
* * *
**Contents**
<!-- TOC -->
- [Description](#description)
- [How-To in Python](#how-to-in-python)
- [Beyond the basics](#beyond-the-basics)
- [Adjusting parameters](#adjusting-parameters)
- [LLM.int8 Quantization](#llmint8-quantization)
- [About](#about)
- [Intended uses & limitations](#intended-uses--limitations)
- [Training and evaluation data](#training-and-evaluation-data)
- [Eval results](#eval-results)
- [FAQ](#faq)
- [How can I run inference with this on CPU?](#how-can-i-run-inference-with-this-on-cpu)
- [How to run inference over a very long (30k+ tokens) document in batches?](#how-to-run-inference-over-a-very-long-30k-tokens-document-in-batches)
- [How to fine-tune further?](#how-to-fine-tune-further)
- [Are there simpler ways to run this?](#are-there-simpler-ways-to-run-this)
- [Training procedure](#training-procedure)
- [Updates](#updates)
- [Training hyperparameters](#training-hyperparameters)
- [Framework versions](#framework-versions)
<!-- /TOC -->
* * *
## Description
A fine-tuned version of [google/long-t5-tglobal-xl](https://huggingface.co/google/long-t5-tglobal-xl) on the `kmfoda/booksum` dataset.
Read the paper by Guo et al. here: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/pdf/2112.07916.pdf)
## How-To in Python
install/update transformers `pip install -U transformers`
summarize text with pipeline:
```python
import torch
from transformers import pipeline
summarizer = pipeline(
"summarization",
"pszemraj/long-t5-tglobal-xl-16384-book-summary",
device=0 if torch.cuda.is_available() else -1,
)
long_text = "Here is a lot of text I don't want to read. Replace me"
result = summarizer(long_text)
print(result[0]["summary_text"])
```
### Beyond the basics
There are two additional points to consider beyond simple inference: adjusting decoding parameters for improved performance, and quantization for reduced memory consumption.
#### Adjusting parameters
Pass [other parameters related to beam search textgen](https://huggingface.co/blog/how-to-generate) when calling `summarizer` to get even higher quality results.
#### LLM.int8 Quantization
> alternative section title: how to get this monster to run inference on free colab runtimes
Via [this PR](https://github.com/huggingface/transformers/pull/20341) LLM.int8 is now supported for `long-t5` models.
- per **initial tests** the summarization quality seems to hold while using _significantly_ less memory! \*
- a version of this model quantized to int8 is [already on the hub here](https://huggingface.co/pszemraj/long-t5-tglobal-xl-16384-book-summary-8bit) so if you're using the 8-bit version anyway, you can start there for a 3.5 gb download only!
First, make sure you have the latest versions of the relevant packages:
```bash
pip install -U transformers bitsandbytes accelerate
```
load in 8-bit (_magic completed by `bitsandbytes` behind the scenes_)
```python
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained(
"pszemraj/long-t5-tglobal-xl-16384-book-summary"
)
model = AutoModelForSeq2SeqLM.from_pretrained(
"pszemraj/long-t5-tglobal-xl-16384-book-summary",
load_in_8bit=True,
device_map="auto",
)
```
The above is already present in the Colab demo linked at the top of the model card.
\* More rigorous metrics-based research comparing beam-search summarization with and without LLM.int8 will take place over time.
* * *
## About
### Intended uses & limitations
While this model seems to improve factual consistency, **don't take summaries as foolproof and check things that seem odd**.
Specifically: negation statements (i.e., the model says: _this thing does not have [ATTRIBUTE]_, when instead it should have said _this thing has lots of [ATTRIBUTE]_).
- I'm sure someone will write a paper on this eventually (if there isn't one already), but you can usually check this by comparing a particular statement with what the surrounding sentences imply.
### Training and evaluation data
`kmfoda/booksum` dataset on HuggingFace - read [the original paper here](https://arxiv.org/abs/2105.08209).
- For **initial fine-tuning**, only input text with 12288 input tokens or less and 1024 output tokens or less was used (_i.e. lines longer than that were dropped before training_) for memory reasons. After a quick analysis, summaries in the 12288-16384 range are in the **small** minority in this dataset.
- In addition, this initial training combined the training and validation sets and trained on them in aggregate to increase the functional dataset size. **Therefore, take the validation set results with a grain of salt; primary metrics should (always) be the test set.**.
- The **final stages of fine-tuning** used the standard 16384 input/1024 output conventions, preserving the standard in/out lengths (_and truncating longer sequences_). This did not seem to change the loss/performance much.
### Eval results
Official results with the [model evaluator](https://huggingface.co/spaces/autoevaluate/model-evaluator) will be computed and posted here.
**Please read the note above, as due to the training methods, the performance on the validation set looks better than the results on the test set will be**. The model achieves the following results on the evaluation set:
- eval_loss: 1.2756
- eval_rouge1: 41.8013
- eval_rouge2: 12.0895
- eval_rougeL: 21.6007
- eval_rougeLsum: 39.5382
- eval_gen_len: 387.2945
- eval_runtime: 13908.4995
- eval_samples_per_second: 0.107
- eval_steps_per_second: 0.027
***** predict/test metrics (initial) *****
predict_gen_len = 506.4368
predict_loss = 2.028
predict_rouge1 = 36.8815
predict_rouge2 = 8.0625
predict_rougeL = 17.6161
predict_rougeLsum = 34.9068
predict_runtime = 2:04:14.37
predict_samples = 1431
predict_samples_per_second = 0.192
predict_steps_per_second = 0.048
\* evaluating big model not as easy as it seems. Doing a bit more investigating
* * *
## FAQ
### How can I run inference with this on CPU?
lol
### How to run inference over a very long (30k+ tokens) document in batches?
See `summarize.py` in [the code for my hf space Document Summarization](https://huggingface.co/spaces/pszemraj/document-summarization/blob/main/summarize.py) :)
You can also use the same code to split a document into batches of 4096, etc., and iterate over them with the model. This is useful in situations where CUDA memory is limited.
**Update:** see the section on the `textsum` package below.
### How to fine-tune further?
See [train with a script](https://huggingface.co/docs/transformers/run_scripts) and [the summarization scripts](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization)
### Are there simpler ways to run this?
For this reason, I created a Python package utility. It's called [textsum](https://github.com/pszemraj/textsum), and you can use it to load models and summarize things in a few lines of code.
```sh
pip install textsum
```
Use `textsum` in python with this model:
```python
from textsum.summarize import Summarizer
summarizer = Summarizer(
model_name_or_path="pszemraj/long-t5-tglobal-xl-16384-book-summary"
)
long_string = "This is a long string of text that will be summarized."
out_str = summarizer.summarize_string(long_string)
print(f"summary: {out_str}")
```
This package provides easy-to-use interfaces for applying summarization models to text documents of arbitrary length. Currently implemented interfaces include a Python API, a CLI, and a shareable demo application.
For details, explanations, and documentation, see the README (_linked above_) or the [wiki](https://github.com/pszemraj/textsum/wiki).
* * *
## Training procedure
### Updates
Updates to this model/model card will be posted here when relevant. The model seems to be fairly converged; if updates/improvements are possible using the `BookSum` dataset, this repo will be updated.
### Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 0.0006
- train_batch_size: 1
- eval_batch_size: 1
- seed: 10350
- distributed_type: multi-GPU
- num_devices: 4
- gradient_accumulation_steps: 32
- total_train_batch_size: 128
- total_eval_batch_size: 4
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: constant
- num_epochs: 1.0
\*_Prior training sessions used roughly similar parameters (learning rates were higher); multiple sessions were required as this takes eons to train._
### Framework versions
- Transformers 4.25.0.dev0
- Pytorch 1.13.0+cu117
- Datasets 2.6.1
- Tokenizers 0.13.1
* * *
|