Summarizer / app.py
eevaw's picture
Update app.py
7dde707 verified
raw
history blame
9.18 kB
import transformers
import datasets
import torch
import sentencepiece
import evaluate
from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import re
# Load dataset
ds = load_dataset("scillm/scientific_papers-archive", split="test")
# Select the first 1000 examples
small_ds = ds.select(range(1000))
# Preprocessing function to remove unwanted references
def preprocess_text(text):
# Remove unwanted references like @xcite
text = re.sub(r'@\w+', '', text) # Remove anything that starts with @
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
return text
# Preprocessing function
def preprocess(examples):
# Preprocess articles and summaries
articles = [preprocess_text(article) for article in examples["input"]]
outputs = [preprocess_text(output) for output in examples["output"]]
# Add prefix to the articles
inputs = ["summarize: " + article for article in articles]
# Tokenize articles
model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
# Tokenize summaries
labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Load mT5 model and tokenizer
model_name = "google/mt5-small" # You can also use other mT5 models
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Tokenize the smaller dataset
tokenized_small_ds = small_ds.map(preprocess, batched=True)
# Verify that the dataset is correctly tokenized
print(tokenized_small_ds[0])
# Split the data into train and test set
small_ds = ds.train_test_split(test_size=0.2)
small_ds["train"][0]
print(small_ds['train'].features)
print(small_ds.column_names)
from transformers import T5Tokenizer
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Apply preprocessing function to dataset
tokenized_ds = small_ds.map(preprocess, batched=True)
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
import torch
torch.cuda.empty_cache()
#nvidia-smi
#pip install wandb
import os
import wandb
api_key = os.getenv("API_KEY")
# Authenticate with WandB
wandb.login(key=api_key)
#print(os.getenv('API_KEY'))
#os.environ["API_KEY"]
from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
# Load the model
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Set the device
device = torch.device("cpu")
model.to(device)
# Ensure model parameters are contiguous
for name, param in model.named_parameters():
if not param.is_contiguous():
param.data = param.data.contiguous() # Make the tensor contiguous
print(f"Made {name} contiguous.")
training_args = Seq2SeqTrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=4, # Pienennä batch-kokoa
per_device_eval_batch_size=4,
evaluation_strategy='epoch',
logging_dir='./logs',
predict_with_generate=True
)
# Create trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen
eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen
)
# Kouluta malli
trainer.train()
#pip install rouge_score
import evaluate
rouge = evaluate.load("rouge")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
# Decode predictions and labels (remove special tokens)
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Replace -100 in labels (ignore index) with the padding token id
labels[labels == -100] = tokenizer.pad_token_id
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Compute ROUGE scores using the `evaluate` library
rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)
return {
"rouge1": rouge_output["rouge1"],
"rouge2": rouge_output["rouge2"],
"rougeL": rouge_output["rougeL"],
}
# Update trainer to include costom metrics
trainer.compute_metrics = compute_metrics
# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)
# Save the fine-tuned model
trainer.save_model("fine-tuned-mt5")
tokenizer.save_pretrained("fine-tuned-mt5")
# Load required libraries
from transformers import T5Tokenizer, MT5ForConditionalGeneration
# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
from transformers import pipeline
import torch
# Syötteesi
# Restructured input
text = (
"Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
"1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
"2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
"3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
"4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
"5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
"Please provide a summary."
)
# Määrittele laite (GPU tai CPU)
device = 0 if torch.cuda.is_available() else -1
# Lataa tiivistämispipeline
summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)
# Tiivistä teksti
summary = summarizer(text,
max_length=120,
min_length=30,
do_sample=False,
num_beams=10,
repetition_penalty=5.0,
no_repeat_ngram_size=2,
length_penalty=1.0)[0]["summary_text"]
# Clean the summary by removing the <extra_id_0> token
import re
# Regular expression to match both <extra_id_X> and <id_XX>
pattern = r"<(extra_id_\d+|id_\d+)>"
# Replace all matches with a space
cleaned_summary = re.sub(pattern, " ", summary).strip()
print(cleaned_summary)
# Niinan koodi
#pip install gradio PyMuPDF
import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz # PyMuPDF
# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
text = ""
# Open the PDF file
with fitz.open(pdf_file) as doc:
for page in doc:
text += page.get_text() # Extract text from each page
return text
# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
# Extract text from the PDF
input_text = extract_text_from_pdf(pdf_file)
# Tokenize the input to check length
tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')
try:
# Generate the summary
summary_ids = new_model.generate(
tokenized_input,
max_length=max_summary_length,
min_length=30,
num_beams=15,
repetition_penalty=5.0,
no_repeat_ngram_size=2
)
# Decode the generated summary
summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
# Clean up the summary to remove unwanted tokens
cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()
# Ensure the summary ends with a complete sentence
if cleaned_summary:
last_period_index = cleaned_summary.rfind('.')
if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
cleaned_summary = cleaned_summary[:last_period_index + 1]
else:
cleaned_summary = cleaned_summary.strip()
return cleaned_summary if cleaned_summary else "No valid summary generated."
except Exception as e:
return str(e) # Return the error message for debugging
# Define the Gradio interface
interface = gr.Interface(
fn=summarize_pdf,
inputs=[
gr.File(label="Upload PDF"),
gr.Slider(50, 300, step=10, label="Max summary length")
],
outputs="textbox", # A textbox for the output summary
title="PDF Text Summarizer",
description="Upload a PDF file to summarize its content."
)
# Launch the interface
# Launch the interface with debug mode enabled
interface.launch(debug=True)