Spaces:
Runtime error
Runtime error
import transformers | |
import datasets | |
import torch | |
import sentencepiece | |
import evaluate | |
from datasets import load_dataset | |
from transformers import MT5ForConditionalGeneration, T5Tokenizer | |
import re | |
# Load dataset | |
ds = load_dataset("scillm/scientific_papers-archive", split="test") | |
# Select the first 1000 examples | |
small_ds = ds.select(range(1000)) | |
# Preprocessing function to remove unwanted references | |
def preprocess_text(text): | |
# Remove unwanted references like @xcite | |
text = re.sub(r'@\w+', '', text) # Remove anything that starts with @ | |
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces | |
return text | |
# Preprocessing function | |
def preprocess(examples): | |
# Preprocess articles and summaries | |
articles = [preprocess_text(article) for article in examples["input"]] | |
outputs = [preprocess_text(output) for output in examples["output"]] | |
# Add prefix to the articles | |
inputs = ["summarize: " + article for article in articles] | |
# Tokenize articles | |
model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length") | |
# Tokenize summaries | |
labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length") | |
model_inputs["labels"] = labels["input_ids"] | |
return model_inputs | |
# Load mT5 model and tokenizer | |
model_name = "google/mt5-small" # You can also use other mT5 models | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
# Tokenize the smaller dataset | |
tokenized_small_ds = small_ds.map(preprocess, batched=True) | |
# Verify that the dataset is correctly tokenized | |
print(tokenized_small_ds[0]) | |
# Split the data into train and test set | |
small_ds = ds.train_test_split(test_size=0.2) | |
small_ds["train"][0] | |
print(small_ds['train'].features) | |
print(small_ds.column_names) | |
from transformers import T5Tokenizer | |
model_name = "google/mt5-small" | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
# Apply preprocessing function to dataset | |
tokenized_ds = small_ds.map(preprocess, batched=True) | |
from transformers import DataCollatorForSeq2Seq | |
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name) | |
import torch | |
torch.cuda.empty_cache() | |
#nvidia-smi | |
#pip install wandb | |
import os | |
import wandb | |
api_key = os.getenv("API_KEY") | |
# Authenticate with WandB | |
wandb.login(key=api_key) | |
#print(os.getenv('API_KEY')) | |
#os.environ["API_KEY"] | |
from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer | |
import torch | |
# Load the model | |
model_name = "google/mt5-small" | |
model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
# Set the device | |
device = torch.device("cpu") | |
model.to(device) | |
# Ensure model parameters are contiguous | |
for name, param in model.named_parameters(): | |
if not param.is_contiguous(): | |
param.data = param.data.contiguous() # Make the tensor contiguous | |
print(f"Made {name} contiguous.") | |
training_args = Seq2SeqTrainingArguments( | |
output_dir='./results', | |
num_train_epochs=1, | |
per_device_train_batch_size=4, # Pienennä batch-kokoa | |
per_device_eval_batch_size=4, | |
evaluation_strategy='epoch', | |
logging_dir='./logs', | |
predict_with_generate=True | |
) | |
# Create trainer instance | |
trainer = Seq2SeqTrainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen | |
eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen | |
) | |
# Kouluta malli | |
trainer.train() | |
#pip install rouge_score | |
import evaluate | |
rouge = evaluate.load("rouge") | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
# Decode predictions and labels (remove special tokens) | |
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) | |
# Replace -100 in labels (ignore index) with the padding token id | |
labels[labels == -100] = tokenizer.pad_token_id | |
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
# Compute ROUGE scores using the `evaluate` library | |
rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels) | |
return { | |
"rouge1": rouge_output["rouge1"], | |
"rouge2": rouge_output["rouge2"], | |
"rougeL": rouge_output["rougeL"], | |
} | |
# Update trainer to include costom metrics | |
trainer.compute_metrics = compute_metrics | |
# Evaluate the model | |
eval_result = trainer.evaluate() | |
print(eval_result) | |
# Save the fine-tuned model | |
trainer.save_model("fine-tuned-mt5") | |
tokenizer.save_pretrained("fine-tuned-mt5") | |
# Load required libraries | |
from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
# Load the fine-tuned tokenizer and model | |
model_name = "fine-tuned-mt5" | |
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) | |
new_model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
from transformers import pipeline | |
import torch | |
# Syötteesi | |
# Restructured input | |
text = ( | |
"Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n" | |
"1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n" | |
"2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n" | |
"3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n" | |
"4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n" | |
"5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n" | |
"Please provide a summary." | |
) | |
# Määrittele laite (GPU tai CPU) | |
device = 0 if torch.cuda.is_available() else -1 | |
# Lataa tiivistämispipeline | |
summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device) | |
# Tiivistä teksti | |
summary = summarizer(text, | |
max_length=120, | |
min_length=30, | |
do_sample=False, | |
num_beams=10, | |
repetition_penalty=5.0, | |
no_repeat_ngram_size=2, | |
length_penalty=1.0)[0]["summary_text"] | |
# Clean the summary by removing the <extra_id_0> token | |
import re | |
# Regular expression to match both <extra_id_X> and <id_XX> | |
pattern = r"<(extra_id_\d+|id_\d+)>" | |
# Replace all matches with a space | |
cleaned_summary = re.sub(pattern, " ", summary).strip() | |
print(cleaned_summary) | |
# Niinan koodi | |
#pip install gradio PyMuPDF | |
import gradio as gr | |
from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
import fitz # PyMuPDF | |
# Load the fine-tuned tokenizer and model | |
model_name = "fine-tuned-mt5" | |
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) | |
new_model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
# Function to extract text from PDF using PyMuPDF | |
def extract_text_from_pdf(pdf_file): | |
text = "" | |
# Open the PDF file | |
with fitz.open(pdf_file) as doc: | |
for page in doc: | |
text += page.get_text() # Extract text from each page | |
return text | |
# Summarization function | |
def summarize_pdf(pdf_file, max_summary_length): | |
# Extract text from the PDF | |
input_text = extract_text_from_pdf(pdf_file) | |
# Tokenize the input to check length | |
tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt') | |
try: | |
# Generate the summary | |
summary_ids = new_model.generate( | |
tokenized_input, | |
max_length=max_summary_length, | |
min_length=30, | |
num_beams=15, | |
repetition_penalty=5.0, | |
no_repeat_ngram_size=2 | |
) | |
# Decode the generated summary | |
summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True) | |
# Clean up the summary to remove unwanted tokens | |
cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip() | |
# Ensure the summary ends with a complete sentence | |
if cleaned_summary: | |
last_period_index = cleaned_summary.rfind('.') | |
if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1: | |
cleaned_summary = cleaned_summary[:last_period_index + 1] | |
else: | |
cleaned_summary = cleaned_summary.strip() | |
return cleaned_summary if cleaned_summary else "No valid summary generated." | |
except Exception as e: | |
return str(e) # Return the error message for debugging | |
# Define the Gradio interface | |
interface = gr.Interface( | |
fn=summarize_pdf, | |
inputs=[ | |
gr.File(label="Upload PDF"), | |
gr.Slider(50, 300, step=10, label="Max summary length") | |
], | |
outputs="textbox", # A textbox for the output summary | |
title="PDF Text Summarizer", | |
description="Upload a PDF file to summarize its content." | |
) | |
# Launch the interface | |
# Launch the interface with debug mode enabled | |
interface.launch(debug=True) | |