import transformers import datasets import torch import sentencepiece import evaluate from datasets import load_dataset from transformers import MT5ForConditionalGeneration, T5Tokenizer import re # Load dataset ds = load_dataset("scillm/scientific_papers-archive", split="test") # Select the first 1000 examples small_ds = ds.select(range(1000)) # Preprocessing function to remove unwanted references def preprocess_text(text): # Remove unwanted references like @xcite text = re.sub(r'@\w+', '', text) # Remove anything that starts with @ text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces return text # Preprocessing function def preprocess(examples): # Preprocess articles and summaries articles = [preprocess_text(article) for article in examples["input"]] outputs = [preprocess_text(output) for output in examples["output"]] # Add prefix to the articles inputs = ["summarize: " + article for article in articles] # Tokenize articles model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length") # Tokenize summaries labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length") model_inputs["labels"] = labels["input_ids"] return model_inputs # Load mT5 model and tokenizer model_name = "google/mt5-small" # You can also use other mT5 models tokenizer = T5Tokenizer.from_pretrained(model_name) model = MT5ForConditionalGeneration.from_pretrained(model_name) # Tokenize the smaller dataset tokenized_small_ds = small_ds.map(preprocess, batched=True) # Verify that the dataset is correctly tokenized print(tokenized_small_ds[0]) # Split the data into train and test set small_ds = ds.train_test_split(test_size=0.2) small_ds["train"][0] print(small_ds['train'].features) print(small_ds.column_names) from transformers import T5Tokenizer model_name = "google/mt5-small" tokenizer = T5Tokenizer.from_pretrained(model_name) # Apply preprocessing function to dataset tokenized_ds = small_ds.map(preprocess, batched=True) from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name) import torch torch.cuda.empty_cache() #pip install wandb import os import wandb api_key = os.getenv("API_KEY") # Authenticate with WandB wandb.login(key=api_key) #print(os.getenv('API_KEY')) #os.environ["API_KEY"] from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer import torch # Load the model model_name = "google/mt5-small" model = MT5ForConditionalGeneration.from_pretrained(model_name) # Set the device device = torch.device("cpu") model.to(device) # Ensure model parameters are contiguous for name, param in model.named_parameters(): if not param.is_contiguous(): param.data = param.data.contiguous() # Make the tensor contiguous print(f"Made {name} contiguous.") training_args = Seq2SeqTrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=4, per_device_eval_batch_size=4, evaluation_strategy='epoch', logging_dir='./logs', predict_with_generate=True ) # Create trainer instance trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_small_ds.shuffle().select(range(80)), # Käytetään 800 esimerkkiä koulutukseen eval_dataset=tokenized_small_ds.shuffle().select(range(20, 100)), # Käytetään 200 esimerkkiä arvioimiseen ) # train the model trainer.train() #pip install rouge_score import evaluate rouge = evaluate.load("rouge") def compute_metrics(eval_pred): predictions, labels = eval_pred # Decode predictions and labels (remove special tokens) decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in labels (ignore index) with the padding token id labels[labels == -100] = tokenizer.pad_token_id decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Compute ROUGE scores using the `evaluate` library rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels) return { "rouge1": rouge_output["rouge1"], "rouge2": rouge_output["rouge2"], "rougeL": rouge_output["rougeL"], } # Update trainer to include costom metrics trainer.compute_metrics = compute_metrics # Evaluate the model eval_result = trainer.evaluate() print(eval_result) # Save the fine-tuned model trainer.save_model("fine-tuned-mt5") tokenizer.save_pretrained("fine-tuned-mt5") # Load required libraries from transformers import T5Tokenizer, MT5ForConditionalGeneration # Load the fine-tuned tokenizer and model model_name = "fine-tuned-mt5" new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) new_model = MT5ForConditionalGeneration.from_pretrained(model_name) from transformers import pipeline import torch # Restructured input text = ( "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n" "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n" "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n" "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n" "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n" "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n" "Please provide a summary." ) # Määrittele laite (GPU tai CPU) device = 0 if torch.cuda.is_available() else -1 # Load the pipeline summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device) # Summarize the text summary = summarizer(text, max_length=120, min_length=30, do_sample=False, num_beams=10, repetition_penalty=5.0, no_repeat_ngram_size=2, length_penalty=1.0)[0]["summary_text"] # Clean the summary by removing the token import re # Regular expression to match both and pattern = r"<(extra_id_\d+|id_\d+)>" # Replace all matches with a space cleaned_summary = re.sub(pattern, " ", summary).strip() print(cleaned_summary) import gradio as gr from transformers import T5Tokenizer, MT5ForConditionalGeneration import fitz # PyMuPDF # Load the fine-tuned tokenizer and model model_name = "fine-tuned-mt5" new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True) new_model = MT5ForConditionalGeneration.from_pretrained(model_name) # Function to extract text from PDF using PyMuPDF def extract_text_from_pdf(pdf_file): text = "" # Open the PDF file with fitz.open(pdf_file) as doc: for page in doc: text += page.get_text() # Extract text from each page return text # Summarization function def summarize_pdf(pdf_file, max_summary_length): # Extract text from the PDF input_text = extract_text_from_pdf(pdf_file) # Tokenize the input to check length tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt') try: # Generate the summary summary_ids = new_model.generate( tokenized_input, max_length=max_summary_length, min_length=30, num_beams=15, repetition_penalty=5.0, no_repeat_ngram_size=2 ) # Decode the generated summary summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True) # Clean up the summary to remove unwanted tokens cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('