# -*- coding: utf-8 -*- """FinalProject_TextClassificationFineTuning.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1fCS36Rnww__14QDdcsjjG5hfFL83gzpU """ !pip install opendatasets !pip install gradio --quiet !pip install transformers[sentencepeice] datasets sacrebleu rouge_score py7zr -q !!pip install rake-nltk # used to determine the key phrases in the text ! pip install kaggle #After Done, delete all teh models that are not needed import opendatasets as od import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import opendatasets as od import gradio as gr from transformers import pipeline import matplotlib.pyplot as plt from datasets import load_dataset, load_metric from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import nltk from nltk.tokenize import sent_tokenize nltk.download("punkt") from datasets import load_dataset from transformers import pipeline from transformers import AutoModelForSeq2SeqLM, AutoTokenizer import torch device = "cuda" if torch.cuda.is_available() else "cpu" device # Pretrained-Dataset is this one: PEGASUS MODEL retrieved form https://huggingface.co/nsi319/legal-pegasus from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus") model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus") text = """ """ input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True) summary_ids = model.generate(input_tokenized, num_beams=9, no_repeat_ngram_size=3, length_penalty=2.0, min_length=150, max_length=250, early_stopping=True) summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0] ### Summary Output # The Securities and Exchange Commission today charged AT&T, Inc. and three of its Investor Relations executives with aiding and abetting the company's violations of the antifraud provisions of Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder. According to the SEC's complaint, the company learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause its revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, the executives made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the SEC alleges that Christopher Womack, Michael Black, and Kent Evans allegedly disclosed internal smartphone sales data and the impact of that data on internal revenue metrics. The SEC further alleges that as a result of what they were told, the analysts substantially reduced their revenue forecasts, leading to the overall consensus Revenue Estimate falling to just below the level that AT&t ultimately reported to the public on April 26, 2016. The SEC is seeking permanent injunctive relief and civil monetary penalties against each defendant. summary #Here we load the ToS dataset for additional finetuning.... this step is optional and doing so only improves our model #The only issue with this is that it requires GPU and runtime disconnects and crashes since I dont have access to GPU or compute power it needs #Loading the Dataset # Assign the Kaggle data set URL into variable dataset = 'https://www.kaggle.com/datasets/simple11/tos-summaries' # Using opendatasets let's download the data sets od.download(dataset) dataset = pd.read_json('/content/tos-summaries/dataset.json', lines = True) dataset #print(dataset.head(6)) print(f"Summary: \n{summary}") 'summarization Gradio for my program' def summarize_text(text): #changed this to "inputs" inputs = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True) #generate summary summary_ids = model.generate(input_tokenized, num_beams=9, no_repeat_ngram_size=3, length_penalty=2.0, min_length=150, max_length=250, early_stopping=True) # Decode and return the summary return tokenizer.decode(summary_ids[0], skip_special_tokens=True) model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus") tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus") interface = gr.Interface( fn=summarize_text, inputs=gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input text'), outputs=gr.Textbox(label='Summarized Text'), title='Terms and Conditions Text Summarizer' ) interface.launch() ######################################################################################################## import nltk from rake_nltk import Rake nltk.download('stopwords') nltk.download('punkt') # Uses stopwords for english from NLTK, and all puntuation characters by # default r = Rake() # Extraction given the text. r.extract_keywords_from_text(summary) # Obtain keyword phrases ranked from highest to lowest. r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest with scores. r.get_ranked_phrases_with_scores()