detectROSE commited on
Commit
55750e2
·
verified ·
1 Parent(s): 122fd99

first_push

Browse files
Files changed (1) hide show
  1. app.py +131 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """FinalProject_TextClassificationFineTuning.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1fCS36Rnww__14QDdcsjjG5hfFL83gzpU
8
+ """
9
+
10
+ !pip install opendatasets
11
+ !pip install gradio --quiet
12
+ !pip install transformers[sentencepeice] datasets sacrebleu rouge_score py7zr -q
13
+ !!pip install rake-nltk # used to determine the key phrases in the text
14
+ ! pip install kaggle
15
+
16
+ #After Done, delete all teh models that are not needed
17
+ import opendatasets as od
18
+ import numpy as np
19
+ import pandas as pd
20
+ import matplotlib.pyplot as plt
21
+ from sklearn.model_selection import train_test_split
22
+ import opendatasets as od
23
+ import gradio as gr
24
+ from transformers import pipeline
25
+ import matplotlib.pyplot as plt
26
+ from datasets import load_dataset, load_metric
27
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
28
+
29
+ import nltk
30
+ from nltk.tokenize import sent_tokenize
31
+
32
+ nltk.download("punkt")
33
+
34
+ from datasets import load_dataset
35
+ from transformers import pipeline
36
+
37
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
38
+ import torch
39
+
40
+ device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ device
42
+
43
+ # Pretrained-Dataset is this one: PEGASUS MODEL retrieved form https://huggingface.co/nsi319/legal-pegasus
44
+
45
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
46
+
47
+ tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
48
+ model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
49
+
50
+ text = """ """
51
+
52
+ input_tokenized = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)
53
+ summary_ids = model.generate(input_tokenized,
54
+ num_beams=9,
55
+ no_repeat_ngram_size=3,
56
+ length_penalty=2.0,
57
+ min_length=150,
58
+ max_length=250,
59
+ early_stopping=True)
60
+ summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
61
+ ### Summary Output
62
+
63
+ # The Securities and Exchange Commission today charged AT&T, Inc. and three of its Investor Relations executives with aiding and abetting the company's violations of the antifraud provisions of Section 10(b) of the Securities Exchange Act of 1934 and Rule 10b-5 thereunder. According to the SEC's complaint, the company learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause its revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, the executives made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the SEC alleges that Christopher Womack, Michael Black, and Kent Evans allegedly disclosed internal smartphone sales data and the impact of that data on internal revenue metrics. The SEC further alleges that as a result of what they were told, the analysts substantially reduced their revenue forecasts, leading to the overall consensus Revenue Estimate falling to just below the level that AT&t ultimately reported to the public on April 26, 2016. The SEC is seeking permanent injunctive relief and civil monetary penalties against each defendant.
64
+
65
+ summary
66
+
67
+ #Here we load the ToS dataset for additional finetuning.... this step is optional and doing so only improves our model
68
+ #The only issue with this is that it requires GPU and runtime disconnects and crashes since I dont have access to GPU or compute power it needs
69
+
70
+ #Loading the Dataset
71
+ # Assign the Kaggle data set URL into variable
72
+ dataset = 'https://www.kaggle.com/datasets/simple11/tos-summaries'
73
+ # Using opendatasets let's download the data sets
74
+ od.download(dataset)
75
+
76
+ dataset = pd.read_json('/content/tos-summaries/dataset.json', lines = True)
77
+ dataset
78
+
79
+ #print(dataset.head(6))
80
+
81
+ print(f"Summary: \n{summary}")
82
+
83
+ 'summarization Gradio for my program'
84
+
85
+ def summarize_text(text):
86
+ #changed this to "inputs"
87
+ inputs = tokenizer.encode(text, return_tensors='pt',max_length=1024,truncation=True)
88
+
89
+ #generate summary
90
+ summary_ids = model.generate(input_tokenized,
91
+ num_beams=9,
92
+ no_repeat_ngram_size=3,
93
+ length_penalty=2.0,
94
+ min_length=150,
95
+ max_length=250,
96
+ early_stopping=True)
97
+
98
+ # Decode and return the summary
99
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
100
+
101
+ model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
102
+ tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
103
+
104
+ interface = gr.Interface(
105
+ fn=summarize_text,
106
+ inputs=gr.Textbox(lines=10, placeholder='Enter Text Here...', label='Input text'),
107
+ outputs=gr.Textbox(label='Summarized Text'),
108
+ title='Terms and Conditions Text Summarizer'
109
+ )
110
+ interface.launch()
111
+
112
+ ########################################################################################################
113
+
114
+ import nltk
115
+ from rake_nltk import Rake
116
+ nltk.download('stopwords')
117
+ nltk.download('punkt')
118
+
119
+ # Uses stopwords for english from NLTK, and all puntuation characters by
120
+ # default
121
+ r = Rake()
122
+
123
+ # Extraction given the text.
124
+ r.extract_keywords_from_text(summary)
125
+
126
+ # Obtain keyword phrases ranked from highest to lowest.
127
+ r.get_ranked_phrases()
128
+
129
+ # To get keyword phrases ranked highest to lowest with scores.
130
+ r.get_ranked_phrases_with_scores()
131
+