nickmuchi commited on
Commit
8499c35
·
1 Parent(s): cdcccbe

Create new file

Browse files
Files changed (1) hide show
  1. functions.py +266 -0
functions.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ from pytube import YouTube
4
+ import pandas as pd
5
+ import plotly_express as px
6
+ import nltk
7
+ import plotly.graph_objects as go
8
+ from optimum.onnxruntime import ORTModelForSequenceClassification
9
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
11
+ import streamlit as st
12
+ import en_core_web_lg
13
+
14
+ nltk.download('punkt')
15
+
16
+ from nltk import sent_tokenize
17
+
18
+ @st.experimental_singleton(suppress_st_warning=True)
19
+ def load_models():
20
+ asr_model = whisper.load_model("small")
21
+ q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
22
+ ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
23
+ q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
24
+ ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
25
+ sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
26
+ sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
27
+ ner_pip = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
28
+ sbert = SentenceTransformer("all-mpnet-base-v2")
29
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
30
+
31
+ return asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder
32
+
33
+ @st.experimental_singleton(suppress_st_warning=True)
34
+ def get_spacy():
35
+ nlp = en_core_web_lg.load()
36
+ return nlp
37
+
38
+ @st.experimental_memo(suppress_st_warning=True)
39
+ def inference(link, upload):
40
+ '''Convert Youtube video or Audio upload to text'''
41
+
42
+ if validators.url(link):
43
+
44
+ yt = YouTube(link)
45
+ title = yt.title
46
+ path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
47
+ options = whisper.DecodingOptions(without_timestamps=True)
48
+ results = asr_model.transcribe(path)
49
+
50
+ return results, yt.title
51
+
52
+ elif upload:
53
+ results = asr_model.transcribe(upload)
54
+
55
+ return results, "Transcribed Earnings Audio"
56
+
57
+ @st.experimental_memo(suppress_st_warning=True)
58
+ def sentiment_pipe(earnings_text):
59
+ '''Determine the sentiment of the text'''
60
+
61
+ earnings_sentences = sent_tokenize(earnings_text)
62
+ earnings_sentiment = sent_pipe(earnings_sentences)
63
+
64
+ return earnings_sentiment, earnings_sentences
65
+
66
+ @st.experimental_memo(suppress_st_warning=True)
67
+ def preprocess_plain_text(text,window_size=3):
68
+ '''Preprocess text for semantic search'''
69
+
70
+ text = text.encode("ascii", "ignore").decode() # unicode
71
+ text = re.sub(r"https*\S+", " ", text) # url
72
+ text = re.sub(r"@\S+", " ", text) # mentions
73
+ text = re.sub(r"#\S+", " ", text) # hastags
74
+ text = re.sub(r"\s{2,}", " ", text) # over spaces
75
+ #text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?
76
+
77
+ #break into lines and remove leading and trailing space on each
78
+ lines = [line.strip() for line in text.splitlines()]
79
+
80
+ # #break multi-headlines into a line each
81
+ chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]
82
+
83
+ # # drop blank lines
84
+ text = '\n'.join(chunk for chunk in chunks if chunk)
85
+
86
+ ## We split this article into paragraphs and then every paragraph into sentences
87
+ paragraphs = []
88
+ for paragraph in text.replace('\n',' ').split("\n\n"):
89
+ if len(paragraph.strip()) > 0:
90
+ paragraphs.append(sent_tokenize(paragraph.strip()))
91
+
92
+ #We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
93
+ #Smaller value: Context from other sentences might get lost
94
+ #Lager values: More context from the paragraph remains, but results are longer
95
+ window_size = window_size
96
+ passages = []
97
+ for paragraph in paragraphs:
98
+ for start_idx in range(0, len(paragraph), window_size):
99
+ end_idx = min(start_idx+window_size, len(paragraph))
100
+ passages.append(" ".join(paragraph[start_idx:end_idx]))
101
+
102
+ print(f"Sentences: {sum([len(p) for p in paragraphs])}")
103
+ print(f"Passages: {len(passages)}")
104
+
105
+ return passages
106
+
107
+ @st.experimental_memo(suppress_st_warning=True)
108
+ def chunk_clean_text(text):
109
+
110
+ """Chunk text longer than 500 tokens"""
111
+
112
+ article = nlp(text)
113
+ sentences = [i.text for i in list(article.sents)]
114
+
115
+ current_chunk = 0
116
+ chunks = []
117
+
118
+ for sentence in sentences:
119
+ if len(chunks) == current_chunk + 1:
120
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
121
+ chunks[current_chunk].extend(sentence.split(" "))
122
+ else:
123
+ current_chunk += 1
124
+ chunks.append(sentence.split(" "))
125
+ else:
126
+ chunks.append(sentence.split(" "))
127
+
128
+ for chunk_id in range(len(chunks)):
129
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
130
+
131
+ return chunks
132
+
133
+ def summary_downloader(raw_text):
134
+
135
+ b64 = base64.b64encode(raw_text.encode()).decode()
136
+ new_filename = "new_text_file_{}_.txt".format(time_str)
137
+ st.markdown("#### Download Summary as a File ###")
138
+ href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
139
+ st.markdown(href,unsafe_allow_html=True)
140
+
141
+ def get_all_entities_per_sentence(text):
142
+ doc = nlp(''.join(text))
143
+
144
+ sentences = list(doc.sents)
145
+
146
+ entities_all_sentences = []
147
+ for sentence in sentences:
148
+ entities_this_sentence = []
149
+
150
+ # SPACY ENTITIES
151
+ for entity in sentence.ents:
152
+ entities_this_sentence.append(str(entity))
153
+
154
+ # FLAIR ENTITIES (CURRENTLY NOT USED)
155
+ # sentence_entities = Sentence(str(sentence))
156
+ # tagger.predict(sentence_entities)
157
+ # for entity in sentence_entities.get_spans('ner'):
158
+ # entities_this_sentence.append(entity.text)
159
+
160
+ # XLM ENTITIES
161
+ entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
162
+ for entity in entities_xlm:
163
+ entities_this_sentence.append(str(entity))
164
+
165
+ entities_all_sentences.append(entities_this_sentence)
166
+
167
+ return entities_all_sentences
168
+
169
+ def get_all_entities(text):
170
+ all_entities_per_sentence = get_all_entities_per_sentence(text)
171
+ return list(itertools.chain.from_iterable(all_entities_per_sentence))
172
+
173
+ def get_and_compare_entities(article_content,summary_output):
174
+
175
+ all_entities_per_sentence = get_all_entities_per_sentence(article_content)
176
+ entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
177
+
178
+ all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
179
+ entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
180
+
181
+ matched_entities = []
182
+ unmatched_entities = []
183
+ for entity in entities_summary:
184
+ if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
185
+ matched_entities.append(entity)
186
+ elif any(
187
+ np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
188
+ sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
189
+ art_entity in entities_article):
190
+ matched_entities.append(entity)
191
+ else:
192
+ unmatched_entities.append(entity)
193
+
194
+ matched_entities = list(dict.fromkeys(matched_entities))
195
+ unmatched_entities = list(dict.fromkeys(unmatched_entities))
196
+
197
+ matched_entities_to_remove = []
198
+ unmatched_entities_to_remove = []
199
+
200
+ for entity in matched_entities:
201
+ for substring_entity in matched_entities:
202
+ if entity != substring_entity and entity.lower() in substring_entity.lower():
203
+ matched_entities_to_remove.append(entity)
204
+
205
+ for entity in unmatched_entities:
206
+ for substring_entity in unmatched_entities:
207
+ if entity != substring_entity and entity.lower() in substring_entity.lower():
208
+ unmatched_entities_to_remove.append(entity)
209
+
210
+ matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
211
+ unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
212
+
213
+ for entity in matched_entities_to_remove:
214
+ matched_entities.remove(entity)
215
+ for entity in unmatched_entities_to_remove:
216
+ unmatched_entities.remove(entity)
217
+
218
+ return matched_entities, unmatched_entities
219
+
220
+ def highlight_entities(article_content,summary_output):
221
+
222
+ markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
223
+ markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
224
+ markdown_end = "</mark>"
225
+
226
+ matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
227
+
228
+ print(summary_output)
229
+
230
+ for entity in matched_entities:
231
+ summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
232
+
233
+ for entity in unmatched_entities:
234
+ summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
235
+
236
+ print("")
237
+ print(summary_output)
238
+
239
+ print("")
240
+ print(summary_output)
241
+
242
+ soup = BeautifulSoup(summary_output, features="html.parser")
243
+
244
+ return HTML_WRAPPER.format(soup)
245
+
246
+
247
+ def display_df_as_table(model,top_k,score='score'):
248
+ '''Display the df with text and scores as a table'''
249
+
250
+ df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
251
+ df['Score'] = round(df['Score'],2)
252
+
253
+ return df
254
+
255
+ def make_spans(text,results):
256
+ results_list = []
257
+ for i in range(len(results)):
258
+ results_list.append(results[i]['label'])
259
+ facts_spans = []
260
+ facts_spans = list(zip(sent_tokenizer(text),results_list))
261
+ return facts_spans
262
+
263
+ ##Fiscal Sentiment by Sentence
264
+ def fin_ext(text):
265
+ results = remote_clx(sent_tokenizer(text))
266
+ return make_spans(text,results)