Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Oct 2, 2022

Commit

d1dcb4e

1 Parent(s): 8a619b7

Update app.py

Browse files

Files changed (1) hide show

app.py +155 -3

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from optimum.onnxruntime import ORTModelForSequenceClassification
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import streamlit as st
 nltk.download('punkt')
@@ -50,18 +51,28 @@ auth_token = os.environ.get("auth_token")
 progress_bar = st.sidebar.progress(0)
-@st.experimental_singleton()
 def load_models():
     asr_model = whisper.load_model("small")
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
-    return asr_model, sent_pipe, sum_pipe, cross_encoder
-asr_model, sent_pipe, sum_pipe, cross_encoder  = load_models()
 @st.experimental_memo(suppress_st_warning=True)
 def inference(link, upload):
@@ -131,6 +142,147 @@ def preprocess_plain_text(text,window_size=3):
     print(f"Passages: {len(passages)}")
     return passages
 def display_df_as_table(model,top_k,score='score'):
     '''Display the df with text and scores as a table'''

 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import streamlit as st
+import en_core_web_lg
 nltk.download('punkt')
 progress_bar = st.sidebar.progress(0)
+@st.experimental_singleton(suppress_st_warning=True)
 def load_models():
     asr_model = whisper.load_model("small")
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+    ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
+    ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")
+    ner_pip = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
+    sbert = SentenceTransformer("all-mpnet-base-v2")
     cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+    return asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder
+@st.experimental_singleton(suppress_st_warning=True)
+def get_spacy():
+    nlp = en_core_web_lg.load()
+    return nlp
+nlp = get_spacy()
+asr_model, sent_pipe, sum_pipe, ner_pipe, sbert, cross_encoder  = load_models()
 @st.experimental_memo(suppress_st_warning=True)
 def inference(link, upload):
     print(f"Passages: {len(passages)}")
     return passages
+@st.experimental_memo(suppress_st_warning=True)
+def chunk_clean_text(text):
+    """Chunk text longer than 500 tokens"""
+    article = nlp(text)
+    sentences = [i.text for i in list(article.sents)]
+    current_chunk = 0
+    chunks = []
+    for sentence in sentences:
+        if len(chunks) == current_chunk + 1:
+            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
+                chunks[current_chunk].extend(sentence.split(" "))
+            else:
+                current_chunk += 1
+                chunks.append(sentence.split(" "))
+        else:
+            chunks.append(sentence.split(" "))
+    for chunk_id in range(len(chunks)):
+        chunks[chunk_id] = " ".join(chunks[chunk_id])
+    return chunks
+def summary_downloader(raw_text):
+	b64 = base64.b64encode(raw_text.encode()).decode()
+	new_filename = "new_text_file_{}_.txt".format(time_str)
+	st.markdown("#### Download Summary as a File ###")
+	href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
+	st.markdown(href,unsafe_allow_html=True)
+def get_all_entities_per_sentence(text):
+    doc = nlp(''.join(text))
+    sentences = list(doc.sents)
+    entities_all_sentences = []
+    for sentence in sentences:
+        entities_this_sentence = []
+        # SPACY ENTITIES
+        for entity in sentence.ents:
+            entities_this_sentence.append(str(entity))
+        # FLAIR ENTITIES (CURRENTLY NOT USED)
+        # sentence_entities = Sentence(str(sentence))
+        # tagger.predict(sentence_entities)
+        # for entity in sentence_entities.get_spans('ner'):
+        #     entities_this_sentence.append(entity.text)
+        # XLM ENTITIES
+        entities_xlm = [entity["word"] for entity in ner_model(str(sentence))]
+        for entity in entities_xlm:
+            entities_this_sentence.append(str(entity))
+        entities_all_sentences.append(entities_this_sentence)
+    return entities_all_sentences
+def get_all_entities(text):
+    all_entities_per_sentence = get_all_entities_per_sentence(text)
+    return list(itertools.chain.from_iterable(all_entities_per_sentence))
+def get_and_compare_entities(article_content,summary_output):
+    all_entities_per_sentence = get_all_entities_per_sentence(article_content)
+    entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
+    all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
+    entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
+    matched_entities = []
+    unmatched_entities = []
+    for entity in entities_summary:
+        if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
+            matched_entities.append(entity)
+        elif any(
+                np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
+                         sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
+                art_entity in entities_article):
+            matched_entities.append(entity)
+        else:
+            unmatched_entities.append(entity)
+    matched_entities = list(dict.fromkeys(matched_entities))
+    unmatched_entities = list(dict.fromkeys(unmatched_entities))
+    matched_entities_to_remove = []
+    unmatched_entities_to_remove = []
+    for entity in matched_entities:
+        for substring_entity in matched_entities:
+            if entity != substring_entity and entity.lower() in substring_entity.lower():
+                matched_entities_to_remove.append(entity)
+    for entity in unmatched_entities:
+        for substring_entity in unmatched_entities:
+            if entity != substring_entity and entity.lower() in substring_entity.lower():
+                unmatched_entities_to_remove.append(entity)
+    matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
+    unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))
+    for entity in matched_entities_to_remove:
+        matched_entities.remove(entity)
+    for entity in unmatched_entities_to_remove:
+        unmatched_entities.remove(entity)
+    return matched_entities, unmatched_entities
+def highlight_entities(article_content,summary_output):
+    markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
+    markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
+    markdown_end = "</mark>"
+    matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
+    print(summary_output)
+    for entity in matched_entities:
+        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
+    for entity in unmatched_entities:
+        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
+    print("")
+    print(summary_output)
+    print("")
+    print(summary_output)
+    soup = BeautifulSoup(summary_output, features="html.parser")
+    return HTML_WRAPPER.format(soup)
+    nlp = get_spacy()
 def display_df_as_table(model,top_k,score='score'):
     '''Display the df with text and scores as a table'''