Spaces:
Sleeping
Sleeping
miesnerjacob
commited on
Commit
Β·
4b75840
1
Parent(s):
ef5720a
Add application files
Browse files- .DS_Store +0 -0
- emotion_detection.py +40 -0
- keyword_extraction.py +84 -0
- named_entity_recognition.py +34 -0
- part_of_speech_tagging.py +14 -0
- requirements.txt +15 -0
- sentiment_analysis.py +50 -0
- streamlit_app.py +249 -0
- text_annotation.py +51 -0
- text_annotation_utils.py +127 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
emotion_detection.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
2 |
+
from transformers_interpret import SequenceClassificationExplainer
|
3 |
+
import torch
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
class EmotionDetection():
|
8 |
+
def __init__(self, chunksize=512):
|
9 |
+
hub_location = 'cardiffnlp/twitter-roberta-base-emotion'
|
10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
|
11 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
|
12 |
+
self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
|
13 |
+
|
14 |
+
def justify(self, text):
|
15 |
+
""""""
|
16 |
+
|
17 |
+
word_attributions = self.explainer(text)
|
18 |
+
html = self.explainer.visualize("example.html")
|
19 |
+
|
20 |
+
return html
|
21 |
+
|
22 |
+
def classify(self, text):
|
23 |
+
""""""
|
24 |
+
|
25 |
+
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
26 |
+
outputs = self.model(**tokens)
|
27 |
+
probs = torch.nn.functional.softmax(outputs[0], dim=-1)
|
28 |
+
probs = probs.mean(dim=0).detach().numpy()
|
29 |
+
labels = list(self.model.config.id2label.values())
|
30 |
+
preds = pd.Series(probs, index=labels, name='Predicted Probability')
|
31 |
+
|
32 |
+
return preds
|
33 |
+
|
34 |
+
def run(self, text):
|
35 |
+
""""""
|
36 |
+
|
37 |
+
preds = self.classify(text)
|
38 |
+
html = self.justify(text)
|
39 |
+
|
40 |
+
return preds, html
|
keyword_extraction.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
import pytextrank
|
3 |
+
import re
|
4 |
+
from operator import itemgetter
|
5 |
+
|
6 |
+
|
7 |
+
class KeywordExtractor:
|
8 |
+
def __init__(self):
|
9 |
+
self.nlp = spacy.load("en_core_web_sm")
|
10 |
+
self.nlp.add_pipe("textrank")
|
11 |
+
|
12 |
+
def get_keywords(self, text, max_keywords):
|
13 |
+
doc = self.nlp(text)
|
14 |
+
|
15 |
+
kws = [i.text for i in doc._.phrases[:max_keywords]]
|
16 |
+
|
17 |
+
return kws
|
18 |
+
|
19 |
+
def get_keyword_indicies(self, string_list, text):
|
20 |
+
out = []
|
21 |
+
for s in string_list:
|
22 |
+
indicies = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
|
23 |
+
out.extend(indicies)
|
24 |
+
|
25 |
+
return out
|
26 |
+
|
27 |
+
def merge_overlapping_indicies(self, indicies):
|
28 |
+
# Sort the array on the basis of start values of intervals.
|
29 |
+
indicies.sort()
|
30 |
+
stack = []
|
31 |
+
# insert first interval into stack
|
32 |
+
stack.append(indicies[0])
|
33 |
+
for i in indicies[1:]:
|
34 |
+
# Check for overlapping interval,
|
35 |
+
# if interval overlap
|
36 |
+
if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
|
37 |
+
stack[-1][-1] = max(stack[-1][-1], i[-1])
|
38 |
+
else:
|
39 |
+
stack.append(i)
|
40 |
+
return stack
|
41 |
+
|
42 |
+
def merge_until_finished(self, indicies):
|
43 |
+
len_indicies = 0
|
44 |
+
while True:
|
45 |
+
merged = self.merge_overlapping_indicies(indicies)
|
46 |
+
if len_indicies == len(merged):
|
47 |
+
out_indicies = sorted(merged, key=itemgetter(0))
|
48 |
+
return out_indicies
|
49 |
+
else:
|
50 |
+
len_indicies = len(merged)
|
51 |
+
|
52 |
+
def get_annotation(self, text, indicies, kws):
|
53 |
+
|
54 |
+
# Convert indicies to list
|
55 |
+
# kws = kws + [i.lower() for i in kws]
|
56 |
+
|
57 |
+
arr = list(text)
|
58 |
+
for idx in sorted(indicies, reverse=True):
|
59 |
+
arr.insert(idx[0], "<kw>")
|
60 |
+
arr.insert(idx[1]+1, "XXXxxxXXXxxxXXX <kw>")
|
61 |
+
annotation = ''.join(arr)
|
62 |
+
split = annotation.split('<kw>')
|
63 |
+
final_annotation = [(x.replace('XXXxxxXXXxxxXXX ', ''), "KEY", "#26aaef") if "XXXxxxXXXxxxXXX" in x else x for x in split]
|
64 |
+
|
65 |
+
kws_check = []
|
66 |
+
for i in final_annotation:
|
67 |
+
if type(i) is tuple:
|
68 |
+
kws_check.append(i[0])
|
69 |
+
|
70 |
+
return final_annotation
|
71 |
+
|
72 |
+
def generate(self, text, max_keywords):
|
73 |
+
|
74 |
+
kws = self.get_keywords(text, max_keywords)
|
75 |
+
|
76 |
+
indicies = list(self.get_keyword_indicies(kws, text))
|
77 |
+
if indicies:
|
78 |
+
indicies_merged = self.merge_until_finished(indicies)
|
79 |
+
annotation = self.get_annotation(text, indicies_merged, kws)
|
80 |
+
else:
|
81 |
+
annotation = None
|
82 |
+
|
83 |
+
return annotation, kws
|
84 |
+
|
named_entity_recognition.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
|
5 |
+
class NamedEntityRecognition():
|
6 |
+
def __init__(self):
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
8 |
+
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
9 |
+
self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
|
10 |
+
|
11 |
+
def get_annotation(self, preds, text):
|
12 |
+
splits = [0]
|
13 |
+
entities = {}
|
14 |
+
for i in preds:
|
15 |
+
splits.append(i['start'])
|
16 |
+
splits.append(i['end'])
|
17 |
+
entities[i['word']] = i['entity_group']
|
18 |
+
|
19 |
+
# Exclude bad preds
|
20 |
+
exclude = ['', '.', '. ', ' ']
|
21 |
+
for x in exclude:
|
22 |
+
if x in entities.keys():
|
23 |
+
entities.pop(x)
|
24 |
+
|
25 |
+
parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]
|
26 |
+
|
27 |
+
final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]
|
28 |
+
|
29 |
+
return final_annotation
|
30 |
+
|
31 |
+
def classify(self, text):
|
32 |
+
preds = self.nlp(text)
|
33 |
+
ner_annotation = self.get_annotation(preds, text)
|
34 |
+
return preds, ner_annotation
|
part_of_speech_tagging.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from nltk.tokenize import word_tokenize
|
3 |
+
nltk.download('punkt')
|
4 |
+
nltk.download('averaged_perceptron_tagger')
|
5 |
+
|
6 |
+
|
7 |
+
class POSTagging():
|
8 |
+
def __init__(self):
|
9 |
+
pass
|
10 |
+
|
11 |
+
def classify(self, text):
|
12 |
+
text = word_tokenize(text)
|
13 |
+
preds = nltk.pos_tag(text)
|
14 |
+
return preds
|
requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
news-please~=1.5.20
|
2 |
+
sklearn~=0.0
|
3 |
+
keybert~=0.5.1
|
4 |
+
tensorflow
|
5 |
+
tensorflow-hub~=0.12.0
|
6 |
+
nltk~=3.5
|
7 |
+
gradio~=3.0
|
8 |
+
typing-extensions==3.10.0.2
|
9 |
+
yake~=0.4.8
|
10 |
+
streamlit-option-menu~=0.3.2
|
11 |
+
streamlit-option-menu~=0.3.2
|
12 |
+
st-annotated-text~=3.0.0
|
13 |
+
transformers-interpret~=0.7.2
|
14 |
+
htbuilder==0.6.0
|
15 |
+
pytextrank
|
sentiment_analysis.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
2 |
+
from transformers_interpret import SequenceClassificationExplainer
|
3 |
+
import torch
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
class SentimentAnalysis():
|
8 |
+
def __init__(self):
|
9 |
+
# Load Tokenizer & Model
|
10 |
+
hub_location = 'cardiffnlp/twitter-roberta-base-sentiment'
|
11 |
+
self.tokenizer = AutoTokenizer.from_pretrained(hub_location)
|
12 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(hub_location)
|
13 |
+
|
14 |
+
# Change model labels in config
|
15 |
+
self.model.config.id2label[0] = "Negative"
|
16 |
+
self.model.config.id2label[1] = "Neutral"
|
17 |
+
self.model.config.id2label[2] = "Positive"
|
18 |
+
self.model.config.label2id["Negative"] = self.model.config.label2id.pop("LABEL_0")
|
19 |
+
self.model.config.label2id["Neutral"] = self.model.config.label2id.pop("LABEL_1")
|
20 |
+
self.model.config.label2id["Positive"] = self.model.config.label2id.pop("LABEL_2")
|
21 |
+
|
22 |
+
# Instantiate explainer
|
23 |
+
self.explainer = SequenceClassificationExplainer(self.model, self.tokenizer)
|
24 |
+
|
25 |
+
def justify(self, text):
|
26 |
+
""""""
|
27 |
+
|
28 |
+
word_attributions = self.explainer(text)
|
29 |
+
html = self.explainer.visualize("example.html")
|
30 |
+
|
31 |
+
return html
|
32 |
+
|
33 |
+
def classify(self, text):
|
34 |
+
""""""
|
35 |
+
|
36 |
+
tokens = self.tokenizer.encode_plus(text, add_special_tokens=False, return_tensors='pt')
|
37 |
+
outputs = self.model(**tokens)
|
38 |
+
probs = torch.nn.functional.softmax(outputs[0], dim=-1)
|
39 |
+
probs = probs.mean(dim=0).detach().numpy()
|
40 |
+
preds = pd.Series(probs, index=["Negative", "Neutral", "Positive"], name='Predicted Probability')
|
41 |
+
|
42 |
+
return preds
|
43 |
+
|
44 |
+
def run(self, text):
|
45 |
+
""""""
|
46 |
+
|
47 |
+
preds = self.classify(text)
|
48 |
+
html = self.justify(text)
|
49 |
+
|
50 |
+
return preds, html
|
streamlit_app.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from text_annotation import annotated_text
|
4 |
+
from streamlit_option_menu import option_menu
|
5 |
+
from sentiment_analysis import SentimentAnalysis
|
6 |
+
from keyword_extraction import KeywordExtractor
|
7 |
+
from part_of_speech_tagging import POSTagging
|
8 |
+
from emotion_detection import EmotionDetection
|
9 |
+
from named_entity_recognition import NamedEntityRecognition
|
10 |
+
|
11 |
+
hide_streamlit_style = """
|
12 |
+
<style>
|
13 |
+
#MainMenu {visibility: hidden;}
|
14 |
+
footer {visibility: hidden;}
|
15 |
+
</style>
|
16 |
+
"""
|
17 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
18 |
+
|
19 |
+
|
20 |
+
@st.cache(allow_output_mutation=True)
|
21 |
+
def load_sentiment_model():
|
22 |
+
return SentimentAnalysis()
|
23 |
+
|
24 |
+
@st.cache(allow_output_mutation=True)
|
25 |
+
def load_keyword_model():
|
26 |
+
return KeywordExtractor()
|
27 |
+
|
28 |
+
@st.cache(allow_output_mutation=True)
|
29 |
+
def load_pos_model():
|
30 |
+
return POSTagging()
|
31 |
+
|
32 |
+
@st.cache(allow_output_mutation=True)
|
33 |
+
def load_emotion_model():
|
34 |
+
return EmotionDetection()
|
35 |
+
|
36 |
+
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_ner_model():
|
38 |
+
return NamedEntityRecognition()
|
39 |
+
|
40 |
+
|
41 |
+
sentiment_analyzer = load_sentiment_model()
|
42 |
+
keyword_extractor = load_keyword_model()
|
43 |
+
pos_tagger = load_pos_model()
|
44 |
+
emotion_detector = load_emotion_model()
|
45 |
+
ner = load_ner_model()
|
46 |
+
|
47 |
+
with st.sidebar:
|
48 |
+
page = option_menu(menu_title='Menu',
|
49 |
+
menu_icon="robot",
|
50 |
+
options=["Welcome!",
|
51 |
+
"Sentiment Analysis",
|
52 |
+
"Keyword Extraction",
|
53 |
+
"Part of Speech Tagging",
|
54 |
+
"Emotion Detection",
|
55 |
+
"Named Entity Recognition"],
|
56 |
+
icons=["house-door",
|
57 |
+
"emoji-heart-eyes",
|
58 |
+
"key",
|
59 |
+
"chat-dots",
|
60 |
+
"emoji-heart-eyes",
|
61 |
+
"building"],
|
62 |
+
default_index=0
|
63 |
+
)
|
64 |
+
|
65 |
+
st.title('Open-source NLP')
|
66 |
+
|
67 |
+
if page == "Welcome!":
|
68 |
+
st.header('Welcome!')
|
69 |
+
st.write(
|
70 |
+
"""
|
71 |
+
Supercharge your workflow with this platform built using 100% open-source resources!
|
72 |
+
"""
|
73 |
+
)
|
74 |
+
|
75 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/2fEvoZ9tajMxq/giphy.gif)")
|
76 |
+
st.write(
|
77 |
+
"""
|
78 |
+
|
79 |
+
|
80 |
+
"""
|
81 |
+
)
|
82 |
+
st.subheader("Introduction")
|
83 |
+
st.write("""
|
84 |
+
Welcome! This application is a celebration of open-source and the power that programmers have been granted today
|
85 |
+
by those who give back to the community. This tool was constructed using Streamlit, Huggingface Transformers,
|
86 |
+
Transformers-Interpret, NLTK, Spacy, amongst other open-source Python libraries and models.
|
87 |
+
|
88 |
+
Utilizing this tool you will be able to perform a multitude of Natural Language Processing Tasks on a range of
|
89 |
+
different tasks. All you need to do is paste your input, select your task, and hit the start button!
|
90 |
+
|
91 |
+
* This application currently supports:
|
92 |
+
* Sentiment Analysis
|
93 |
+
* Keyword Extraction
|
94 |
+
* Part of Speech Tagging
|
95 |
+
* Emotion Detection
|
96 |
+
* Named Entity Recognition
|
97 |
+
|
98 |
+
More features may be added in the future, depending on community feedback. Please reach out to me at
|
99 |
+
[email protected] or at my Linkedin page listed below if you have ideas or suggestions for improvement.
|
100 |
+
|
101 |
+
If you would like to contribute yourself, feel free to fork the Github repository listed below and submit a merge request.
|
102 |
+
"""
|
103 |
+
)
|
104 |
+
st.subheader("Notes")
|
105 |
+
st.write(
|
106 |
+
"""
|
107 |
+
* This dashboard was contsructed by Jacob Miesner, but every resource used is open-source! If you are interested
|
108 |
+
in his other works you can view them here:
|
109 |
+
|
110 |
+
[Project Github](https://github.com/MiesnerJacob/nlp-dashboard)
|
111 |
+
|
112 |
+
[Jacob Miesner's Github](https://github.com/MiesnerJacob)
|
113 |
+
|
114 |
+
[Jacob Miesner's Linkedin](https://www.linkedin.com/in/jacob-miesner-885050125/)
|
115 |
+
|
116 |
+
[Jacob Miesner's Website](https://www.jacobmiesner.com)
|
117 |
+
|
118 |
+
* The prediction justification for some of the tasks are printed as the model views them. For this reason the text
|
119 |
+
may contain special tokens like [CLS] or [SEP] or even hashtags splitting words. If you are knowledgeable about
|
120 |
+
language models and how they work these will be familiar, if you do not have prior experience with language models
|
121 |
+
you can ignore these characters.
|
122 |
+
"""
|
123 |
+
)
|
124 |
+
|
125 |
+
elif page == "Sentiment Analysis":
|
126 |
+
st.header('Sentiment Analysis')
|
127 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/XIqCQx02E1U9W/giphy.gif)")
|
128 |
+
st.write(
|
129 |
+
"""
|
130 |
+
|
131 |
+
|
132 |
+
"""
|
133 |
+
)
|
134 |
+
|
135 |
+
text = st.text_area("Paste text here", value="")
|
136 |
+
|
137 |
+
if st.button('Start!'):
|
138 |
+
with st.spinner("Loading..."):
|
139 |
+
preds, html = sentiment_analyzer.run(text)
|
140 |
+
st.success('All done!')
|
141 |
+
st.write("")
|
142 |
+
st.subheader("Sentiment Predictions")
|
143 |
+
st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
|
144 |
+
st.write("")
|
145 |
+
st.subheader("Sentiment Justification")
|
146 |
+
raw_html = html._repr_html_()
|
147 |
+
st.components.v1.html(raw_html)
|
148 |
+
|
149 |
+
elif page == "Keyword Extraction":
|
150 |
+
st.header('Keyword Extraction')
|
151 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/xT9C25UNTwfZuk85WP/giphy-downsized-large.gif)")
|
152 |
+
st.write(
|
153 |
+
"""
|
154 |
+
|
155 |
+
|
156 |
+
"""
|
157 |
+
)
|
158 |
+
|
159 |
+
text = st.text_area("Paste text here", value="")
|
160 |
+
|
161 |
+
max_keywords = st.slider('# of Keywords Max Limit', min_value=1, max_value=10, value=5, step=1)
|
162 |
+
|
163 |
+
if st.button('Start!'):
|
164 |
+
with st.spinner("Loading..."):
|
165 |
+
annotation, keywords = keyword_extractor.generate(text, max_keywords)
|
166 |
+
st.success('All done!')
|
167 |
+
|
168 |
+
if annotation:
|
169 |
+
st.subheader("Keyword Annotation")
|
170 |
+
st.write("")
|
171 |
+
annotated_text(*annotation)
|
172 |
+
st.text("")
|
173 |
+
|
174 |
+
st.subheader("Extracted Keywords")
|
175 |
+
st.write("")
|
176 |
+
df = pd.DataFrame(keywords, columns=['Extracted Keywords'])
|
177 |
+
csv = df.to_csv(index=False).encode('utf-8')
|
178 |
+
st.download_button('Download Keywords to CSV', csv, file_name='news_intelligence_keywords.csv')
|
179 |
+
|
180 |
+
data_table = st.table(df)
|
181 |
+
|
182 |
+
elif page == "Part of Speech Tagging":
|
183 |
+
st.header('Part of Speech Tagging')
|
184 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/WoWm8YzFQJg5i/giphy.gif)")
|
185 |
+
st.write(
|
186 |
+
"""
|
187 |
+
|
188 |
+
|
189 |
+
"""
|
190 |
+
)
|
191 |
+
|
192 |
+
text = st.text_area("Paste text here", value="")
|
193 |
+
|
194 |
+
if st.button('Start!'):
|
195 |
+
with st.spinner("Loading..."):
|
196 |
+
preds = pos_tagger.classify(text)
|
197 |
+
st.success('All done!')
|
198 |
+
st.write("")
|
199 |
+
st.subheader("Part of Speech tags")
|
200 |
+
annotated_text(*preds)
|
201 |
+
st.write("")
|
202 |
+
st.components.v1.iframe('https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html', height=1000)
|
203 |
+
|
204 |
+
elif page == "Emotion Detection":
|
205 |
+
st.header('Emotion Detection')
|
206 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/fU8X6ozSszyEw/giphy.gif)")
|
207 |
+
st.write(
|
208 |
+
"""
|
209 |
+
|
210 |
+
|
211 |
+
"""
|
212 |
+
)
|
213 |
+
|
214 |
+
text = st.text_area("Paste text here", value="")
|
215 |
+
|
216 |
+
if st.button('Start!'):
|
217 |
+
with st.spinner("Loading..."):
|
218 |
+
preds, html = emotion_detector.run(text)
|
219 |
+
st.success('All done!')
|
220 |
+
st.write("")
|
221 |
+
st.subheader("Emotion Predictions")
|
222 |
+
st.bar_chart(data=preds, width=0, height=0, use_container_width=True)
|
223 |
+
raw_html = html._repr_html_()
|
224 |
+
st.write("")
|
225 |
+
st.subheader("Emotion Justification")
|
226 |
+
st.components.v1.html(raw_html, height=500)
|
227 |
+
|
228 |
+
elif page == "Named Entity Recognition":
|
229 |
+
st.header('Named Entity Recognition')
|
230 |
+
st.markdown("![Alt Text](https://media.giphy.com/media/lxO8wdWdu4tig/giphy.gif)")
|
231 |
+
st.write(
|
232 |
+
"""
|
233 |
+
|
234 |
+
|
235 |
+
"""
|
236 |
+
)
|
237 |
+
|
238 |
+
text = st.text_area("Paste text here", value="")
|
239 |
+
|
240 |
+
if st.button('Start!'):
|
241 |
+
with st.spinner("Loading..."):
|
242 |
+
preds, ner_annotation = ner.classify(text)
|
243 |
+
st.success('All done!')
|
244 |
+
st.write("")
|
245 |
+
st.subheader("NER Predictions")
|
246 |
+
annotated_text(*ner_annotation)
|
247 |
+
st.write("")
|
248 |
+
st.subheader("NER Prediction Metadata")
|
249 |
+
st.write(preds)
|
text_annotation.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from text_annotation_utils import *
|
3 |
+
|
4 |
+
def annotated_text(*args, type=None):
|
5 |
+
"""Writes text with annotations into your Streamlit app.
|
6 |
+
Parameters
|
7 |
+
----------
|
8 |
+
*args : str, tuple or htbuilder.HtmlElement
|
9 |
+
Arguments can be:
|
10 |
+
- strings, to draw the string as-is on the screen.
|
11 |
+
- tuples of the form (main_text, annotation_text, background, color) where
|
12 |
+
background and foreground colors are optional and should be an CSS-valid string such as
|
13 |
+
"#aabbcc" or "rgb(10, 20, 30)"
|
14 |
+
- HtmlElement objects in case you want to customize the annotations further. In particular,
|
15 |
+
you can import the `annotation()` function from this module to easily produce annotations
|
16 |
+
whose CSS you can customize via keyword arguments.
|
17 |
+
Examples
|
18 |
+
--------
|
19 |
+
# >>> annotated_text(
|
20 |
+
# ... "This ",
|
21 |
+
# ... ("is", "verb", "#8ef"),
|
22 |
+
# ... " some ",
|
23 |
+
# ... ("annotated", "adj", "#faa"),
|
24 |
+
# ... ("text", "noun", "#afa"),
|
25 |
+
# ... " for those of ",
|
26 |
+
# ... ("you", "pronoun", "#fea"),
|
27 |
+
# ... " who ",
|
28 |
+
# ... ("like", "verb", "#8ef"),
|
29 |
+
# ... " this sort of ",
|
30 |
+
# ... ("thing", "noun", "#afa"),
|
31 |
+
# ... )
|
32 |
+
# >>> annotated_text(
|
33 |
+
# ... "Hello ",
|
34 |
+
# ... annotation("world!", "noun", color="#8ef", border="1px dashed red"),
|
35 |
+
# ... )
|
36 |
+
"""
|
37 |
+
if type == 'title':
|
38 |
+
st.markdown(
|
39 |
+
'<p class="big-font">' + get_annotated_html(*args)+ '</p>',
|
40 |
+
unsafe_allow_html=True,
|
41 |
+
)
|
42 |
+
if type == 'description':
|
43 |
+
st.markdown(
|
44 |
+
'<p class="medium-font">' + get_annotated_html(*args) + '</p>',
|
45 |
+
unsafe_allow_html=True,
|
46 |
+
)
|
47 |
+
else:
|
48 |
+
st.markdown(
|
49 |
+
get_annotated_html(*args),
|
50 |
+
unsafe_allow_html=True,
|
51 |
+
)
|
text_annotation_utils.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
from htbuilder import H, HtmlElement, styles
|
3 |
+
from htbuilder.units import unit
|
4 |
+
|
5 |
+
# Only works in 3.7+: from htbuilder import div, span
|
6 |
+
div = H.div
|
7 |
+
span = H.span
|
8 |
+
|
9 |
+
# Only works in 3.7+: from htbuilder.units import px, rem, em
|
10 |
+
px = unit.px
|
11 |
+
rem = unit.rem
|
12 |
+
em = unit.em
|
13 |
+
|
14 |
+
# Colors from the Streamlit palette.
|
15 |
+
# These are red-70, orange-70, ..., violet-70, gray-70.
|
16 |
+
PALETTE = [
|
17 |
+
"#ff4b4b",
|
18 |
+
"#ffa421",
|
19 |
+
"#ffe312",
|
20 |
+
"#21c354",
|
21 |
+
"#00d4b1",
|
22 |
+
"#00c0f2",
|
23 |
+
"#1c83e1",
|
24 |
+
"#803df5",
|
25 |
+
"#808495",
|
26 |
+
]
|
27 |
+
|
28 |
+
OPACITIES = [
|
29 |
+
"33", "66",
|
30 |
+
]
|
31 |
+
|
32 |
+
def annotation(body, label="", background=None, color=None, **style):
|
33 |
+
"""Build an HtmlElement span object with the given body and annotation label.
|
34 |
+
The end result will look something like this:
|
35 |
+
[body | label]
|
36 |
+
Parameters
|
37 |
+
----------
|
38 |
+
body : string
|
39 |
+
The string to put in the "body" part of the annotation.
|
40 |
+
label : string
|
41 |
+
The string to put in the "label" part of the annotation.
|
42 |
+
background : string or None
|
43 |
+
The color to use for the background "chip" containing this annotation.
|
44 |
+
If None, will use a random color based on the label.
|
45 |
+
color : string or None
|
46 |
+
The color to use for the body and label text.
|
47 |
+
If None, will use the document's default text color.
|
48 |
+
style : dict
|
49 |
+
Any CSS you want to apply to the containing "chip". This is useful for things like
|
50 |
+
Examples
|
51 |
+
--------
|
52 |
+
Produce a simple annotation with default colors:
|
53 |
+
# >>> annotation("apple", "fruit")
|
54 |
+
Produce an annotation with custom colors:
|
55 |
+
# >>> annotation("apple", "fruit", background="#FF0", color="black")
|
56 |
+
Produce an annotation with crazy CSS:
|
57 |
+
# >>> annotation("apple", "fruit", background="#FF0", border="1px dashed red")
|
58 |
+
"""
|
59 |
+
|
60 |
+
color_style = {}
|
61 |
+
|
62 |
+
if color:
|
63 |
+
color_style['color'] = color
|
64 |
+
|
65 |
+
if not background:
|
66 |
+
label_sum = sum(ord(c) for c in label)
|
67 |
+
background_color = PALETTE[label_sum % len(PALETTE)]
|
68 |
+
background_opacity = OPACITIES[label_sum % len(OPACITIES)]
|
69 |
+
background = background_color + background_opacity
|
70 |
+
|
71 |
+
return (
|
72 |
+
span(
|
73 |
+
style=styles(
|
74 |
+
background=background,
|
75 |
+
border_radius=rem(0.33),
|
76 |
+
padding=(rem(0.125), rem(0.5)),
|
77 |
+
overflow="hidden",
|
78 |
+
**color_style,
|
79 |
+
**style,
|
80 |
+
))(
|
81 |
+
|
82 |
+
html.escape(body),
|
83 |
+
|
84 |
+
span(
|
85 |
+
style=styles(
|
86 |
+
padding_left=rem(0.5),
|
87 |
+
text_transform="uppercase",
|
88 |
+
))(
|
89 |
+
span(
|
90 |
+
style=styles(
|
91 |
+
font_size=em(0.67),
|
92 |
+
opacity=0.5,
|
93 |
+
))(
|
94 |
+
html.escape(label),
|
95 |
+
),
|
96 |
+
),
|
97 |
+
)
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
def get_annotated_html(*args):
|
102 |
+
"""Writes text with annotations into an HTML string.
|
103 |
+
Parameters
|
104 |
+
----------
|
105 |
+
*args : see annotated_text()
|
106 |
+
Returns
|
107 |
+
-------
|
108 |
+
str
|
109 |
+
An HTML string.
|
110 |
+
"""
|
111 |
+
|
112 |
+
out = div()
|
113 |
+
|
114 |
+
for arg in args:
|
115 |
+
if isinstance(arg, str):
|
116 |
+
out(html.escape(arg))
|
117 |
+
|
118 |
+
elif isinstance(arg, HtmlElement):
|
119 |
+
out(arg)
|
120 |
+
|
121 |
+
elif isinstance(arg, tuple):
|
122 |
+
out(annotation(*arg))
|
123 |
+
|
124 |
+
else:
|
125 |
+
raise Exception("Bad input")
|
126 |
+
|
127 |
+
return str(out)
|