Spaces:

egumasa
/

engagement-analyzer-demo

Running

App Files Files Community

egumasa commited on Jan 4, 2023

Commit

a937724

1 Parent(s): 65d7587

new UI

Browse files

Files changed (8) hide show

demo.py +30 -73
pipeline/__pycache__/post_processors.cpython-39.pyc +0 -0
pipeline/custom_functions.py +190 -0
pipeline/post_processors.py +51 -0
utils/__pycache__/util.cpython-39.pyc +0 -0
utils/__pycache__/visualize.cpython-39.pyc +0 -0
utils/util.py +63 -0
utils/visualize.py +128 -0

demo.py CHANGED Viewed

@@ -4,10 +4,13 @@ from collections import Counter
 import spacy
 from spacy.tokens import Doc
-from spacy_streamlit import visualize_spans
 import streamlit as st
 # nlp = spacy.load(
 #     "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
 # )
@@ -27,13 +30,13 @@ st.set_page_config(page_title="ENGAGEMENT analyzer (beta ver 0.2)",
 @st.cache(allow_output_mutation=True)
-def load_model(spacy_model):
     # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
     nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
     return (nlp)
-nlp = load_model("en_engagement_RoBERTa_context_flz")
 doc = nlp(
     'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
@@ -140,54 +143,6 @@ def delete_span(span_sc: dict):
         del span_sc[idx]
-def delete_overlapping_span(span_sc: dict):
-    start_token_list = [spn.start for spn in span_sc]
-    dict_ = Counter(start_token_list)
-    overlap = {k: v for k, v in dict_.items() if v > 1}
-    id_del = []
-    id_comp = {}
-    info = {}
-    for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
-                                     start=1):
-        res = {
-            'score': score,
-            'spn': spn,
-            'label': spn.label_,
-            'start': spn.start,
-            'compare': spn.start in overlap,
-            "sents": len(list(spn.sents))
-        }
-        # print(res)
-        info[n] = res
-        if res['compare']:
-            if spn.start not in id_comp:
-                id_comp[spn.start] = n
-            else:
-                update = res['score'] > info[id_comp[spn.start]]['score']
-                if update:
-                    id_del.append(id_comp[spn.start])
-                    id_comp[spn.start] = n
-                else:
-                    id_del.append(n)
-                print(update)
-        # delete span beyond sentences
-        if len(list(spn.sents)) > 1:
-            id_del.append(n)
-    # print(id_comp)
-    for n, idx in enumerate(id_del):
-        # print(idx)
-        try:
-            del span_sc[idx - n]
-        except IndexError:
-            continue
 # st.markdown('''
 #     <style>
 #         .sidebar .sidebar-content {{
@@ -308,28 +263,30 @@ with st.form("my_form"):
 delete_overlapping_span(doc.spans['sc'])
-visualize_spans(doc,
-                spans_key="sc",
-                displacy_options={
-                    'template': {
-                        "span": TPL_SPAN,
-                        'slice': TPL_SPAN_SLICE,
-                        'start': TPL_SPAN_START,
-                    },
-                    "colors": {
-                        "ENTERTAIN": "#73C6B6",
-                        "DENY": '#CD6155',
-                        "COUNTER": "#D35400",
-                        "PRONOUNCE": "#2ECC71",
-                        "ENDORSE": "#A569BD",
-                        "CONCUR": "#F39C12",
-                        "CITATION": "#F8C471",
-                        "SOURCES": "#F7DC6F",
-                        "MONOGLOSS": "#85929E",
-                        "ATTRIBUTE": "#85C1E9",
-                        "JUSTIFYING": "#2ECC71",
-                    },
-                })
 st.subheader("Bibliography")
 st.markdown("""

 import spacy
 from spacy.tokens import Doc
+# from spacy_streamlit import visualize_spans
 import streamlit as st
+from utils.util import delete_overlapping_span
+from utils.visualize import visualize_spans
 # nlp = spacy.load(
 #     "packages/en_engagement_RoBERTa-0.0.2/en_engagement_RoBERTa/en_engagement_RoBERTa-0.0.2"
 # )
 @st.cache(allow_output_mutation=True)
+def load_model():
     # nlp = spacy.load("en_engagement_RoBERTa_context_flz")
     nlp = spacy.load("en_engagement_spl_RoBERTa_acad")
     return (nlp)
+nlp = load_model()
 doc = nlp(
     'Welcome! Probably this is one of the few attempts to teach a machine how to read the discourse...! Although it is not perfect, you should be able to get a good place to start for your stance-taking analyses. The result will be presented here.'
         del span_sc[idx]
 # st.markdown('''
 #     <style>
 #         .sidebar .sidebar-content {{
 delete_overlapping_span(doc.spans['sc'])
+visualize_spans(
+    doc,
+    spans_key="sc",
+    displacy_options={
+        'template': {
+            "span": TPL_SPAN,
+            'slice': TPL_SPAN_SLICE,
+            'start': TPL_SPAN_START,
+        },
+        "colors": {
+            "ENTERTAIN": "#73C6B6",
+            "DENY": '#CD6155',
+            "COUNTER": "#D35400",
+            "PRONOUNCE": "#2ECC71",
+            "ENDORSE": "#A569BD",
+            "CONCUR": "#F39C12",
+            "CITATION": "#F8C471",
+            "SOURCES": "#F7DC6F",
+            "MONOGLOSS": "#85929E",
+            "ATTRIBUTE": "#85C1E9",
+            "JUSTIFYING": "#2ECC71",
+        },
+    },
+)
 st.subheader("Bibliography")
 st.markdown("""

pipeline/__pycache__/post_processors.cpython-39.pyc ADDED Viewed

Binary file (2.15 kB). View file

pipeline/custom_functions.py ADDED Viewed

	@@ -0,0 +1,190 @@

+from functools import partial
+from pathlib import Path
+from typing import Iterable, Callable
+import spacy
+from spacy.training import Example
+from spacy.tokens import DocBin, Doc
+# make the factory work
+# from scripts.rel_pipe import make_relation_extractor
+# make the config work
+# from scripts.rel_model import create_relation_model, create_classification_layer, create_instances, create_tensors
+# from scripts.custom_comps.SpanCat_extention import build_mean_max_reducer1, build_mean_max_reducer2, build_mean_max_reducer3, build_mean_max_reducer4
+from typing import List, Tuple, cast
+from thinc.api import Model, with_getitem, chain, list2ragged, Logistic
+from thinc.api import Maxout, Linear, concatenate, glorot_uniform_init, PyTorchLSTM
+from thinc.api import reduce_mean, reduce_max, reduce_first, reduce_last
+from thinc.types import Ragged, Floats2d
+from spacy.util import registry
+from spacy.tokens import Doc
+from spacy.ml.extract_spans import extract_spans
+# @registry.layers("spacy.LinearLogistic.v1")
+# def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
+#     """An output layer for multi-label classification. It uses a linear layer
+#     followed by a logistic activation.
+#     """
+#     return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
+@registry.layers("mean_max_reducer.v1.5")
+def build_mean_max_reducer1(hidden_size: int,
+                            dropout: float = 0.0) -> Model[Ragged, Floats2d]:
+    """Reduce sequences by concatenating their mean and max pooled vectors,
+    and then combine the concatenated vectors with a hidden layer.
+    """
+    return chain(
+        concatenate(
+            cast(Model[Ragged, Floats2d], reduce_last()),
+            cast(Model[Ragged, Floats2d], reduce_first()),
+            reduce_mean(),
+            reduce_max(),
+        ),
+        Maxout(nO=hidden_size, normalize=True, dropout=dropout),
+    )
+@registry.layers("mean_max_reducer.v2")
+def build_mean_max_reducer2(hidden_size: int,
+                            dropout: float = 0.0) -> Model[Ragged, Floats2d]:
+    """Reduce sequences by concatenating their mean and max pooled vectors,
+    and then combine the concatenated vectors with a hidden layer.
+    """
+    return chain(
+        concatenate(
+            cast(Model[Ragged, Floats2d], reduce_last()),
+            cast(Model[Ragged, Floats2d], reduce_first()),
+            reduce_mean(),
+            reduce_max(),
+        ), Maxout(nO=hidden_size, normalize=True, dropout=dropout),
+        Maxout(nO=hidden_size, normalize=True, dropout=dropout))
+# @registry.layers("mean_max_reducer.v2")
+# def build_mean_max_reducer2(hidden_size: int,
+#                             depth: int) -> Model[Ragged, Floats2d]:
+#     """Reduce sequences by concatenating their mean and max pooled vectors,
+#     and then combine the concatenated vectors with a hidden layer.
+#     """
+#     return chain(
+#         concatenate(
+#             cast(Model[Ragged, Floats2d], reduce_last()),
+#             cast(Model[Ragged, Floats2d], reduce_first()),
+#             reduce_mean(),
+#             reduce_max(),
+#         ), Maxout(nO=hidden_size, normalize=True, dropout=0.0),
+#         PyTorchLSTM(nO=64, nI=hidden_size, bi=True, depth=depth, dropout=0.2))
+@registry.layers("mean_max_reducer.v3")
+def build_mean_max_reducer3(hidden_size: int,
+                            maxout_pieces: int = 3,
+                            dropout: float = 0.0) -> Model[Ragged, Floats2d]:
+    """Reduce sequences by concatenating their mean and max pooled vectors,
+    and then combine the concatenated vectors with a hidden layer.
+    """
+    hidden_size2 = int(hidden_size / 2)
+    hidden_size3 = int(hidden_size / 2)
+    return chain(
+        concatenate(
+            cast(Model[Ragged, Floats2d], reduce_last()),
+            cast(Model[Ragged, Floats2d], reduce_first()),
+            reduce_mean(),
+            reduce_max(),
+        ),
+        Maxout(nO=hidden_size,
+               nP=maxout_pieces,
+               normalize=True,
+               dropout=dropout),
+        Maxout(nO=hidden_size2,
+               nP=maxout_pieces,
+               normalize=True,
+               dropout=dropout),
+        Maxout(nO=hidden_size3,
+               nP=maxout_pieces,
+               normalize=True,
+               dropout=dropout))
+@registry.layers("mean_max_reducer.v3.3")
+def build_mean_max_reducer4(hidden_size: int,
+                            depth: int) -> Model[Ragged, Floats2d]:
+    """Reduce sequences by concatenating their mean and max pooled vectors,
+    and then combine the concatenated vectors with a hidden layer.
+    """
+    hidden_size2 = int(hidden_size / 2)
+    hidden_size3 = int(hidden_size / 2)
+    return chain(
+        concatenate(
+            cast(Model[Ragged, Floats2d], reduce_last()),
+            cast(Model[Ragged, Floats2d], reduce_first()),
+            reduce_mean(),
+            reduce_max(),
+        ), Maxout(nO=hidden_size, nP=3, normalize=True, dropout=0.0),
+        Maxout(nO=hidden_size2, nP=3, normalize=True, dropout=0.0),
+        Maxout(nO=hidden_size3, nP=3, normalize=True, dropout=0.0))
+@registry.architectures("CustomSpanCategorizer.v2")
+def build_spancat_model(
+    tok2vec: Model[List[Doc], List[Floats2d]],
+    reducer: Model[Ragged, Floats2d],
+    scorer: Model[Floats2d, Floats2d],
+) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
+    """Build a span categorizer model, given a token-to-vector model, a
+    reducer model to map the sequence of vectors for each span down to a single
+    vector, and a scorer model to map the vectors to probabilities.
+    tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
+    reducer (Model[Ragged, Floats2d]): The reducer model.
+    scorer (Model[Floats2d, Floats2d]): The scorer model.
+    """
+    model = chain(
+        cast(
+            Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
+            with_getitem(
+                0,
+                chain(tok2vec,
+                      cast(Model[List[Floats2d], Ragged], list2ragged()))),
+        ),
+        extract_spans(),
+        reducer,
+        scorer,
+    )
+    model.set_ref("tok2vec", tok2vec)
+    model.set_ref("reducer", reducer)
+    model.set_ref("scorer", scorer)
+    return model
+# @registry.architectures("spacy.SpanCategorizer.v1")
+# def build_spancat_model(
+#     tok2vec: Model[List[Doc], List[Floats2d]],
+#     reducer: Model[Ragged, Floats2d],
+#     scorer: Model[Floats2d, Floats2d],
+# ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
+#     """Build a span categorizer model, given a token-to-vector model, a
+#     reducer model to map the sequence of vectors for each span down to a single
+#     vector, and a scorer model to map the vectors to probabilities.
+#     tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model.
+#     reducer (Model[Ragged, Floats2d]): The reducer model.
+#     scorer (Model[Floats2d, Floats2d]): The scorer model.
+#     """
+#     model = chain(
+#         cast(
+#             Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]],
+#             with_getitem(
+#                 0,
+#                 chain(tok2vec,
+#                       cast(Model[List[Floats2d], Ragged], list2ragged()))),
+#         ),
+#         extract_spans(),
+#         reducer,
+#         scorer,
+#     )
+#     model.set_ref("tok2vec", tok2vec)
+#     model.set_ref("reducer", reducer)
+#     model.set_ref("scorer", scorer)
+#     return model

pipeline/post_processors.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
+import pandas as pd
+import spacy
+from spacy.language import Language
+SPAN_ATTRS = ["text", "label_", "start", "end"]
+def simple_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
+                 spans_key: str = "sc",
+                 attrs: List[str] = SPAN_ATTRS):
+    columns = attrs + ["Conf. score"]
+    data = [
+        [str(getattr(span, attr))
+         for attr in attrs] + [score]  # [f'{score:.5f}']
+        for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
+    ]
+    return data, columns
+def const_table(doc: Union[spacy.tokens.Doc, Dict[str, str]],
+                spans_key: str = "sc",
+                attrs: List[str] = SPAN_ATTRS):
+    columns = attrs + ["Conf. score", 'span dep',
+                       "POS", "POS sequence", "head"]
+    data = []
+    for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores']):
+        span_info = []
+        span_info.extend([str(getattr(span, attr)) for attr in attrs])
+        span_info.append(score)
+        span_info.append(span.root.dep_)
+        span_info.append(span.root.tag_)
+        span_info.append("_".join([t.tag_ for t in span]))
+        span_info.append(span.root.head.norm_)
+        # span_info.append(span.root.head.dep_ == "ROOT")
+        data.append(span_info)
+    return data, columns
+def ngrammar(seq: list, n=2):
+    result = []
+    n_item = len(seq)
+    for idx, item in enumerate(seq):
+        if idx + n <= n_item:
+            result.append(seq[idx: idx + n])
+    return result

utils/__pycache__/util.cpython-39.pyc ADDED Viewed

Binary file (1.68 kB). View file

utils/__pycache__/visualize.cpython-39.pyc ADDED Viewed

Binary file (3.45 kB). View file

utils/util.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import re
+from collections import Counter
+def preprocess(text):
+    text = re.sub("\n\n", ' &&&&&&&&#&#&#&#&', text)
+    text = re.sub('\n', ' ', text)
+    text = re.sub('\s+', " ", text)
+    text = re.sub('&&&&&&&&#&#&#&#&', '\n\n', text)
+    return text
+def delete_overlapping_span(span_sc: dict):
+    # print(span_sc)
+    start_token_list = [spn.start for spn in span_sc]
+    dict_ = Counter(start_token_list)
+    overlap = {k: v for k, v in dict_.items() if v > 1}
+    id_del = []
+    id_comp = {}
+    info = {}
+    for n, (spn, score) in enumerate(zip(span_sc, span_sc.attrs['scores']),
+                                     start=1):
+        res = {
+            'score': score,
+            'spn': spn,
+            'label': spn.label_,
+            'start': spn.start,
+            'end': spn.end,
+            'compare': spn.start in overlap,
+            "sents": len(list(spn.sents))
+        }
+        # print(res)
+        info[n] = res
+        if res['compare']:
+            if spn.start not in id_comp:
+                id_comp[spn.start] = n
+            else:
+                same_lbl = res['label'] == info[id_comp[spn.start]]['label']
+                update = res['score'] > info[id_comp[spn.start]]['score']
+                if update and same_lbl:
+                    print(res['label'], info[id_comp[spn.start]]['label'])
+                    print(same_lbl)
+                    id_del.append(id_comp[spn.start])
+                    id_comp[spn.start] = n
+                else:
+                    id_del.append(n)
+                # print(update)
+        # delete span beyond sentences
+        if len(list(spn.sents)) > 1:
+            id_del.append(n)
+    # print(id_comp)
+    for n, idx in enumerate(id_del):
+        # print(idx)
+        try:
+            del span_sc[idx - n]
+        except IndexError:
+            continue

utils/visualize.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This code is adapted from spacy-streamlit package by explosion
+# https://github.com/explosion/spacy-streamlit/blob/master/spacy_streamlit/__init__.py
+#
+from typing import List, Sequence, Tuple, Optional, Dict, Union, Callable
+import streamlit as st
+import spacy
+from spacy.language import Language
+from spacy import displacy
+import pandas as pd
+import streamlit as st
+from spacy_streamlit import visualize_spans
+from spacy_streamlit.util import load_model, process_text, get_svg, get_html, LOGO
+from pipeline.post_processors import simple_table, const_table, ngrammar
+# from skbio import diversity as dv
+SPACY_VERSION = tuple(map(int, spacy.__version__.split(".")))
+# fmt: off
+# SPAN_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"]
+SPAN_ATTRS = [
+    "text",
+    "label_",
+    "start",
+    "end",
+]
+def visualize_spans(
+    doc: Union[spacy.tokens.Doc, Dict[str, str]],
+    *,
+    spans_key: str = "sc",
+    attrs: List[str] = SPAN_ATTRS,
+    show_table: bool = True,
+    title: Optional[str] = "Spans",
+    manual: bool = False,
+    displacy_options: Optional[Dict] = None,
+    simple: bool = True,
+):
+    """
+    Visualizer for spans.
+    doc (Doc, Dict): The document to visualize.
+    spans_key (str): Which spans key to render spans from. Default is "sc".
+    attrs (list):  The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table
+    argument is True.
+    show_table (bool): Flag signifying whether to show a table with accompanying span attributes.
+    title (str): The title displayed at the top of the Spans visualization.
+    manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information.
+    displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
+      See https://spacy.io/api/top-level#displacy_options-span
+    """
+    if SPACY_VERSION < (3, 3, 0):
+        raise ValueError(
+            f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}"
+        )
+    if not displacy_options:
+        displacy_options = dict()
+    displacy_options["spans_key"] = spans_key
+    if title:
+        st.header(title)
+    if manual:
+        if show_table:
+            st.warning(
+                "When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False."
+            )
+        if not isinstance(doc, dict):
+            st.warning(
+                "When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'."
+            )
+    html = displacy.render(
+        doc,
+        style="span",
+        options=displacy_options,
+        manual=manual,
+    )
+    st.write(f"{get_html(html)}", unsafe_allow_html=True)
+    if show_table:
+        # data = [
+        #     [str(getattr(span, attr)) for attr in attrs] + [str(score)]
+        #     for span, score in zip(doc.spans[spans_key], doc.spans[spans_key].attrs['scores'])
+        # ]
+        if simple:
+            data, cols = simple_table(doc, spans_key='sc', attrs=attrs)
+        else:
+            data, cols = const_table(doc, spans_key='sc', attrs=attrs)
+        seq = [s for s in doc.spans[spans_key]]
+        span_ngrams = ngrammar(seq=seq, n=3)
+        # st.code(span_ngrams)
+        if data:
+            df = pd.DataFrame(data, columns=cols)
+            st.subheader("Span information")
+            st.dataframe(
+                df.style.highlight_between(subset='Conf. score', right=.7))
+            st.subheader("Label counts & Diagnostic confidence score summary")
+            counts = df['label_'].value_counts()
+            label_counts = df.groupby('label_').agg({
+                "label_":
+                'count',
+                "Conf. score": ['median', 'min', 'max']
+            }).round(4)
+            st.dataframe(label_counts)
+            # st.subheader("Engagement label by grammatical function")
+            # label_dep = pd.crosstab(df['span dep'], df['label_'])
+            # st.dataframe(label_dep)
+            # st.subheader('Quantitative results')
+            # st.markdown(
+            #     f"Shannon's index: {dv.alpha.shannon(counts, base=2): .3f}")
+            # st.markdown(
+            #     f"Simpson's e index: {dv.alpha.simpson_e(counts): .3f}")
+            # st.markdown(str(dv.alpha_diversity(metric = "shannon", counts=counts, ids = ['ENTERTAIN', 'ATTRIBUTE', 'CITATION', 'COUNTER', 'DENY', 'ENDORSE', 'PRONOUNCE', 'CONCUR', 'MONOGLOSS', 'SOURCES', 'JUSTIFYING'])))
+            # print(dv.get_alpha_diversity_metrics())