Spaces:

spark-nlp
/

sparknlp-grammar-analysis-and-dependency-parsing

Sleeping

App Files Files Community

abdullahmubeen10 commited on Sep 4, 2024

Commit

6520bbf

verified ·

1 Parent(s): 03f408e

Upload 6 files

Browse files

Files changed (6) hide show

.streamlit/config.toml +3 -0
Demo.py +254 -0
Dockerfile +72 -0
images/DependencyParserVisualizer.png +0 -0
pages/Workflow & Model Overview.py +250 -0
requirements.txt +8 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[theme]
+base="light"
+primaryColor="#29B4E8"

Demo.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import streamlit as st
+import sparknlp
+from johnsnowlabs import nlp
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+import pyspark.sql.functions as F
+import pandas as pd
+# Page Configuration
+st.set_page_config(
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# CSS Styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 10px;
+            border-radius: 10px;
+            margin-top: 10px;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+        .table {
+            width: 100%;
+            border-collapse: collapse;
+            margin-top: 20px;
+        }
+        .table th, .table td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: left;
+        }
+        .table th {
+            background-color: #4A90E2;
+            color: white;
+        }
+        .table td {
+            background-color: #f2f2f2;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Initialize Spark
+@st.cache_resource
+def init_spark():
+    return sparknlp.start()
+# Create NLP Pipeline
+@st.cache_resource
+def create_pipeline():
+    document_assembler = DocumentAssembler() \
+        .setInputCol("text") \
+        .setOutputCol("document")
+    tokenizer = Tokenizer() \
+        .setInputCols(["document"]) \
+        .setOutputCol("token")
+    pos_tagger = PerceptronModel.pretrained("pos_anc", 'en') \
+        .setInputCols("document", "token") \
+        .setOutputCol("pos")
+    dep_parser = DependencyParserModel.pretrained('dependency_conllu') \
+        .setInputCols(["document", "pos", "token"]) \
+        .setOutputCol("dependency")
+    typed_dep_parser = TypedDependencyParserModel.pretrained('dependency_typed_conllu') \
+        .setInputCols(["token", "pos", "dependency"]) \
+        .setOutputCol("dependency_type")
+    pipeline = Pipeline(stages=[
+        document_assembler,
+        tokenizer,
+        pos_tagger,
+        dep_parser,
+        typed_dep_parser
+    ])
+    return pipeline
+# Fit Data to Pipeline
+def fit_data(pipeline, text):
+    df = spark.createDataFrame([[text]]).toDF("text")
+    result = pipeline.fit(df).transform(df)
+    return result
+# Render DataFrame as HTML Table
+def render_table(df, sidebar=False):
+    html = df.to_html(classes="table", index=False, escape=False)
+    if sidebar:
+      st.sidebar.markdown(html, unsafe_allow_html=True)
+    else:
+      st.markdown(html, unsafe_allow_html=True)
+def explain_tags(tag_type, tags, tag_dict):
+    explanations = [(tag, tag_dict[tag]) for tag in tags if tag in tag_dict]
+    if explanations:
+        df = pd.DataFrame(explanations, columns=[f"{tag_type} Tag", f"{tag_type} Meaning"])
+        df.index = [''] * len(df)  # Hide the index
+        render_table(df, sidebar=True)
+# Page Title and Subtitle
+title = "Grammar Analysis & Dependency Parsing"
+sub_title = "Visualize the syntactic structure of a sentence as a directed labeled graph."
+st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
+st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True)
+# Example Sentences
+examples = [
+    "John Snow is a good man. He knows a lot about science.",
+    "In what country is the WTO headquartered?",
+    "I was wearing my dark blue shirt and tie.",
+    "The Geneva Motor Show is the most popular car show of the year.",
+    "Bill Gates and Steve Jobs had periods of civility."
+]
+# Text Selection
+selected_text = st.selectbox("Select an example", examples)
+custom_input = st.text_input("Try it with your own sentence!")
+text_to_analyze = custom_input if custom_input else selected_text
+st.write('Text to analyze:')
+HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto;
+                   border: 1px solid #e6e9ef; border-radius: 0.25rem;
+                   padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
+st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)
+# Initialize Spark and Pipeline
+spark = init_spark()
+pipeline = create_pipeline()
+output = fit_data(pipeline, text_to_analyze)
+# Display Dependency Tree
+st.write("Dependency Tree:")
+nlp.load('dep.typed').viz_streamlit_dep_tree(
+    text=text_to_analyze,
+    title='',
+    sub_title='',
+    set_wide_layout_CSS=False,
+    generate_code_sample=False,
+    key="NLU_streamlit",
+    show_infos=False,
+    show_logo=False,
+    show_text_input=False,
+)
+# Display Raw Result
+st.write("Raw Result:")
+df = output.select(F.explode(F.arrays_zip(
+    output.token.result,
+    output.token.begin,
+    output.token.end,
+    output.pos.result,
+    output.dependency.result,
+    output.dependency_type.result
+)).alias("cols")) \
+    .select(F.expr("cols['0']").alias("chunk"),
+            F.expr("cols['1']").alias("begin"),
+            F.expr("cols['2']").alias("end"),
+            F.expr("cols['3']").alias("pos"),
+            F.expr("cols['4']").alias("dependency"),
+            F.expr("cols['5']").alias("dependency_type")).toPandas()
+render_table(df)
+# Sidebar Content
+# POS and Dependency dictionaries
+pos_dict = {
+    "CC": "Coordinating conjunction", "CD": "Cardinal number", "DT": "Determiner",
+    "EX": "Existential there", "FW": "Foreign word", "IN": "Preposition or subordinating conjunction",
+    "JJ": "Adjective", "JJR": "Adjective, comparative", "JJS": "Adjective, superlative",
+    "LS": "List item marker", "MD": "Modal", "NN": "Noun, singular or mass",
+    "NNS": "Noun, plural", "NNP": "Proper noun, singular", "NNPS": "Proper noun, plural",
+    "PDT": "Predeterminer", "POS": "Possessive ending", "PRP": "Personal pronoun",
+    "PRP$": "Possessive pronoun", "RB": "Adverb", "RBR": "Adverb, comparative",
+    "RBS": "Adverb, superlative", "RP": "Particle", "SYM": "Symbol", "TO": "to",
+    "UH": "Interjection", "VB": "Verb, base form", "VBD": "Verb, past tense",
+    "VBG": "Verb, gerund or present participle", "VBN": "Verb, past participle",
+    "VBP": "Verb, non-3rd person singular present", "VBZ": "Verb, 3rd person singular present",
+    "WDT": "Wh-determiner", "WP": "Wh-pronoun", "WP$": "Possessive wh-pronoun",
+    "WRB": "Wh-adverb"
+}
+dependency_dict = {
+    "acl": "clausal modifier of noun (adjectival clause)",
+    "advcl": "adverbial clause modifier",
+    "advmod": "adverbial modifier",
+    "amod": "adjectival modifier",
+    "appos": "appositional modifier",
+    "aux": "auxiliary",
+    "case": "case marking",
+    "cc": "coordinating conjunction",
+    "ccomp": "clausal complement",
+    "clf": "classifier",
+    "compound": "compound",
+    "conj": "conjunct",
+    "cop": "copula",
+    "csubj": "clausal subject",
+    "dep": "unspecified dependency",
+    "det": "determiner",
+    "discourse": "discourse element",
+    "dislocated": "dislocated elements",
+    "expl": "expletive",
+    "fixed": "fixed multiword expression",
+    "flat": "flat multiword expression",
+    "goeswith": "goes with",
+    "iobj": "indirect object",
+    "list": "list",
+    "mark": "marker",
+    "nmod": "nominal modifier",
+    "nsubj": "nominal subject",
+    "nummod": "numeric modifier",
+    "obj": "object",
+    "obl": "oblique nominal",
+    "orphan": "orphan",
+    "parataxis": "parataxis",
+    "punct": "punctuation",
+    "reparandum": "overridden disfluency",
+    "root": "root",
+    "vocative": "vocative",
+    "xcomp": "open clausal complement"
+}
+# Get unique POS and dependency tags
+unique_pos = df['pos'].unique()
+unique_dep = df['dependency_type'].unique()
+# Sidebar options for explanations
+if st.sidebar.checkbox("Explain POS Tags"):
+    explain_tags("POS", unique_pos, pos_dict)
+if st.sidebar.checkbox("Explain Dependency Types"):
+    explain_tags("Dependency", unique_dep, dependency_dict)
+# Sidebar with Reference Notebook Link
+colab_link = """
+<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb">
+    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
+</a>
+"""
+st.sidebar.markdown('Reference Notebook:')
+st.sidebar.markdown(colab_link, unsafe_allow_html=True)

Dockerfile ADDED Viewed

	@@ -0,0 +1,72 @@

+# Download base image ubuntu 18.04
+FROM ubuntu:18.04
+# Set environment variables
+ENV NB_USER jovyan
+ENV NB_UID 1000
+ENV HOME /home/${NB_USER}
+ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
+# Install required packages
+RUN apt-get update && apt-get install -y \
+    tar \
+    wget \
+    bash \
+    rsync \
+    gcc \
+    libfreetype6-dev \
+    libhdf5-serial-dev \
+    libpng-dev \
+    libzmq3-dev \
+    python3 \
+    python3-dev \
+    python3-pip \
+    unzip \
+    pkg-config \
+    software-properties-common \
+    graphviz \
+    openjdk-8-jdk \
+    ant \
+    ca-certificates-java \
+    && apt-get clean \
+    && update-ca-certificates -f
+# Install Python 3.8 and pip
+RUN add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y python3.8 python3-pip \
+    && apt-get clean
+# Set up JAVA_HOME
+RUN echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> /etc/profile \
+    && echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> /etc/profile
+# Create a new user named "jovyan" with user ID 1000
+RUN useradd -m -u ${NB_UID} ${NB_USER}
+# Switch to the "jovyan" user
+USER ${NB_USER}
+# Set home and path variables for the user
+ENV HOME=/home/${NB_USER} \
+    PATH=/home/${NB_USER}/.local/bin:$PATH
+# Set up PySpark to use Python 3.8 for both driver and workers
+ENV PYSPARK_PYTHON=/usr/bin/python3.8
+ENV PYSPARK_DRIVER_PYTHON=/usr/bin/python3.8
+# Set the working directory to the user's home directory
+WORKDIR ${HOME}
+# Upgrade pip and install Python dependencies
+RUN python3.8 -m pip install --upgrade pip
+COPY requirements.txt /tmp/requirements.txt
+RUN python3.8 -m pip install -r /tmp/requirements.txt
+# Copy the application code into the container at /home/jovyan
+COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
+# Expose port for Streamlit
+EXPOSE 7860
+# Define the entry point for the container
+ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]

images/DependencyParserVisualizer.png ADDED Viewed

pages/Workflow & Model Overview.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import streamlit as st
+# Custom CSS for better styling
+st.markdown("""
+    <style>
+        .main-title {
+            font-size: 36px;
+            color: #4A90E2;
+            font-weight: bold;
+            text-align: center;
+        }
+        .sub-title {
+            font-size: 24px;
+            color: #4A90E2;
+            margin-top: 20px;
+        }
+        .section {
+            background-color: #f9f9f9;
+            padding: 15px;
+            border-radius: 10px;
+            margin-top: 20px;
+        }
+        .section h2 {
+            font-size: 22px;
+            color: #4A90E2;
+        }
+        .section p, .section ul {
+            color: #666666;
+        }
+        .link {
+            color: #4A90E2;
+            text-decoration: none;
+        }
+    </style>
+""", unsafe_allow_html=True)
+# Title
+st.markdown('<div class="main-title">Grammar Analysis & Dependency Parsing</div>', unsafe_allow_html=True)
+# Introduction Section
+st.markdown("""
+<div class="section">
+    <p>Understanding the grammatical structure of sentences is crucial in Natural Language Processing (NLP) for various applications such as translation, text summarization, and information extraction. This page focuses on Grammar Analysis and Dependency Parsing, which help in identifying the grammatical roles of words in a sentence and the relationships between them.</p>
+    <p>We utilize Spark NLP, a robust library for NLP tasks, to perform Part-of-Speech (POS) tagging and Dependency Parsing, enabling us to analyze sentences at scale with high accuracy.</p>
+</div>
+""", unsafe_allow_html=True)
+# Understanding Dependency Parsing
+st.markdown('<div class="sub-title">Understanding Dependency Parsing</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>Dependency Parsing is a technique used to understand the grammatical structure of a sentence by identifying the dependencies between words. It maps out relationships such as subject-verb, adjective-noun, etc., which are essential for understanding the sentence's meaning.</p>
+    <p>In Dependency Parsing, each word in a sentence is linked to another word, creating a tree-like structure called a dependency tree. This structure helps in various NLP tasks, including information retrieval, question answering, and machine translation.</p>
+</div>
+""", unsafe_allow_html=True)
+# Implementation Section
+st.markdown('<div class="sub-title">Implementing Grammar Analysis & Dependency Parsing</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>The following example demonstrates how to implement a grammar analysis pipeline using Spark NLP. The pipeline includes stages for tokenization, POS tagging, and dependency parsing, extracting the grammatical relationships between words in a sentence.</p>
+</div>
+""", unsafe_allow_html=True)
+st.code('''
+import sparknlp
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+import pyspark.sql.functions as F
+# Initialize Spark NLP
+spark = sparknlp.start()
+# Stage 1: Document Assembler
+document_assembler = DocumentAssembler()\\
+    .setInputCol("text")\\
+    .setOutputCol("document")
+# Stage 2: Tokenizer
+tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")
+# Stage 3: POS Tagger
+postagger = PerceptronModel.pretrained("pos_anc", "en")\\
+    .setInputCols(["document", "token"])\\
+    .setOutputCol("pos")
+# Stage 4: Dependency Parsing
+dependency_parser = DependencyParserModel.pretrained("dependency_conllu")\\
+    .setInputCols(["document", "pos", "token"])\\
+    .setOutputCol("dependency")
+# Stage 5: Typed Dependency Parsing
+typed_dependency_parser = TypedDependencyParserModel.pretrained("dependency_typed_conllu")\\
+    .setInputCols(["token", "pos", "dependency"])\\
+    .setOutputCol("dependency_type")
+# Define the pipeline
+pipeline = Pipeline(stages=[
+    document_assembler,
+    tokenizer,
+    postagger,
+    dependency_parser,
+    typed_dependency_parser
+])
+# Example sentence
+example = spark.createDataFrame([
+    ["Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul."]
+]).toDF("text")
+# Apply the pipeline
+result = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")).transform(example)
+# Display the results
+result.select(
+    F.explode(
+        F.arrays_zip(
+            result.token.result,
+            result.pos.result,
+            result.dependency.result,
+            result.dependency_type.result
+        )
+    ).alias("cols")
+).select(
+    F.expr("cols['0']").alias("token"),
+    F.expr("cols['1']").alias("pos"),
+    F.expr("cols['2']").alias("dependency"),
+    F.expr("cols['3']").alias("dependency_type")
+).show(truncate=False)
+''', language='python')
+# Example Output
+st.text("""
++------------+---+------------+---------------+
+|token       |pos|dependency  |dependency_type|
++------------+---+------------+---------------+
+|Unions      |NNP|ROOT        |root           |
+|representing|VBG|workers     |amod           |
+|workers     |NNS|Unions      |flat           |
+|at          |IN |Turner      |case           |
+|Turner      |NNP|workers     |flat           |
+|Newall      |NNP|say         |nsubj          |
+|say         |VBP|Unions      |parataxis      |
+|they        |PRP|disappointed|nsubj          |
+|are         |VBP|disappointed|nsubj          |
+|'           |POS|disappointed|case           |
+|disappointed|JJ |say         |nsubj          |
+|'           |POS|disappointed|case           |
+|after       |IN |talks       |case           |
+|talks       |NNS|disappointed|nsubj          |
+|with        |IN |stricken    |det            |
+|stricken    |NN |talks       |amod           |
+|parent      |NN |Mogul       |flat           |
+|firm        |NN |Mogul       |flat           |
+|Federal     |NNP|Mogul       |flat           |
+|Mogul       |NNP|stricken    |flat           |
++------------+---+------------+---------------+
+""")
+# Visualizing the Dependencies
+st.markdown('<div class="sub-title">Visualizing the Dependencies</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>For a visual representation of the dependencies, you can use the <b>spark-nlp-display</b> module, an open-source tool that makes visualizing dependencies straightforward and easy to integrate into your workflow.</p>
+    <p>First, install the module with pip:</p>
+    <code>pip install spark-nlp-display</code>
+    <p>Then, you can use the <code>DependencyParserVisualizer</code> class to create a visualization of the dependency tree:</p>
+</div>
+""", unsafe_allow_html=True)
+st.code('''
+from sparknlp_display import DependencyParserVisualizer
+# Initialize the visualizer
+dependency_vis = DependencyParserVisualizer()
+# Display the dependency tree
+dependency_vis.display(
+    result.collect()[0],  # single example result
+    pos_col="pos",
+    dependency_col="dependency",
+    dependency_type_col="dependency_type",
+)
+''', language='python')
+st.image('images\DependencyParserVisualizer.png', caption='The visualization of dependencies')
+st.markdown("""
+<div class="section">
+    <p>This code snippet will generate a visual dependency tree like shown above for the given sentence, clearly showing the grammatical relationships between words. The <code>spark-nlp-display</code> module provides an intuitive way to visualize complex dependency structures, aiding in the analysis and understanding of sentence grammar.</p>
+</div>
+""", unsafe_allow_html=True)
+# Model Info Section
+st.markdown('<div class="sub-title">Choosing the Right Model for Dependency Parsing</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <p>For dependency parsing, the models <b>"dependency_conllu"</b> and <b>"dependency_typed_conllu"</b> are used. These models are trained on a large corpus and are effective for extracting grammatical relations between words in English sentences.</p>
+    <p>To explore more models tailored for different NLP tasks, visit the <a class="link" href="https://sparknlp.org/models?annotator=DependencyParserModel" target="_blank">Spark NLP Models Hub</a>.</p>
+</div>
+""", unsafe_allow_html=True)
+# References Section
+st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators" target="_blank" rel="noopener">Spark NLP documentation page</a> for all available annotators</li>
+        <li>Python API documentation for <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/pos/perceptron/index.html#sparknlp.annotator.pos.perceptron.PerceptronModel" target="_blank" rel="noopener">PerceptronModel</a> and <a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/dependency/dependency_parser/index.html#sparknlp.annotator.dependency.dependency_parser.DependencyParserModel" target="_blank" rel="noopener">Dependency Parser</a></li>
+        <li>Scala API documentation for <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronModel.html" target="_blank" rel="noopener">PerceptronModel</a> and <a href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModel.html" target="_blank" rel="noopener">DependencyParserModel</a></li>
+        <li>For extended examples of usage of Spark NLP annotators, check the <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp-workshop" target="_blank" rel="noopener">Spark NLP Workshop repository</a>.</li>
+        <li>Minsky, M.L. and Papert, S.A. (1969) Perceptrons. MIT Press, Cambridge.</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# Community & Support Section
+st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
+        <li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
+        <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
+        <li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
+        <li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)
+# Quick Links Section
+st.markdown('<div class="sub-title">Quick Links</div>', unsafe_allow_html=True)
+st.markdown("""
+<div class="section">
+    <ul>
+        <li><a class="link" href="https://sparknlp.org/docs/en/quickstart" target="_blank">Getting Started</a></li>
+        <li><a class="link" href="https://nlp.johnsnowlabs.com/models" target="_blank">Pretrained Models</a></li>
+        <li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english" target="_blank">Example Notebooks</a></li>
+        <li><a class="link" href="https://sparknlp.org/docs/en/install" target="_blank">Installation Guide</a></li>
+    </ul>
+</div>
+""", unsafe_allow_html=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+st-annotated-text
+streamlit-tags
+pandas
+numpy
+spark-nlp
+pyspark
+johnsnowlabs