NusaBERT

Sleeping

App Files Files Community

StevenLimcorn commited on Feb 19, 2024

Commit

47a6c20

1 Parent(s): 5596de2

Reformat code to be generic, adding new models in model.py

Browse files

Files changed (7) hide show

__pycache__/model.cpython-311.pyc +0 -0
__pycache__/script.cpython-311.pyc +0 -0
__pycache__/utils.cpython-311.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
app.py +10 -122
model.py +91 -0
utils.py +105 -12

__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (3.43 kB). View file

__pycache__/script.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/script.cpython-311.pyc and b/__pycache__/script.cpython-311.pyc differ

__pycache__/utils.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-311.pyc and b/__pycache__/utils.cpython-311.pyc differ

__pycache__/utils.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-39.pyc and b/__pycache__/utils.cpython-39.pyc differ

app.py CHANGED Viewed

@@ -1,125 +1,13 @@
-from utils import (
-    SentenceSimilarity,
-    pos_tagging,
-    text_analysis,
-    text_interface,
-    sentence_similarity,
-)
-from script import details
-from transformers import pipeline
 import gradio as gr
-from functools import partial
-pipes = {
-    "Sentiment Analysis": pipeline(
-        "text-classification",
-        model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-        tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
-    ),
-    "Emotion Classifier": pipeline(
-        "text-classification",
-        model="w11wo/indonesian-roberta-base-sentiment-classifier",
-        tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
-    ),
-    "summarization": pipeline(
-        "summarization",
-        model="LazarusNLP/IndoNanoT5-base-IndoSum",
-        tokenizer="LazarusNLP/IndoNanoT5-base-IndoSum",
-    ),
-    "sentence-similarity": SentenceSimilarity(model="LazarusNLP/all-indobert-base-v2"),
-    "POS Tagging": pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
-}
 if __name__ == "__main__":
-    # list of collections of all demos
-    classifiers = ["Sentiment Analysis", "Emotion Classifier"]
-    # Summary
-    summary_interface = gr.Interface.from_pipeline(
-        pipes["summarization"],
-        title="Summarization",
-        examples=details["summarization"]["examples"],
-        description=details["summarization"]["description"],
-        allow_flagging="never",
-    )
-    # Pos Tagging
-    pos_interface = gr.Interface(
-        fn=partial(pos_tagging, pipe=pipes["POS Tagging"]),
-        inputs=[
-            gr.Textbox(placeholder="Masukan kalimat di sini...", label="Input Text"),
-        ],
-        outputs=[gr.HighlightedText()],
-        title="POS Tagging",
-        examples=details["POS Tagging"]["examples"],
-        description=details["POS Tagging"]["description"],
-        allow_flagging="never",
-    )
-    # Text Analysis
-    with gr.Blocks() as text_analysis_interface:
-        gr.Markdown("# Text Analysis")
-        gr.Markdown(details["Text Analysis"]["description"])
-        input_text = gr.Textbox(lines=5, label="Input Text")
-        with gr.Row():
-            smsa = gr.Label(label="Sentiment Analysis")
-            emot = gr.Label(label="Emotion Classification")
-            pos = gr.HighlightedText(label="POS Tagging")
-        btn = gr.Button("Analyze")
-        btn.click(
-            fn=partial(text_analysis, pipes=pipes),
-            inputs=[input_text],
-            outputs=[smsa, emot, pos],
-        )
-        gr.Examples(
-            details["Text Analysis"]["examples"],
-            inputs=input_text,
-            outputs=[smsa, emot, pos],
-        )
-    with gr.Blocks() as sentence_similarity_interface:
-        gr.Markdown("# Document Search 🔍")
-        gr.Markdown(details["sentence-similarity"]["description"])
-        with gr.Row():
-            with gr.Column():
-                input_text = gr.Textbox(lines=5, label="Query")
-                file_input = gr.File(
-                    label="Documents", file_types=[".txt"], file_count="multiple"
-                )
-                button = gr.Button("Search...")
-            output = gr.Label()
-        button.click(
-            fn=partial(sentence_similarity, pipe=pipes["sentence-similarity"]),
-            inputs=[input_text, file_input],
-            outputs=[output],
-        )
-    demo_interface = {
-        "demo": [
-            text_interface(
-                pipes[name],
-                details[name]["examples"],
-                name,
-                name,
-                details[name]["description"],
-            )
-            for name in classifiers
-        ]
-        + [
-            sentence_similarity_interface,
-            summary_interface,
-            pos_interface,
-            text_analysis_interface,
-        ],
-        "titles": classifiers
-        + ["Document Search", "Summarization", "POS Tagging", "Text Analysis"],
-    }
-    # with gr.Blocks() as demo:
-    #     with gr.Column():
-    #         gr.Markdown("# Title")
-    #         gr.TabbedInterface(
-    #             demo_interface["demo"], demo_interface["titles"], theme="soft"
-    #         )
-    demo = gr.TabbedInterface(
-        demo_interface["demo"], demo_interface["titles"], theme="soft"
-    )
-    demo.launch()

+from model import models
 import gradio as gr
 if __name__ == "__main__":
+    exclude_keys, interfaces, titles = ["interface"], [], []
+    for model, args in models.items():
+        interface = args["interface"]
+        excluded_args = {k: args[k] for k in set(list(args.keys())) - set(exclude_keys)}
+        interfaces.append(interface(**excluded_args))
+        titles.append(model)
+    demo = gr.TabbedInterface(interfaces, titles, theme="soft")
+    demo.launch(debug=True)

model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from utils import (
+    text_analysis_interface,
+    token_classification_interface,
+    search_interface,
+    text_interface,
+    SentenceSimilarity,
+)
+from transformers import pipeline
+models = {
+    "Text Analysis": {
+        "title": "# Text Analysis",
+        "examples": [
+            "Siapa sih di dunia yg ngga punya hater? Rasul yg mulia aja punya. Budha aja punya. Nabi Isa aja punya. Nah apalagi eloh ama gueh .... ya kaaan",
+            "saya ganteng, kalau tidak-suka mati saja kamu",
+            "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
+        ],
+        "output_label": ["Sentiment Analysis", "Emotion Classifier", "POS Tagging"],
+        "desc": "A tool to showcase the full capabilities of text analysis LazarusNLP has to offer.",
+        "interface": text_analysis_interface,
+        "pipe": [
+            pipeline(
+                "text-classification",
+                model="w11wo/indonesian-roberta-base-sentiment-classifier",
+                tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
+            ),
+            pipeline(
+                "text-classification",
+                model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
+                tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
+            ),
+            pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
+        ],
+    },
+    "Sentiment Analysis": {
+        "title": "Sentiment Analysis",
+        "examples": [
+            "saya kecewa karena pengeditan biodata penumpang dilakukan by sistem tanpa konfirmasi dan solusi permasalahan nya pun dianggap sepele karena dibiarkan begitu saja sedang pelayanan pelanggan yang sudah berkali-berkali dihubungi pun hanya seperti mengulur waktu.",
+            "saya sudah transfer ratusan ribu dan sesuai nominal transfer. tapi tiket belum muncul juga. harus diwaspadai ini aplikasi ini.",
+            "keren sekali aplikasi ini bisa menunjukan data diri secara detail, sangat di rekomendasikan untuk di pakai.",
+        ],
+        "output_label": "Sentiment Analysis",
+        "desc": "A sentiment-text-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's SmSA dataset consisting of Indonesian comments and reviews.",
+        "interface": text_interface,
+        "pipe": pipeline(
+            "text-classification",
+            model="w11wo/indonesian-roberta-base-sentiment-classifier",
+            tokenizer="w11wo/indonesian-roberta-base-sentiment-classifier",
+        ),
+    },
+    "Emotion Detection": {
+        "title": "Emotion Classifier",
+        "examples": [
+            "iya semoga itu karya terbaik mu adalah skripsi mu dan lucua2n mu tapi harapan aku dari kamu adalah kesembuhanmu nold",
+            "saya ganteng, kalau tidak-suka mati saja kamu",
+            "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
+        ],
+        "output_label": "Emotion Classifier",
+        "desc": "An emotion classifier based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's EmoT dataset",
+        "interface": text_interface,
+        "pipe": pipeline(
+            "text-classification",
+            model="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
+            tokenizer="StevenLimcorn/indonesian-roberta-base-emotion-classifier",
+        ),
+    },
+    # "summarization": {
+    #     "examples": [],
+    #     "desc": "This model is a fine-tuned version of LazarusNLP/IndoNanoT5-base on the indonlg dataset.",
+    # },
+    "POS Tagging": {
+        "title": "POS Tagging",
+        "examples": [
+            "iya semoga itu karya terbaik mu adalah skripsi mu dan lucua2n mu tapi harapan aku dari kamu adalah kesembuhanmu nold",
+            "saya ganteng, kalau tidak-suka mati saja kamu",
+            "Bahaha.. dia ke kasir after me. Sambil ngangkat keresek belanjaanku, masih sempet liat mas nya nyodorin barang belanjaannya",
+        ],
+        "output_label": "POS Tagging",
+        "desc": "A part-of-speech token-classification model based on the RoBERTa model. The model was originally the pre-trained Indonesian RoBERTa Base model, which is then fine-tuned on indonlu's POSP dataset consisting of tag-labelled news.",
+        "interface": token_classification_interface,
+        "pipe": pipeline(model="w11wo/indonesian-roberta-base-posp-tagger"),
+    },
+    "Document Search": {
+        "title": "# Document Search 🔍",
+        "examples": [],
+        "output_label": "Top 5 related documents",
+        "desc": "A semantic search tool to get the most related documents 📖 based on user's query.",
+        "interface": search_interface,
+        "pipe": SentenceSimilarity(model="LazarusNLP/all-indobert-base-v2"),
+    },
+}

utils.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import gradio as gr
 from functools import partial
-from transformers import pipeline
 from sentence_transformers import SentenceTransformer, util
 from scipy.special import softmax
 import os
 class SentenceSimilarity:
     def __init__(self, model: str):
@@ -31,11 +34,31 @@ def sentence_similarity(text: str, documents: list[str], pipe: SentenceSimilarit
 # Text Analysis
-def cls_inference(input: list[str], pipe: pipeline) -> str:
     results = pipe(input, top_k=None)
     return {x["label"]: x["score"] for x in results}
 def text_interface(
     pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
 ):
@@ -52,15 +75,85 @@ def text_interface(
     )
-# POSP
-def pos_tagging(text: str, pipe: pipeline):
-    output = pipe(text)
-    return {"text": text, "entities": output}
-# Text Analysis
-def text_analysis(text, pipes: dict):
-    sa = cls_inference(text, pipes["Sentiment Analysis"])
-    emot = cls_inference(text, pipes["Emotion Classifier"])
-    pos = pos_tagging(text, pipes["POS Tagging"])
-    return (sa, emot, pos)

 import gradio as gr
 from functools import partial
+from transformers import pipeline, pipelines
 from sentence_transformers import SentenceTransformer, util
 from scipy.special import softmax
 import os
+######################
+##### INFERENCE ######
+######################
 class SentenceSimilarity:
     def __init__(self, model: str):
 # Text Analysis
+def cls_inference(input: list[str], pipe: pipeline) -> dict:
     results = pipe(input, top_k=None)
     return {x["label"]: x["score"] for x in results}
+# POSP
+def tagging(text: str, pipe: pipeline):
+    output = pipe(text)
+    return {"text": text, "entities": output}
+# Text Analysis
+def text_analysis(text, pipes: list[pipeline]):
+    outputs = []
+    for pipe in pipes:
+        if isinstance(pipe, pipelines.token_classification.TokenClassificationPipeline):
+            outputs.append(tagging(text, pipe))
+        else:
+            outputs.append(cls_inference(text, pipe))
+    return outputs
+######################
+##### INTERFACE ######
+######################
 def text_interface(
     pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
 ):
     )
+def search_interface(
+    pipe: SentenceSimilarity,
+    examples: list[str],
+    output_label: str,
+    title: str,
+    desc: str,
+):
+    with gr.Blocks() as sentence_similarity_interface:
+        gr.Markdown(title)
+        gr.Markdown(desc)
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(lines=5, label="Query")
+                file_input = gr.File(
+                    label="Documents", file_types=[".txt"], file_count="multiple"
+                )
+                button = gr.Button("Search...")
+            output = gr.Label(output_label)
+        button.click(
+            fn=partial(sentence_similarity, pipe=pipe),
+            inputs=[input_text, file_input],
+            outputs=[output],
+        )
+    return sentence_similarity_interface
+def token_classification_interface(
+    pipe: pipeline, examples: list[str], output_label: str, title: str, desc: str
+):
+    return gr.Interface(
+        fn=partial(tagging, pipe=pipe),
+        inputs=[
+            gr.Textbox(placeholder="Masukan kalimat di sini...", label="Input Text"),
+        ],
+        outputs=[gr.HighlightedText(label=output_label)],
+        title=title,
+        examples=examples,
+        description=desc,
+        allow_flagging="never",
+    )
+def text_analysis_interface(
+    pipe: list, examples: list[str], output_label: str, title: str, desc: str
+):
+    with gr.Blocks() as text_analysis_interface:
+        gr.Markdown(title)
+        gr.Markdown(desc)
+        input_text = gr.Textbox(lines=5, label="Input Text")
+        with gr.Row():
+            outputs = [
+                (
+                    gr.HighlightedText(label=label)
+                    if isinstance(
+                        p, pipelines.token_classification.TokenClassificationPipeline
+                    )
+                    else gr.Label(label=label)
+                )
+                for label, p in zip(output_label, pipe)
+            ]
+        btn = gr.Button("Analyze")
+        btn.click(
+            fn=partial(text_analysis, pipes=pipe),
+            inputs=[input_text],
+            outputs=outputs,
+        )
+        gr.Examples(
+            examples=examples,
+            inputs=input_text,
+            outputs=outputs,
+        )
+    return text_analysis_interface
+# Summary
+# summary_interface = gr.Interface.from_pipeline(
+#     pipes["summarization"],
+#     title="Summarization",
+#     examples=details["summarization"]["examples"],
+#     description=details["summarization"]["description"],
+#     allow_flagging="never",
+# )