File size: 7,058 Bytes
3a52501
 
 
 
 
4829b64
 
 
 
 
 
258c872
3a52501
 
 
d1d936a
a09bb13
d1d936a
 
3a52501
53b0cab
 
 
 
 
 
 
 
 
 
 
258c872
4829b64
 
 
258c872
3a52501
638d345
 
258c872
 
 
4829b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258c872
4829b64
 
258c872
4829b64
 
 
 
819654b
4829b64
3a52501
4829b64
 
 
 
 
 
 
 
 
 
 
 
 
3a52501
 
 
4829b64
 
3a52501
4829b64
 
 
3a52501
4829b64
3a52501
 
4829b64
 
 
3a52501
 
 
 
4829b64
3a52501
 
4829b64
 
 
 
 
 
819654b
258c872
 
 
4829b64
 
258c872
 
d1d936a
 
 
4829b64
 
 
258c872
4829b64
8aa124c
4829b64
 
258c872
 
819654b
 
 
 
 
5f2ec77
a09bb13
 
 
 
 
 
 
258c872
 
819654b
258c872
 
 
 
 
 
d1d936a
6286028
d1d936a
258c872
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53b0cab
 
 
258c872
 
53b0cab
258c872
 
53b0cab
 
258c872
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python
# coding: utf-8

import gradio as gr
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TextClassificationPipeline,
    pipeline,
)
from sklearn import preprocessing
from langdetect import detect
from matplotlib import pyplot as plt
import imageio
import logging
import warnings

logging.getLogger().setLevel(logging.INFO)

DESCRIPTION = """Diese Anwendung teilt Vorstösse an das federführende Departement zu und
    macht einen Vorschlag für das zuständige Amt. Der Vorschlag der Anwendung ist nicht
    100% richtig. Der Zuteilungsvorschlag muss von einer Fachperson geprüft und die
    effektive Zuteilung muss nach eigenem Ermessen erfolgen. \n\n
    Cette application attribue les interventions au département chef de file et fait une
    proposition à l'office compétent. La proposition de l'application n'est pas correcte
    à 100%. La proposition d'attribution doit être vérifiée par un spécialiste et l'attribution
    effective doit être faite à la discrétion de l'utilisateur."""
TITLE_DE = (
    "Automatische Zuteilung von Vorstössen an das federführende Departement bzw. Amt"
)
TITLE_FR = "Où aller ? Classification des départements & bureaux"
UNKNOWN_LANG_TEXT = (
    "The language is not recognized, it must be either in German or in French."
)
PLACEHOLDER_TEXT = "Geben Sie bitte den Titel und den 'Submitted Text' des Vorstoss ein.\nVeuillez entrer le titre et le 'Submitted Text' de la requête."

UNSURE_DE_TEXT = "Das ML-Modell ist nicht sicher. Die Zuteilung könnte sein: \n\n"
UNSURE_FR_TEXT = "Le modèle ML n'est pas sûr. L'allocation pourrait être: \n\n"

ML_MODEL_SURE = 0.6

BARS_DEP_FR = (
    "DDPS",
    "DFI",
    "AS-MPC",
    "DFJP",
    "DEFR",
    "DETEC",
    "DFAE",
    "Parl",
    "ChF",
    "DFF",
    "AF",
    "TF",
)
BARS_DEP_DE = (
    "VBS",
    "EDI",
    "AB-BA",
    "EJPD",
    "WBF",
    "UVEK",
    "EDA",
    "Parl",
    "BK",
    "EFD",
    "BV",
    "BGer",
)


def load_model(modelFolder):
    """Loads model from model_folder & creates a text classification pipeline."""
    model = AutoModelForSequenceClassification.from_pretrained(modelFolder)
    tokenizer = AutoTokenizer.from_pretrained(modelFolder)
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
    return pipe


def translate_to_de(SubmittedText):
    """Translates french user input to German for the model to reach better classification."""
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-de")
    translatedText = translator(SubmittedText[0:1000])
    text = translatedText[0]["translation_text"]
    return text


def create_bar_plot(rates, barnames):
    y_pos = np.arange(len(barnames))
    plt.barh(y_pos, rates)
    plt.yticks(y_pos, barnames)

    # Save the bar chart as png and load it (enables better display)
    plt.savefig("rates.png")
    im = imageio.v2.imread("rates.png")

    return im, barnames


def show_chosen_category(barnames, rates, language):
    """Creates the output text
    - adds disclaimer if ML model is not sure
    - when unsure, adds all categories with prob. > 10% to output"""
    maxRate = np.max(rates)
    maxIndex = np.argmax(rates)

    distance = "\t\t\t\t\t"

    # ML model not sure if highest probability < 60%
    if maxRate < ML_MODEL_SURE:
        name = UNSURE_FR_TEXT if language == "fr" else UNSURE_DE_TEXT

        # Show each department that has a probability > 10%
        i = 0
        while i == 0:
            if rates[maxIndex] >= 0.1:
                chosenScore = str(rates[maxIndex])[2:4]
                chosenCat = barnames[maxIndex]
                name = name + "\t" + chosenScore + "%" + distance + chosenCat + "\n"
                rates[maxIndex] = 0
                maxIndex = np.argmax(rates)
            else:
                i = 1

    # ML model pretty sure, show only one department
    else:
        name = str(maxRate)[2:4] + "%" + distance + barnames[maxIndex]

    return name


pipeDep = load_model("saved_model_dep")
pipeOffice = load_model("saved_model_office")

labelencoderOffice = preprocessing.LabelEncoder()
labelencoderOffice.classes_ = np.load("classes_office.npy")


def textclassification(SubmittedText):
    language = detect(SubmittedText)
    logging.info(
        f"SubmittedText received. Detected language: {language}. SubmittedText: {SubmittedText}"
    )

    # Translate the input to german if necessary
    if language == "fr":
        SubmittedText = translate_to_de(SubmittedText)
    elif language != "de":
        return UNKNOWN_LANG_TEXT, None, None, None

    # Make the prediction with the 1000 first characters
    images = []
    chosenCategoryTexts = []

    labelsDep = BARS_DEP_FR if language == "fr" else BARS_DEP_DE
    labelsOffice = labelencoderOffice.classes_

    for pipe, barnames in zip((pipeDep, pipeOffice), (labelsDep, labelsOffice)):
        plt.clf()

        # catch deprecation warning, as new functionality following the deprecated way
        # sorts results the wrong way and cannot be easily fixed
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            prediction = pipe(SubmittedText[0:1000], return_all_scores=True)
        rates = [row["score"] for row in prediction[0]]

        # Create barplot & output text
        im, barnames = create_bar_plot(rates, barnames)
        images.append(im)

        chosenCategoryText = show_chosen_category(barnames, rates, language)
        chosenCategoryTexts.append(chosenCategoryText)

    # return chosenCategoryText & image for both predictions
    logging.info(
        f"Prediction Department: {chosenCategoryTexts[0]}\n\nPrediction Amt: {chosenCategoryTexts[1]}"
    )
    return chosenCategoryTexts[0], images[0], chosenCategoryTexts[1], images[1]


# Launch UI
with gr.Blocks(
    # Set theme matching BK CH
    gr.themes.Monochrome(
        primary_hue="red",
        secondary_hue="red",
        font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"],
    )
) as demo:
    gr.Markdown(f"# {TITLE_DE}\n # {TITLE_FR}\n\n {DESCRIPTION}")

    # Organize layout in three columns for input, prediction I and prediction II
    with gr.Row():
        with gr.Column(scale=2):
            name = gr.Textbox(
                label="Vorstosstext:", lines=28, placeholder=PLACEHOLDER_TEXT
            )
            predict_btn = gr.Button("Submit | Soumettre")
        with gr.Column(scale=2):
            output_text_dep = gr.Textbox(label="Vorschlag Departement:")
            output_image_dep = gr.Image(label="Departement")
        with gr.Column(scale=2):
            output_text_office = gr.Textbox(label="Vorschlag Amt:")
            output_image_office = gr.Image(label="Amt")

    predict_btn.click(
        fn=textclassification,
        inputs=name,
        outputs=[
            output_text_dep,
            output_image_dep,
            output_text_office,
            output_image_office,
        ],
        api_name="predict",
    )

demo.launch()