dreji18 commited on
Commit
c818471
Β·
1 Parent(s): b13ebd6
Files changed (1) hide show
  1. app.py +240 -0
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Jan 12 08:28:35 2021
4
+
5
+ @author: rejid4996
6
+ """
7
+
8
+ # packages
9
+ import os
10
+ import re
11
+ import time
12
+ import base64
13
+ import pickle
14
+ import numpy as np
15
+ import pandas as pd
16
+ import streamlit as st
17
+ from io import BytesIO
18
+ import preprocessor as p
19
+ from textblob.classifiers import NaiveBayesClassifier
20
+
21
+ # custum function to clean the dataset (combining tweet_preprocessor and reguar expression)
22
+ def clean_tweets(df):
23
+ #set up punctuations we want to be replaced
24
+ REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
25
+ REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")
26
+ tempArr = []
27
+ for line in df:
28
+ # send to tweet_processor
29
+ tmpL = p.clean(line)
30
+ # remove puctuation
31
+ tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
32
+ tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
33
+ tempArr.append(tmpL)
34
+ return tempArr
35
+
36
+ def to_excel(df):
37
+ output = BytesIO()
38
+ writer = pd.ExcelWriter(output, engine='xlsxwriter')
39
+ df.to_excel(writer, sheet_name='Sheet1')
40
+ writer.save()
41
+ processed_data = output.getvalue()
42
+ return processed_data
43
+
44
+ def get_table_download_link(df):
45
+ """Generates a link allowing the data in a given panda dataframe to be downloaded
46
+ in: dataframe
47
+ out: href string
48
+ """
49
+ val = to_excel(df)
50
+ b64 = base64.b64encode(val) # val looks like b'...'
51
+ return f'<a href="data:application/octet-stream;base64,{b64.decode()}" download="classified_data.xlsx">Download file</a>' # decode b'abc' => abc
52
+
53
+ def download_model(model):
54
+ output_model = pickle.dumps(model)
55
+ b64 = base64.b64encode(output_model).decode()
56
+ href = f'<a href="data:file/output_model;base64,{b64}" download="myClassifier.pkl">Download Model .pkl File</a>'
57
+ st.markdown(href, unsafe_allow_html=True)
58
+
59
+ def main():
60
+ """NLP App with Streamlit"""
61
+
62
+ from PIL import Image
63
+
64
+ wallpaper = Image.open('D 4 Data.jpg')
65
+ wallpaper = wallpaper.resize((700,350))
66
+
67
+ st.sidebar.title("Text Classification App 1.0")
68
+ st.sidebar.success("Please reach out to https://www.linkedin.com/in/deepak-john-reji/ for more queries")
69
+ st.sidebar.subheader("Classifier using Textblob ")
70
+
71
+ st.info("For more contents subscribe to my Youtube Channel https://www.youtube.com/channel/UCgOwsx5injeaB_TKGsVD5GQ")
72
+ st.image(wallpaper)
73
+
74
+ options = ("Train the model", "Test the model", "Predict for a new data")
75
+ a = st.sidebar.empty()
76
+ value = a.radio("what do you wanna do", options, 0)
77
+
78
+ if value == "Train the model":
79
+
80
+ uploaded_file = st.file_uploader("*Upload your file, make sure you have a column for text that has to be classified and the label", type="xlsx")
81
+
82
+ if uploaded_file:
83
+
84
+ df = pd.read_excel(uploaded_file)
85
+
86
+ option1 = st.sidebar.selectbox(
87
+ 'Select the text column',
88
+ tuple(df.columns.to_list()))
89
+
90
+ option2 = st.sidebar.selectbox(
91
+ 'Select the label column',
92
+ tuple(df.columns.to_list()))
93
+
94
+ # clean training data
95
+ df[option1] = clean_tweets(df[option1])
96
+
97
+ # Enter the label names
98
+ label1 = st.sidebar.text_input("Enter the label for '0' value")
99
+ label2 = st.sidebar.text_input("Enter the label for '1' value")
100
+
101
+ # replace value with pos and neg
102
+ df[option2] = df[option2].map({0:label1, 1:label2})
103
+
104
+ gcr_config = st.sidebar.slider(label="choose the training size, longer the size longer the training time",
105
+ min_value=100,
106
+ max_value=10000,
107
+ step=10)
108
+
109
+ #subsetting based on classes
110
+ df1 = df[df[option2] == label1][0:int(gcr_config/2)]
111
+ df2 = df[df[option2] == label2][0:int(gcr_config/2)]
112
+
113
+ df_new = pd.concat([df1, df2]).reset_index(drop=True)
114
+
115
+
116
+ # convert in the format
117
+ training_list = []
118
+ for i in df_new.index:
119
+ value = (df_new[option1][i], df_new[option2][i])
120
+ training_list.append(value)
121
+
122
+ # run classification
123
+ run_button = st.sidebar.button(label='Start Training')
124
+
125
+ if run_button:
126
+
127
+ # Train using Naive Bayes
128
+ start = time.time() # start time
129
+ cl = NaiveBayesClassifier(training_list[0:gcr_config])
130
+
131
+ st.success("Congratulations!!! Model trained successfully with an accuracy of "+str(cl.accuracy(training_list) * 100) + str("%"))
132
+ st.write("Total Time taken for Training :" + str((time.time()-start)/60) + " minutes")
133
+
134
+ # download the model
135
+ download_model(cl)
136
+
137
+ # testing the model
138
+ if value == "Test the model":
139
+ uploaded_file = st.file_uploader("*Upload your model file, make sure its in the right format (currently pickle file)", type="pkl")
140
+ if uploaded_file:
141
+ model = pickle.load(uploaded_file)
142
+ st.success("Congratulations!!! Model upload successfull")
143
+
144
+ if model:
145
+ value1 = ""
146
+ test_sentence = st.text_input("Enter the testing sentence")
147
+
148
+ #predict_button = st.button(label='Predict')
149
+
150
+ if test_sentence:
151
+ st.info("Model Prediction is : " + model.classify(test_sentence))
152
+
153
+ "\n"
154
+ st.write("### 🎲 Help me train the model better. How is the prediction?")
155
+ "\n"
156
+ correct = st.checkbox("Correct")
157
+ wrong = st.checkbox("Incorrect")
158
+
159
+ if correct:
160
+ st.success("Great!!! I am happy for you")
161
+ st.write("If you would like please try out for more examples")
162
+
163
+ if wrong:
164
+ st.write("### 🎲 Dont worry!!! Lets add this new data to the model and retrain. ")
165
+ label = st.text_input("Could you write the actual label, please note the label name should be the same while you trained")
166
+ #retrain_button = st.button(label='Retrain')
167
+ if label:
168
+ new_data = [(test_sentence, label)]
169
+ model.update(new_data)
170
+
171
+ st.write("### 🎲 Lets classify and see whether model had learned from this example ")
172
+
173
+ st.write("Sentence : " + test_sentence)
174
+ st.info("New Model Prediction is : " + model.classify(test_sentence))
175
+
176
+ sec_wrong3 = st.checkbox("It's Correct")
177
+ sec_wrong1 = st.checkbox("Still Incorrect")
178
+ sec_wrong2 = st.checkbox("I will go ahead and change the data in excel and retrain the model")
179
+
180
+
181
+ if sec_wrong1:
182
+ st.write("### 🎲 Lets try training with some sentences of this sort")
183
+ new_sentence = st.text_input("Enter the training sentence")
184
+ new_label = st.text_input("Enter the training label")
185
+
186
+ st.write("Lets try one last time ")
187
+ retrain_button1 = st.button(label='Retrain again!')
188
+
189
+ if retrain_button1:
190
+ new_data1 = [(new_sentence, new_label)]
191
+ model.update(new_data1)
192
+
193
+ st.write("Sentence : " + new_sentence)
194
+ st.info("New Model Prediction is : " + model.classify(new_sentence))
195
+
196
+ # download the model
197
+ download_model(model)
198
+
199
+ if sec_wrong2:
200
+ st.info("Great!!! Fingers Crossed")
201
+ st.write("### 🎲 Please return to your excel file and add more sentences and Train the model again")
202
+
203
+ if sec_wrong3:
204
+ st.info("Wow!!! Awesome")
205
+ st.write("Now lets download the updated model")
206
+ # download the model
207
+ download_model(model)
208
+
209
+ # predicting for new data
210
+ if value == "Predict for a new data":
211
+ uploaded_file3 = st.file_uploader("*Upload your model file, make sure its in the right format (currently pickle file)", type="pkl")
212
+ if uploaded_file3:
213
+ model1 = pickle.load(uploaded_file3)
214
+ st.success("Congratulations!!! Model uploaded successfully")
215
+
216
+ uploaded_file1 = st.file_uploader("*Upload your new data which you have to predict", type="xlsx")
217
+ if uploaded_file1:
218
+ st.success("Congratulations!!! Data uploaded successfully")
219
+
220
+ df_valid = pd.read_excel(uploaded_file1)
221
+
222
+ option3 = st.selectbox(
223
+ 'Select the text column which needs to be predicted',
224
+ tuple(df_valid.columns.to_list()))
225
+
226
+ predict_button1 = st.button(label='Predict for new data')
227
+
228
+ if predict_button1:
229
+ start1 = time.time() # start time
230
+ df_valid['predicted'] = df_valid[option3].apply(lambda tweet: model1.classify(tweet))
231
+
232
+ st.write("### 🎲 Prediction Successfull !!!")
233
+
234
+ st.write("Total No. of sentences: "+ str(len(df_valid)))
235
+ st.write("Total Time taken for Prediction :" + str((time.time()-start1)/60) + " minutes")
236
+
237
+ st.markdown(get_table_download_link(df_valid), unsafe_allow_html=True)
238
+
239
+ if __name__ == "__main__":
240
+ main()