fiona10 commited on
Commit
f1cd7ff
·
verified ·
1 Parent(s): bcee71a

Upload recommendation_model.py

Browse files
Files changed (1) hide show
  1. recommendation_model.py +219 -0
recommendation_model.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import speech_recognition as sr
2
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
3
+ import spacy, os
4
+ import pandas as pd
5
+ import numpy as np
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.corpus import stopwords
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from autocorrect import Speller
11
+ from datetime import datetime
12
+ from transformers import pipeline
13
+ from translate import Translator
14
+ from nltk.stem import WordNetLemmatizer
15
+ from nltk.stem import PorterStemmer
16
+ from nltk.corpus import wordnet
17
+ from googletrans import Translator
18
+ import pickle
19
+
20
+ class recommendationModel:
21
+ def __init__(self):
22
+ self.translator = Translator()
23
+ self.zero_shot_classifier = pipeline('zero-shot-classification', model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")
24
+ self.spell_checker = Speller(lang='en')
25
+ self.porter = PorterStemmer()
26
+ self.lemmatizer = WordNetLemmatizer()
27
+ self.nlp = spacy.load("en_core_web_sm")
28
+ # self.spell_checker = Speller(lang='en')
29
+ self.class_names = ["positive :)", "neutral :|", "negative :("]
30
+ self.data1 = None
31
+
32
+ def detect_language(self,user_input):
33
+ det = self.translator.detect(user_input)
34
+ if det.lang!='en':
35
+ trans = self.translator.translate(user_input,'en')
36
+ print("\nTranslation:",trans.text)
37
+ return trans.text
38
+ else:
39
+ return user_input
40
+
41
+ def remove_stopwords(self,tags):
42
+ words = word_tokenize(tags)
43
+ stop_words = set(stopwords.words('english'))
44
+ filtered_words = [word for word in words if word not in stop_words]
45
+ filtered_text = " ".join(filtered_words)
46
+ return filtered_text
47
+
48
+ def correct_spelling(self,word):
49
+ return self.spell_checker(word)
50
+
51
+ def porterStemmer(self,text):
52
+ words = word_tokenize(text)
53
+ stemmed_words = [self.porter.stem(word) for word in words]
54
+ stemmed_sentence = ' '.join(stemmed_words)
55
+ return stemmed_sentence
56
+
57
+ def correct_spellings_in_text(self,text):
58
+ words = nltk.word_tokenize(text)
59
+ corrected_words = [self.correct_spelling(word) for word in words]
60
+ corrected_text = " ".join(corrected_words)
61
+ return corrected_text
62
+
63
+ def preprocess_input(self,userInput):
64
+ corrected_text = self.correct_spellings_in_text(userInput)
65
+ words = nltk.word_tokenize(corrected_text.lower())
66
+ sentence = " ".join(words)
67
+ sentence = self.remove_stopwords(sentence)
68
+ # sentence = porterStemmer(sentence)
69
+ keywords = nltk.word_tokenize(sentence.lower())
70
+ return keywords, sentence
71
+
72
+ def calculate_score(self,about, keywords):
73
+ score = 0
74
+ for keyword in keywords:
75
+ if keyword in about.lower():
76
+ score += 1
77
+ return score
78
+
79
+ def zero_shot_classifier_sent(self,userInput):
80
+ zsc_output = self.zero_shot_classifier(userInput, self.class_names)
81
+ zsc_labels = zsc_output['labels']
82
+ zsc_scores = zsc_output['scores']
83
+ return zsc_labels, zsc_scores
84
+
85
+ def recommendArticle(self,userInput,tfidf_scores,output_csv):
86
+ zsc_labels, zsc_scores = self.zero_shot_classifier_sent(userInput)
87
+ label_score_pairs = zip(zsc_labels, zsc_scores)
88
+ max_label, max_score = max(label_score_pairs, key=lambda pair: pair[1])
89
+ userInput = self.detect_language(userInput) #change to english
90
+ keywords, sentence = self.preprocess_input(userInput)
91
+ self.data1['score'] = self.data1['description'].apply(lambda x: self.calculate_score(x, keywords))
92
+
93
+ # Sort articles based on score
94
+ recommended_articles = self.data1.sort_values(by='score', ascending=False)
95
+
96
+ print("\n*****************\nRecommended Articles:")
97
+ for index, row in recommended_articles.head(10).iterrows():
98
+ print(f"\nTitle: {row['title']}")
99
+ print(f"Keywords: {row['keywords']}")
100
+ print(f"Class: {row['class']}")
101
+ print(f"URL: {row['url']}")
102
+
103
+ # Prepare data to append to CSV
104
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
105
+ output_data = {
106
+ 'Timestamp': timestamp,
107
+ 'User Input': userInput,
108
+ 'Emotion': max_label,
109
+ 'Sentiment Score': max_score,
110
+ 'Keywords': ", ".join(keywords)}
111
+
112
+ # Append output data to CSV
113
+ output_df = pd.DataFrame(output_data, index=[0])
114
+ output_df.to_csv(output_csv, mode='a', header=not os.path.exists(output_csv), index=False)
115
+
116
+ def convert_audio_to_text(self,recognizer, source, duration):
117
+ print("Listening for audio...")
118
+ audio_data = recognizer.listen(source, timeout=duration, phrase_time_limit=duration)
119
+ try:
120
+ text = recognizer.recognize_google(audio_data)
121
+ return text
122
+ except sr.WaitTimeoutError:
123
+ print("Listening timed out. No speech detected.")
124
+ return ""
125
+ except sr.UnknownValueError:
126
+ print("Oops, it seems we're having trouble understanding the audio. Let's try again with clearer sound.")
127
+ return ""
128
+ except sr.RequestError as e:
129
+ print(f"Could not request results; {e}")
130
+ return ""
131
+
132
+ def extract_keywords_tfidf(self,article_descriptions):
133
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
134
+ tfidf_matrix = tfidf_vectorizer.fit_transform(article_descriptions)
135
+ feature_names = tfidf_vectorizer.get_feature_names_out()
136
+ article_tfidf_scores = tfidf_matrix[0].toarray().flatten()
137
+ keyword_scores = dict(zip(feature_names, article_tfidf_scores))
138
+ return keyword_scores
139
+
140
+ def main(self,inputs):
141
+ output_csv = "Output2.csv" # Specify the output CSV file
142
+ print("Choose input method:\n1. Text\n2. Voice\n3. Audio File")
143
+ while True:
144
+ choice = input("\nEnter your choice (1 or 2 or 3): ")
145
+
146
+ if choice == '1':
147
+ user_input1 = input("Enter your message: ")
148
+ user_input1 = self.detect_language(user_input1)
149
+ inputs.append(user_input1)
150
+ user_input = ' '.join(inputs)
151
+ print(user_input)
152
+ print("\nProcessing....")
153
+ tfidf_scores = self.extract_keywords_tfidf(self.data1['description'])
154
+ self.recommendArticle(user_input, tfidf_scores, output_csv)
155
+ break
156
+
157
+ elif choice == '2':
158
+ recognizer = sr.Recognizer()
159
+ with sr.Microphone() as source:
160
+ recognizer.adjust_for_ambient_noise(source) # Adjust for ambient noise
161
+ text1 = self.convert_audio_to_text(recognizer, source, 15)
162
+
163
+ if text1:
164
+ text = self.detect_language(text1)
165
+ inputs.append(text1)
166
+ text = ' '.join(inputs)
167
+ print(text)
168
+ print("\nProcessing....")
169
+ tfidf_scores = self.extract_keywords_tfidf(self.data1['description'])
170
+ self.recommendArticle(text, tfidf_scores, output_csv)
171
+ break
172
+ else:
173
+ print("Oops, it seems we're having trouble understanding the audio. Let's try again with clearer sound.")
174
+
175
+ elif choice == '3':
176
+ filename = input("Enter the path to the audio file: ")
177
+ recognizer = sr.Recognizer()
178
+ with sr.AudioFile(filename) as source:
179
+ recognizer.adjust_for_ambient_noise(source) # Adjust for ambient noise
180
+ text1 = self.convert_audio_to_text(recognizer, source, 1000)
181
+
182
+ if text1:
183
+ text = self.detect_language(text1)
184
+ inputs.append(text1)
185
+ text = ' '.join(inputs)
186
+ print(text)
187
+ print("\nProcessing....")
188
+ tfidf_scores = self.extract_keywords_tfidf(self.data1['description'])
189
+ self.recommendArticle(text, tfidf_scores, output_csv)
190
+ break
191
+ else:
192
+ print("Oops, it seems we're having trouble finding the file. Let's try again with the correct path.")
193
+ else:
194
+ print("Invalid choice. Please enter 1 or 2 or 3.")
195
+
196
+
197
+ #PROPER PICKLING AND UNPICKLING ATTRIBUTES
198
+ def __getstate__(self):
199
+ # Exclude specific attributes from being pickled
200
+ excluded_attrs = ['translator', 'zero_shot_classifier', 'nlp'] # Add other attributes here if needed
201
+ state = self.__dict__.copy()
202
+ for attr in excluded_attrs:
203
+ if attr in state:
204
+ del state[attr]
205
+ return state
206
+
207
+ def __setstate__(self, state):
208
+ # Restore the state and recreate excluded attributes
209
+ self.__dict__.update(state)
210
+ self.translator = Translator() # Recreate translator
211
+ self.zero_shot_classifier = pipeline('zero-shot-classification', model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli") # Recreate zero_shot_classifier
212
+ self.nlp = spacy.load("en_core_web_sm") # Recreate nlp
213
+ # Recreate other excluded attributes here if needed
214
+
215
+
216
+ model = recommendationModel()
217
+
218
+ with open('model2.pkl', 'wb') as f:
219
+ pickle.dump(model, f)