Rehman1603 commited on
Commit
fb67966
·
verified ·
1 Parent(s): 94c9ce2

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +49 -0
  2. demo.PNG +0 -0
  3. encoding.py +37 -0
  4. main.py +104 -0
  5. mcq.py +305 -0
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import easyocr
3
+ import gradio as gr
4
+ from main import predict_mcq
5
+
6
+ reader = easyocr.Reader(['th','en'])
7
+ def ocr_with_easy(image):
8
+ #gray_scale_image=get_grayscale(img)
9
+ #thresholding(gray_scale_image)
10
+ cv2.imwrite('image.png',image)
11
+ image=cv2.imread('image.png')
12
+ bounds = reader.readtext(image,paragraph="False",detail = 0)
13
+ bounds = ''.join(bounds)
14
+ return bounds
15
+
16
+ def put_in_single_list(data):
17
+ result=[]
18
+ final_result=[]
19
+ for i in data:
20
+ result.append(i.get("question_statement"))
21
+ result.append(i.get("answer"))
22
+ result.append(i.get("options"))
23
+ final_result.append(result)
24
+ return final_result
25
+
26
+ def MCQGenerator(image):
27
+ I_text=ocr_with_easy(image)
28
+ text={
29
+ "input_text":I_text
30
+ }
31
+ Mcqs=predict_mcq(text)
32
+ data=Mcqs.get('questions')
33
+ print(data)
34
+ if data is not None:
35
+ #final_result=put_in_single_list(data)
36
+ statement=""
37
+ answer=""
38
+ options=""
39
+ for mcq in data:
40
+ statement+=mcq.get('question_statement')+','
41
+ answer+=mcq.get('answer')+','
42
+ options+=mcq.get('options')[0]+','+mcq.get('options')[1]+','+mcq.get('options')[2]+','
43
+ return statement,answer,options
44
+ else:
45
+ return "Null","Null","Null"
46
+
47
+ iface=gr.Interface(fn=MCQGenerator,inputs='image',outputs=[gr.components.Textbox(label="Question"),gr.components.Textbox(label="Answer"),gr.components.Textbox(label="Options")],
48
+ examples=[['demo.PNG']])
49
+ iface.launch(debug=True)
demo.PNG ADDED
encoding.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
3
+
4
+
5
+ def greedy_decoding (inp_ids,attn_mask,model,tokenizer):
6
+ greedy_output = model.generate(input_ids=inp_ids, attention_mask=attn_mask, max_length=256)
7
+ Question = tokenizer.decode(greedy_output[0], skip_special_tokens=True,clean_up_tokenization_spaces=True)
8
+ return Question.strip().capitalize()
9
+
10
+
11
+ def beam_search_decoding (inp_ids,attn_mask,model,tokenizer):
12
+ beam_output = model.generate(input_ids=inp_ids,
13
+ attention_mask=attn_mask,
14
+ max_length=256,
15
+ num_beams=10,
16
+ num_return_sequences=3,
17
+ no_repeat_ngram_size=2,
18
+ early_stopping=True
19
+ )
20
+ Questions = [tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True) for out in
21
+ beam_output]
22
+ return [Question.strip().capitalize() for Question in Questions]
23
+
24
+
25
+ def topkp_decoding (inp_ids,attn_mask,model,tokenizer):
26
+ topkp_output = model.generate(input_ids=inp_ids,
27
+ attention_mask=attn_mask,
28
+ max_length=256,
29
+ do_sample=True,
30
+ top_k=40,
31
+ top_p=0.80,
32
+ num_return_sequences=3,
33
+ no_repeat_ngram_size=2,
34
+ early_stopping=True
35
+ )
36
+ Questions = [tokenizer.decode(out, skip_special_tokens=True,clean_up_tokenization_spaces=True) for out in topkp_output]
37
+ return [Question.strip().capitalize() for Question in Questions]
main.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ import random
7
+ import spacy
8
+ import zipfile
9
+ import os
10
+ os.system('pip install git+https://github.com/boudinfl/pke.git')
11
+ os.system('python -m nltk.downloader universal_tagset')
12
+ os.system('python -m spacy download en')
13
+ os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz')
14
+ os.system('tar -xvf s2v_reddit_2015_md.tar.gz')
15
+ os.system('python -m spacy download en_core_web_sm')
16
+ import git
17
+ import json
18
+ from sense2vec import Sense2Vec
19
+ import requests
20
+ from collections import OrderedDict
21
+ import string
22
+ import pke
23
+ import nltk
24
+ import numpy
25
+ import en_core_web_sm
26
+ from nltk import FreqDist
27
+ nltk.download('brown', quiet=True, force=True)
28
+ nltk.download('stopwords', quiet=True, force=True)
29
+ nltk.download('popular', quiet=True, force=True)
30
+ from nltk.corpus import stopwords
31
+ from nltk.corpus import brown
32
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
33
+ from nltk.tokenize import sent_tokenize
34
+ from flashtext import KeywordProcessor
35
+ from encoding import beam_search_decoding
36
+ from mcq import tokenize_sentences
37
+ from mcq import get_keywords
38
+ from mcq import get_sentences_for_keyword
39
+ from mcq import generate_questions_mcq
40
+ from mcq import generate_normal_questions
41
+ import time
42
+ tokenizer = T5Tokenizer.from_pretrained('t5-large')
43
+ model = T5ForConditionalGeneration.from_pretrained('Parth/result')
44
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
+ model.to(device)
46
+ # model.eval()
47
+ device = device
48
+ model = model
49
+ nlp = spacy.load('en_core_web_sm')
50
+ s2v = Sense2Vec().from_disk('s2v_old')
51
+ fdist = FreqDist(brown.words())
52
+ normalized_levenshtein = NormalizedLevenshtein()
53
+ def set_seed(seed):
54
+ numpy.random.seed(seed)
55
+ torch.manual_seed(seed)
56
+ if torch.cuda.is_available():
57
+ torch.cuda.manual_seed_all(seed)
58
+ set_seed(42)
59
+
60
+
61
+
62
+ def predict_mcq(payload):
63
+ start = time.time()
64
+ inp = {
65
+ "input_text": payload.get("input_text"),
66
+ "max_questions": payload.get("max_questions", 10)
67
+ }
68
+
69
+ text = inp['input_text']
70
+ sentences = tokenize_sentences(text)
71
+ joiner = " "
72
+ modified_text = joiner.join(sentences)
73
+
74
+
75
+ keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) )
76
+
77
+
78
+ keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences)
79
+
80
+ for k in keyword_sentence_mapping.keys():
81
+ text_snippet = " ".join(keyword_sentence_mapping[k][:3])
82
+ keyword_sentence_mapping[k] = text_snippet
83
+
84
+
85
+ final_output = {}
86
+
87
+ if len(keyword_sentence_mapping.keys()) == 0:
88
+ return final_output
89
+ else:
90
+ try:
91
+ generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein)
92
+
93
+ except:
94
+ return final_output
95
+ end = time.time()
96
+
97
+ final_output["statement"] = modified_text
98
+ final_output["questions"] = generated_questions["questions"]
99
+ final_output["time_taken"] = end-start
100
+
101
+ if torch.device=='cuda':
102
+ torch.cuda.empty_cache()
103
+
104
+ return final_output
mcq.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np # linear algebra
2
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
3
+ import time
4
+ import torch
5
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
6
+ import random
7
+ import spacy
8
+ import zipfile
9
+ import os
10
+ import json
11
+ from sense2vec import Sense2Vec
12
+ import requests
13
+ from collections import OrderedDict
14
+ import string
15
+ import pke
16
+ import nltk
17
+ from nltk import FreqDist
18
+ nltk.download('brown')
19
+ nltk.download('stopwords')
20
+ nltk.download('popular')
21
+ from nltk.corpus import stopwords
22
+ from nltk.corpus import brown
23
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
24
+ from nltk.tokenize import sent_tokenize
25
+ from flashtext import KeywordProcessor
26
+
27
+ def MCQs_available(word,s2v):
28
+ word = word.replace(" ", "_")
29
+ sense = s2v.get_best_sense(word)
30
+ if sense is not None:
31
+ return True
32
+ else:
33
+ return False
34
+
35
+
36
+ def edits(word):
37
+ "All edits that are one edit away from `word`."
38
+ letters = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
39
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
40
+ deletes = [L + R[1:] for L, R in splits if R]
41
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
42
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
43
+ inserts = [L + c + R for L, R in splits for c in letters]
44
+ return set(deletes + transposes + replaces + inserts)
45
+
46
+
47
+ def sense2vec_get_words(word,s2v):
48
+ output = []
49
+
50
+ word_preprocessed = word.translate(word.maketrans("","", string.punctuation))
51
+ word_preprocessed = word_preprocessed.lower()
52
+
53
+ word_edits = edits(word_preprocessed)
54
+
55
+ word = word.replace(" ", "_")
56
+
57
+ sense = s2v.get_best_sense(word)
58
+ most_similar = s2v.most_similar(sense, n=15)
59
+
60
+ compare_list = [word_preprocessed]
61
+ for each_word in most_similar:
62
+ append_word = each_word[0].split("|")[0].replace("_", " ")
63
+ append_word = append_word.strip()
64
+ append_word_processed = append_word.lower()
65
+ append_word_processed = append_word_processed.translate(append_word_processed.maketrans("","", string.punctuation))
66
+ if append_word_processed not in compare_list and word_preprocessed not in append_word_processed and append_word_processed not in word_edits:
67
+ output.append(append_word.title())
68
+ compare_list.append(append_word_processed)
69
+
70
+
71
+ out = list(OrderedDict.fromkeys(output))
72
+
73
+ return out
74
+
75
+ def get_options(answer,s2v):
76
+ distractors =[]
77
+
78
+ try:
79
+ distractors = sense2vec_get_words(answer,s2v)
80
+ if len(distractors) > 0:
81
+ print(" Sense2vec_distractors successful for word : ", answer)
82
+ return distractors,"sense2vec"
83
+ except:
84
+ print (" Sense2vec_distractors failed for word : ",answer)
85
+
86
+
87
+ return distractors,"None"
88
+
89
+ def tokenize_sentences(text):
90
+ sentences = [sent_tokenize(text)]
91
+ sentences = [y for x in sentences for y in x]
92
+ # Remove any short sentences less than 20 letters.
93
+ sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
94
+ return sentences
95
+
96
+
97
+ def get_sentences_for_keyword(keywords, sentences):
98
+ keyword_processor = KeywordProcessor()
99
+ keyword_sentences = {}
100
+ for word in keywords:
101
+ word = word.strip()
102
+ keyword_sentences[word] = []
103
+ keyword_processor.add_keyword(word)
104
+ for sentence in sentences:
105
+ keywords_found = keyword_processor.extract_keywords(sentence)
106
+ for key in keywords_found:
107
+ keyword_sentences[key].append(sentence)
108
+
109
+ for key in keyword_sentences.keys():
110
+ values = keyword_sentences[key]
111
+ values = sorted(values, key=len, reverse=True)
112
+ keyword_sentences[key] = values
113
+
114
+ delete_keys = []
115
+ for k in keyword_sentences.keys():
116
+ if len(keyword_sentences[k]) == 0:
117
+ delete_keys.append(k)
118
+ for del_key in delete_keys:
119
+ del keyword_sentences[del_key]
120
+
121
+ return keyword_sentences
122
+
123
+
124
+ def is_far(words_list,currentword,thresh,normalized_levenshtein):
125
+ threshold = thresh
126
+ score_list =[]
127
+ for word in words_list:
128
+ score_list.append(normalized_levenshtein.distance(word.lower(),currentword.lower()))
129
+ if min(score_list)>=threshold:
130
+ return True
131
+ else:
132
+ return False
133
+
134
+ def filter_phrases(phrase_keys,max,normalized_levenshtein ):
135
+ filtered_phrases =[]
136
+ if len(phrase_keys)>0:
137
+ filtered_phrases.append(phrase_keys[0])
138
+ for ph in phrase_keys[1:]:
139
+ if is_far(filtered_phrases,ph,0.7,normalized_levenshtein ):
140
+ filtered_phrases.append(ph)
141
+ if len(filtered_phrases)>=max:
142
+ break
143
+ return filtered_phrases
144
+
145
+
146
+ def get_nouns_multipartite(text):
147
+ out = []
148
+
149
+ extractor = pke.unsupervised.MultipartiteRank()
150
+ extractor.load_document(input=text, language='en')
151
+ pos = {'PROPN', 'NOUN'}
152
+ stoplist = list(string.punctuation)
153
+ stoplist += stopwords.words('english')
154
+ extractor.candidate_selection(pos=pos)
155
+ # 4. build the Multipartite graph and rank candidates using random walk,
156
+ # alpha controls the weight adjustment mechanism, see TopicRank for
157
+ # threshold/method parameters.
158
+ try:
159
+ extractor.candidate_weighting(alpha=1.1,
160
+ threshold=0.75,
161
+ method='average')
162
+ except:
163
+ return out
164
+
165
+ keyphrases = extractor.get_n_best(n=10)
166
+
167
+ for key in keyphrases:
168
+ out.append(key[0])
169
+
170
+ return out
171
+
172
+
173
+ def get_phrases(doc):
174
+ phrases={}
175
+ for np in doc.noun_chunks:
176
+ phrase =np.text
177
+ len_phrase = len(phrase.split())
178
+ if len_phrase > 1:
179
+ if phrase not in phrases:
180
+ phrases[phrase]=1
181
+ else:
182
+ phrases[phrase]=phrases[phrase]+1
183
+
184
+ phrase_keys=list(phrases.keys())
185
+ phrase_keys = sorted(phrase_keys, key= lambda x: len(x),reverse=True)
186
+ phrase_keys=phrase_keys[:50]
187
+ return phrase_keys
188
+
189
+
190
+
191
+ def get_keywords(nlp,text,max_keywords,s2v,fdist,normalized_levenshtein,no_of_sentences):
192
+ doc = nlp(text)
193
+ max_keywords = int(max_keywords)
194
+
195
+ keywords = get_nouns_multipartite(text)
196
+ keywords = sorted(keywords, key=lambda x: fdist[x])
197
+ keywords = filter_phrases(keywords, max_keywords,normalized_levenshtein )
198
+
199
+ phrase_keys = get_phrases(doc)
200
+ filtered_phrases = filter_phrases(phrase_keys, max_keywords,normalized_levenshtein )
201
+
202
+ total_phrases = keywords + filtered_phrases
203
+
204
+ total_phrases_filtered = filter_phrases(total_phrases, min(max_keywords, 2*no_of_sentences),normalized_levenshtein )
205
+
206
+
207
+ answers = []
208
+ for answer in total_phrases_filtered:
209
+ if answer not in answers and MCQs_available(answer,s2v):
210
+ answers.append(answer)
211
+
212
+ answers = answers[:max_keywords]
213
+ return answers
214
+
215
+
216
+ def generate_questions_mcq(keyword_sent_mapping,device,tokenizer,model,sense2vec,normalized_levenshtein):
217
+ batch_text = []
218
+ answers = keyword_sent_mapping.keys()
219
+ for answer in answers:
220
+ txt = keyword_sent_mapping[answer]
221
+ context = "context: " + txt
222
+ text = context + " " + "answer: " + answer + " </s>"
223
+ batch_text.append(text)
224
+
225
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
226
+
227
+
228
+ print ("Running model for generation")
229
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
230
+
231
+ with torch.no_grad():
232
+ outs = model.generate(input_ids=input_ids,
233
+ attention_mask=attention_masks,
234
+ max_length=150)
235
+
236
+ output_array ={}
237
+ output_array["questions"] =[]
238
+ # print(outs)
239
+ for index, val in enumerate(answers):
240
+ individual_question ={}
241
+ out = outs[index, :]
242
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
243
+
244
+ Question = dec.replace("question:", "")
245
+ Question = Question.strip()
246
+ individual_question["question_statement"] = Question
247
+ individual_question["question_type"] = "MCQ"
248
+ individual_question["answer"] = val
249
+ individual_question["id"] = index+1
250
+ individual_question["options"], individual_question["options_algorithm"] = get_options(val, sense2vec)
251
+
252
+ individual_question["options"] = filter_phrases(individual_question["options"], 10,normalized_levenshtein)
253
+ index = 3
254
+ individual_question["extra_options"]= individual_question["options"][index:]
255
+ individual_question["options"] = individual_question["options"][:index]
256
+ individual_question["context"] = keyword_sent_mapping[val]
257
+
258
+ if len(individual_question["options"])>0:
259
+ output_array["questions"].append(individual_question)
260
+
261
+ return output_array
262
+
263
+ def generate_normal_questions(keyword_sent_mapping,device,tokenizer,model): #for normal one word questions
264
+ batch_text = []
265
+ answers = keyword_sent_mapping.keys()
266
+ for answer in answers:
267
+ txt = keyword_sent_mapping[answer]
268
+ context = "context: " + txt
269
+ text = context + " " + "answer: " + answer + " </s>"
270
+ batch_text.append(text)
271
+
272
+ encoding = tokenizer.batch_encode_plus(batch_text, pad_to_max_length=True, return_tensors="pt")
273
+
274
+
275
+ print ("Running model for generation")
276
+ input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
277
+
278
+ with torch.no_grad():
279
+ outs = model.generate(input_ids=input_ids,
280
+ attention_mask=attention_masks,
281
+ max_length=150)
282
+
283
+ output_array ={}
284
+ output_array["questions"] =[]
285
+
286
+ for index, val in enumerate(answers):
287
+ individual_quest= {}
288
+ out = outs[index, :]
289
+ dec = tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
290
+
291
+ Question= dec.replace('question:', '')
292
+ Question= Question.strip()
293
+
294
+ individual_quest['Question']= Question
295
+ individual_quest['Answer']= val
296
+ individual_quest["id"] = index+1
297
+ individual_quest["context"] = keyword_sent_mapping[val]
298
+
299
+ output_array["questions"].append(individual_quest)
300
+
301
+ return output_array
302
+
303
+ def random_choice():
304
+ a = random.choice([0,1])
305
+ return bool(a)