DpShirazi commited on
Commit
8c07c55
·
1 Parent(s): 1dd973d

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +123 -0
  2. questiongenerator.py +429 -0
  3. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Dec 25 18:18:27 2023
4
+
5
+ @author: alish
6
+ """
7
+
8
+ import gradio as gr
9
+ import fitz # PyMuPDF
10
+ import questiongenerator as qs
11
+ import random
12
+
13
+ from questiongenerator import QuestionGenerator
14
+ qg = QuestionGenerator()
15
+
16
+
17
+
18
+ def Extract_QA(qlist):
19
+ i=0
20
+ question_i= qlist[i]['question']
21
+ Choices_ans= []
22
+ Choice_is_correct=[]
23
+ for j in range(4):
24
+ Choices_ans= Choices_ans+ [qlist[i]['answer'][j]['answer']]
25
+ Choice_is_correct= Choice_is_correct+ [qlist[i]['answer'][j]['correct']]
26
+
27
+ Q=f"""
28
+ Q: {question_i}
29
+ A. {Choices_ans[0]}
30
+ B. {Choices_ans[1]}
31
+ C. {Choices_ans[2]}
32
+ D. {Choices_ans[3]}
33
+ """
34
+ xs=['A','B','C','D']
35
+ result = [x for x, y in zip(xs, Choice_is_correct) if y ]
36
+ A= f"""
37
+ The rigth answer is: {result[0]}
38
+ """
39
+ return (Q,A)
40
+
41
+
42
+
43
+
44
+
45
+
46
+ def extract_text_from_pdf(pdf_file_path):
47
+ # Read the PDF file
48
+ global extracted_text
49
+ text = []
50
+ with fitz.open(pdf_file_path) as doc:
51
+ for page in doc:
52
+ text.append(page.get_text())
53
+ extracted_text= '\n'.join(text)
54
+ extracted_text= get_sub_text(extracted_text)
55
+
56
+ return ("The pdf is uploaded Successfully from:"+ str(pdf_file_path))
57
+
58
+ qg = qs.QuestionGenerator()
59
+
60
+ def get_sub_text(TXT):
61
+ sub_texts= qg._split_into_segments(TXT)
62
+ if isinstance(sub_texts, list):
63
+ return sub_texts
64
+ else:
65
+ return [sub_texts]
66
+
67
+ def pick_One_txt(sub_texts):
68
+ global selected_extracted_text
69
+ N= len(sub_texts)
70
+ if N==1:
71
+ selected_extracted_text= sub_texts[0]
72
+ return(selected_extracted_text)
73
+ # Generate a random number between low and high
74
+ random_number = random.uniform(0, N)
75
+ # Pick the integer part of the random number
76
+ random_number = int(random_number)
77
+ selected_extracted_text= sub_texts[random_number]
78
+
79
+ return(selected_extracted_text)
80
+
81
+
82
+ def pipeline():
83
+ global Q,A
84
+ text= selected_extracted_text
85
+ qlist= qg.generate(text, num_questions=1, answer_style="multiple_choice")
86
+ Q,A= Extract_QA(qlist)
87
+ A= A + '\n'+text
88
+ return (Q,A)
89
+
90
+ def ReurnAnswer():
91
+ return A
92
+
93
+ def GetQuestion():
94
+ pick_One_txt(extracted_text)
95
+ Q,A=pipeline()
96
+ return Q
97
+
98
+ with gr.Blocks() as demo:
99
+
100
+ with gr.Row():
101
+ #input_file=gr.File(type="filepath", label="Upload PDF Document")
102
+ input_file=gr.UploadButton(label='Select a file!', file_types=[".pdf"])
103
+ #upload_btn = gr.Button(value="Upload File")
104
+ #txt= extract_text_from_pdf(input_file)
105
+ with gr.Row():
106
+ with gr.Column():
107
+ upload_btn = gr.Button(value="Upload the pdf File.")
108
+ Gen_Question = gr.Button(value="Show the Question")
109
+ Gen_Answer = gr.Button(value="Show the Answer")
110
+
111
+ with gr.Column():
112
+ file_stat= gr.Textbox(label="File Status")
113
+ question = gr.Textbox(label="Question(s)")
114
+ Answer = gr.Textbox(label="Answer(s)")
115
+
116
+
117
+ upload_btn.click(extract_text_from_pdf, inputs=input_file, outputs=file_stat, api_name="QuestioGenerator")
118
+ Gen_Question.click(GetQuestion, inputs=None, outputs=question, api_name="QuestioGenerator")
119
+ Gen_Answer.click(ReurnAnswer, inputs=None, outputs=Answer, api_name="QuestioGenerator")
120
+ #examples = gr.Examples(examples=["I went to the supermarket yesterday.", "Helen is a good swimmer."],
121
+ # inputs=[english])
122
+
123
+ demo.launch()
questiongenerator.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import en_core_web_sm
2
+ import json
3
+ import numpy as np
4
+ import random
5
+ import re
6
+ import torch
7
+ from transformers import (
8
+ AutoTokenizer,
9
+ AutoModelForSeq2SeqLM,
10
+ AutoModelForSequenceClassification,
11
+ )
12
+ from typing import Any, List, Mapping, Tuple
13
+
14
+
15
+ class QuestionGenerator:
16
+ """A transformer-based NLP system for generating reading comprehension-style questions from
17
+ texts. It can generate full sentence questions, multiple choice questions, or a mix of the
18
+ two styles.
19
+
20
+ To filter out low quality questions, questions are assigned a score and ranked once they have
21
+ been generated. Only the top k questions will be returned. This behaviour can be turned off
22
+ by setting use_evaluator=False.
23
+ """
24
+
25
+ def __init__(self) -> None:
26
+
27
+ QG_PRETRAINED = "iarfmoose/t5-base-question-generator"
28
+ self.ANSWER_TOKEN = "<answer>"
29
+ self.CONTEXT_TOKEN = "<context>"
30
+ self.SEQ_LENGTH = 512
31
+
32
+ self.device = torch.device(
33
+ "cuda" if torch.cuda.is_available() else "cpu")
34
+
35
+ self.qg_tokenizer = AutoTokenizer.from_pretrained(
36
+ QG_PRETRAINED, use_fast=False)
37
+ self.qg_model = AutoModelForSeq2SeqLM.from_pretrained(QG_PRETRAINED)
38
+ self.qg_model.to(self.device)
39
+ self.qg_model.eval()
40
+
41
+ self.qa_evaluator = QAEvaluator()
42
+
43
+ def generate(
44
+ self,
45
+ article: str,
46
+ use_evaluator: bool = True,
47
+ num_questions: bool = None,
48
+ answer_style: str = "all"
49
+ ) -> List:
50
+ """Takes an article and generates a set of question and answer pairs. If use_evaluator
51
+ is True then QA pairs will be ranked and filtered based on their quality. answer_style
52
+ should selected from ["all", "sentences", "multiple_choice"].
53
+ """
54
+
55
+ print("Generating questions...\n")
56
+
57
+ qg_inputs, qg_answers = self.generate_qg_inputs(article, answer_style)
58
+ generated_questions = self.generate_questions_from_inputs(qg_inputs)
59
+
60
+ message = "{} questions doesn't match {} answers".format(
61
+ len(generated_questions), len(qg_answers)
62
+ )
63
+ assert len(generated_questions) == len(qg_answers), message
64
+
65
+ if use_evaluator:
66
+ print("Evaluating QA pairs...\n")
67
+ encoded_qa_pairs = self.qa_evaluator.encode_qa_pairs(
68
+ generated_questions, qg_answers
69
+ )
70
+ scores = self.qa_evaluator.get_scores(encoded_qa_pairs)
71
+
72
+ if num_questions:
73
+ qa_list = self._get_ranked_qa_pairs(
74
+ generated_questions, qg_answers, scores, num_questions
75
+ )
76
+ else:
77
+ qa_list = self._get_ranked_qa_pairs(
78
+ generated_questions, qg_answers, scores
79
+ )
80
+
81
+ else:
82
+ print("Skipping evaluation step.\n")
83
+ qa_list = self._get_all_qa_pairs(generated_questions, qg_answers)
84
+
85
+ return qa_list
86
+
87
+ def generate_qg_inputs(self, text: str, answer_style: str) -> Tuple[List[str], List[str]]:
88
+ """Given a text, returns a list of model inputs and a list of corresponding answers.
89
+ Model inputs take the form "answer_token <answer text> context_token <context text>" where
90
+ the answer is a string extracted from the text, and the context is the wider text surrounding
91
+ the context.
92
+ """
93
+
94
+ VALID_ANSWER_STYLES = ["all", "sentences", "multiple_choice"]
95
+
96
+ if answer_style not in VALID_ANSWER_STYLES:
97
+ raise ValueError(
98
+ "Invalid answer style {}. Please choose from {}".format(
99
+ answer_style, VALID_ANSWER_STYLES
100
+ )
101
+ )
102
+
103
+ inputs = []
104
+ answers = []
105
+
106
+ if answer_style == "sentences" or answer_style == "all":
107
+ segments = self._split_into_segments(text)
108
+
109
+ for segment in segments:
110
+ sentences = self._split_text(segment)
111
+ prepped_inputs, prepped_answers = self._prepare_qg_inputs(
112
+ sentences, segment
113
+ )
114
+ inputs.extend(prepped_inputs)
115
+ answers.extend(prepped_answers)
116
+
117
+ if answer_style == "multiple_choice" or answer_style == "all":
118
+ sentences = self._split_text(text)
119
+ prepped_inputs, prepped_answers = self._prepare_qg_inputs_MC(
120
+ sentences
121
+ )
122
+ inputs.extend(prepped_inputs)
123
+ answers.extend(prepped_answers)
124
+
125
+ return inputs, answers
126
+
127
+ def generate_questions_from_inputs(self, qg_inputs: List) -> List[str]:
128
+ """Given a list of concatenated answers and contexts, with the form:
129
+ "answer_token <answer text> context_token <context text>", generates a list of
130
+ questions.
131
+ """
132
+ generated_questions = []
133
+
134
+ for qg_input in qg_inputs:
135
+ question = self._generate_question(qg_input)
136
+ generated_questions.append(question)
137
+
138
+ return generated_questions
139
+
140
+ def _split_text(self, text: str) -> List[str]:
141
+ """Splits the text into sentences, and attempts to split or truncate long sentences."""
142
+ MAX_SENTENCE_LEN = 128
143
+ sentences = re.findall(".*?[.!\?]", text)
144
+ cut_sentences = []
145
+
146
+ for sentence in sentences:
147
+ if len(sentence) > MAX_SENTENCE_LEN:
148
+ cut_sentences.extend(re.split("[,;:)]", sentence))
149
+
150
+ # remove useless post-quote sentence fragments
151
+ cut_sentences = [s for s in sentences if len(s.split(" ")) > 5]
152
+ sentences = sentences + cut_sentences
153
+
154
+ return list(set([s.strip(" ") for s in sentences]))
155
+
156
+ def _split_into_segments(self, text: str) -> List[str]:
157
+ """Splits a long text into segments short enough to be input into the transformer network.
158
+ Segments are used as context for question generation.
159
+ """
160
+ MAX_TOKENS = 490
161
+ paragraphs = text.split("\n")
162
+ tokenized_paragraphs = [
163
+ self.qg_tokenizer(p)["input_ids"] for p in paragraphs if len(p) > 0
164
+ ]
165
+ segments = []
166
+
167
+ while len(tokenized_paragraphs) > 0:
168
+ segment = []
169
+
170
+ while len(segment) < MAX_TOKENS and len(tokenized_paragraphs) > 0:
171
+ paragraph = tokenized_paragraphs.pop(0)
172
+ segment.extend(paragraph)
173
+ segments.append(segment)
174
+
175
+ return [self.qg_tokenizer.decode(s, skip_special_tokens=True) for s in segments]
176
+
177
+ def _prepare_qg_inputs(
178
+ self,
179
+ sentences: List[str],
180
+ text: str
181
+ ) -> Tuple[List[str], List[str]]:
182
+ """Uses sentences as answers and the text as context. Returns a tuple of (model inputs, answers).
183
+ Model inputs are "answer_token <answer text> context_token <context text>"
184
+ """
185
+ inputs = []
186
+ answers = []
187
+
188
+ for sentence in sentences:
189
+ qg_input = f"{self.ANSWER_TOKEN} {sentence} {self.CONTEXT_TOKEN} {text}"
190
+ inputs.append(qg_input)
191
+ answers.append(sentence)
192
+
193
+ return inputs, answers
194
+
195
+ def _prepare_qg_inputs_MC(self, sentences: List[str]) -> Tuple[List[str], List[str]]:
196
+ """Performs NER on the text, and uses extracted entities are candidate answers for multiple-choice
197
+ questions. Sentences are used as context, and entities as answers. Returns a tuple of (model inputs, answers).
198
+ Model inputs are "answer_token <answer text> context_token <context text>"
199
+ """
200
+ spacy_nlp = en_core_web_sm.load()
201
+ docs = list(spacy_nlp.pipe(sentences, disable=["parser"]))
202
+ inputs_from_text = []
203
+ answers_from_text = []
204
+
205
+ for doc, sentence in zip(docs, sentences):
206
+ entities = doc.ents
207
+ if entities:
208
+
209
+ for entity in entities:
210
+ qg_input = f"{self.ANSWER_TOKEN} {entity} {self.CONTEXT_TOKEN} {sentence}"
211
+ answers = self._get_MC_answers(entity, docs)
212
+ inputs_from_text.append(qg_input)
213
+ answers_from_text.append(answers)
214
+
215
+ return inputs_from_text, answers_from_text
216
+
217
+ def _get_MC_answers(self, correct_answer: Any, docs: Any) -> List[Mapping[str, Any]]:
218
+ """Finds a set of alternative answers for a multiple-choice question. Will attempt to find
219
+ alternatives of the same entity type as correct_answer if possible.
220
+ """
221
+ entities = []
222
+
223
+ for doc in docs:
224
+ entities.extend([{"text": e.text, "label_": e.label_}
225
+ for e in doc.ents])
226
+
227
+ # remove duplicate elements
228
+ entities_json = [json.dumps(kv) for kv in entities]
229
+ pool = set(entities_json)
230
+ num_choices = (
231
+ min(4, len(pool)) - 1
232
+ ) # -1 because we already have the correct answer
233
+
234
+ # add the correct answer
235
+ final_choices = []
236
+ correct_label = correct_answer.label_
237
+ final_choices.append({"answer": correct_answer.text, "correct": True})
238
+ pool.remove(
239
+ json.dumps({"text": correct_answer.text,
240
+ "label_": correct_answer.label_})
241
+ )
242
+
243
+ # find answers with the same NER label
244
+ matches = [e for e in pool if correct_label in e]
245
+
246
+ # if we don't have enough then add some other random answers
247
+ if len(matches) < num_choices:
248
+ choices = matches
249
+ pool = pool.difference(set(choices))
250
+ choices.extend(random.sample(pool, num_choices - len(choices)))
251
+ else:
252
+ choices = random.sample(matches, num_choices)
253
+
254
+ choices = [json.loads(s) for s in choices]
255
+
256
+ for choice in choices:
257
+ final_choices.append({"answer": choice["text"], "correct": False})
258
+
259
+ random.shuffle(final_choices)
260
+ return final_choices
261
+
262
+ @torch.no_grad()
263
+ def _generate_question(self, qg_input: str) -> str:
264
+ """Takes qg_input which is the concatenated answer and context, and uses it to generate
265
+ a question sentence. The generated question is decoded and then returned.
266
+ """
267
+ encoded_input = self._encode_qg_input(qg_input)
268
+ output = self.qg_model.generate(input_ids=encoded_input["input_ids"])
269
+ question = self.qg_tokenizer.decode(
270
+ output[0],
271
+ skip_special_tokens=True
272
+ )
273
+ return question
274
+
275
+ def _encode_qg_input(self, qg_input: str) -> torch.tensor:
276
+ """Tokenizes a string and returns a tensor of input ids corresponding to indices of tokens in
277
+ the vocab.
278
+ """
279
+ return self.qg_tokenizer(
280
+ qg_input,
281
+ padding='max_length',
282
+ max_length=self.SEQ_LENGTH,
283
+ truncation=True,
284
+ return_tensors="pt",
285
+ ).to(self.device)
286
+
287
+ def _get_ranked_qa_pairs(
288
+ self, generated_questions: List[str], qg_answers: List[str], scores, num_questions: int = 10
289
+ ) -> List[Mapping[str, str]]:
290
+ """Ranks generated questions according to scores, and returns the top num_questions examples.
291
+ """
292
+ if num_questions > len(scores):
293
+ num_questions = len(scores)
294
+ print((
295
+ f"\nWas only able to generate {num_questions} questions.",
296
+ "For more questions, please input a longer text.")
297
+ )
298
+
299
+ qa_list = []
300
+
301
+ for i in range(num_questions):
302
+ index = scores[i]
303
+ qa = {
304
+ "question": generated_questions[index].split("?")[0] + "?",
305
+ "answer": qg_answers[index]
306
+ }
307
+ qa_list.append(qa)
308
+
309
+ return qa_list
310
+
311
+ def _get_all_qa_pairs(self, generated_questions: List[str], qg_answers: List[str]):
312
+ """Formats question and answer pairs without ranking or filtering."""
313
+ qa_list = []
314
+
315
+ for question, answer in zip(generated_questions, qg_answers):
316
+ qa = {
317
+ "question": question.split("?")[0] + "?",
318
+ "answer": answer
319
+ }
320
+ qa_list.append(qa)
321
+
322
+ return qa_list
323
+
324
+
325
+ class QAEvaluator:
326
+ """Wrapper for a transformer model which evaluates the quality of question-answer pairs.
327
+ Given a QA pair, the model will generate a score. Scores can be used to rank and filter
328
+ QA pairs.
329
+ """
330
+
331
+ def __init__(self) -> None:
332
+
333
+ QAE_PRETRAINED = "iarfmoose/bert-base-cased-qa-evaluator"
334
+ self.SEQ_LENGTH = 512
335
+
336
+ self.device = torch.device(
337
+ "cuda" if torch.cuda.is_available() else "cpu")
338
+
339
+ self.qae_tokenizer = AutoTokenizer.from_pretrained(QAE_PRETRAINED)
340
+ self.qae_model = AutoModelForSequenceClassification.from_pretrained(
341
+ QAE_PRETRAINED
342
+ )
343
+ self.qae_model.to(self.device)
344
+ self.qae_model.eval()
345
+
346
+ def encode_qa_pairs(self, questions: List[str], answers: List[str]) -> List[torch.tensor]:
347
+ """Takes a list of questions and a list of answers and encodes them as a list of tensors."""
348
+ encoded_pairs = []
349
+
350
+ for question, answer in zip(questions, answers):
351
+ encoded_qa = self._encode_qa(question, answer)
352
+ encoded_pairs.append(encoded_qa.to(self.device))
353
+
354
+ return encoded_pairs
355
+
356
+ def get_scores(self, encoded_qa_pairs: List[torch.tensor]) -> List[float]:
357
+ """Generates scores for a list of encoded QA pairs."""
358
+ scores = {}
359
+
360
+ for i in range(len(encoded_qa_pairs)):
361
+ scores[i] = self._evaluate_qa(encoded_qa_pairs[i])
362
+
363
+ return [
364
+ k for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)
365
+ ]
366
+
367
+ def _encode_qa(self, question: str, answer: str) -> torch.tensor:
368
+ """Concatenates a question and answer, and then tokenizes them. Returns a tensor of
369
+ input ids corresponding to indices in the vocab.
370
+ """
371
+ if type(answer) is list:
372
+ for a in answer:
373
+ if a["correct"]:
374
+ correct_answer = a["answer"]
375
+ else:
376
+ correct_answer = answer
377
+
378
+ return self.qae_tokenizer(
379
+ text=question,
380
+ text_pair=correct_answer,
381
+ padding="max_length",
382
+ max_length=self.SEQ_LENGTH,
383
+ truncation=True,
384
+ return_tensors="pt",
385
+ )
386
+
387
+ @torch.no_grad()
388
+ def _evaluate_qa(self, encoded_qa_pair: torch.tensor) -> float:
389
+ """Takes an encoded QA pair and returns a score."""
390
+ output = self.qae_model(**encoded_qa_pair)
391
+ return output[0][0][1]
392
+
393
+
394
+ def print_qa(qa_list: List[Mapping[str, str]], show_answers: bool = True) -> None:
395
+ """Formats and prints a list of generated questions and answers."""
396
+
397
+ for i in range(len(qa_list)):
398
+ # wider space for 2 digit q nums
399
+ space = " " * int(np.where(i < 9, 3, 4))
400
+
401
+ print(f"{i + 1}) Q: {qa_list[i]['question']}")
402
+
403
+ answer = qa_list[i]["answer"]
404
+
405
+ # print a list of multiple choice answers
406
+ if type(answer) is list:
407
+
408
+ if show_answers:
409
+ print(
410
+ f"{space}A: 1. {answer[0]['answer']} "
411
+ f"{np.where(answer[0]['correct'], '(correct)', '')}"
412
+ )
413
+ for j in range(1, len(answer)):
414
+ print(
415
+ f"{space + ' '}{j + 1}. {answer[j]['answer']} "
416
+ f"{np.where(answer[j]['correct']==True,'(correct)', '')}"
417
+ )
418
+
419
+ else:
420
+ print(f"{space}A: 1. {answer[0]['answer']}")
421
+ for j in range(1, len(answer)):
422
+ print(f"{space + ' '}{j + 1}. {answer[j]['answer']}")
423
+
424
+ print("")
425
+
426
+ # print full sentence answers
427
+ else:
428
+ if show_answers:
429
+ print(f"{space}A: {answer}\n")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets==1.16.1
2
+ en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
3
+ numpy==1.22.0
4
+ sentencepiece==0.1.96
5
+ spacy
6
+ tokenizers==0.10.3
7
+ torch==1.7.1
8
+ transformers==4.12.5
9
+ gradio
10
+ pymupdf