Spaces:
Sleeping
Sleeping
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import time | |
import torch | |
from transformers import T5ForConditionalGeneration,T5Tokenizer | |
import random | |
import spacy | |
import zipfile | |
import os | |
os.system('pip install git+https://github.com/boudinfl/pke.git') | |
os.system('python -m nltk.downloader universal_tagset') | |
os.system('python -m spacy download en') | |
os.system('wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') | |
os.system('tar -xvf s2v_reddit_2015_md.tar.gz') | |
os.system('python -m spacy download en_core_web_sm') | |
import git | |
import json | |
from sense2vec import Sense2Vec | |
import requests | |
from collections import OrderedDict | |
import string | |
import pke | |
import nltk | |
import numpy | |
import en_core_web_sm | |
from nltk import FreqDist | |
nltk.download('brown', quiet=True, force=True) | |
nltk.download('stopwords', quiet=True, force=True) | |
nltk.download('popular', quiet=True, force=True) | |
from nltk.corpus import stopwords | |
from nltk.corpus import brown | |
from similarity.normalized_levenshtein import NormalizedLevenshtein | |
from nltk.tokenize import sent_tokenize | |
from flashtext import KeywordProcessor | |
from encoding import beam_search_decoding | |
from mcq import tokenize_sentences | |
from mcq import get_keywords | |
from mcq import get_sentences_for_keyword | |
from mcq import generate_questions_mcq | |
from mcq import generate_normal_questions | |
import time | |
tokenizer = T5Tokenizer.from_pretrained('t5-large') | |
model = T5ForConditionalGeneration.from_pretrained('Parth/result') | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model.to(device) | |
# model.eval() | |
device = device | |
model = model | |
nlp = spacy.load('en_core_web_sm') | |
s2v = Sense2Vec().from_disk('s2v_old') | |
fdist = FreqDist(brown.words()) | |
normalized_levenshtein = NormalizedLevenshtein() | |
def set_seed(seed): | |
numpy.random.seed(seed) | |
torch.manual_seed(seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed_all(seed) | |
set_seed(42) | |
def predict_mcq(payload): | |
start = time.time() | |
inp = { | |
"input_text": payload.get("input_text"), | |
"max_questions": payload.get("max_questions", 10) | |
} | |
text = inp['input_text'] | |
sentences = tokenize_sentences(text) | |
joiner = " " | |
modified_text = joiner.join(sentences) | |
keywords = get_keywords(nlp,modified_text,inp['max_questions'],s2v,fdist,normalized_levenshtein,len(sentences) ) | |
keyword_sentence_mapping = get_sentences_for_keyword(keywords, sentences) | |
for k in keyword_sentence_mapping.keys(): | |
text_snippet = " ".join(keyword_sentence_mapping[k][:3]) | |
keyword_sentence_mapping[k] = text_snippet | |
final_output = {} | |
if len(keyword_sentence_mapping.keys()) == 0: | |
return final_output | |
else: | |
try: | |
generated_questions = generate_questions_mcq(keyword_sentence_mapping,device,tokenizer,model,s2v,normalized_levenshtein) | |
except: | |
return final_output | |
end = time.time() | |
final_output["statement"] = modified_text | |
final_output["questions"] = generated_questions["questions"] | |
final_output["time_taken"] = end-start | |
if torch.device=='cuda': | |
torch.cuda.empty_cache() | |
return final_output |