Spaces:

VGG11
/

armenian_chatbot_bert_multilingual

Runtime error

App Files Files Community

armenian_chatbot_bert_multilingual / app.py

Mary12

Update app.py

622bf47 over 1 year ago

raw

history blame

4.28 kB

	import gradio as gr
	import re
	from pypdf import PdfReader
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline


	def remove_references(text):
	text = re.sub(r'\[\d+\]', '', text) ##[ref]
	text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
	text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
	# text = html.unescape(text)
	text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
	return text


	# def extract_text_from_pdf(file_path):
	# text = ""
	# pdf_reader = PdfReader(file_path)
	# for page in pdf_reader.pages:
	# text += page.extract_text() + "\n"
	# return text

	import fitz # PyMuPDF

	def extract_text_from_pdf(file_path):
	text = ""
	pdf_document = fitz.open(file_path)
	for page_num in range(pdf_document.page_count):
	page = pdf_document[page_num]
	text += page.get_text("text") + "\n"
	pdf_document.close()
	return text

	def extract_text_from_txt(file_path):
	text = ""
	with open(file_path, "r", encoding='utf-8') as txt_file:
	text = txt_file.read()
	return text

	def extract_text_from_doc(file_path):
	text - ""
	doc = docx.Document(file_path)
	for texts in doc.paragraphs:
	text+= texts.text + "\n"

	return texts



	def model(model_name):
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
	model_pipeline = pipeline4
	"question-answering",
	model = model,
	tokenizer = tokenizer
	)

	return model_pipeline

	model_name = "timpal0l/mdeberta-v3-base-squad2"
	pipe = model(model_name)

	def qa_result( context, question, file, pipe = pipe):
	if file is not None:
	allowed_types = [".pdf", ".txt", ".doc"]
	extension = "." + file.name.split(".")[-1].lower()
	if not extension in allowed_types:
	text = "Խնդրում եմ ներբեռնել .pdf, .txt, կամ .doc տիպի ֆայլեր։"
	else:
	if extension is allowed_types[0]:
	context = extract_text_from_pdf(file.name)
	elif extension is allowed_types[1]:
	context = extract_text_from_txt(file.name)
	else:
	context = extract_text_from_doc(file.name)

	result = pipe(question=question, context=context)
	answered = result['answer']
	text = remove_references(answered)

	elif file is None and len(context) == 0 and len(question) == 0:
	text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն կամ ֆայլն ու հարցերը։"

	else:
	if len(context) == 0 and len(question) == 0:
	text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն ու հարցերը։"
	elif len(context) == 0:
	text = "Ես չեմ կարողանա քեզ օգնել եթե ինձ չտրամադրես տեքստը"
	elif len(context) == 0:
	text = "Ես չեմ կարողանա քեզ օգնել եթե ինձ չտաս հարցդ"
	else:
	result = pipe(question=question, context=context)
	answered = result['answer']
	text = remove_references(answered)
	text = text.replace('(', '', 1)
	text = text.replace(',', '', len(text)-1)

	return text.capitalize()

	theme = gr.themes.Soft().set(
	body_background_fill='*background_fill_secondary',
	body_text_color_subdued='*body_text_color',
	body_text_color_subdued_dark='*chatbot_code_background_color'
	)


	app = gr.Interface(
	fn=qa_result,
	btn=gr.UploadButton("📁"),
	inputs=['textbox', 'text', gr.inputs.File()],
	outputs='textbox',
	title='Ողջու՛յն։ Ես քո արհեստական բանականությամբ օգնականն եմ',
	theme=theme,
	description='Տու՛ր ինձ տեքստ, ու տեքստին վերաբերող հարցեր, ու ես կօգնեմ քեզ պատասխանել հարցերին'
	)
	app.launch(inline=False)