import gradio as gr
import re
from pypdf import PdfReader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline


def remove_references(text):
    text = re.sub(r'\[\d+\]', '', text) ##[ref]
    text = re.sub(r'\[https?://[^\[\]]+\s[^\[\]]+\]', '', text) ##hyperlink with text
    text = re.sub(r'\[https?://[^\[\]]+\]', '', text) ##just the hyperlink
    # text = html.unescape(text)
    text = re.sub(r'\s+', ' ', text).strip() ##clear out the white spaces
    return text

    
# def extract_text_from_pdf(file_path):
#     text = ""
#     pdf_reader = PdfReader(file_path)
#     for page in pdf_reader.pages:
#         text += page.extract_text() + "\n"
#     return text
  
import fitz  # PyMuPDF

def extract_text_from_pdf(file_path):
    text = ""
    pdf_document = fitz.open(file_path)
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text += page.get_text("text") + "\n"
    pdf_document.close()
    return text

def extract_text_from_txt(file_path):
    text = ""
    with open(file_path, "r", encoding='utf-8') as txt_file:
        text = txt_file.read()
    return text
    
def extract_text_from_doc(file_path):
    text - ""
    doc = docx.Document(file_path)
    for texts in doc.paragraphs:
        text+= texts.text + "\n"

    return texts
    

def model(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForQuestionAnswering.from_pretrained(model_name,return_dict = False)
  model_pipeline = pipeline4
    "question-answering",
    model = model,
    tokenizer = tokenizer
    )

  return model_pipeline
    
model_name = "timpal0l/mdeberta-v3-base-squad2"
pipe = model(model_name)

def qa_result( context, question, file, pipe = pipe):  
    if file is not None:
        allowed_types = [".pdf", ".txt", ".doc"]
        extension = "." + file.name.split(".")[-1].lower()
        if not extension in allowed_types:
            text = "Խնդրում եմ ներբեռնել .pdf, .txt, կամ .doc տիպի ֆայլեր։"
        else:
            if extension is allowed_types[0]:
                context = extract_text_from_pdf(file.name)
            elif extension is allowed_types[1]:
                context = extract_text_from_txt(file.name)
            else:
                context = extract_text_from_doc(file.name)
                
            result = pipe(question=question, context=context)
            answered = result['answer']
            text = remove_references(answered)
   
    elif file is None and len(context) == 0 and len(question) == 0:
        text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն կամ ֆայլն ու հարցերը։"
   
    else:
        if len(context) == 0 and len(question) == 0:
            text = "Որպեսզի ես կարողանամ քեզ օգնել, ինձ պիտի տրամադրես համապատասխան տեքստն ու հարցերը։"
        elif len(context) == 0:
            text = "Ես չեմ կարողանա քեզ օգնել եթե ինձ չտրամադրես տեքստը"
        elif len(context) == 0:
            text = "Ես չեմ կարողանա քեզ օգնել եթե ինձ չտաս հարցդ"
        else:
            result = pipe(question=question, context=context)
            answered = result['answer']
            text = remove_references(answered)
    text = text.replace('(', '', 1)
    text = text.replace(',', '', len(text)-1)
            
    return text.capitalize()

theme = gr.themes.Soft().set(
    body_background_fill='*background_fill_secondary',
    body_text_color_subdued='*body_text_color',
    body_text_color_subdued_dark='*chatbot_code_background_color'
)


app = gr.Interface(
    fn=qa_result,
    btn=gr.UploadButton("📁"),
    inputs=['textbox', 'text', gr.inputs.File()],
    outputs='textbox',
    title='Ողջու՛յն։ Ես քո արհեստական բանականությամբ օգնականն եմ',
    theme=theme,
    description='Տու՛ր ինձ տեքստ, ու տեքստին վերաբերող հարցեր, ու ես կօգնեմ քեզ պատասխանել հարցերին'
)
app.launch(inline=False)