YU-XI commited on
Commit
b6eba06
·
verified ·
1 Parent(s): 4def369

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -14
app.py CHANGED
@@ -7,6 +7,8 @@ import google.generativeai as genai
7
  from langchain.chains.question_answering import load_qa_chain
8
  import torch
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
10
 
11
  # Configure Gemini API
12
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
@@ -18,7 +20,19 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
  dtype = torch.bfloat16
19
  mistral_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
20
 
21
- def initialize(file_path, question):
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
  model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
24
  prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
@@ -29,12 +43,20 @@ def initialize(file_path, question):
29
  """
30
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
31
 
32
- if os.path.exists(file_path):
 
 
33
  pdf_loader = PyPDFLoader(file_path)
34
  pages = pdf_loader.load_and_split()
35
- context = "\n".join(str(page.page_content) for page in pages[:30])
 
 
 
 
 
 
36
  stuff_chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
37
- stuff_answer = stuff_chain({"input_documents": pages, "question": question, "context": context}, return_only_outputs=True)
38
  gemini_answer = stuff_answer['output_text']
39
 
40
  # Use Mistral model for additional text generation
@@ -47,25 +69,27 @@ def initialize(file_path, question):
47
  combined_output = f"Gemini Answer: {gemini_answer}\n\nMistral Follow-up: {mistral_output}"
48
  return combined_output
49
  else:
50
- return "Error: Unable to process the document. Please ensure the PDF file is valid."
51
  except Exception as e:
52
  return f"An error occurred: {str(e)}"
53
 
54
  # Define Gradio Interface
55
  input_file = gr.File(label="Upload PDF File")
 
56
  input_question = gr.Textbox(label="Ask about the document")
57
  output_text = gr.Textbox(label="Answer - Combined Gemini and Mistral")
58
 
59
- def pdf_qa(file, question):
60
- if file is None:
61
- return "Please upload a PDF file first."
62
- return initialize(file.name, question)
 
63
 
64
  # Create Gradio Interface
65
  gr.Interface(
66
- fn=pdf_qa,
67
- inputs=[input_file, input_question],
68
  outputs=output_text,
69
- title="RAG Knowledge Retrieval using Gemini API and Mistral Model",
70
- description="Upload a PDF file and ask questions about the content."
71
- ).launch()
 
7
  from langchain.chains.question_answering import load_qa_chain
8
  import torch
9
  from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from transformers import BlipProcessor, BlipForConditionalGeneration
11
+ from PIL import Image
12
 
13
  # Configure Gemini API
14
  genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
 
20
  dtype = torch.bfloat16
21
  mistral_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
22
 
23
+ # Load BLIP model for image processing
24
+ blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
25
+ blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
26
+
27
+ def process_image(image):
28
+ # Convert PIL Image to tensor
29
+ inputs = blip_processor(images=image, return_tensors="pt").to(device)
30
+ # Generate caption from image
31
+ caption_ids = blip_model.generate(**inputs)
32
+ caption = blip_processor.decode(caption_ids[0], skip_special_tokens=True)
33
+ return caption
34
+
35
+ def initialize(file_path, image, question):
36
  try:
37
  model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
38
  prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
 
43
  """
44
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
45
 
46
+ context = ""
47
+
48
+ if file_path and os.path.exists(file_path):
49
  pdf_loader = PyPDFLoader(file_path)
50
  pages = pdf_loader.load_and_split()
51
+ context += "\n".join(str(page.page_content) for page in pages[:30])
52
+
53
+ if image:
54
+ image_context = process_image(image)
55
+ context += f"\nImage Context: {image_context}"
56
+
57
+ if context:
58
  stuff_chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
59
+ stuff_answer = stuff_chain({"input_documents": [], "question": question, "context": context}, return_only_outputs=True)
60
  gemini_answer = stuff_answer['output_text']
61
 
62
  # Use Mistral model for additional text generation
 
69
  combined_output = f"Gemini Answer: {gemini_answer}\n\nMistral Follow-up: {mistral_output}"
70
  return combined_output
71
  else:
72
+ return "Error: No valid context provided. Please upload a valid PDF or image."
73
  except Exception as e:
74
  return f"An error occurred: {str(e)}"
75
 
76
  # Define Gradio Interface
77
  input_file = gr.File(label="Upload PDF File")
78
+ input_image = gr.Image(type="pil", label="Upload Image")
79
  input_question = gr.Textbox(label="Ask about the document")
80
  output_text = gr.Textbox(label="Answer - Combined Gemini and Mistral")
81
 
82
+ def multimodal_qa(file, image, question):
83
+ if file is None and image is None:
84
+ return "Please upload a PDF file or an image first."
85
+ file_path = file.name if file else None
86
+ return initialize(file_path, image, question)
87
 
88
  # Create Gradio Interface
89
  gr.Interface(
90
+ fn=multimodal_qa,
91
+ inputs=[input_file, input_image, input_question],
92
  outputs=output_text,
93
+ title="Multi-modal RAG with Gemini API and Mistral Model",
94
+ description="Upload a PDF or an image and ask questions about the content."
95
+ ).launch()