AlirezaF138 commited on
Commit
75183d4
ยท
verified ยท
1 Parent(s): c189041

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -35
app.py CHANGED
@@ -4,10 +4,9 @@ from pdf2image import convert_from_path
4
  from PIL import Image
5
  import os
6
 
7
- # Function to perform OCR and search for a keyword
8
- def ocr_and_search(input_file, keyword, lang='fas'): # 'fas': Persian language (Farsi)
9
  extracted_text = ""
10
- keyword_found = False
11
 
12
  # Check if the input file is a PDF or an image
13
  if isinstance(input_file, str) and input_file.endswith('.pdf'): # Check if the file is a PDF
@@ -19,52 +18,53 @@ def ocr_and_search(input_file, keyword, lang='fas'): # 'fas': Persian language
19
  text = pytesseract.image_to_string(image, lang=lang)
20
  extracted_text += text
21
 
22
- # Check if the keyword is in the extracted text
23
- if keyword.lower() in text.lower():
24
- keyword_found = True
25
-
26
  elif isinstance(input_file, Image.Image): # If the input is an image
27
  text = pytesseract.image_to_string(input_file, lang=lang)
28
  extracted_text = text
29
-
30
- # Check if the keyword is in the extracted text
31
- if keyword.lower() in text.lower():
32
- keyword_found = True
33
-
34
- if not keyword_found:
35
- result_message = f"Keyword '{keyword}' not found in the document."
36
- else:
37
- result_message = f"Keyword '{keyword}' found in the document."
38
 
39
- return extracted_text, result_message
40
 
41
- # Create Gradio interface
42
  def gradio_interface():
43
  # Define Gradio inputs and outputs
44
- input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF") # Option to choose file type
45
  file_input = gr.File(label="Upload PDF/Image")
46
- keyword_input = gr.Textbox(label="Enter Keyword", value="ูู„ุณูู‡") # Default keyword is 'ูู„ุณูู‡'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  output_text = gr.Textbox(label="Extracted Text", interactive=False)
48
- output_message = gr.Textbox(label="Keyword Search Result", interactive=False)
49
 
50
  # Function to process the inputs and return the outputs
51
- def process(input_type, file, keyword):
52
- # Handle PDF and image accordingly
53
  if input_type == "PDF":
54
- extracted_text, result_message = ocr_and_search(file.name, keyword)
55
- else: # Handle image input
56
- image = Image.open(file.name) # Open image file
57
- extracted_text, result_message = ocr_and_search(image, keyword)
58
-
59
- return extracted_text, result_message
60
 
61
  # Create and launch Gradio interface
62
- gr.Interface(fn=process,
63
- inputs=[input_type, file_input, keyword_input],
64
- outputs=[output_text, output_message],
65
- title="OCR Keyword Search (PDF/Image)",
66
- description="Upload a PDF or Image, enter a keyword, and see the OCR results along with a search for the keyword."
67
- ).launch()
 
 
68
 
69
  # Call the function to create the interface
70
  gradio_interface()
 
4
  from PIL import Image
5
  import os
6
 
7
+ # Function to perform OCR
8
+ def ocr(input_file, lang='fas'): # 'fas': Persian language (Farsi)
9
  extracted_text = ""
 
10
 
11
  # Check if the input file is a PDF or an image
12
  if isinstance(input_file, str) and input_file.endswith('.pdf'): # Check if the file is a PDF
 
18
  text = pytesseract.image_to_string(image, lang=lang)
19
  extracted_text += text
20
 
 
 
 
 
21
  elif isinstance(input_file, Image.Image): # If the input is an image
22
  text = pytesseract.image_to_string(input_file, lang=lang)
23
  extracted_text = text
 
 
 
 
 
 
 
 
 
24
 
25
+ return extracted_text
26
 
 
27
  def gradio_interface():
28
  # Define Gradio inputs and outputs
29
+ input_type = gr.Radio(["PDF", "Image"], label="Choose Input Type", value="PDF")
30
  file_input = gr.File(label="Upload PDF/Image")
31
+ language_input = gr.Dropdown(
32
+ label="Select OCR Language",
33
+ choices=[
34
+ ("English", "eng"),
35
+ ("Mandarin Chinese", "chi_sim"),
36
+ ("Hindi", "hin"),
37
+ ("Spanish", "spa"),
38
+ ("French", "fra"),
39
+ ("Standard Arabic", "ara"),
40
+ ("Bengali", "ben"),
41
+ ("Portuguese", "por"),
42
+ ("Russian", "rus"),
43
+ ("Urdu", "urd"),
44
+ ("Persian (Farsi)", "fas")
45
+ ],
46
+ value="fas" # Default to Persian
47
+ )
48
  output_text = gr.Textbox(label="Extracted Text", interactive=False)
 
49
 
50
  # Function to process the inputs and return the outputs
51
+ def process(input_type, file, lang):
 
52
  if input_type == "PDF":
53
+ extracted_text = ocr(file.name, lang)
54
+ else:
55
+ image = Image.open(file.name)
56
+ extracted_text = ocr(image, lang)
57
+ return extracted_text
 
58
 
59
  # Create and launch Gradio interface
60
+ gr.Interface(
61
+ fn=process,
62
+ inputs=[input_type, file_input, language_input],
63
+ outputs=[output_text],
64
+ title="OCR (PDF/Image)",
65
+ description="Upload a PDF or Image, select the OCR language, and extract the text."
66
+ ).launch()
67
+
68
 
69
  # Call the function to create the interface
70
  gradio_interface()