Spaces:

miguelhomepage
/

gradio

Configuration error

App Files Files Community

miguelhomepage commited on Oct 6, 2024

Commit

2027574

verified ·

1 Parent(s): 36f7ac0

Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.DS_Store +0 -0
README.md +3 -9
go.py +181 -0
teste.py +18 -0
uploads/.DS_Store +0 -0
uploads/compMorada-1.png +0 -0
uploads/compMorada.jpg +0 -0
uploads/compMorada.pdf +0 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Gradio
-emoji: 🐠
-colorFrom: blue
-colorTo: red
 sdk: gradio
-sdk_version: 4.44.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: gradio
+app_file: go.py
 sdk: gradio
+sdk_version: 3.41.2
 ---

go.py ADDED Viewed

	@@ -0,0 +1,181 @@

+#! apt install tesseract-ocr
+#! apt install libtesseract-dev
+#! pip install pytesseract
+#! pip install Pillow
+#!sudo apt-get install tesseract-ocr-por
+#!pip install openai
+#!sudo apt install poppler-utils
+import cv2
+import pytesseract
+import urllib
+import numpy as np
+import re
+import imutils
+from PIL import Image
+import string
+import glob
+import os
+from openai import OpenAI
+import subprocess
+import gradio as gr
+import shutil
+def askLLM(text,fileType):
+	client = OpenAI(api_key="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")
+	if(fileType == "permanente"):
+		prompt = f"""this is a  text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
+	if(fileType == "morada"):
+		prompt = f"""this is a  text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
+	completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
+	message_content = completion.choices[0].message.content
+	json_str = message_content.strip("```json").strip("```").strip()
+	return(json_str)
+def ocr(image_path,fileType,rotation):
+  if(fileType == 'permanente'):
+	  image_path = image_path.replace("-1.png", "")
+	  out = ""
+	  files = glob.glob(os.path.join(image_path + '*.png'))
+	  files_sorted = sorted(files)
+	  for file in files_sorted:
+		  image = cv2.imread(file)
+		  out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
+		  #out = out + file + " "
+	  answer = askLLM(out,fileType)
+	  return(answer)
+  with open(image_path, 'rb') as image_file:
+	  image_data = image_file.read()
+  image_array = np.frombuffer(image_data, np.uint8)
+  image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
+  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+  gray = cv2.bitwise_not(gray)
+  rot_data = pytesseract.image_to_osd(image);
+  #print("[OSD] "+rot_data)
+  rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
+  angle = float(rot)
+  rotated = imutils.rotate_bound(image, angle)
+  if(rotation != 0):
+	  rotated = imutils.rotate(rotated, rotation)
+  #cv2_imshow(rotated)
+  #print(pytesseract.image_to_osd(rotated));
+  #print("[TEXT]")
+  out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
+  processed = process(out,fileType)
+  return(processed)
+def process(out,fileType):
+	if(fileType == "morada"):
+	  #print(out)
+	  answer = askLLM(out,fileType)
+	  return(answer)
+	if(fileType == "iban"):
+	  result = ""
+	  #print("-----")
+	  #print(out)
+	  #print("-----")
+	  idx = 0
+	  prefix = "PT50"
+	  out = out.upper()
+	  if(fileType == "iban"):
+		  index = out.find("NIB")
+		  if (index != -1):
+			  prefix = "NIB"
+			  idx = index
+		  index = out.find("PT50")
+		  if (index != -1):
+			  prefix = "PT50"
+			  idx = index
+		  index = out.find("PT5O")
+		  if (index != -1):
+			  prefix = "PT5O"
+			  idx = index
+		  index = out.find("PTS0")
+		  if (index != -1):
+			  prefix = "PTS0"
+			  idx = index
+		  index = out.find("PTSO")
+		  if (index != -1):
+			  prefix = "PTSO"
+			  idx = index
+		  if(idx != 0):
+			  #print("prefix:",prefix)
+			  visible_chars = string.ascii_letters + string.digits + string.punctuation
+			  remaining_string = out[idx + len(prefix):]
+			  result = ''.join([char for char in remaining_string if char in visible_chars][:21])
+			  if(prefix == "NIB"):
+				  result = "NIB" + result
+			  else:
+				  result = "PT50" + result
+	  else:
+		  result = ""
+	return(result)
+def process_file(file_path,option):
+	msg = file_path[:-4] + "-1.png"
+	if file_path.lower().endswith('.pdf'):
+		output_file_base = os.path.splitext(file_path)[0]
+		try:
+			subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
+		except subprocess.CalledProcessError as e:
+			msg = "Error converting PDF to PNG"
+	if file_path.lower().endswith('.pdf'):
+		error = 1
+	else:
+		msg = file_path
+	saida = go(msg,option)
+	return(saida)
+def gradio_process_file(file,option):
+	if file is None:
+		return "No file uploaded."
+	upload_dir = "uploads/"
+	if not os.path.exists(upload_dir):
+		os.makedirs(upload_dir)
+	original_file_name = os.path.basename(file.name)
+	destination_path = os.path.join(upload_dir, original_file_name)
+	shutil.copy(file.name, destination_path)
+	return(process_file(destination_path,option))
+def go(img,fileType):
+	r = ocr(img,fileType,0)
+	if(r == '""'):
+		r = ocr(img,fileType,90)
+	if(r == '""'):
+		r = ocr(img,fileType,180)
+	if(r == '""'):
+		r = ocr(img,fileType,270)
+	return(r)
+with gr.Blocks() as app:
+    gr.Markdown("# Extração de Docs #")
+    gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')
+    file_input = gr.File(label="Upload de PDF ou imagem")
+    select_option = gr.Dropdown(
+	  label="Escolha",
+	  choices=["iban", "morada", "permanente"],
+	  value="iban",
+	  interactive=True
+    )
+    output_box = gr.Textbox(label="Output", interactive=False)
+    process_button = gr.Button("Processar")
+    process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])
+app.launch(share=True)

teste.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import glob
+import os
+def ocr(image_path,fileType,rotation):
+  if(fileType == 'permanente'):
+	  image_path = image_path.replace("-1.png", "")
+	  out = ""
+	  files = glob.glob(os.path.join(image_path + '*.png'))
+	  files_sorted = sorted(files)
+	  for file in files_sorted:
+		  #image = cv2.imread(os.path.basename(file))
+		  #out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6")
+		  out = out + os.path.basename(file) + " "
+	  #answer = askLLM(out,fileType)
+	  return(out)
+x = ocr("uploads/CertidaoPermanente","permanente",0)
+print(x)

uploads/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

uploads/compMorada-1.png ADDED Viewed

uploads/compMorada.jpg ADDED Viewed

uploads/compMorada.pdf ADDED Viewed

Binary file (462 kB). View file