Spaces:
Configuration error
Configuration error
miguelhomepage
commited on
Upload folder using huggingface_hub
Browse files- .DS_Store +0 -0
- README.md +3 -9
- go.py +181 -0
- teste.py +18 -0
- uploads/.DS_Store +0 -0
- uploads/compMorada-1.png +0 -0
- uploads/compMorada.jpg +0 -0
- uploads/compMorada.pdf +0 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: gradio
|
3 |
+
app_file: go.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 3.41.2
|
|
|
|
|
6 |
---
|
|
|
|
go.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#! apt install tesseract-ocr
|
2 |
+
#! apt install libtesseract-dev
|
3 |
+
#! pip install pytesseract
|
4 |
+
#! pip install Pillow
|
5 |
+
#!sudo apt-get install tesseract-ocr-por
|
6 |
+
#!pip install openai
|
7 |
+
#!sudo apt install poppler-utils
|
8 |
+
import cv2
|
9 |
+
import pytesseract
|
10 |
+
import urllib
|
11 |
+
import numpy as np
|
12 |
+
import re
|
13 |
+
import imutils
|
14 |
+
from PIL import Image
|
15 |
+
import string
|
16 |
+
import glob
|
17 |
+
import os
|
18 |
+
from openai import OpenAI
|
19 |
+
import subprocess
|
20 |
+
import gradio as gr
|
21 |
+
import shutil
|
22 |
+
|
23 |
+
def askLLM(text,fileType):
|
24 |
+
client = OpenAI(api_key="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")
|
25 |
+
if(fileType == "permanente"):
|
26 |
+
prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
|
27 |
+
if(fileType == "morada"):
|
28 |
+
prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
|
29 |
+
completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
|
30 |
+
message_content = completion.choices[0].message.content
|
31 |
+
json_str = message_content.strip("```json").strip("```").strip()
|
32 |
+
return(json_str)
|
33 |
+
|
34 |
+
|
35 |
+
def ocr(image_path,fileType,rotation):
|
36 |
+
if(fileType == 'permanente'):
|
37 |
+
image_path = image_path.replace("-1.png", "")
|
38 |
+
out = ""
|
39 |
+
files = glob.glob(os.path.join(image_path + '*.png'))
|
40 |
+
files_sorted = sorted(files)
|
41 |
+
for file in files_sorted:
|
42 |
+
image = cv2.imread(file)
|
43 |
+
out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
|
44 |
+
#out = out + file + " "
|
45 |
+
|
46 |
+
answer = askLLM(out,fileType)
|
47 |
+
return(answer)
|
48 |
+
|
49 |
+
with open(image_path, 'rb') as image_file:
|
50 |
+
image_data = image_file.read()
|
51 |
+
image_array = np.frombuffer(image_data, np.uint8)
|
52 |
+
image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
|
53 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
54 |
+
gray = cv2.bitwise_not(gray)
|
55 |
+
rot_data = pytesseract.image_to_osd(image);
|
56 |
+
#print("[OSD] "+rot_data)
|
57 |
+
rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
|
58 |
+
angle = float(rot)
|
59 |
+
rotated = imutils.rotate_bound(image, angle)
|
60 |
+
if(rotation != 0):
|
61 |
+
rotated = imutils.rotate(rotated, rotation)
|
62 |
+
#cv2_imshow(rotated)
|
63 |
+
#print(pytesseract.image_to_osd(rotated));
|
64 |
+
#print("[TEXT]")
|
65 |
+
out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
|
66 |
+
processed = process(out,fileType)
|
67 |
+
return(processed)
|
68 |
+
|
69 |
+
def process(out,fileType):
|
70 |
+
if(fileType == "morada"):
|
71 |
+
#print(out)
|
72 |
+
answer = askLLM(out,fileType)
|
73 |
+
return(answer)
|
74 |
+
if(fileType == "iban"):
|
75 |
+
result = ""
|
76 |
+
#print("-----")
|
77 |
+
#print(out)
|
78 |
+
#print("-----")
|
79 |
+
idx = 0
|
80 |
+
prefix = "PT50"
|
81 |
+
out = out.upper()
|
82 |
+
if(fileType == "iban"):
|
83 |
+
index = out.find("NIB")
|
84 |
+
if (index != -1):
|
85 |
+
prefix = "NIB"
|
86 |
+
idx = index
|
87 |
+
|
88 |
+
index = out.find("PT50")
|
89 |
+
if (index != -1):
|
90 |
+
prefix = "PT50"
|
91 |
+
idx = index
|
92 |
+
|
93 |
+
index = out.find("PT5O")
|
94 |
+
if (index != -1):
|
95 |
+
prefix = "PT5O"
|
96 |
+
idx = index
|
97 |
+
|
98 |
+
index = out.find("PTS0")
|
99 |
+
if (index != -1):
|
100 |
+
prefix = "PTS0"
|
101 |
+
idx = index
|
102 |
+
|
103 |
+
index = out.find("PTSO")
|
104 |
+
if (index != -1):
|
105 |
+
prefix = "PTSO"
|
106 |
+
idx = index
|
107 |
+
|
108 |
+
if(idx != 0):
|
109 |
+
#print("prefix:",prefix)
|
110 |
+
visible_chars = string.ascii_letters + string.digits + string.punctuation
|
111 |
+
remaining_string = out[idx + len(prefix):]
|
112 |
+
result = ''.join([char for char in remaining_string if char in visible_chars][:21])
|
113 |
+
if(prefix == "NIB"):
|
114 |
+
result = "NIB" + result
|
115 |
+
else:
|
116 |
+
result = "PT50" + result
|
117 |
+
|
118 |
+
else:
|
119 |
+
result = ""
|
120 |
+
return(result)
|
121 |
+
|
122 |
+
def process_file(file_path,option):
|
123 |
+
msg = file_path[:-4] + "-1.png"
|
124 |
+
if file_path.lower().endswith('.pdf'):
|
125 |
+
output_file_base = os.path.splitext(file_path)[0]
|
126 |
+
try:
|
127 |
+
subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
|
128 |
+
except subprocess.CalledProcessError as e:
|
129 |
+
msg = "Error converting PDF to PNG"
|
130 |
+
if file_path.lower().endswith('.pdf'):
|
131 |
+
error = 1
|
132 |
+
else:
|
133 |
+
msg = file_path
|
134 |
+
saida = go(msg,option)
|
135 |
+
return(saida)
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
def gradio_process_file(file,option):
|
140 |
+
if file is None:
|
141 |
+
return "No file uploaded."
|
142 |
+
upload_dir = "uploads/"
|
143 |
+
if not os.path.exists(upload_dir):
|
144 |
+
os.makedirs(upload_dir)
|
145 |
+
original_file_name = os.path.basename(file.name)
|
146 |
+
destination_path = os.path.join(upload_dir, original_file_name)
|
147 |
+
shutil.copy(file.name, destination_path)
|
148 |
+
return(process_file(destination_path,option))
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def go(img,fileType):
|
156 |
+
r = ocr(img,fileType,0)
|
157 |
+
if(r == '""'):
|
158 |
+
r = ocr(img,fileType,90)
|
159 |
+
if(r == '""'):
|
160 |
+
r = ocr(img,fileType,180)
|
161 |
+
if(r == '""'):
|
162 |
+
r = ocr(img,fileType,270)
|
163 |
+
return(r)
|
164 |
+
|
165 |
+
with gr.Blocks() as app:
|
166 |
+
gr.Markdown("# Extração de Docs #")
|
167 |
+
gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')
|
168 |
+
|
169 |
+
|
170 |
+
file_input = gr.File(label="Upload de PDF ou imagem")
|
171 |
+
select_option = gr.Dropdown(
|
172 |
+
label="Escolha",
|
173 |
+
choices=["iban", "morada", "permanente"],
|
174 |
+
value="iban",
|
175 |
+
interactive=True
|
176 |
+
)
|
177 |
+
output_box = gr.Textbox(label="Output", interactive=False)
|
178 |
+
process_button = gr.Button("Processar")
|
179 |
+
process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])
|
180 |
+
|
181 |
+
app.launch(share=True)
|
teste.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
|
4 |
+
def ocr(image_path,fileType,rotation):
|
5 |
+
if(fileType == 'permanente'):
|
6 |
+
image_path = image_path.replace("-1.png", "")
|
7 |
+
out = ""
|
8 |
+
files = glob.glob(os.path.join(image_path + '*.png'))
|
9 |
+
files_sorted = sorted(files)
|
10 |
+
for file in files_sorted:
|
11 |
+
#image = cv2.imread(os.path.basename(file))
|
12 |
+
#out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6")
|
13 |
+
out = out + os.path.basename(file) + " "
|
14 |
+
#answer = askLLM(out,fileType)
|
15 |
+
return(out)
|
16 |
+
|
17 |
+
x = ocr("uploads/CertidaoPermanente","permanente",0)
|
18 |
+
print(x)
|
uploads/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
uploads/compMorada-1.png
ADDED
uploads/compMorada.jpg
ADDED
uploads/compMorada.pdf
ADDED
Binary file (462 kB). View file
|
|