miguelhomepage commited on
Commit
2027574
·
verified ·
1 Parent(s): 36f7ac0

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Gradio
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: gradio
3
+ app_file: go.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.41.2
 
 
6
  ---
 
 
go.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! apt install tesseract-ocr
2
+ #! apt install libtesseract-dev
3
+ #! pip install pytesseract
4
+ #! pip install Pillow
5
+ #!sudo apt-get install tesseract-ocr-por
6
+ #!pip install openai
7
+ #!sudo apt install poppler-utils
8
+ import cv2
9
+ import pytesseract
10
+ import urllib
11
+ import numpy as np
12
+ import re
13
+ import imutils
14
+ from PIL import Image
15
+ import string
16
+ import glob
17
+ import os
18
+ from openai import OpenAI
19
+ import subprocess
20
+ import gradio as gr
21
+ import shutil
22
+
23
+ def askLLM(text,fileType):
24
+ client = OpenAI(api_key="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")
25
+ if(fileType == "permanente"):
26
+ prompt = f"""this is a text written in portuguese.Please extract the access_code,firmName,firmTaxNo,address and titulares.'titulares' are the name of the partners.If there are multiple partners do not create an array, concatenate it with a comma.Give me just the certidao permanente information with nothing else in json format.If there is a field that is not found the value should be null. text:""" + text
27
+ if(fileType == "morada"):
28
+ prompt = f"""this is a text written in portuguese.Please extract name and address from the text but only when they refer to an individual, not an organization.give the result as a json with the keys 'name' and 'address' and nothing else. if you cant find anything with meaning return an empty string and nothing else. text:""" + text
29
+ completion = client.chat.completions.create(model="gpt-4o",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user","content": prompt}])
30
+ message_content = completion.choices[0].message.content
31
+ json_str = message_content.strip("```json").strip("```").strip()
32
+ return(json_str)
33
+
34
+
35
+ def ocr(image_path,fileType,rotation):
36
+ if(fileType == 'permanente'):
37
+ image_path = image_path.replace("-1.png", "")
38
+ out = ""
39
+ files = glob.glob(os.path.join(image_path + '*.png'))
40
+ files_sorted = sorted(files)
41
+ for file in files_sorted:
42
+ image = cv2.imread(file)
43
+ out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6") + " "
44
+ #out = out + file + " "
45
+
46
+ answer = askLLM(out,fileType)
47
+ return(answer)
48
+
49
+ with open(image_path, 'rb') as image_file:
50
+ image_data = image_file.read()
51
+ image_array = np.frombuffer(image_data, np.uint8)
52
+ image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
53
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
54
+ gray = cv2.bitwise_not(gray)
55
+ rot_data = pytesseract.image_to_osd(image);
56
+ #print("[OSD] "+rot_data)
57
+ rot = re.search('(?<=Rotate: )\d+', rot_data).group(0)
58
+ angle = float(rot)
59
+ rotated = imutils.rotate_bound(image, angle)
60
+ if(rotation != 0):
61
+ rotated = imutils.rotate(rotated, rotation)
62
+ #cv2_imshow(rotated)
63
+ #print(pytesseract.image_to_osd(rotated));
64
+ #print("[TEXT]")
65
+ out = pytesseract.image_to_string(rotated, lang='por', config="--psm 6")
66
+ processed = process(out,fileType)
67
+ return(processed)
68
+
69
+ def process(out,fileType):
70
+ if(fileType == "morada"):
71
+ #print(out)
72
+ answer = askLLM(out,fileType)
73
+ return(answer)
74
+ if(fileType == "iban"):
75
+ result = ""
76
+ #print("-----")
77
+ #print(out)
78
+ #print("-----")
79
+ idx = 0
80
+ prefix = "PT50"
81
+ out = out.upper()
82
+ if(fileType == "iban"):
83
+ index = out.find("NIB")
84
+ if (index != -1):
85
+ prefix = "NIB"
86
+ idx = index
87
+
88
+ index = out.find("PT50")
89
+ if (index != -1):
90
+ prefix = "PT50"
91
+ idx = index
92
+
93
+ index = out.find("PT5O")
94
+ if (index != -1):
95
+ prefix = "PT5O"
96
+ idx = index
97
+
98
+ index = out.find("PTS0")
99
+ if (index != -1):
100
+ prefix = "PTS0"
101
+ idx = index
102
+
103
+ index = out.find("PTSO")
104
+ if (index != -1):
105
+ prefix = "PTSO"
106
+ idx = index
107
+
108
+ if(idx != 0):
109
+ #print("prefix:",prefix)
110
+ visible_chars = string.ascii_letters + string.digits + string.punctuation
111
+ remaining_string = out[idx + len(prefix):]
112
+ result = ''.join([char for char in remaining_string if char in visible_chars][:21])
113
+ if(prefix == "NIB"):
114
+ result = "NIB" + result
115
+ else:
116
+ result = "PT50" + result
117
+
118
+ else:
119
+ result = ""
120
+ return(result)
121
+
122
+ def process_file(file_path,option):
123
+ msg = file_path[:-4] + "-1.png"
124
+ if file_path.lower().endswith('.pdf'):
125
+ output_file_base = os.path.splitext(file_path)[0]
126
+ try:
127
+ subprocess.run(['pdftoppm', '-png', file_path, output_file_base], check=True)
128
+ except subprocess.CalledProcessError as e:
129
+ msg = "Error converting PDF to PNG"
130
+ if file_path.lower().endswith('.pdf'):
131
+ error = 1
132
+ else:
133
+ msg = file_path
134
+ saida = go(msg,option)
135
+ return(saida)
136
+
137
+
138
+
139
+ def gradio_process_file(file,option):
140
+ if file is None:
141
+ return "No file uploaded."
142
+ upload_dir = "uploads/"
143
+ if not os.path.exists(upload_dir):
144
+ os.makedirs(upload_dir)
145
+ original_file_name = os.path.basename(file.name)
146
+ destination_path = os.path.join(upload_dir, original_file_name)
147
+ shutil.copy(file.name, destination_path)
148
+ return(process_file(destination_path,option))
149
+
150
+
151
+
152
+
153
+
154
+
155
+ def go(img,fileType):
156
+ r = ocr(img,fileType,0)
157
+ if(r == '""'):
158
+ r = ocr(img,fileType,90)
159
+ if(r == '""'):
160
+ r = ocr(img,fileType,180)
161
+ if(r == '""'):
162
+ r = ocr(img,fileType,270)
163
+ return(r)
164
+
165
+ with gr.Blocks() as app:
166
+ gr.Markdown("# Extração de Docs #")
167
+ gr.HTML('<button onclick="window.location.reload()">Nova extração</button>')
168
+
169
+
170
+ file_input = gr.File(label="Upload de PDF ou imagem")
171
+ select_option = gr.Dropdown(
172
+ label="Escolha",
173
+ choices=["iban", "morada", "permanente"],
174
+ value="iban",
175
+ interactive=True
176
+ )
177
+ output_box = gr.Textbox(label="Output", interactive=False)
178
+ process_button = gr.Button("Processar")
179
+ process_button.click(gradio_process_file, inputs=[file_input,select_option], outputs=[output_box])
180
+
181
+ app.launch(share=True)
teste.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+
4
+ def ocr(image_path,fileType,rotation):
5
+ if(fileType == 'permanente'):
6
+ image_path = image_path.replace("-1.png", "")
7
+ out = ""
8
+ files = glob.glob(os.path.join(image_path + '*.png'))
9
+ files_sorted = sorted(files)
10
+ for file in files_sorted:
11
+ #image = cv2.imread(os.path.basename(file))
12
+ #out = out + pytesseract.image_to_string(image, lang='por', config="--psm 6")
13
+ out = out + os.path.basename(file) + " "
14
+ #answer = askLLM(out,fileType)
15
+ return(out)
16
+
17
+ x = ocr("uploads/CertidaoPermanente","permanente",0)
18
+ print(x)
uploads/.DS_Store ADDED
Binary file (6.15 kB). View file
 
uploads/compMorada-1.png ADDED
uploads/compMorada.jpg ADDED
uploads/compMorada.pdf ADDED
Binary file (462 kB). View file