Alealejandrooo dompal1 commited on
Commit
1d47317
·
0 Parent(s):

Duplicate from LumeraDS/deathCertReader

Browse files

Co-authored-by: D P <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: DeathCertifReader
3
+ emoji: 🔥
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.28.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: LumeraDS/deathCertReader
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from alessandro
2
+ import re
3
+ import cv2
4
+ import numpy as np
5
+ from paddleocr import PaddleOCR
6
+ from PIL import Image
7
+ import matplotlib.pyplot as plt
8
+ import pandas as pd
9
+ import matplotlib.pyplot as plt
10
+
11
+ ocr = PaddleOCR(lang='sl')
12
+
13
+ # def convert_to_image(document):
14
+ # '''
15
+ # Function: converts the pdf to image
16
+ # Input: pdf document
17
+ # Output: image
18
+ # '''
19
+
20
+ # # reads PDFs
21
+ # # reads only first page of PDF documents
22
+
23
+ # # os.path.join(document.name, 'sample.pdf')
24
+ # pdf_document = load_from_file(document)
25
+ # page_1 = pdf_document.create_page(0)
26
+ # images = renderer.render_page(page_1)
27
+ # image_data = image.data
28
+ # # convert the image to numpy array
29
+ # image = np.array(images)
30
+ # # handles non-PDF formats (e.g., .tif)
31
+ # # else:
32
+ # # images = Image.open(document)
33
+ # # # convert the image to RGB
34
+ # # image = images.convert('RGB')
35
+ # # # convert the image to numpy array
36
+ # # image = np.array(image)
37
+ # # # TODO: change to dynamic scaling
38
+ # # # downscale the image
39
+ # # scale = 1.494
40
+ # # width = int(image.shape[1] / scale)
41
+ # # height = int(image.shape[0] / scale)
42
+ # # dim = (width, height)
43
+ # # image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
44
+ # # fig, ax = plt.subplots(figsize=(15, 10))
45
+ # # ax.imshow(image, cmap = 'gray')
46
+ # return image
47
+
48
+
49
+ def deskew(image, model):
50
+ '''
51
+ Function: deskew an image
52
+ Input: takes an image as an array
53
+ Output: deskewed image
54
+ '''
55
+
56
+ # map the model classes to the actual degree of skew
57
+ map = { 0: '-1', 1: '-10', 2: '-11', 3: '-12', 4: '-13',
58
+ 5: '-14',6: '-15', 7: '-2', 8: '-3', 9: '-4',
59
+ 10: '-5',11: '-6',12: '-7', 13: '-8', 14: '-9',
60
+ 15: '0', 16: '1', 17: '10', 18: '11', 19: '12',
61
+ 20: '13',21: '14',22: '15', 23: '180',24: '2',
62
+ 25: '270',26: '3',27: '4', 28: '5', 29: '6',
63
+ 30: '7', 31: '8',32: '9', 33: '90'}
64
+
65
+ image_d = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
66
+ width = int(image_d.shape[1] * 0.2)
67
+ height = int(image_d.shape[0] * 0.2)
68
+ dim = (width, height)
69
+ # resize image
70
+ res = cv2.resize(image_d, dim, interpolation = cv2.INTER_AREA)
71
+ resized = cv2.resize(res, (200, 200))
72
+ # add two dimensions to feed to the model
73
+ resized = resized.astype('float32').reshape(1, 200, 200 ,1)
74
+ # normalize
75
+ resized = resized/255
76
+ # predictions
77
+ predictions = model.run(None, {'conv2d_input': resized})
78
+ # best prediction
79
+ pred = predictions[0].argmax()
80
+ # angle of skew
81
+ angle = int(map[pred])
82
+ skew_confidence = predictions[0][0][pred] * 100
83
+ # deskew original image
84
+ if angle == 90:
85
+ deskewed_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
86
+ return deskewed_image, angle, skew_confidence
87
+ if angle == 270:
88
+ deskewed_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
89
+ return deskewed_image, angle, skew_confidence
90
+
91
+ (h, w) = image.shape[:2]
92
+ center = (w // 2, h // 2)
93
+ M = cv2.getRotationMatrix2D(center, -angle, 1.0)
94
+ deskewed_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC,
95
+ borderMode=cv2.BORDER_REPLICATE)
96
+ return deskewed_image, angle, skew_confidence
97
+
98
+
99
+ def prepare_image_to_autoencoder(image):
100
+ '''
101
+ Function: prepare the image to be passed to the autoencoder.
102
+ Input: image (_type_): deskewed image
103
+ Output: resized image to be passed to the autoencoder
104
+ '''
105
+
106
+ height, width = image.shape[:2]
107
+ target_height = 600
108
+ target_width = 600
109
+ image = image[int(height/3.6): int(height/1.87), int(width/3.67): int(width/1.575)]
110
+ # reshape image to fixed size
111
+ image = cv2.resize(image, (target_width, target_height))
112
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
113
+ # normalize images
114
+ image = image / 255.0
115
+ # reshape to pass image to autoencoder
116
+ image = image.reshape(target_height, target_width, 1)
117
+ return image
118
+
119
+
120
+ def autoencode_ONNX(image, model):
121
+ '''
122
+ Function: remove noise from image
123
+ Input: image and autoencoder model
124
+ Output: image
125
+ '''
126
+
127
+ image = image.astype(np.float32).reshape(1, 600, 600, 1)
128
+ image = model.run(None, {'input_2': image})
129
+ image = image[0]
130
+ image = image.squeeze()
131
+ image = image * 255
132
+ image = image.astype('uint8')
133
+ # fig, ax = plt.subplots(figsize=(8, 5))
134
+ # ax.imshow(image, cmap = 'gray')
135
+ return image
136
+
137
+
138
+ def detect_entries_ONNX(denoised, model):
139
+ '''
140
+ Function: detect boxes Priimek, Ime and Datum boxes
141
+ Priimek: lastname
142
+ Ime: firstname
143
+ Datum smrti: date of death
144
+ Input: image
145
+ Output: boxes and confidence scores
146
+ '''
147
+
148
+ # the object detection model requires a tensor(1, h, w, 3)
149
+ autoencoded_RGB = cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
150
+ # adds the 1 to the tensor
151
+ autoencoded_expanded = np.expand_dims(autoencoded_RGB, axis=0)
152
+ detections = model.run(None, {'input_tensor': autoencoded_expanded})
153
+ boxes = detections[1]
154
+ confidence = detections[4] # returns a ndarray in a list of list
155
+ boxes = np.array(boxes[0])
156
+ confidence = np.array(confidence).reshape(5, 1)
157
+ boxes_and_confidence = np.append(boxes, confidence, axis=1)
158
+ # reshapes the boxes to be sorted
159
+ boxes_and_confidence = boxes_and_confidence.reshape(5, 5)
160
+ # sorts
161
+ boxes_and_confidence = \
162
+ boxes_and_confidence[boxes_and_confidence[:, 0].argsort()]
163
+ # boxes (expressed in image %)
164
+ boxes = boxes_and_confidence[:, :-1]
165
+ # boxes (expressed in actual pixels: ymin, xmin, ymax, xmax)
166
+ boxes = boxes * 600
167
+ # confidence boxes
168
+ confidence_boxes = boxes_and_confidence[:, -1].tolist()
169
+
170
+ for box in boxes:
171
+ ymin, xmin, ymax, xmax = box.astype(int)
172
+ cv2.rectangle(autoencoded_RGB, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
173
+ plt.figure()
174
+ plt.imshow(cv2.cvtColor(autoencoded_RGB, cv2.COLOR_BGR2RGB))
175
+ plt.title("Detected Boxes")
176
+ plt.savefig("test.jpg")
177
+ img = cv2.imread("test.jpg")
178
+ return Image.fromarray(img), confidence_boxes
179
+
180
+ def extract_detected_entries_pdl(image):
181
+
182
+ result = ocr.ocr(image, cls=False)
183
+
184
+ # boxes = [line[0] for line in result]
185
+ # txts = [line[1][0] for line in result]
186
+ # scores = [line[1][1] for line in result]
187
+ # im_show = draw_ocr(image, boxes, txts, scores, font_path ='/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf')
188
+ txt = []
189
+ scores = []
190
+ boxes = []
191
+ for r in result[0]:
192
+ txt.append(cleanString_basic(r[-1][0]))
193
+ scores.append(r[-1][1])
194
+ boxes.append(r[0])
195
+
196
+ return pd.DataFrame(np.transpose([txt,scores, boxes]),columns = ["Text","Score", "Boundary Box"])
197
+
198
+ def cleanString_basic(word):
199
+ word = word.replace("$", "s")
200
+ return word
201
+
202
+ def clean_string_start(string: 'str'):
203
+
204
+ names_flags = "√"
205
+ chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
206
+ if string.startswith(tuple(chars_to_remove)):
207
+ names_flags = string[0]
208
+ string = string[1:]
209
+ return string, names_flags
210
+
211
+ def clean_string_end(string: 'str'):
212
+
213
+ names_flags = "√"
214
+ chars_to_remove = ['!', "'", '[', ']', '*', '|', '.', ':', '\\', '/']
215
+ if string.endswith(tuple(chars_to_remove)):
216
+ names_flags = string[-1]
217
+ string = string[:-1]
218
+ return string, names_flags
219
+
220
+ def clean_dates(date: 'str'):
221
+ '''
222
+ Function: cleans the fields "datum smrti" and returns the char removed.
223
+ Input: date (string format)
224
+ Output: cleaned frame
225
+ '''
226
+
227
+ date_flags = "Y"
228
+ # finds special characters in the string
229
+ special_char = re.findall(r'[a-zA-Z!\[\|]', date)
230
+ if len(special_char) > 0:
231
+ date_flags = special_char
232
+ # remove special characters in the string
233
+ string = re.sub(r'[a-zA-Z!\[\|]', '', date)
234
+ return string, date_flags
235
+
236
+ def regex_string(string):
237
+ '''
238
+ Function: swaps the carachters with the "hat" with the regular ones
239
+ Input: string
240
+ Output: cleaned string
241
+ '''
242
+ map = {'Č': 'C',
243
+ 'č': 'c',
244
+ 'Š': 'S',
245
+ 'š': 's',
246
+ 'Ž': 'Z',
247
+ 'ž':'z'}
248
+ for x in string:
249
+ if x in map:
250
+ string = string.replace(x, map[x])
251
+ return string
252
+
253
+ import onnxruntime
254
+
255
+ def pdf_deskew_gr (document):
256
+ img = convert_to_image(document)
257
+ model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
258
+ deskewed_image, angle, skew_confidence = deskew(img, model)
259
+ return deskewed_image, angle, skew_confidence
260
+
261
+ def pdf_clean_gr(document):
262
+ img = convert_to_image(document)
263
+ model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
264
+ deskewed_image, angle, skew_confidence = deskew(img, model)
265
+ img = prepare_image_to_autoencoder(img)
266
+ model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx")
267
+ img = autoencode_ONNX(img, model)
268
+ return img
269
+
270
+ def pdf_resnet_gr(document):
271
+ img = convert_to_image(document)
272
+ model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/CNN_deskew_v0.0.2.onnx")
273
+ deskewed_image, angle, skew_confidence = deskew(img, model)
274
+ img = prepare_image_to_autoencoder(img)
275
+ model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/autoencoder_denoise_v0.0.2.onnx")
276
+ img = autoencode_ONNX(img, model)
277
+ model = onnxruntime.InferenceSession("/content/drive/MyDrive/cpo/Alessandro/ai_models/Latest/ResNet_od_v0.0.2.onnx")
278
+ boxes, confidence_boxes = detect_entries_ONNX(img, model)
279
+ return boxes, confidence_boxes
280
+
281
+ def pdf_extract_gr(extractimg):
282
+ # extractimg = convert_to_image(document)
283
+ extractimg = np.array(extractimg)
284
+ model = onnxruntime.InferenceSession("./models/CNN_deskew_v0.0.2.onnx")
285
+ deskewed_image, angle, skew_confidence = deskew(extractimg, model)
286
+ cleanimg = prepare_image_to_autoencoder(deskewed_image)
287
+ model = onnxruntime.InferenceSession("./models/autoencoder_denoise_v0.0.2.onnx")
288
+ img = autoencode_ONNX(cleanimg, model)
289
+ # model = onnxruntime.InferenceSession("./models/ResNet_od_v0.0.2.onnx")
290
+ # boxes, confidence_boxes = detect_entries_ONNX(img, model)
291
+ # confidence_entries, lastname, firstname, death_date = extract_detected_entries_pdl(img, boxes)
292
+
293
+ df = extract_detected_entries_pdl(img)
294
+
295
+ firstnamerow = df.iloc[0]
296
+ firstname = firstnamerow[0]
297
+ firstnameconfidence = round(float(firstnamerow[1]) * 100,3)
298
+ firstnameconfidence = f"{firstnameconfidence}%"
299
+
300
+ surnamerow = df.iloc[1]
301
+ surname = surnamerow[0]
302
+ surnameconfidence = round(float(surnamerow[1]) * 100,3)
303
+ surnameconfidence = f"{surnameconfidence}%"
304
+
305
+ dodrow = df.iloc[2]
306
+ dodname = dodrow[0]
307
+ dodconfidence = round(float(dodrow[1]) * 100,3)
308
+ dodconfidence = f"{dodconfidence}%"
309
+
310
+ return df, deskewed_image, angle, skew_confidence, img, firstname, firstnameconfidence, surname, surnameconfidence, dodname, dodconfidence
311
+
312
+ css = """
313
+ .run_container {
314
+ display: flex;
315
+ flex-direction: column;
316
+ align-items: center;
317
+ gap: 10px;
318
+ }
319
+
320
+ .run_btn {
321
+ margin: auto;
322
+ width: 50%;
323
+ display: flex;
324
+ }
325
+ .upload_cell {
326
+ margin: auto;
327
+ display: flex;
328
+ }
329
+
330
+ .results_container {
331
+ display: flex;
332
+ justify-content: space-evenly;
333
+ }
334
+
335
+ .results_cell {
336
+
337
+ }
338
+
339
+ """
340
+
341
+ import gradio as gr
342
+
343
+ with gr.Blocks(css = css) as demo:
344
+ gr.Markdown("""
345
+ # Death Certificate Extraction
346
+ """, elem_classes = "h1")
347
+ gr.Markdown("Upload a PDF, extract data")
348
+ with gr.Box(elem_classes = "run_container"):
349
+ # ExtractInput = gr.File(label = "Death Certificate", elem_classes="upload_cell")
350
+ ExtractButton = gr.Button(label = "Extract", elem_classes="run_btn")
351
+ with gr.Row(elem_id = "hide"):
352
+ with gr.Column():
353
+ ExtractInput = gr.Image()
354
+ with gr.Column():
355
+ # ExtractResult = gr.Image(label = "result")
356
+ with gr.Row(elem_classes = "results_container"):
357
+ FirstNameBox = gr.Textbox(label = "First Name", elem_classes = "results_cell")
358
+ FirstNameConfidenceBox = gr.Textbox(label = "First Name Confidence", elem_classes = "results_cell")
359
+ with gr.Row(elem_classes = "results_container"):
360
+ SurnameNameBox = gr.Textbox(label = "Surname", elem_classes = "results_cell")
361
+ SurnameNameConfidenceBox = gr.Textbox(label = "Surname Confidence", elem_classes = "results_cell")
362
+ with gr.Row(elem_classes = "results_container"):
363
+ DODBox = gr.Textbox(label = "Date of Death", elem_classes = "results_cell")
364
+ DODConfidenceBox = gr.Textbox(label = "Date of Death Confidence", elem_classes = "results_cell")
365
+
366
+ with gr.Accordion("Full Results", open = False):
367
+ ExtractDF = gr.Dataframe(label = "Results")
368
+
369
+ with gr.Accordion("Clean Image", open = False):
370
+ CleanOutput = gr.Image()
371
+
372
+ with gr.Accordion("Deskew", open = False):
373
+ DeskewOutput = gr.Image()
374
+ with gr.Column():
375
+ DeskewAngle = gr.Number(label = "Angle")
376
+ with gr.Column():
377
+ DeskewConfidence = gr.Number(label = "Confidence")
378
+
379
+ ExtractButton.click(fn=pdf_extract_gr,
380
+ inputs = ExtractInput,
381
+ outputs = [ExtractDF, DeskewOutput, DeskewAngle,
382
+ DeskewConfidence, CleanOutput, FirstNameBox,
383
+ FirstNameConfidenceBox, SurnameNameBox,
384
+ SurnameNameConfidenceBox, DODBox, DODConfidenceBox])
385
+
386
+ demo.launch(show_api=True, share=False, debug=True)
models/CNN_deskew_v0.0.2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb73b87df7c3aff0b1a8237e8d839fbb7d1ba80c6ea95f6b21782bf7ba02eb0
3
+ size 444268
models/ResNet_od_v0.0.2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:120ba5866d26033b936a7f814f3c96ad62fa1a8cadbeb6a11bb5401d41966161
3
+ size 204978340
models/autoencoder_denoise_v0.0.2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b6a595e3ca6c0bb6fcda28022c436bd92579779f3ab5af58f1eb0bc904df44
3
+ size 607567
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ onnxruntime==1.12.1
2
+ opencv-contrib-python==4.6.0.66
3
+ opencv-python==4.6.0.66
4
+ paddle-bfloat==0.1.7
5
+ paddleocr==2.6.1.3
6
+ paddlepaddle==2.4.2
7
+ pandas==1.3.5
8
+ pdf2image==1.16.2
9
+ Pillow==9.3.0