Update pdf_processing.py
Browse files- pdf_processing.py +14 -2
pdf_processing.py
CHANGED
@@ -19,6 +19,8 @@ def process_pdf(pdf_path, output_folder):
|
|
19 |
if not os.path.exists(output_folder):
|
20 |
os.makedirs(output_folder)
|
21 |
|
|
|
|
|
22 |
for page_num, image in enumerate(images):
|
23 |
print(f"Processing page {page_num + 1}/{len(images)}")
|
24 |
|
@@ -41,7 +43,10 @@ def process_pdf(pdf_path, output_folder):
|
|
41 |
|
42 |
# Perform OCR on the remaining area of the image
|
43 |
remaining_text = apply_ocr_remaining_area(image_without_tables)
|
44 |
-
|
|
|
|
|
|
|
45 |
|
46 |
# Process each cropped table and save data as CSV
|
47 |
table_data_list = []
|
@@ -51,6 +56,13 @@ def process_pdf(pdf_path, output_folder):
|
|
51 |
table_data = process_cropped_table(cropped_table, structure_model, device, page_num, idx, output_folder)
|
52 |
table_data_list.append(table_data)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
# Function to process each cropped table and save to CSV
|
55 |
def process_cropped_table(cropped_table, structure_model, device, page_num, table_index, output_folder):
|
56 |
structure_transform = transforms.Compose([
|
@@ -104,4 +116,4 @@ def get_cell_coordinates_by_row(table_data):
|
|
104 |
row_cells.append({'bbox': cell_bbox})
|
105 |
cell_coordinates.append(row_cells)
|
106 |
|
107 |
-
return cell_coordinates
|
|
|
19 |
if not os.path.exists(output_folder):
|
20 |
os.makedirs(output_folder)
|
21 |
|
22 |
+
output_files = [] # Dùng để lưu danh sách file kết quả
|
23 |
+
|
24 |
for page_num, image in enumerate(images):
|
25 |
print(f"Processing page {page_num + 1}/{len(images)}")
|
26 |
|
|
|
43 |
|
44 |
# Perform OCR on the remaining area of the image
|
45 |
remaining_text = apply_ocr_remaining_area(image_without_tables)
|
46 |
+
|
47 |
+
# Save the remaining text to a file and thêm vào danh sách output_files
|
48 |
+
txt_file = save_remaining_text_to_txt(remaining_text, output_folder, page_num)
|
49 |
+
output_files.append(txt_file)
|
50 |
|
51 |
# Process each cropped table and save data as CSV
|
52 |
table_data_list = []
|
|
|
56 |
table_data = process_cropped_table(cropped_table, structure_model, device, page_num, idx, output_folder)
|
57 |
table_data_list.append(table_data)
|
58 |
|
59 |
+
# Save each table data to CSV and thêm vào danh sách output_files
|
60 |
+
csv_filename = os.path.join(output_folder, f'page_{page_num + 1}_table_{idx + 1}.csv')
|
61 |
+
output_files.append(csv_filename)
|
62 |
+
|
63 |
+
# Trả về danh sách các file output cho Gradio
|
64 |
+
return output_files
|
65 |
+
|
66 |
# Function to process each cropped table and save to CSV
|
67 |
def process_cropped_table(cropped_table, structure_model, device, page_num, table_index, output_folder):
|
68 |
structure_transform = transforms.Compose([
|
|
|
116 |
row_cells.append({'bbox': cell_bbox})
|
117 |
cell_coordinates.append(row_cells)
|
118 |
|
119 |
+
return cell_coordinates
|