bacngv commited on
Commit
504dbd0
·
verified ·
1 Parent(s): bfece13

Update pdf_processing.py

Browse files
Files changed (1) hide show
  1. pdf_processing.py +14 -2
pdf_processing.py CHANGED
@@ -19,6 +19,8 @@ def process_pdf(pdf_path, output_folder):
19
  if not os.path.exists(output_folder):
20
  os.makedirs(output_folder)
21
 
 
 
22
  for page_num, image in enumerate(images):
23
  print(f"Processing page {page_num + 1}/{len(images)}")
24
 
@@ -41,7 +43,10 @@ def process_pdf(pdf_path, output_folder):
41
 
42
  # Perform OCR on the remaining area of the image
43
  remaining_text = apply_ocr_remaining_area(image_without_tables)
44
- save_remaining_text_to_txt(remaining_text, output_folder, page_num)
 
 
 
45
 
46
  # Process each cropped table and save data as CSV
47
  table_data_list = []
@@ -51,6 +56,13 @@ def process_pdf(pdf_path, output_folder):
51
  table_data = process_cropped_table(cropped_table, structure_model, device, page_num, idx, output_folder)
52
  table_data_list.append(table_data)
53
 
 
 
 
 
 
 
 
54
  # Function to process each cropped table and save to CSV
55
  def process_cropped_table(cropped_table, structure_model, device, page_num, table_index, output_folder):
56
  structure_transform = transforms.Compose([
@@ -104,4 +116,4 @@ def get_cell_coordinates_by_row(table_data):
104
  row_cells.append({'bbox': cell_bbox})
105
  cell_coordinates.append(row_cells)
106
 
107
- return cell_coordinates
 
19
  if not os.path.exists(output_folder):
20
  os.makedirs(output_folder)
21
 
22
+ output_files = [] # Dùng để lưu danh sách file kết quả
23
+
24
  for page_num, image in enumerate(images):
25
  print(f"Processing page {page_num + 1}/{len(images)}")
26
 
 
43
 
44
  # Perform OCR on the remaining area of the image
45
  remaining_text = apply_ocr_remaining_area(image_without_tables)
46
+
47
+ # Save the remaining text to a file and thêm vào danh sách output_files
48
+ txt_file = save_remaining_text_to_txt(remaining_text, output_folder, page_num)
49
+ output_files.append(txt_file)
50
 
51
  # Process each cropped table and save data as CSV
52
  table_data_list = []
 
56
  table_data = process_cropped_table(cropped_table, structure_model, device, page_num, idx, output_folder)
57
  table_data_list.append(table_data)
58
 
59
+ # Save each table data to CSV and thêm vào danh sách output_files
60
+ csv_filename = os.path.join(output_folder, f'page_{page_num + 1}_table_{idx + 1}.csv')
61
+ output_files.append(csv_filename)
62
+
63
+ # Trả về danh sách các file output cho Gradio
64
+ return output_files
65
+
66
  # Function to process each cropped table and save to CSV
67
  def process_cropped_table(cropped_table, structure_model, device, page_num, table_index, output_folder):
68
  structure_transform = transforms.Compose([
 
116
  row_cells.append({'bbox': cell_bbox})
117
  cell_coordinates.append(row_cells)
118
 
119
+ return cell_coordinates