|
import json |
|
import csv |
|
import os |
|
from tqdm import tqdm |
|
import re |
|
from utils import * |
|
import traceback |
|
|
|
|
|
def process_json_files(csv_path, output_dir): |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
json_file = open(os.path.join(output_dir, 'output1.jsonl'), |
|
'w', |
|
encoding='utf-8') |
|
try: |
|
|
|
with open(csv_path, 'r', encoding='utf-8') as csv_file: |
|
csv_reader = csv.reader(csv_file) |
|
next(csv_reader) |
|
|
|
|
|
for row in tqdm(csv_reader, |
|
desc="Processing JSON files", |
|
unit="file"): |
|
json_path = row[0] |
|
|
|
try: |
|
|
|
with open(json_path, 'r', encoding='utf-8') as f: |
|
json_data = json.load(f) |
|
img_path = row[1] |
|
shape = cv2.imread(img_path).shape |
|
|
|
|
|
doc_triplet = [] |
|
doc_tgt_sen_trans = [] |
|
doc_words_boxes_list = [] |
|
|
|
for key, value in json_data.items(): |
|
if value.get("attribute") == 'text_block': |
|
for text_ in value.get('text', []): |
|
combined_list = [( |
|
text_['src_words'][i], |
|
text_['src_word_bboxes'][i], |
|
) for i in range(len(text_['src_words']))] |
|
doc_words_boxes_list.extend(combined_list) |
|
|
|
doc_tgt_sen_trans.append( |
|
text_['tgt_text.zh-CN']) |
|
processed_list = [ |
|
(src_w, src_w_boxes, resize_box(src_w_boxes, shape)) |
|
for (src_w, src_w_boxes) in doc_words_boxes_list |
|
] |
|
|
|
sorted_tuple_list = tblr_reading_order_detector( |
|
processed_list) |
|
|
|
text_src_list = [atuple[0] for atuple in sorted_tuple_list] |
|
layout_src_list = [ |
|
atuple[2] for atuple in sorted_tuple_list |
|
] |
|
text_src = ' '.join(text_src_list) |
|
tgt_sen_trans = ''.join(doc_tgt_sen_trans) |
|
|
|
data_dict = { |
|
"img_path": img_path, |
|
"text_src": text_src, |
|
"layout_src": layout_src_list, |
|
"tgt_sen_trans": tgt_sen_trans |
|
} |
|
|
|
json_line = json.dumps(data_dict, ensure_ascii=False) |
|
json_file.write(json_line + '\n') |
|
|
|
except FileNotFoundError: |
|
print(f"File not found: {json_path}") |
|
except json.JSONDecodeError: |
|
print(f"Error decoding JSON in file: {json_path}") |
|
except KeyError as e: |
|
print(f"Missing key {e} in file: {json_path}") |
|
except Exception as e: |
|
print(f"Unexpected error processing {json_path}: {str(e)}") |
|
traceback.print_exc() |
|
|
|
except FileNotFoundError: |
|
print(f"CSV file not found: {csv_path}") |
|
except Exception as e: |
|
print(f"Error reading CSV file: {str(e)}") |
|
|
|
print("Processing completed!") |
|
|
|
|
|
|
|
csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output.csv' |
|
output_dir = '/home/zychen/hwproject/my_modeling_phase_1/dataset' |
|
|
|
process_json_files(csv_path, output_dir) |
|
|