File size: 4,377 Bytes
41404b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
import csv
import os
from tqdm import tqdm  # 导入tqdm用于显示进度条
import re
from utils import *
import traceback


def process_json_files(csv_path, output_dir):
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    json_file = open(os.path.join(output_dir, 'output1.jsonl'),
                     'w',
                     encoding='utf-8')
    try:
        # 读取CSV文件
        with open(csv_path, 'r', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file)
            next(csv_reader)  # 跳过标题行

            # 使用tqdm包装csv_reader以显示进度条
            for row in tqdm(csv_reader,
                            desc="Processing JSON files",
                            unit="file"):
                json_path = row[0]  # 获取JSON文件路径
                # print('row', row)
                try:
                    # 读取JSON文件
                    with open(json_path, 'r', encoding='utf-8') as f:
                        json_data = json.load(f)
                    img_path = row[1]
                    shape = cv2.imread(img_path).shape
                    #element -> tuple: (word_text, word_bbox, normed_word_bbox)
                    #  resize_box(text_['src_word_bboxes'][i],shape)
                    doc_triplet = []
                    doc_tgt_sen_trans = []
                    doc_words_boxes_list = []
                    # 处理JSON数据
                    for key, value in json_data.items():
                        if value.get("attribute") == 'text_block':
                            for text_ in value.get('text', []):
                                combined_list = [(
                                    text_['src_words'][i],
                                    text_['src_word_bboxes'][i],
                                ) for i in range(len(text_['src_words']))]
                                doc_words_boxes_list.extend(combined_list)
                                # print(f'combined_list:{combined_list}')
                                doc_tgt_sen_trans.append(
                                    text_['tgt_text.zh-CN'])
                    processed_list = [
                        (src_w, src_w_boxes, resize_box(src_w_boxes, shape))
                        for (src_w, src_w_boxes) in doc_words_boxes_list
                    ]
                    # print(f'processed:{processed_list}')
                    sorted_tuple_list = tblr_reading_order_detector(
                        processed_list)

                    text_src_list = [atuple[0] for atuple in sorted_tuple_list]
                    layout_src_list = [
                        atuple[2] for atuple in sorted_tuple_list
                    ]
                    text_src = ' '.join(text_src_list)
                    tgt_sen_trans = ''.join(doc_tgt_sen_trans)
                    # print('text_src', text_src)
                    data_dict = {
                        "img_path": img_path,
                        "text_src": text_src,
                        "layout_src": layout_src_list,
                        "tgt_sen_trans": tgt_sen_trans
                    }
                    # print(data_dict)
                    json_line = json.dumps(data_dict, ensure_ascii=False)
                    json_file.write(json_line + '\n')

                except FileNotFoundError:
                    print(f"File not found: {json_path}")
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {json_path}")
                except KeyError as e:
                    print(f"Missing key {e} in file: {json_path}")
                except Exception as e:
                    print(f"Unexpected error processing {json_path}: {str(e)}")
                    traceback.print_exc()

    except FileNotFoundError:
        print(f"CSV file not found: {csv_path}")
    except Exception as e:
        print(f"Error reading CSV file: {str(e)}")

    print("Processing completed!")


# csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output_part2.csv'  # 替换为你的CSV文件路径
csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output.csv'  # 替换为你的CSV文件路径
output_dir = '/home/zychen/hwproject/my_modeling_phase_1/dataset'  # 输出目录名

process_json_files(csv_path, output_dir)