test_upload / make_jsonl.py
Chen42's picture
Upload folder using huggingface_hub
41404b8 verified
raw
history blame
4.38 kB
import json
import csv
import os
from tqdm import tqdm # 导入tqdm用于显示进度条
import re
from utils import *
import traceback
def process_json_files(csv_path, output_dir):
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
json_file = open(os.path.join(output_dir, 'output1.jsonl'),
'w',
encoding='utf-8')
try:
# 读取CSV文件
with open(csv_path, 'r', encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # 跳过标题行
# 使用tqdm包装csv_reader以显示进度条
for row in tqdm(csv_reader,
desc="Processing JSON files",
unit="file"):
json_path = row[0] # 获取JSON文件路径
# print('row', row)
try:
# 读取JSON文件
with open(json_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
img_path = row[1]
shape = cv2.imread(img_path).shape
#element -> tuple: (word_text, word_bbox, normed_word_bbox)
# resize_box(text_['src_word_bboxes'][i],shape)
doc_triplet = []
doc_tgt_sen_trans = []
doc_words_boxes_list = []
# 处理JSON数据
for key, value in json_data.items():
if value.get("attribute") == 'text_block':
for text_ in value.get('text', []):
combined_list = [(
text_['src_words'][i],
text_['src_word_bboxes'][i],
) for i in range(len(text_['src_words']))]
doc_words_boxes_list.extend(combined_list)
# print(f'combined_list:{combined_list}')
doc_tgt_sen_trans.append(
text_['tgt_text.zh-CN'])
processed_list = [
(src_w, src_w_boxes, resize_box(src_w_boxes, shape))
for (src_w, src_w_boxes) in doc_words_boxes_list
]
# print(f'processed:{processed_list}')
sorted_tuple_list = tblr_reading_order_detector(
processed_list)
text_src_list = [atuple[0] for atuple in sorted_tuple_list]
layout_src_list = [
atuple[2] for atuple in sorted_tuple_list
]
text_src = ' '.join(text_src_list)
tgt_sen_trans = ''.join(doc_tgt_sen_trans)
# print('text_src', text_src)
data_dict = {
"img_path": img_path,
"text_src": text_src,
"layout_src": layout_src_list,
"tgt_sen_trans": tgt_sen_trans
}
# print(data_dict)
json_line = json.dumps(data_dict, ensure_ascii=False)
json_file.write(json_line + '\n')
except FileNotFoundError:
print(f"File not found: {json_path}")
except json.JSONDecodeError:
print(f"Error decoding JSON in file: {json_path}")
except KeyError as e:
print(f"Missing key {e} in file: {json_path}")
except Exception as e:
print(f"Unexpected error processing {json_path}: {str(e)}")
traceback.print_exc()
except FileNotFoundError:
print(f"CSV file not found: {csv_path}")
except Exception as e:
print(f"Error reading CSV file: {str(e)}")
print("Processing completed!")
# csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output_part2.csv' # 替换为你的CSV文件路径
csv_path = '/home/zychen/hwproject/my_modeling_phase_1/dataset/output.csv' # 替换为你的CSV文件路径
output_dir = '/home/zychen/hwproject/my_modeling_phase_1/dataset' # 输出目录名
process_json_files(csv_path, output_dir)