File size: 7,053 Bytes
93cc042 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import random
import concurrent.futures
from moviepy.editor import *
import os
import cv2
import concurrent.futures
def extract_frame(video_path):
# 打开视频文件
video_capture = cv2.VideoCapture(video_path)
# 跳到最后一帧
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
# 读取最后一帧
success, frame =
if success:
# 保存帧
save_frame(video_path, frame)
# 跳到倒数第二帧
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2)
# 读取倒数第二帧
success, frame =
if success:
# 保存帧
save_frame(video_path, frame)
# 释放视频文件
def save_frame(video_path, frame):
# 获取视频文件名(不带扩展名)
video_name = os.path.splitext(os.path.basename(video_path))[0]
# 设置图像保存路径
image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg")
# 检查图像文件是否已经存在
if not os.path.exists(image_path):
cv2.imwrite(image_path, frame)
print(f"保存最后一帧为 {image_path}")
def process_video(file_path):
# 加载视频文件
clip = VideoFileClip(file_path)
# 检查视频是否包含音频
if is not None:
# 提取音频
audio =
# 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下
audio_format = random.choice(["mp3", "wav"])
audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}'
audio_file_wav = os.path.splitext(file_path)[0] + '.wav'
audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3'
if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3):
print(f"file {audio_file_path} exit.")
# 关闭音频和剪辑对象
except Exception as e:
if "Resource temporarily unavailable" in str(e):
print(f"An error occurred while processing the file {file_path}: {e}")
print(f"An error occurred while processing the file {file_path}: {e}")
# Paths
data_json_path = '/mnt/bn/tns-algo-video-public-my2/'
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/'
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/'
video_folder = '/mnt/bn/tns-algo-video-public-my2/'
image_folder = '/mnt/bn/tns-algo-video-public-my2/'
new_json_path = '/mnt/bn/tns-algo-video-public-my2/'
# Load JSON data
with open(data_json_path, 'r') as f:
data = json.load(f)
# Function to check if a file exists in a folder
def file_exists(folder, filename):
return os.path.exists(os.path.join(folder, filename))
# Initialize counters for missing and total files by type
file_counts = {
"video": {"total": 0, "missing": 0},
"audio_asr": {"total": 0, "missing": 0},
"audio_caption": {"total": 0, "missing": 0},
"image": {"total": 0, "missing": 0},
"unknown": {"total": 0, "missing": 0} # For items missing all types of files
# Helper function to process each item in the dataset
def process_item(item):
result = {"item": item, "valid": True, "missing": []}
found = False
if 'video' in item:
video_file = item['video']
file_counts["video"]["total"] += 1
found = True
video_path = os.path.join(video_folder, video_file)
if not video_file or not file_exists(video_folder, video_file):
result['missing'].append(f"Video file missing or not found: {video_file}")
result['valid'] = False
file_counts["video"]["missing"] += 1
if 'audio_asr' in item:
audio_asr_file = item['audio_asr']
file_counts["audio_asr"]["total"] += 1
found = True
if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
result['valid'] = False
file_counts["audio_asr"]["missing"] += 1
if 'audio_caption' in item:
audio_caption_file = item['audio_caption']
file_counts["audio_caption"]["total"] += 1
found = True
if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
result['valid'] = False
file_counts["audio_caption"]["missing"] += 1
if 'image' in item:
image_file = item['image']
file_counts["image"]["total"] += 1
found = True
if not image_file or not file_exists(image_folder, image_file):
result['missing'].append(f"Image file missing or not found: {image_file}")
result['valid'] = False
file_counts["image"]["missing"] += 1
if not found:
result['valid'] = False
file_counts["unknown"]["total"] += 1
file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found
return result
# List to store results
new_items = []
texts = []
# Use ThreadPoolExecutor for multithreaded processing
with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
futures = {executor.submit(process_item, item): item for item in data}
for future in tqdm(as_completed(futures), total=len(futures)):
result = future.result()
if result['valid']:
texts.append(result['item']) # Collect invalid items if needed
for missing in result['missing']:
# Save new_items to a JSON file
with open(new_json_path, 'w', encoding='utf-8') as f:
json.dump(new_items, f, ensure_ascii=False, indent=4)
# Print the summary of missing and total files by type
print(f"Saved {len(new_items)} valid items to {new_json_path}")
print(f"Total and missing files by type:")
for file_type, counts in file_counts.items():
print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")