import os import json from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor, as_completed import os import random import concurrent.futures from moviepy.editor import * import os import cv2 import concurrent.futures def extract_frame(video_path): # 打开视频文件 video_capture = cv2.VideoCapture(video_path) # 跳到最后一帧 video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1) # 读取最后一帧 success, frame = video_capture.read() if success: # 保存帧 save_frame(video_path, frame) else: # 跳到倒数第二帧 video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2) # 读取倒数第二帧 success, frame = video_capture.read() if success: # 保存帧 save_frame(video_path, frame) else: print(f"无法读取最后一帧和倒数第二帧:{video_path}") # 释放视频文件 video_capture.release() def save_frame(video_path, frame): # 获取视频文件名(不带扩展名) video_name = os.path.splitext(os.path.basename(video_path))[0] # 设置图像保存路径 image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg") # 检查图像文件是否已经存在 if not os.path.exists(image_path): cv2.imwrite(image_path, frame) print(f"保存最后一帧为 {image_path}") else: print(f"图像已存在:{video_name}") def process_video(file_path): try: # 加载视频文件 clip = VideoFileClip(file_path) # 检查视频是否包含音频 if clip.audio is not None: # 提取音频 audio = clip.audio # 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下 audio_format = random.choice(["mp3", "wav"]) audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}' audio_file_wav = os.path.splitext(file_path)[0] + '.wav' audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3' if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3): audio.write_audiofile(audio_file_path) else: print(f"file {audio_file_path} exit.") # 关闭音频和剪辑对象 audio.close() clip.close() except Exception as e: if "Resource temporarily unavailable" in str(e): print(f"An error occurred while processing the file {file_path}: {e}") time.sleep(20) else: print(f"An error occurred while processing the file {file_path}: {e}") # Paths data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json' audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data' audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818' video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video' image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA' new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json' # Load JSON data with open(data_json_path, 'r') as f: data = json.load(f) # Function to check if a file exists in a folder def file_exists(folder, filename): return os.path.exists(os.path.join(folder, filename)) # Initialize counters for missing and total files by type file_counts = { "video": {"total": 0, "missing": 0}, "audio_asr": {"total": 0, "missing": 0}, "audio_caption": {"total": 0, "missing": 0}, "image": {"total": 0, "missing": 0}, "unknown": {"total": 0, "missing": 0} # For items missing all types of files } # Helper function to process each item in the dataset def process_item(item): result = {"item": item, "valid": True, "missing": []} found = False if 'video' in item: video_file = item['video'] file_counts["video"]["total"] += 1 found = True video_path = os.path.join(video_folder, video_file) process_video(video_path) extract_frame(video_path) if not video_file or not file_exists(video_folder, video_file): result['missing'].append(f"Video file missing or not found: {video_file}") result['valid'] = False file_counts["video"]["missing"] += 1 if 'audio_asr' in item: audio_asr_file = item['audio_asr'] file_counts["audio_asr"]["total"] += 1 found = True if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file): result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}") result['valid'] = False file_counts["audio_asr"]["missing"] += 1 if 'audio_caption' in item: audio_caption_file = item['audio_caption'] file_counts["audio_caption"]["total"] += 1 found = True if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file): result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}") result['valid'] = False file_counts["audio_caption"]["missing"] += 1 if 'image' in item: image_file = item['image'] file_counts["image"]["total"] += 1 found = True if not image_file or not file_exists(image_folder, image_file): result['missing'].append(f"Image file missing or not found: {image_file}") result['valid'] = False file_counts["image"]["missing"] += 1 if not found: result['valid'] = False file_counts["unknown"]["total"] += 1 file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found return result # List to store results new_items = [] texts = [] # Use ThreadPoolExecutor for multithreaded processing with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system futures = {executor.submit(process_item, item): item for item in data} for future in tqdm(as_completed(futures), total=len(futures)): result = future.result() if result['valid']: new_items.append(result['item']) else: texts.append(result['item']) # Collect invalid items if needed for missing in result['missing']: print(missing) # Save new_items to a JSON file with open(new_json_path, 'w', encoding='utf-8') as f: json.dump(new_items, f, ensure_ascii=False, indent=4) # Print the summary of missing and total files by type print(f"Saved {len(new_items)} valid items to {new_json_path}") print(f"Total and missing files by type:") for file_type, counts in file_counts.items(): print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")