stage4 / handle_stage4.py
multitensor's picture
Upload folder using huggingface_hub
93cc042 verified
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import random
import concurrent.futures
from moviepy.editor import *
import os
import cv2
import concurrent.futures
def extract_frame(video_path):
# 打开视频文件
video_capture = cv2.VideoCapture(video_path)
# 跳到最后一帧
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
# 读取最后一帧
success, frame = video_capture.read()
if success:
# 保存帧
save_frame(video_path, frame)
else:
# 跳到倒数第二帧
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2)
# 读取倒数第二帧
success, frame = video_capture.read()
if success:
# 保存帧
save_frame(video_path, frame)
else:
print(f"无法读取最后一帧和倒数第二帧:{video_path}")
# 释放视频文件
video_capture.release()
def save_frame(video_path, frame):
# 获取视频文件名(不带扩展名)
video_name = os.path.splitext(os.path.basename(video_path))[0]
# 设置图像保存路径
image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg")
# 检查图像文件是否已经存在
if not os.path.exists(image_path):
cv2.imwrite(image_path, frame)
print(f"保存最后一帧为 {image_path}")
else:
print(f"图像已存在:{video_name}")
def process_video(file_path):
try:
# 加载视频文件
clip = VideoFileClip(file_path)
# 检查视频是否包含音频
if clip.audio is not None:
# 提取音频
audio = clip.audio
# 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下
audio_format = random.choice(["mp3", "wav"])
audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}'
audio_file_wav = os.path.splitext(file_path)[0] + '.wav'
audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3'
if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3):
audio.write_audiofile(audio_file_path)
else:
print(f"file {audio_file_path} exit.")
# 关闭音频和剪辑对象
audio.close()
clip.close()
except Exception as e:
if "Resource temporarily unavailable" in str(e):
print(f"An error occurred while processing the file {file_path}: {e}")
time.sleep(20)
else:
print(f"An error occurred while processing the file {file_path}: {e}")
# Paths
data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json'
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818'
video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json'
# Load JSON data
with open(data_json_path, 'r') as f:
data = json.load(f)
# Function to check if a file exists in a folder
def file_exists(folder, filename):
return os.path.exists(os.path.join(folder, filename))
# Initialize counters for missing and total files by type
file_counts = {
"video": {"total": 0, "missing": 0},
"audio_asr": {"total": 0, "missing": 0},
"audio_caption": {"total": 0, "missing": 0},
"image": {"total": 0, "missing": 0},
"unknown": {"total": 0, "missing": 0} # For items missing all types of files
}
# Helper function to process each item in the dataset
def process_item(item):
result = {"item": item, "valid": True, "missing": []}
found = False
if 'video' in item:
video_file = item['video']
file_counts["video"]["total"] += 1
found = True
video_path = os.path.join(video_folder, video_file)
process_video(video_path)
extract_frame(video_path)
if not video_file or not file_exists(video_folder, video_file):
result['missing'].append(f"Video file missing or not found: {video_file}")
result['valid'] = False
file_counts["video"]["missing"] += 1
if 'audio_asr' in item:
audio_asr_file = item['audio_asr']
file_counts["audio_asr"]["total"] += 1
found = True
if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
result['valid'] = False
file_counts["audio_asr"]["missing"] += 1
if 'audio_caption' in item:
audio_caption_file = item['audio_caption']
file_counts["audio_caption"]["total"] += 1
found = True
if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
result['valid'] = False
file_counts["audio_caption"]["missing"] += 1
if 'image' in item:
image_file = item['image']
file_counts["image"]["total"] += 1
found = True
if not image_file or not file_exists(image_folder, image_file):
result['missing'].append(f"Image file missing or not found: {image_file}")
result['valid'] = False
file_counts["image"]["missing"] += 1
if not found:
result['valid'] = False
file_counts["unknown"]["total"] += 1
file_counts["unknown"]["missing"] += 1 # Count as unknown if no valid key is found
return result
# List to store results
new_items = []
texts = []
# Use ThreadPoolExecutor for multithreaded processing
with ThreadPoolExecutor(max_workers=96) as executor: # Adjust `max_workers` based on your system
futures = {executor.submit(process_item, item): item for item in data}
for future in tqdm(as_completed(futures), total=len(futures)):
result = future.result()
if result['valid']:
new_items.append(result['item'])
else:
texts.append(result['item']) # Collect invalid items if needed
for missing in result['missing']:
print(missing)
# Save new_items to a JSON file
with open(new_json_path, 'w', encoding='utf-8') as f:
json.dump(new_items, f, ensure_ascii=False, indent=4)
# Print the summary of missing and total files by type
print(f"Saved {len(new_items)} valid items to {new_json_path}")
print(f"Total and missing files by type:")
for file_type, counts in file_counts.items():
print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")