|
import os |
|
import json |
|
from tqdm import tqdm |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import os |
|
import random |
|
import concurrent.futures |
|
from moviepy.editor import * |
|
import os |
|
import cv2 |
|
import concurrent.futures |
|
|
|
def extract_frame(video_path): |
|
|
|
video_capture = cv2.VideoCapture(video_path) |
|
|
|
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1) |
|
|
|
success, frame = video_capture.read() |
|
if success: |
|
|
|
save_frame(video_path, frame) |
|
else: |
|
|
|
video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2) |
|
|
|
success, frame = video_capture.read() |
|
if success: |
|
|
|
save_frame(video_path, frame) |
|
else: |
|
print(f"无法读取最后一帧和倒数第二帧:{video_path}") |
|
|
|
video_capture.release() |
|
|
|
def save_frame(video_path, frame): |
|
|
|
video_name = os.path.splitext(os.path.basename(video_path))[0] |
|
|
|
image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg") |
|
|
|
if not os.path.exists(image_path): |
|
cv2.imwrite(image_path, frame) |
|
print(f"保存最后一帧为 {image_path}") |
|
else: |
|
print(f"图像已存在:{video_name}") |
|
|
|
|
|
def process_video(file_path): |
|
try: |
|
|
|
clip = VideoFileClip(file_path) |
|
|
|
if clip.audio is not None: |
|
|
|
audio = clip.audio |
|
|
|
audio_format = random.choice(["mp3", "wav"]) |
|
audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}' |
|
audio_file_wav = os.path.splitext(file_path)[0] + '.wav' |
|
audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3' |
|
if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3): |
|
audio.write_audiofile(audio_file_path) |
|
else: |
|
print(f"file {audio_file_path} exit.") |
|
|
|
audio.close() |
|
clip.close() |
|
except Exception as e: |
|
if "Resource temporarily unavailable" in str(e): |
|
print(f"An error occurred while processing the file {file_path}: {e}") |
|
time.sleep(20) |
|
else: |
|
print(f"An error occurred while processing the file {file_path}: {e}") |
|
|
|
|
|
|
|
data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json' |
|
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data' |
|
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818' |
|
video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video' |
|
image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA' |
|
new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json' |
|
|
|
|
|
with open(data_json_path, 'r') as f: |
|
data = json.load(f) |
|
|
|
|
|
def file_exists(folder, filename): |
|
return os.path.exists(os.path.join(folder, filename)) |
|
|
|
|
|
file_counts = { |
|
"video": {"total": 0, "missing": 0}, |
|
"audio_asr": {"total": 0, "missing": 0}, |
|
"audio_caption": {"total": 0, "missing": 0}, |
|
"image": {"total": 0, "missing": 0}, |
|
"unknown": {"total": 0, "missing": 0} |
|
} |
|
|
|
|
|
def process_item(item): |
|
result = {"item": item, "valid": True, "missing": []} |
|
found = False |
|
|
|
if 'video' in item: |
|
video_file = item['video'] |
|
file_counts["video"]["total"] += 1 |
|
found = True |
|
video_path = os.path.join(video_folder, video_file) |
|
process_video(video_path) |
|
extract_frame(video_path) |
|
|
|
if not video_file or not file_exists(video_folder, video_file): |
|
result['missing'].append(f"Video file missing or not found: {video_file}") |
|
result['valid'] = False |
|
file_counts["video"]["missing"] += 1 |
|
|
|
if 'audio_asr' in item: |
|
audio_asr_file = item['audio_asr'] |
|
file_counts["audio_asr"]["total"] += 1 |
|
found = True |
|
if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file): |
|
result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}") |
|
result['valid'] = False |
|
file_counts["audio_asr"]["missing"] += 1 |
|
|
|
if 'audio_caption' in item: |
|
audio_caption_file = item['audio_caption'] |
|
file_counts["audio_caption"]["total"] += 1 |
|
found = True |
|
if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file): |
|
result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}") |
|
result['valid'] = False |
|
file_counts["audio_caption"]["missing"] += 1 |
|
|
|
if 'image' in item: |
|
image_file = item['image'] |
|
file_counts["image"]["total"] += 1 |
|
found = True |
|
if not image_file or not file_exists(image_folder, image_file): |
|
result['missing'].append(f"Image file missing or not found: {image_file}") |
|
result['valid'] = False |
|
file_counts["image"]["missing"] += 1 |
|
|
|
if not found: |
|
result['valid'] = False |
|
file_counts["unknown"]["total"] += 1 |
|
file_counts["unknown"]["missing"] += 1 |
|
|
|
return result |
|
|
|
|
|
new_items = [] |
|
texts = [] |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=96) as executor: |
|
futures = {executor.submit(process_item, item): item for item in data} |
|
|
|
for future in tqdm(as_completed(futures), total=len(futures)): |
|
result = future.result() |
|
if result['valid']: |
|
new_items.append(result['item']) |
|
else: |
|
texts.append(result['item']) |
|
for missing in result['missing']: |
|
print(missing) |
|
|
|
|
|
with open(new_json_path, 'w', encoding='utf-8') as f: |
|
json.dump(new_items, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
print(f"Saved {len(new_items)} valid items to {new_json_path}") |
|
print(f"Total and missing files by type:") |
|
for file_type, counts in file_counts.items(): |
|
print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}") |
|
|