File size: 7,053 Bytes
93cc042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import random
import concurrent.futures
from moviepy.editor import *
import os
import cv2
import concurrent.futures

def extract_frame(video_path):
    # 打开视频文件
    video_capture = cv2.VideoCapture(video_path)
    # 跳到最后一帧
    video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 1)
    # 读取最后一帧
    success, frame = video_capture.read()
    if success:
        # 保存帧
        save_frame(video_path, frame)
    else:
        # 跳到倒数第二帧
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, video_capture.get(cv2.CAP_PROP_FRAME_COUNT) - 2)
        # 读取倒数第二帧
        success, frame = video_capture.read()
        if success:
            # 保存帧
            save_frame(video_path, frame)
        else:
            print(f"无法读取最后一帧和倒数第二帧:{video_path}")
    # 释放视频文件
    video_capture.release()

def save_frame(video_path, frame):
    # 获取视频文件名(不带扩展名)
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    # 设置图像保存路径
    image_path = os.path.join(os.path.dirname(video_path), f"{video_name}.jpg")
    # 检查图像文件是否已经存在
    if not os.path.exists(image_path):
        cv2.imwrite(image_path, frame)
        print(f"保存最后一帧为 {image_path}")
    else:
        print(f"图像已存在:{video_name}")
        

def process_video(file_path):
    try:
        # 加载视频文件
        clip = VideoFileClip(file_path)
        # 检查视频是否包含音频
        if clip.audio is not None:
            # 提取音频
            audio = clip.audio
            # 保存为随机选择的格式,文件名和原视频文件相同,保存在相同的路径下
            audio_format = random.choice(["mp3", "wav"])
            audio_file_path = os.path.splitext(file_path)[0] + f'.{audio_format}'
            audio_file_wav = os.path.splitext(file_path)[0] + '.wav'
            audio_file_mp3 = os.path.splitext(file_path)[0] + '.mp3'
            if not os.path.exists(audio_file_wav) and not os.path.exists(audio_file_mp3):
                audio.write_audiofile(audio_file_path)
            else:
                print(f"file {audio_file_path} exit.")
            # 关闭音频和剪辑对象
            audio.close()
        clip.close()
    except Exception as e:
        if "Resource temporarily unavailable" in str(e):
            print(f"An error occurred while processing the file {file_path}: {e}")
            time.sleep(20)
        else:
            print(f"An error occurred while processing the file {file_path}: {e}")


# Paths
data_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/hf/stage4_next_json/video_stag4_0116_next.json'
audio_asr_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data'
audio_caption_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/caption_data/0818'
video_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video'
image_folder = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA'
new_json_path = '/mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/stage4_0119.json'

# Load JSON data
with open(data_json_path, 'r') as f:
    data = json.load(f)

# Function to check if a file exists in a folder
def file_exists(folder, filename):
    return os.path.exists(os.path.join(folder, filename))

# Initialize counters for missing and total files by type
file_counts = {
    "video": {"total": 0, "missing": 0},
    "audio_asr": {"total": 0, "missing": 0},
    "audio_caption": {"total": 0, "missing": 0},
    "image": {"total": 0, "missing": 0},
    "unknown": {"total": 0, "missing": 0}  # For items missing all types of files
}

# Helper function to process each item in the dataset
def process_item(item):
    result = {"item": item, "valid": True, "missing": []}
    found = False

    if 'video' in item:
        video_file = item['video']
        file_counts["video"]["total"] += 1
        found = True
        video_path = os.path.join(video_folder, video_file)
        process_video(video_path)
        extract_frame(video_path)

        if not video_file or not file_exists(video_folder, video_file):
            result['missing'].append(f"Video file missing or not found: {video_file}")
            result['valid'] = False
            file_counts["video"]["missing"] += 1

    if 'audio_asr' in item:
        audio_asr_file = item['audio_asr']
        file_counts["audio_asr"]["total"] += 1
        found = True
        if not audio_asr_file or not file_exists(audio_asr_folder, audio_asr_file):
            result['missing'].append(f"Audio ASR file missing or not found: {audio_asr_file}")
            result['valid'] = False
            file_counts["audio_asr"]["missing"] += 1

    if 'audio_caption' in item:
        audio_caption_file = item['audio_caption']
        file_counts["audio_caption"]["total"] += 1
        found = True
        if not audio_caption_file or not file_exists(audio_caption_folder, audio_caption_file):
            result['missing'].append(f"Audio caption file missing or not found: {audio_caption_file}")
            result['valid'] = False
            file_counts["audio_caption"]["missing"] += 1

    if 'image' in item:
        image_file = item['image']
        file_counts["image"]["total"] += 1
        found = True
        if not image_file or not file_exists(image_folder, image_file):
            result['missing'].append(f"Image file missing or not found: {image_file}")
            result['valid'] = False
            file_counts["image"]["missing"] += 1

    if not found:
        result['valid'] = False
        file_counts["unknown"]["total"] += 1
        file_counts["unknown"]["missing"] += 1  # Count as unknown if no valid key is found

    return result

# List to store results
new_items = []
texts = []

# Use ThreadPoolExecutor for multithreaded processing
with ThreadPoolExecutor(max_workers=96) as executor:  # Adjust `max_workers` based on your system
    futures = {executor.submit(process_item, item): item for item in data}

    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result['valid']:
            new_items.append(result['item'])
        else:
            texts.append(result['item'])  # Collect invalid items if needed
            for missing in result['missing']:
                print(missing)

# Save new_items to a JSON file
with open(new_json_path, 'w', encoding='utf-8') as f:
    json.dump(new_items, f, ensure_ascii=False, indent=4)

# Print the summary of missing and total files by type
print(f"Saved {len(new_items)} valid items to {new_json_path}")
print(f"Total and missing files by type:")
for file_type, counts in file_counts.items():
    print(f"{file_type}: Total = {counts['total']}, Missing = {counts['missing']}")