Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 16,322 Bytes

8e8cd3e

import torch

import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, pdb, math, python_speech_features
import numpy as np
from scipy import signal
from shutil import rmtree
from scipy.io import wavfile
from scipy.interpolate import interp1d
from sklearn.metrics import accuracy_score, f1_score
import soundfile as sf

from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.frame_timecode import FrameTimecode
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors import ContentDetector

from models.av_mossformer2_tse.faceDetector.s3fd import S3FD

from .decode import decode_one_audio_AV_MossFormer2_TSE_16K



def process_tse(args, model, device, data_reader, output_wave_dir):
	video_args = args_param()
	video_args.model = model
	video_args.device = device
	video_args.sampling_rate = args.sampling_rate
	args.device = device
	assert args.sampling_rate == 16000
	with torch.no_grad():
		for videoPath in data_reader:  # Loop over all video samples
			savFolder = videoPath.split('/')[-1]
			video_args.savePath = f'{output_wave_dir}/{savFolder[:-4]}/'
			video_args.videoPath = videoPath
			main(video_args, args)



def args_param():
    warnings.filterwarnings("ignore")
    parser = argparse.ArgumentParser()
    parser.add_argument('--nDataLoaderThread',     type=int,   default=10,   help='Number of workers')
    parser.add_argument('--facedetScale',          type=float, default=0.25, help='Scale factor for face detection, the frames will be scale to 0.25 orig')
    parser.add_argument('--minTrack',              type=int,   default=50,   help='Number of min frames for each shot')
    parser.add_argument('--numFailedDet',          type=int,   default=10,   help='Number of missed detections allowed before tracking is stopped')
    parser.add_argument('--minFaceSize',           type=int,   default=1,    help='Minimum face size in pixels')
    parser.add_argument('--cropScale',             type=float, default=0.40, help='Scale bounding box')
    parser.add_argument('--start',                 type=int, default=0,   help='The start time of the video')
    parser.add_argument('--duration',              type=int, default=0,  help='The duration of the video, when set as 0, will extract the whole video')
    video_args = parser.parse_args()
    return video_args


# Main function
def main(video_args, args):
    # Initialization 
    video_args.pyaviPath = os.path.join(video_args.savePath, 'py_video')
    video_args.pyframesPath = os.path.join(video_args.savePath, 'pyframes')
    video_args.pyworkPath = os.path.join(video_args.savePath, 'pywork')
    video_args.pycropPath = os.path.join(video_args.savePath, 'py_faceTracks')
    if os.path.exists(video_args.savePath):
        rmtree(video_args.savePath)
    os.makedirs(video_args.pyaviPath, exist_ok = True) # The path for the input video, input audio, output video
    os.makedirs(video_args.pyframesPath, exist_ok = True) # Save all the video frames
    os.makedirs(video_args.pyworkPath, exist_ok = True) # Save the results in this process by the pckl method
    os.makedirs(video_args.pycropPath, exist_ok = True) # Save the detected face clips (audio+video) in this process

    # Extract video
    video_args.videoFilePath = os.path.join(video_args.pyaviPath, 'video.avi')
    # If duration did not set, extract the whole video, otherwise extract the video from 'video_args.start' to 'video_args.start + video_args.duration'
    if video_args.duration == 0:
        command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" % \
            (video_args.videoPath, video_args.nDataLoaderThread, video_args.videoFilePath))
    else:
        command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" % \
            (video_args.videoPath, video_args.nDataLoaderThread, video_args.start, video_args.start + video_args.duration, video_args.videoFilePath))
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" %(video_args.videoFilePath))

    # Extract audio
    video_args.audioFilePath = os.path.join(video_args.pyaviPath, 'audio.wav')
    command = ("ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % \
        (video_args.videoFilePath, video_args.nDataLoaderThread, video_args.audioFilePath))
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" %(video_args.audioFilePath))

    # Extract the video frames
    command = ("ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % \
        (video_args.videoFilePath, video_args.nDataLoaderThread, os.path.join(video_args.pyframesPath, '%06d.jpg'))) 
    subprocess.call(command, shell=True, stdout=None)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" %(video_args.pyframesPath))

    # Scene detection for the video frames
    scene = scene_detect(video_args)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" %(video_args.pyworkPath))	

    # Face detection for the video frames
    faces = inference_video(video_args)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" %(video_args.pyworkPath))

    # Face tracking
    allTracks, vidTracks = [], []
    for shot in scene:
        if shot[1].frame_num - shot[0].frame_num >= video_args.minTrack: # Discard the shot frames less than minTrack frames
            allTracks.extend(track_shot(video_args, faces[shot[0].frame_num:shot[1].frame_num])) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" %len(allTracks))

    # Face clips cropping
    for ii, track in tqdm.tqdm(enumerate(allTracks), total = len(allTracks)):
        vidTracks.append(crop_video(video_args, track, os.path.join(video_args.pycropPath, '%05d'%ii)))
    savePath = os.path.join(video_args.pyworkPath, 'tracks.pckl')
    with open(savePath, 'wb') as fil:
        pickle.dump(vidTracks, fil)
    sys.stderr.write(time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" %video_args.pycropPath)
    fil = open(savePath, 'rb')
    vidTracks = pickle.load(fil)
    fil.close()

    # AVSE
    files = glob.glob("%s/*.avi"%video_args.pycropPath)
    files.sort()

    est_sources = evaluate_network(files, video_args, args)

    visualization(vidTracks, est_sources, video_args)	

    # combine files in pycrop
    for idx, file in enumerate(files):
        print(file)
        command = f"ffmpeg -i {file} {file[:-9]}orig_{idx}.mp4 ;"
        command += f"rm {file} ;"
        command += f"rm {file.replace('.avi', '.wav')} ;"

        command += f"ffmpeg -i {file[:-9]}orig_{idx}.mp4 -i {file[:-9]}est_{idx}.wav -c:v copy -map 0:v:0 -map 1:a:0 -shortest {file[:-9]}est_{idx}.mp4 ;"
        # command += f"rm {file[:-9]}est_{idx}.wav ;"

        output = subprocess.call(command, shell=True, stdout=None)

    rmtree(video_args.pyworkPath)
    rmtree(video_args.pyframesPath)




def scene_detect(video_args):
	# CPU: Scene detection, output is the list of each shot's time duration
	videoManager = VideoManager([video_args.videoFilePath])
	statsManager = StatsManager()
	sceneManager = SceneManager(statsManager)
	sceneManager.add_detector(ContentDetector())
	baseTimecode = videoManager.get_base_timecode()
	videoManager.set_downscale_factor()
	videoManager.start()
	sceneManager.detect_scenes(frame_source = videoManager)
	sceneList = sceneManager.get_scene_list(baseTimecode)
	savePath = os.path.join(video_args.pyworkPath, 'scene.pckl')
	if sceneList == []:
		sceneList = [(videoManager.get_base_timecode(),videoManager.get_current_timecode())]
	with open(savePath, 'wb') as fil:
		pickle.dump(sceneList, fil)
		sys.stderr.write('%s - scenes detected %d\n'%(video_args.videoFilePath, len(sceneList)))
	return sceneList

def inference_video(video_args):
	# GPU: Face detection, output is the list contains the face location and score in this frame
	DET = S3FD(device=video_args.device)
	flist = glob.glob(os.path.join(video_args.pyframesPath, '*.jpg'))
	flist.sort()
	dets = []
	for fidx, fname in enumerate(flist):
		image = cv2.imread(fname)
		imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
		bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[video_args.facedetScale])
		dets.append([])
		for bbox in bboxes:
		  dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) # dets has the frames info, bbox info, conf info
		sys.stderr.write('%s-%05d; %d dets\r' % (video_args.videoFilePath, fidx, len(dets[-1])))
	savePath = os.path.join(video_args.pyworkPath,'faces.pckl')
	with open(savePath, 'wb') as fil:
		pickle.dump(dets, fil)
	return dets

def bb_intersection_over_union(boxA, boxB, evalCol = False):
	# CPU: IOU Function to calculate overlap between two image
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	interArea = max(0, xB - xA) * max(0, yB - yA)
	boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
	boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
	if evalCol == True:
		iou = interArea / float(boxAArea)
	else:
		iou = interArea / float(boxAArea + boxBArea - interArea)
	return iou

def track_shot(video_args, sceneFaces):
	# CPU: Face tracking
	iouThres  = 0.5     # Minimum IOU between consecutive face detections
	tracks    = []
	while True:
		track     = []
		for frameFaces in sceneFaces:
			for face in frameFaces:
				if track == []:
					track.append(face)
					frameFaces.remove(face)
				elif face['frame'] - track[-1]['frame'] <= video_args.numFailedDet:
					iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
					if iou > iouThres:
						track.append(face)
						frameFaces.remove(face)
						continue
				else:
					break
		if track == []:
			break
		elif len(track) > video_args.minTrack:
			frameNum    = np.array([ f['frame'] for f in track ])
			bboxes      = np.array([np.array(f['bbox']) for f in track])
			frameI      = np.arange(frameNum[0],frameNum[-1]+1)
			bboxesI    = []
			for ij in range(0,4):
				interpfn  = interp1d(frameNum, bboxes[:,ij])
				bboxesI.append(interpfn(frameI))
			bboxesI  = np.stack(bboxesI, axis=1)
			if max(np.mean(bboxesI[:,2]-bboxesI[:,0]), np.mean(bboxesI[:,3]-bboxesI[:,1])) > video_args.minFaceSize:
				tracks.append({'frame':frameI,'bbox':bboxesI})
	return tracks

def crop_video(video_args, track, cropFile):
	# CPU: crop the face clips
	flist = glob.glob(os.path.join(video_args.pyframesPath, '*.jpg')) # Read the frames
	flist.sort()
	vOut = cv2.VideoWriter(cropFile + 't.avi', cv2.VideoWriter_fourcc(*'XVID'), 25, (224,224))# Write video
	dets = {'x':[], 'y':[], 's':[]}
	for det in track['bbox']: # Read the tracks
		dets['s'].append(max((det[3]-det[1]), (det[2]-det[0]))/2) 
		dets['y'].append((det[1]+det[3])/2) # crop center x 
		dets['x'].append((det[0]+det[2])/2) # crop center y
	dets['s'] = signal.medfilt(dets['s'], kernel_size=13)  # Smooth detections 
	dets['x'] = signal.medfilt(dets['x'], kernel_size=13)
	dets['y'] = signal.medfilt(dets['y'], kernel_size=13)
	for fidx, frame in enumerate(track['frame']):
		cs  = video_args.cropScale
		bs  = dets['s'][fidx]   # Detection box size
		bsi = int(bs * (1 + 2 * cs))  # Pad videos by this amount 
		image = cv2.imread(flist[frame])
		frame = np.pad(image, ((bsi,bsi), (bsi,bsi), (0, 0)), 'constant', constant_values=(110, 110))
		my  = dets['y'][fidx] + bsi  # BBox center Y
		mx  = dets['x'][fidx] + bsi  # BBox center X
		face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
		vOut.write(cv2.resize(face, (224, 224)))
	audioTmp    = cropFile + '.wav'
	audioStart  = (track['frame'][0]) / 25
	audioEnd    = (track['frame'][-1]+1) / 25
	vOut.release()
	command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % \
		      (video_args.audioFilePath, video_args.nDataLoaderThread, audioStart, audioEnd, audioTmp)) 
	output = subprocess.call(command, shell=True, stdout=None) # Crop audio file
	_, audio = wavfile.read(audioTmp)
	command = ("ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % \
			  (cropFile, audioTmp, video_args.nDataLoaderThread, cropFile)) # Combine audio and video file
	output = subprocess.call(command, shell=True, stdout=None)
	os.remove(cropFile + 't.avi')
	return {'track':track, 'proc_track':dets}


def evaluate_network(files, video_args, args):

	est_sources = []
	for file in tqdm.tqdm(files, total = len(files)):

		fileName = os.path.splitext(file.split('/')[-1])[0] # Load audio and video
		audio, _ = sf.read(os.path.join(video_args.pycropPath, fileName + '.wav'), dtype='float32')

		video = cv2.VideoCapture(os.path.join(video_args.pycropPath, fileName + '.avi'))
		videoFeature = []
		while video.isOpened():
			ret, frames = video.read()
			if ret == True:
				face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
				face = cv2.resize(face, (224,224))
				face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))]
				videoFeature.append(face)
			else:
				break

		video.release()
		visual = np.array(videoFeature)/255.0
		visual = (visual - 0.4161)/0.1688

		length = int(audio.shape[0]/16000*25)
		if visual.shape[0] < length:
			visual = np.pad(visual, ((0,int(length - visual.shape[0])),(0,0),(0,0)), mode = 'edge')

		audio = np.expand_dims(audio, axis=0)
		visual = np.expand_dims(visual, axis=0)

		inputs = (audio, visual)
		est_source = decode_one_audio_AV_MossFormer2_TSE_16K(video_args.model, inputs, args)

		est_sources.append(est_source)

	return est_sources

def visualization(tracks, est_sources, video_args):
	# CPU: visulize the result for video format
	flist = glob.glob(os.path.join(video_args.pyframesPath, '*.jpg'))
	flist.sort()
	

	for idx, audio in enumerate(est_sources):
		max_value = np.max(np.abs(audio))
		if max_value >1:
			audio /= max_value
		sf.write(video_args.pycropPath +'/est_%s.wav' %idx, audio, 16000)

	for tidx, track in enumerate(tracks):
		faces = [[] for i in range(len(flist))]
		for fidx, frame in enumerate(track['track']['frame'].tolist()):
			faces[frame].append({'track':tidx, 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
	
		firstImage = cv2.imread(flist[0])
		fw = firstImage.shape[1]
		fh = firstImage.shape[0]
		vOut = cv2.VideoWriter(os.path.join(video_args.pyaviPath, 'video_only.avi'), cv2.VideoWriter_fourcc(*'XVID'), 25, (fw,fh))
		for fidx, fname in tqdm.tqdm(enumerate(flist), total = len(flist)):
			image = cv2.imread(fname)
			for face in faces[fidx]:
				cv2.rectangle(image, (int(face['x']-face['s']), int(face['y']-face['s'])), (int(face['x']+face['s']), int(face['y']+face['s'])),(0,255,0),10)
			vOut.write(image)
		vOut.release()

		command = ("ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % \
			(os.path.join(video_args.pyaviPath, 'video_only.avi'), (video_args.pycropPath +'/est_%s.wav' %tidx), \
			video_args.nDataLoaderThread, os.path.join(video_args.pyaviPath,'video_out_%s.avi'%tidx))) 
		output = subprocess.call(command, shell=True, stdout=None)




		command = "ffmpeg -i %s %s ;" % (
					os.path.join(video_args.pyaviPath, 'video_out_%s.avi' % tidx),
					os.path.join(video_args.pyaviPath, 'video_est_%s.mp4' % tidx)
				)
		command += f"rm {os.path.join(video_args.pyaviPath, 'video_out_%s.avi' % tidx)}"
		output = subprocess.call(command, shell=True, stdout=None)


	command = "ffmpeg -i %s %s ;" % (
				os.path.join(video_args.pyaviPath, 'video.avi'),
				os.path.join(video_args.pyaviPath, 'video_orig.mp4')
			)
	command += f"rm {os.path.join(video_args.pyaviPath, 'video_only.avi')} ;"
	command += f"rm {os.path.join(video_args.pyaviPath, 'video.avi')} ;"
	command += f"rm {os.path.join(video_args.pyaviPath, 'audio.wav')} ;"
	output = subprocess.call(command, shell=True, stdout=None)