Spaces:

Wolowolo
/

FSFM-deepfake_diffusion_spoof_face_detection

Running

FSFM-deepfake_diffusion_spoof_face_detection / app.py

FSFM-3C

init_test

189285f 13 days ago

18.3 kB

	# -- coding: utf-8 --
	# Author: Gaojian Wang@ZJUICSR
	# --------------------------------------------------------
	# This source code is licensed under the Attribution-NonCommercial 4.0 International License.
	# You can find the license in the LICENSE file in the root directory of this source tree.
	# --------------------------------------------------------
	# pip uninstall nvidia_cublas_cu11

	import sys

	sys.path.append('..')
	import os

	os.system(f'pip install dlib')
	import torch
	import numpy as np
	from PIL import Image
	from torch.nn import functional as F

	import gradio as gr

	import models_vit
	from util.datasets import build_dataset
	import argparse
	from engine_finetune import test_all
	import dlib
	from huggingface_hub import hf_hub_download

	P = os.path.abspath(__file__)
	FRAME_SAVE_PATH = os.path.join(P[:-6], 'frame')
	CKPT_SAVE_PATH = os.path.join(P[:-6], 'checkpoints')
	CKPT_LIST = ['DfD-Checkpoint_Fine-tuned_on_FF++',
	'FAS-Checkpoint_Fine-tuned_on_MCIO']
	CKPT_NAME = {'DfD-Checkpoint_Fine-tuned_on_FF++': 'finetuned_models/FF++_c23_32frames/checkpoint-min_val_loss.pth',
	'FAS-Checkpoint_Fine-tuned_on_MCIO': 'finetuned_models/MCIO_protocol/Both_MCIO/checkpoint-min_val_loss.pth'}
	os.makedirs(FRAME_SAVE_PATH, exist_ok=True)
	os.makedirs(CKPT_SAVE_PATH, exist_ok=True)


	def get_args_parser():
	parser = argparse.ArgumentParser('MAE fine-tuning for image classification', add_help=False)
	parser.add_argument('--batch_size', default=64, type=int,
	help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
	parser.add_argument('--epochs', default=50, type=int)
	parser.add_argument('--accum_iter', default=1, type=int,
	help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')

	# Model parameters
	parser.add_argument('--model', default='vit_large_patch16', type=str, metavar='MODEL',
	help='Name of model to train')

	parser.add_argument('--input_size', default=224, type=int,
	help='images input size')
	parser.add_argument('--normalize_from_IMN', action='store_true',
	help='cal mean and std from imagenet, else from pretrain datasets')
	parser.set_defaults(normalize_from_IMN=True)
	parser.add_argument('--apply_simple_augment', action='store_true',
	help='apply simple data augment')

	parser.add_argument('--drop_path', type=float, default=0.1, metavar='PCT',
	help='Drop path rate (default: 0.1)')

	# Optimizer parameters
	parser.add_argument('--clip_grad', type=float, default=None, metavar='NORM',
	help='Clip gradient norm (default: None, no clipping)')
	parser.add_argument('--weight_decay', type=float, default=0.05,
	help='weight decay (default: 0.05)')

	parser.add_argument('--lr', type=float, default=None, metavar='LR',
	help='learning rate (absolute lr)')
	parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
	help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
	parser.add_argument('--layer_decay', type=float, default=0.75,
	help='layer-wise lr decay from ELECTRA/BEiT')

	parser.add_argument('--min_lr', type=float, default=1e-6, metavar='LR',
	help='lower lr bound for cyclic schedulers that hit 0')

	parser.add_argument('--warmup_epochs', type=int, default=5, metavar='N',
	help='epochs to warmup LR')

	# Augmentation parameters
	parser.add_argument('--color_jitter', type=float, default=None, metavar='PCT',
	help='Color jitter factor (enabled only when not using Auto/RandAug)')
	parser.add_argument('--aa', type=str, default='rand-m9-mstd0.5-inc1', metavar='NAME',
	help='Use AutoAugment policy. "v0" or "original". " + "(default: rand-m9-mstd0.5-inc1)'),
	parser.add_argument('--smoothing', type=float, default=0.1,
	help='Label smoothing (default: 0.1)')

	# * Random Erase params
	parser.add_argument('--reprob', type=float, default=0.25, metavar='PCT',
	help='Random erase prob (default: 0.25)')
	parser.add_argument('--remode', type=str, default='pixel',
	help='Random erase mode (default: "pixel")')
	parser.add_argument('--recount', type=int, default=1,
	help='Random erase count (default: 1)')
	parser.add_argument('--resplit', action='store_true', default=False,
	help='Do not random erase first (clean) augmentation split')

	# * Mixup params
	parser.add_argument('--mixup', type=float, default=0,
	help='mixup alpha, mixup enabled if > 0.')
	parser.add_argument('--cutmix', type=float, default=0,
	help='cutmix alpha, cutmix enabled if > 0.')
	parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
	help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
	parser.add_argument('--mixup_prob', type=float, default=1.0,
	help='Probability of performing mixup or cutmix when either/both is enabled')
	parser.add_argument('--mixup_switch_prob', type=float, default=0.5,
	help='Probability of switching to cutmix when both mixup and cutmix enabled')
	parser.add_argument('--mixup_mode', type=str, default='batch',
	help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')

	# * Finetuning params
	parser.add_argument('--finetune', default='',
	help='finetune from checkpoint')
	parser.add_argument('--global_pool', action='store_true')
	parser.set_defaults(global_pool=True)
	parser.add_argument('--cls_token', action='store_false', dest='global_pool',
	help='Use class token instead of global pool for classification')

	# Dataset parameters
	parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
	help='dataset path')
	parser.add_argument('--nb_classes', default=1000, type=int,
	help='number of the classification types')

	parser.add_argument('--output_dir', default='',
	help='path where to save, empty for no saving')
	parser.add_argument('--log_dir', default='',
	help='path where to tensorboard log')
	parser.add_argument('--device', default='cuda',
	help='device to use for training / testing')
	parser.add_argument('--seed', default=0, type=int)
	parser.add_argument('--resume', default='',
	help='resume from checkpoint')

	parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
	help='start epoch')
	parser.add_argument('--eval', action='store_true',
	help='Perform evaluation only')
	parser.set_defaults(eval=True)
	parser.add_argument('--dist_eval', action='store_true', default=False,
	help='Enabling distributed evaluation (recommended during training for faster monitor')
	parser.add_argument('--num_workers', default=10, type=int)
	parser.add_argument('--pin_mem', action='store_true',
	help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
	parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
	parser.set_defaults(pin_mem=True)

	# distributed training parameters
	parser.add_argument('--world_size', default=1, type=int,
	help='number of distributed processes')
	parser.add_argument('--local_rank', default=-1, type=int)
	parser.add_argument('--dist_on_itp', action='store_true')
	parser.add_argument('--dist_url', default='env://',
	help='url used to set up distributed training')

	return parser


	args = get_args_parser()
	args = args.parse_args()
	args.nb_classes = 2

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = models_vit.__dict__['vit_base_patch16'](
	num_classes=args.nb_classes,
	drop_path_rate=args.drop_path,
	global_pool=args.global_pool,
	)


	def load_model(ckpt):
	if ckpt == 'choose from here' or 'continuously updating...':
	return gr.update()
	args.resume = os.path.join(CKPT_SAVE_PATH, ckpt)
	if os.path.isfile(args.resume) == False:
	hf_hub_download(local_dir=CKPT_SAVE_PATH,
	repo_id='Wolowolo/fsfm-3c/' + CKPT_NAME[ckpt],
	filename=ckpt)
	checkpoint = torch.load(args.resume, map_location='cpu')
	model.load_state_dict(checkpoint['model'])
	return gr.update()


	def get_boundingbox(face, width, height, minsize=None):
	"""
	From FF++:
	https://github.com/ondyari/FaceForensics/blob/master/classification/detect_from_video.py
	Expects a dlib face to generate a quadratic bounding box.
	:param face: dlib face class
	:param width: frame width
	:param height: frame height
	:param cfg.face_scale: bounding box size multiplier to get a bigger face region
	:param minsize: set minimum bounding box size
	:return: x, y, bounding_box_size in opencv form
	"""
	x1 = face.left()
	y1 = face.top()
	x2 = face.right()
	y2 = face.bottom()
	size_bb = int(max(x2 - x1, y2 - y1) * 1.3)
	if minsize:
	if size_bb < minsize:
	size_bb = minsize
	center_x, center_y = (x1 + x2) // 2, (y1 + y2) // 2

	# Check for out of bounds, x-y top left corner
	x1 = max(int(center_x - size_bb // 2), 0)
	y1 = max(int(center_y - size_bb // 2), 0)
	# Check for too big bb size for given x, y
	size_bb = min(width - x1, size_bb)
	size_bb = min(height - y1, size_bb)

	return x1, y1, size_bb


	def extract_face(frame):
	face_detector = dlib.get_frontal_face_detector()
	image = np.array(frame.convert('RGB'))
	faces = face_detector(image, 1)
	if len(faces) > 0:
	# For now only take the biggest face
	face = faces[0]
	# Face crop and rescale(follow FF++)
	x, y, size = get_boundingbox(face, image.shape[1], image.shape[0])
	# Get the landmarks/parts for the face in box d only with the five key points
	cropped_face = image[y:y + size, x:x + size]
	# cropped_face = cv2.resize(cropped_face, (224, 224), interpolation=cv2.INTER_CUBIC)
	return Image.fromarray(cropped_face)
	else:
	return None


	def get_frame_index_uniform_sample(total_frame_num, extract_frame_num):
	interval = np.linspace(0, total_frame_num - 1, num=extract_frame_num, dtype=int)
	return interval.tolist()


	import cv2


	def extract_face_from_fixed_num_frames(src_video, dst_path, num_frames=None, device='cpu'):
	"""
	1) extract specific num of frames from videos in [1st(index 0) frame, last frame] with uniform sample interval
	2) extract face from frame with specific enlarge size
	"""
	video_capture = cv2.VideoCapture(src_video)
	total_frames = video_capture.get(7)

	# extract from the 1st(index 0) frame
	if num_frames is not None:
	frame_indices = get_frame_index_uniform_sample(total_frames, num_frames)
	else:
	frame_indices = range(int(total_frames))

	for frame_index in frame_indices:
	video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
	ret, frame = video_capture.read()
	image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	img = extract_face(image)
	if img == None:
	continue
	img = img.resize((224, 224), Image.BICUBIC)
	if not ret:
	continue
	save_img_name = f"frame_{frame_index}.png"

	img.save(os.path.join(dst_path, '0', save_img_name))
	# cv2.imwrite(os.path.join(dst_path, '0', save_img_name), frame)

	video_capture.release()
	# cv2.destroyAllWindows()
	return frame_indices


	def FSFM3C_video_detection(video):
	model.to(device)

	# extract frames
	num_frames = 32

	files = os.listdir(FRAME_SAVE_PATH)
	num_files = len(files)
	frame_path = os.path.join(FRAME_SAVE_PATH, str(num_files))
	os.makedirs(frame_path, exist_ok=True)
	os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)
	frame_indices = extract_face_from_fixed_num_frames(video, frame_path, num_frames=num_frames, device=device)

	args.data_path = frame_path
	args.batch_size = 32
	dataset_val = build_dataset(is_train=False, args=args)
	sampler_val = torch.utils.data.SequentialSampler(dataset_val)
	data_loader_val = torch.utils.data.DataLoader(
	dataset_val, sampler=sampler_val,
	batch_size=args.batch_size,
	num_workers=args.num_workers,
	pin_memory=args.pin_mem,
	drop_last=False
	)

	frame_preds_list, video_pred_list = test_all(data_loader_val, model, device)

	real_prob_frames = [round(1. - fake_score, 2) for fake_score in video_pred_list]
	frame_results = {f"frame_{frame}": f"{int(real_prob_frames[i] * 100)}%" for i, frame in enumerate(frame_indices)}

	real_prob_video = int(round(1. - (sum(video_pred_list) / len(video_pred_list)), 2) * 100)
	if real_prob_video > 50:
	result_message = "real"
	else:
	result_message = "fake"

	video_results = (f"The face in this video may be {result_message}, "
	f"and the video-level real_face_probability is {real_prob_video}% \n"
	f"The frame-level detection results ['sampled_frame_index': 'real_face_probability']: \n"
	f"{frame_results} \n")

	return video_results


	def FSFM3C_image_detection(image):
	model.to(device)

	files = os.listdir(FRAME_SAVE_PATH)
	num_files = len(files)
	frame_path = os.path.join(FRAME_SAVE_PATH, str(num_files))
	os.makedirs(frame_path, exist_ok=True)
	os.makedirs(os.path.join(frame_path, '0'), exist_ok=True)

	save_img_name = f"frame_0.png"
	img = extract_face(image)
	if img is None:
	return ['Invalid Input']
	img = img.resize((224, 224), Image.BICUBIC)
	img.save(os.path.join(frame_path, '0', save_img_name))

	args.data_path = frame_path
	args.batch_size = 1
	dataset_val = build_dataset(is_train=False, args=args)
	sampler_val = torch.utils.data.SequentialSampler(dataset_val)
	data_loader_val = torch.utils.data.DataLoader(
	dataset_val, sampler=sampler_val,
	batch_size=args.batch_size,
	num_workers=args.num_workers,
	pin_memory=args.pin_mem,
	drop_last=False
	)

	frame_preds_list, video_pred_list = test_all(data_loader_val, model, device)

	real_prob_image = int(round(1. - (sum(video_pred_list) / len(video_pred_list)), 2) * 100)
	if real_prob_image > 50:
	result_message = "real"
	else:
	result_message = "fake"

	image_results = (f"The face in this image may be {result_message},"
	f"and the real_face_probability is {real_prob_image}%")

	return image_results


	# WebUI
	with gr.Blocks() as demo:
	gr.HTML(
	"<h1 style='text-align: center;'>🦱 Real Facial Image&Video Detection <br> Against Face Forgery and Spoofing (Deepfake/Diffusion/Presentation-attacks)</h1>")
	gr.Markdown("### ---Powered by the fine-tuned model that is pre-trained from [FSFM-3C](https://fsfm-3c.github.io/)")

	gr.Markdown("### Release:")

	gr.Markdown("- <b>V1.0 [2024-12] (Current):</b> "
	"Create this page with basic detectors (simply fine-tuned models) that follow the paper implementation. "
	"<b>Notes:</b> Performance is limited because no any optimization of data, models, hyperparameters, etc. is done for downstream tasks. <br> "
	"<b>[TODO]: </b> Update practical models, and optimized interfaces, and provide more functions such as visualizations, a unified detector, and multi-modal diagnosis.")

	gr.Markdown(
	"> Please provide an <b>image</b> or a <b>video (<100s </b>, default to uniform sampling 32 frames)</b> and <b>select the model</b> for detection. <br>"
	"- <b>DfD-Checkpoint_Fine-tuned_on_FF++</b> for deepfake detection, FSFM VIT-B fine-tuned on the FF++_c23 dataset (train&val sets of 4 manipulations, 32 frames per video) <br>"
	"- <b>FAS-Checkpoint_Fine-tuned_on_MCIO</b> for face anti-spoofing, FSFM VIT-B fine-tuned on the MCIO datasets (2 frames per video) ")

	with gr.Column():
	ckpt_select_dropdown = gr.Dropdown(
	label="Select the Model Checkpoint for Detection (🖱️ below)",
	choices=['choose from here'] + CKPT_LIST + ['continuously updating...'],
	multiselect=False,
	value='choose from here',
	interactive=True,
	)
	with gr.Row(elem_classes="center-align"):
	with gr.Column(scale=5):
	gr.Markdown(
	"## Image Detection"
	)
	image = gr.Image(label="Upload/Capture/Paste your image", type="pil")
	image_submit_btn = gr.Button("Submit")
	output_results_image = gr.Textbox(label="Detection Result")
	with gr.Column(scale=5):
	gr.Markdown(
	"## Video Detection"
	)
	video = gr.Video(label="Upload/Capture your video")
	video_submit_btn = gr.Button("Submit")
	output_results_video = gr.Textbox(label="Detection Result")

	image_submit_btn.click(
	fn=FSFM3C_image_detection,
	inputs=[image],
	outputs=[output_results_image],
	)
	video_submit_btn.click(
	fn=FSFM3C_video_detection,
	inputs=[video],
	outputs=[output_results_video],
	)
	ckpt_select_dropdown.change(
	fn=load_model,
	inputs=[ckpt_select_dropdown],
	outputs=[ckpt_select_dropdown],
	)

	if __name__ == "__main__":
	gr.close_all()
	demo.queue()
	demo.launch()