climateGAN / climategan /eval_metrics.py

initial commit from cc-ai/climateGAN

448ebbd about 2 years ago

18.2 kB

	import cv2
	import numpy as np
	import torch
	from skimage import filters
	from sklearn.metrics.pairwise import euclidean_distances
	import matplotlib.pyplot as plt
	import seaborn as sns
	from copy import deepcopy

	# ------------------------------------------------------------------------------
	# ----- Evaluation metrics for a pair of binary mask images (pred, target) -----
	# ------------------------------------------------------------------------------


	def get_accuracy(arr1, arr2):
	"""pixel accuracy

	Args:
	arr1 (np.array)
	arr2 (np.array)
	"""
	return (arr1 == arr2).sum() / arr1.size


	def trimap(pred_im, gt_im, thickness=8):
	"""Compute accuracy in a region of thickness around the contours
	for binary images (0-1 values)
	Args:
	pred_im (Image): Prediction
	gt_im (Image): Target
	thickness (int, optional): [description]. Defaults to 8.
	"""
	W, H = gt_im.size
	contours, hierarchy = cv2.findContours(
	np.array(gt_im), mode=cv2.RETR_TREE, method=cv2.CHAIN_APPROX_SIMPLE
	)
	mask_contour = np.zeros((H, W), dtype=np.int32)
	cv2.drawContours(
	mask_contour, contours, -1, (1), thickness=thickness, hierarchy=hierarchy
	)
	gt_contour = np.array(gt_im)[np.where(mask_contour > 0)]
	pred_contour = np.array(pred_im)[np.where(mask_contour > 0)]
	return get_accuracy(pred_contour, gt_contour)


	def iou(pred_im, gt_im):
	"""
	IoU for binary masks (0-1 values)

	Args:
	pred_im ([type]): [description]
	gt_im ([type]): [description]
	"""
	pred = np.array(pred_im)
	gt = np.array(gt_im)
	intersection = (pred * gt).sum()
	union = (pred + gt).sum() - intersection
	return intersection / union


	def f1_score(pred_im, gt_im):
	pred = np.array(pred_im)
	gt = np.array(gt_im)
	intersection = (pred * gt).sum()
	return 2 * intersection / (pred + gt).sum()


	def accuracy(pred_im, gt_im):
	pred = np.array(pred_im)
	gt = np.array(gt_im)
	if len(gt_im.shape) == 4:
	assert gt_im.shape[1] == 1
	gt_im = gt_im[:, 0, :, :]
	if len(pred.shape) > len(gt_im.shape):
	pred = np.argmax(pred, axis=1)
	return float((pred == gt).sum()) / gt.size


	def mIOU(pred, label, average="macro"):
	"""
	Adapted from:
	https://stackoverflow.com/questions/62461379/multiclass-semantic-segmentation-model-evaluation

	Compute the mean IOU from pred and label tensors
	pred is a tensor N x C x H x W with logits (softmax will be applied)
	and label is a N x H x W tensor with int labels per pixel

	this does the same as sklearn's jaccard_score function if you choose average="macro"
	Args:
	pred (torch.tensor): predicted logits
	label (torch.tensor): labels
	average: "macro" or "weighted"

	Returns:
	float: mIOU, can be nan
	"""
	num_classes = pred.shape[-3]

	pred = torch.argmax(pred, dim=1).squeeze(1)
	present_iou_list = list()
	pred = pred.view(-1)
	label = label.view(-1)
	# Note: Following for loop goes from 0 to (num_classes-1)
	# and ignore_index is num_classes, thus ignore_index is
	# not considered in computation of IoU.
	interesting_classes = (
	[*range(num_classes)] if num_classes > 2 else [int(label.max().item())]
	)
	weights = []

	for sem_class in interesting_classes:
	pred_inds = pred == sem_class
	target_inds = label == sem_class
	if (target_inds.long().sum().item() > 0) or (pred_inds.long().sum().item() > 0):
	intersection_now = (pred_inds[target_inds]).long().sum().item()
	union_now = (
	pred_inds.long().sum().item()
	+ target_inds.long().sum().item()
	- intersection_now
	)
	weights.append(pred_inds.long().sum().item())
	iou_now = float(intersection_now) / float(union_now)
	present_iou_list.append(iou_now)
	if not present_iou_list:
	return float("nan")
	elif average == "weighted":
	weighted_avg = np.sum(np.multiply(weights, present_iou_list) / np.sum(weights))
	return weighted_avg
	else:
	return np.mean(present_iou_list)


	def masker_classification_metrics(
	pred, label, labels_dict={"cannot": 0, "must": 1, "may": 2}
	):
	"""
	Classification metrics for the masker, and the corresponding maps. If the
	predictions are soft, the errors are weighted accordingly. Metrics computed:

	tpr : float
	True positive rate

	tpt : float
	True positive total (divided by total population)

	tnr : float
	True negative rate

	tnt : float
	True negative total (divided by total population)

	fpr : float
	False positive rate: rate of predicted mask on cannot flood

	fpt : float
	False positive total (divided by total population)

	fnr : float
	False negative rate: rate of missed mask on must flood

	fnt : float
	False negative total (divided by total population)

	mnr : float
	"May" negative rate (labeled as "may", predicted as no-mask)

	mpr : float
	"May" positive rate (labeled as "may", predicted as mask)

	accuracy : float
	Accuracy

	error : float
	Error

	precision : float
	Precision, considering only cannot and must flood labels

	f05 : float
	F0.5 score, considering only cannot and must flood labels

	accuracy_must_may : float
	Accuracy considering only the must and may areas

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	labels_dict : dict
	A dictionary with the identifier of each class (cannot, must, may)

	Returns
	-------
	metrics_dict : dict
	A dictionary with metric name and value pairs

	maps_dict : dict
	A dictionary containing the metric maps
	"""
	tp_map = pred * np.asarray(label == labels_dict["must"], dtype=int)
	tpr = np.sum(tp_map) / np.sum(label == labels_dict["must"])
	tpt = np.sum(tp_map) / np.prod(label.shape)
	tn_map = (1.0 - pred) * np.asarray(label == labels_dict["cannot"], dtype=int)
	tnr = np.sum(tn_map) / np.sum(label == labels_dict["cannot"])
	tnt = np.sum(tn_map) / np.prod(label.shape)
	fp_map = pred * np.asarray(label == labels_dict["cannot"], dtype=int)
	fpr = np.sum(fp_map) / np.sum(label == labels_dict["cannot"])
	fpt = np.sum(fp_map) / np.prod(label.shape)
	fn_map = (1.0 - pred) * np.asarray(label == labels_dict["must"], dtype=int)
	fnr = np.sum(fn_map) / np.sum(label == labels_dict["must"])
	fnt = np.sum(fn_map) / np.prod(label.shape)
	may_neg_map = (1.0 - pred) * np.asarray(label == labels_dict["may"], dtype=int)
	may_pos_map = pred * np.asarray(label == labels_dict["may"], dtype=int)
	mnr = np.sum(may_neg_map) / np.sum(label == labels_dict["may"])
	mpr = np.sum(may_pos_map) / np.sum(label == labels_dict["may"])
	accuracy = tpt + tnt
	error = fpt + fnt

	# Assertions
	assert np.isclose(tpr, 1.0 - fnr), "TPR: {:.4f}, FNR: {:.4f}".format(tpr, fnr)
	assert np.isclose(tnr, 1.0 - fpr), "TNR: {:.4f}, FPR: {:.4f}".format(tnr, fpr)
	assert np.isclose(mpr, 1.0 - mnr), "MPR: {:.4f}, MNR: {:.4f}".format(mpr, mnr)

	precision = np.sum(tp_map) / (np.sum(tp_map) + np.sum(fp_map) + 1e-9)
	beta = 0.5
	f05 = ((1 + beta ** 2) * precision * tpr) / (beta ** 2 * precision + tpr + 1e-9)
	accuracy_must_may = (np.sum(tp_map) + np.sum(may_neg_map)) / (
	np.sum(label == labels_dict["must"]) + np.sum(label == labels_dict["may"])
	)

	metrics_dict = {
	"tpr": tpr,
	"tpt": tpt,
	"tnr": tnr,
	"tnt": tnt,
	"fpr": fpr,
	"fpt": fpt,
	"fnr": fnr,
	"fnt": fnt,
	"mpr": mpr,
	"mnr": mnr,
	"accuracy": accuracy,
	"error": error,
	"precision": precision,
	"f05": f05,
	"accuracy_must_may": accuracy_must_may,
	}
	maps_dict = {
	"tp": tp_map,
	"tn": tn_map,
	"fp": fp_map,
	"fn": fn_map,
	"may_pos": may_pos_map,
	"may_neg": may_neg_map,
	}

	return metrics_dict, maps_dict


	def pred_cannot(pred, label, label_cannot=0):
	"""
	Metric for the masker: Computes false positive rate and its map. If the
	predictions are soft, the errors are weighted accordingly.

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	label_cannot : int
	The label index of "cannot flood"

	Returns
	-------
	fp_map : array-like
	The map of false positives: predicted mask on cannot flood

	fpr : float
	False positive rate: rate of predicted mask on cannot flood
	"""
	fp_map = pred * np.asarray(label == label_cannot, dtype=int)
	fpr = np.sum(fp_map) / np.sum(label == label_cannot)
	return fp_map, fpr


	def missed_must(pred, label, label_must=1):
	"""
	Metric for the masker: Computes false negative rate and its map. If the
	predictions are soft, the errors are weighted accordingly.

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	label_must : int
	The label index of "must flood"

	Returns
	-------
	fn_map : array-like
	The map of false negatives: missed mask on must flood

	fnr : float
	False negative rate: rate of missed mask on must flood
	"""
	fn_map = (1.0 - pred) * np.asarray(label == label_must, dtype=int)
	fnr = np.sum(fn_map) / np.sum(label == label_must)
	return fn_map, fnr


	def may_flood(pred, label, label_may=2):
	"""
	Metric for the masker: Computes "may" negative and "may" positive rates and their
	map. If the predictions are soft, the "errors" are weighted accordingly.

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	label_may : int
	The label index of "may flood"

	Returns
	-------
	may_neg_map : array-like
	The map of "may" negatives

	may_pos_map : array-like
	The map of "may" positives

	mnr : float
	"May" negative rate

	mpr : float
	"May" positive rate
	"""
	may_neg_map = (1.0 - pred) * np.asarray(label == label_may, dtype=int)
	may_pos_map = pred * np.asarray(label == label_may, dtype=int)
	mnr = np.sum(may_neg_map) / np.sum(label == label_may)
	mpr = np.sum(may_pos_map) / np.sum(label == label_may)
	return may_neg_map, may_pos_map, mnr, mpr


	def masker_metrics(pred, label, label_cannot=0, label_must=1):
	"""
	Computes a set of metrics for the masker

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	label_must : int
	The label index of "must flood"

	label_cannot : int
	The label index of "cannot flood"

	Returns
	-------
	tpr : float
	True positive rate

	tnr : float
	True negative rate

	precision : float
	Precision, considering only cannot and must flood labels

	f1 : float
	F1 score, considering only cannot and must flood labels
	"""
	tp_map = pred * np.asarray(label == label_must, dtype=int)
	tpr = np.sum(tp_map) / np.sum(label == label_must)
	tn_map = (1.0 - pred) * np.asarray(label == label_cannot, dtype=int)
	tnr = np.sum(tn_map) / np.sum(label == label_cannot)
	fp_map = pred * np.asarray(label == label_cannot, dtype=int)
	fn_map = (1.0 - pred) * np.asarray(label == label_must, dtype=int) # noqa: F841
	precision = np.sum(tp_map) / (np.sum(tp_map) + np.sum(fp_map))
	f1 = 2 * (precision * tpr) / (precision + tpr)
	return tpr, tnr, precision, f1


	def get_confusion_matrix(tpr, tnr, fpr, fnr, mpr, mnr):
	"""
	Constructs the confusion matrix of a masker prediction over a set of samples

	Parameters
	----------
	tpr : vector-like
	True positive rate

	tnr : vector-like
	True negative rate

	fpr : vector-like
	False positive rate

	fnr : vector-like
	False negative rate

	mpr : vector-like
	"May" positive rate

	mnr : vector-like
	"May" negative rate

	Returns
	-------
	confusion_matrix : 3x3 array
	Confusion matrix: [i, j] = [pred, true]
	\| tnr fnr mnr \|
	\| fpr tpr mpr \|
	\| 0. 0, 0, \|

	confusion_matrix_std : 3x3 array
	Standard deviation of the confusion matrix
	"""
	# Compute mean and standard deviations over all samples
	tpr_m = np.mean(tpr)
	tpr_s = np.std(tpr)
	tnr_m = np.mean(tnr)
	tnr_s = np.std(tnr)
	fpr_m = np.mean(fpr)
	fpr_s = np.std(fpr)
	fnr_m = np.mean(fnr)
	fnr_s = np.std(fnr)
	mpr_m = np.mean(mpr)
	mpr_s = np.std(mpr)
	mnr_m = np.mean(mnr)
	mnr_s = np.std(mnr)

	# Assertions
	assert np.isclose(tpr_m, 1.0 - fnr_m), "TPR: {:.4f}, FNR: {:.4f}".format(
	tpr_m, fnr_m
	)
	assert np.isclose(tnr_m, 1.0 - fpr_m), "TNR: {:.4f}, FPR: {:.4f}".format(
	tnr_m, fpr_m
	)
	assert np.isclose(mpr_m, 1.0 - mnr_m), "MPR: {:.4f}, MNR: {:.4f}".format(
	mpr_m, mnr_m
	)

	# Fill confusion matrix
	confusion_matrix = np.zeros((3, 3))
	confusion_matrix[0, 0] = tnr_m
	confusion_matrix[0, 1] = fnr_m
	confusion_matrix[0, 2] = mnr_m
	confusion_matrix[1, 0] = fpr_m
	confusion_matrix[1, 1] = tpr_m
	confusion_matrix[1, 2] = mpr_m
	confusion_matrix[2, 2] = 0.0

	# Standard deviation
	confusion_matrix_std = np.zeros((3, 3))
	confusion_matrix_std[0, 0] = tnr_s
	confusion_matrix_std[0, 1] = fnr_s
	confusion_matrix_std[0, 2] = mnr_s
	confusion_matrix_std[1, 0] = fpr_s
	confusion_matrix_std[1, 1] = tpr_s
	confusion_matrix_std[1, 2] = mpr_s
	confusion_matrix_std[2, 2] = 0.0
	return confusion_matrix, confusion_matrix_std


	def edges_coherence_std_min(pred, label, label_must=1, bin_th=0.5):
	"""
	The standard deviation of the minimum distance between the edge of the prediction
	and the edge of the "must flood" label.

	Parameters
	----------
	pred : array-like
	Mask prediction

	label : array-like
	Mask ground truth labels

	label_must : int
	The label index of "must flood"

	bin_th : float
	The threshold for the binarization of the prediction

	Returns
	-------
	metric : float
	The value of the metric

	pred_edge : array-like
	The edges images of the prediction, for visualization

	label_edge : array-like
	The edges images of the "must flood" label, for visualization
	"""
	# Keep must flood label only
	label = deepcopy(label)
	label[label != label_must] = -1
	label[label == label_must] = 1
	label[label != label_must] = 0
	label = np.asarray(label, dtype=float)

	# Binarize prediction
	pred = np.asarray(pred > bin_th, dtype=float)

	# Compute edges
	pred = filters.sobel(pred)
	label = filters.sobel(label)

	# Location of edges
	pred_coord = np.argwhere(pred > 0)
	label_coord = np.argwhere(label > 0)

	# Handle blank predictions
	if pred_coord.shape[0] == 0:
	return 1.0, pred, label

	# Normalized pairwise distances between pred and label
	dist_mat = np.divide(euclidean_distances(pred_coord, label_coord), pred.shape[0])

	# Standard deviation of the minimum distance from pred to label
	edge_coherence = np.std(np.min(dist_mat, axis=1))

	return edge_coherence, pred, label


	def boxplot_metric(
	output_filename,
	df,
	metric,
	dict_metrics,
	do_stripplot=False,
	dict_models=None,
	dpi=300,
	**snskwargs
	):
	f = plt.figure(dpi=dpi)

	if do_stripplot:
	ax = sns.boxplot(x="model", y=metric, data=df, fliersize=0.0, **snskwargs)
	ax = sns.stripplot(
	x="model", y=metric, data=df, size=2.0, color="gray", **snskwargs
	)
	else:
	ax = sns.boxplot(x="model", y=metric, data=df, **snskwargs)

	# Set axes labels
	ax.set_xlabel("Models", rotation=0, fontsize="medium")
	ax.set_ylabel(dict_metrics[metric], rotation=90, fontsize="medium")

	# Spines
	sns.despine(left=True, bottom=True)

	# X-Tick labels
	if dict_models:
	xticklabels = [dict_models[t.get_text()] for t in ax.get_xticklabels()]
	ax.set_xticklabels(
	xticklabels,
	rotation=20,
	verticalalignment="top",
	horizontalalignment="right",
	fontsize="xx-small",
	)

	f.savefig(
	output_filename,
	dpi=f.dpi,
	bbox_inches="tight",
	facecolor="white",
	transparent=False,
	)
	f.clear()
	plt.close(f)


	def clustermap_metric(
	output_filename,
	df,
	metric,
	dict_metrics,
	method="average",
	cluster_metric="euclidean",
	dict_models=None,
	dpi=300,
	**snskwargs
	):
	ax_grid = sns.clustermap(data=df, method=method, metric=cluster_metric, **snskwargs)
	ax_heatmap = ax_grid.ax_heatmap
	ax_cbar = ax_grid.ax_cbar

	# Set axes labels
	ax_heatmap.set_xlabel("Models", rotation=0, fontsize="medium")
	ax_heatmap.set_ylabel("Images", rotation=90, fontsize="medium")

	# Set title
	ax_cbar.set_title(dict_metrics[metric], rotation=0, fontsize="x-large")

	# X-Tick labels
	if dict_models:
	xticklabels = [dict_models[t.get_text()] for t in ax_heatmap.get_xticklabels()]
	ax_heatmap.set_xticklabels(
	xticklabels,
	rotation=20,
	verticalalignment="top",
	horizontalalignment="right",
	fontsize="small",
	)

	ax_grid.fig.savefig(
	output_filename,
	dpi=dpi,
	bbox_inches="tight",
	facecolor="white",
	transparent=False,
	)
	ax_grid.fig.clear()
	plt.close(ax_grid.fig)