diff --git a/README.md b/README.md
deleted file mode 100644
index ad5cb58c756fd70913b30639641c8159330f2ed1..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-title: CatVTON
-emoji: 👀
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 4.40.0
-app_file: app.py
-pinned: false
-license: cc-by-nc-sa-4.0
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/__pycache__/utils.cpython-39.pyc b/__pycache__/utils.cpython-39.pyc
index c74e08e5c75101b25900fe823a294cb9569b7643..6e064fcdc30055bbb94cc9af4caf1803b711c741 100644
Binary files a/__pycache__/utils.cpython-39.pyc and b/__pycache__/utils.cpython-39.pyc differ
diff --git a/app.py b/app.py
index fb3ad3a66902a58a9ffc54fac423b635abe089be..eee21897bf47c655c65c95094e673e94da499abc 100644
--- a/app.py
+++ b/app.py
@@ -1,10 +1,7 @@
 import argparse
 import os
-os.environ['CUDA_HOME'] = '/usr/local/cuda'
-os.environ['PATH'] = os.environ['PATH'] + ':/usr/local/cuda/bin'
-
 from datetime import datetime
-import spaces
+
 import gradio as gr
 import numpy as np
 import torch
@@ -12,7 +9,7 @@ from diffusers.image_processor import VaeImageProcessor
 from huggingface_hub import snapshot_download
 from PIL import Image
 
-from model.cloth_masker import AutoMaskerSeg, vis_mask
+from model.cloth_masker import AutoMasker, vis_mask
 from model.pipeline import CatVTONPipeline
 from utils import init_weight_dtype, resize_and_crop, resize_and_padding
 
@@ -85,12 +82,6 @@ def parse_args():
             " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
         ),
     )
-    # parser.add_argument(
-    #     "--enable_condition_noise",
-    #     action="store_true",
-    #     default=True,
-    #     help="Whether or not to enable condition noise.",
-    # )
     
     args = parser.parse_args()
     env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -123,13 +114,13 @@ pipeline = CatVTONPipeline(
 )
 # AutoMasker
 mask_processor = VaeImageProcessor(vae_scale_factor=8, do_normalize=False, do_binarize=True, do_convert_grayscale=True)
-automasker = AutoMaskerSeg(
+automasker = AutoMasker(
     densepose_ckpt=os.path.join(repo_path, "DensePose"),
-    segformer_ckpt="mattmdjaga/segformer_b2_clothes",
+    schp_ckpt=os.path.join(repo_path, "SCHP"),
     device='cuda', 
 )
 
-@spaces.GPU
+
 def submit_function(
     person_image,
     cloth_image,
@@ -238,12 +229,9 @@ HEADER = """
   </a>
 </div>
 <br>
-
-· Thanks to <a href="https://huggingface.co/zero-gpu-explorers">ZeroGPU</a>  for providing A100 for this demo. <br> 
-· To adapt to ZeroGPU, we replace SCHP with <a href="https://huggingface.co/mattmdjaga/segformer_b2_clothes">SegFormer</a> which may result in differences from <a href="http://120.76.142.206:8888">our own demo</a>. <br>
 · This demo and our weights are only open for **Non-commercial Use**. <br>
-· SafetyChecker is set to filter NSFW content, but it may block normal results too. Please adjust the <span>`seed`</span> for normal outcomes.
-
+· SafetyChecker is set to filter NSFW content, but it may block normal results too. Please adjust the <span>`seed`</span> for normal outcomes.<br> 
+· Thanks to <a href="https://huggingface.co/zero-gpu-explorers">ZeroGPU</a> for providing GPU for <a href="https://huggingface.co/spaces/zhengchong/CatVTON">Our HuggingFace Space.</a> 
 """
 
 def app_gradio():
diff --git a/densepose/__init__.py b/densepose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc9e977ed3174e244414378dd85d48ea02e635e
--- /dev/null
+++ b/densepose/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from .data.datasets import builtin  # just to register data
+from .converters import builtin as builtin_converters  # register converters
+from .config import (
+    add_densepose_config,
+    add_densepose_head_config,
+    add_hrnet_config,
+    add_dataset_category_config,
+    add_bootstrap_config,
+    load_bootstrap_config,
+)
+from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+from .evaluation import DensePoseCOCOEvaluator
+from .modeling.roi_heads import DensePoseROIHeads
+from .modeling.test_time_augmentation import (
+    DensePoseGeneralizedRCNNWithTTA,
+    DensePoseDatasetMapperTTA,
+)
+from .utils.transform import load_from_cfg
+from .modeling.hrfpn import build_hrfpn_backbone
diff --git a/densepose/__pycache__/__init__.cpython-39.pyc b/densepose/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a15ca8241664ac683ccd91f487b8ecdcf3a3e7bf
Binary files /dev/null and b/densepose/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/__pycache__/config.cpython-39.pyc b/densepose/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4a11d27b90c0a05aa04569af23e8ce3a0fab50d
Binary files /dev/null and b/densepose/__pycache__/config.cpython-39.pyc differ
diff --git a/densepose/config.py b/densepose/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a06a09c80865ab987773511b2acc71e232b26ac
--- /dev/null
+++ b/densepose/config.py
@@ -0,0 +1,277 @@
+# -*- coding = utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+# pyre-ignore-all-errors
+
+from detectron2.config import CfgNode as CN
+
+
+def add_dataset_category_config(cfg: CN) -> None:
+    """
+    Add config for additional category-related dataset options
+     - category whitelisting
+     - category mapping
+    """
+    _C = cfg
+    _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True)
+    _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True)
+    # class to mesh mapping
+    _C.DATASETS.CLASS_TO_MESH_NAME_MAPPING = CN(new_allowed=True)
+
+
+def add_evaluation_config(cfg: CN) -> None:
+    _C = cfg
+    _C.DENSEPOSE_EVALUATION = CN()
+    # evaluator type, possible values:
+    #  - "iou": evaluator for models that produce iou data
+    #  - "cse": evaluator for models that produce cse data
+    _C.DENSEPOSE_EVALUATION.TYPE = "iou"
+    # storage for DensePose results, possible values:
+    #  - "none": no explicit storage, all the results are stored in the
+    #            dictionary with predictions, memory intensive;
+    #            historically the default storage type
+    #  - "ram": RAM storage, uses per-process RAM storage, which is
+    #           reduced to a single process storage on later stages,
+    #           less memory intensive
+    #  - "file": file storage, uses per-process file-based storage,
+    #            the least memory intensive, but may create bottlenecks
+    #            on file system accesses
+    _C.DENSEPOSE_EVALUATION.STORAGE = "none"
+    # minimum threshold for IOU values: the lower its values is,
+    # the more matches are produced (and the higher the AP score)
+    _C.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD = 0.5
+    # Non-distributed inference is slower (at inference time) but can avoid RAM OOM
+    _C.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE = True
+    # evaluate mesh alignment based on vertex embeddings, only makes sense in CSE context
+    _C.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT = False
+    # meshes to compute mesh alignment for
+    _C.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES = []
+
+
+def add_bootstrap_config(cfg: CN) -> None:
+    """ """
+    _C = cfg
+    _C.BOOTSTRAP_DATASETS = []
+    _C.BOOTSTRAP_MODEL = CN()
+    _C.BOOTSTRAP_MODEL.WEIGHTS = ""
+    _C.BOOTSTRAP_MODEL.DEVICE = "cuda"
+
+
+def get_bootstrap_dataset_config() -> CN:
+    _C = CN()
+    _C.DATASET = ""
+    # ratio used to mix data loaders
+    _C.RATIO = 0.1
+    # image loader
+    _C.IMAGE_LOADER = CN(new_allowed=True)
+    _C.IMAGE_LOADER.TYPE = ""
+    _C.IMAGE_LOADER.BATCH_SIZE = 4
+    _C.IMAGE_LOADER.NUM_WORKERS = 4
+    _C.IMAGE_LOADER.CATEGORIES = []
+    _C.IMAGE_LOADER.MAX_COUNT_PER_CATEGORY = 1_000_000
+    _C.IMAGE_LOADER.CATEGORY_TO_CLASS_MAPPING = CN(new_allowed=True)
+    # inference
+    _C.INFERENCE = CN()
+    # batch size for model inputs
+    _C.INFERENCE.INPUT_BATCH_SIZE = 4
+    # batch size to group model outputs
+    _C.INFERENCE.OUTPUT_BATCH_SIZE = 2
+    # sampled data
+    _C.DATA_SAMPLER = CN(new_allowed=True)
+    _C.DATA_SAMPLER.TYPE = ""
+    _C.DATA_SAMPLER.USE_GROUND_TRUTH_CATEGORIES = False
+    # filter
+    _C.FILTER = CN(new_allowed=True)
+    _C.FILTER.TYPE = ""
+    return _C
+
+
+def load_bootstrap_config(cfg: CN) -> None:
+    """
+    Bootstrap datasets are given as a list of `dict` that are not automatically
+    converted into CfgNode. This method processes all bootstrap dataset entries
+    and ensures that they are in CfgNode format and comply with the specification
+    """
+    if not cfg.BOOTSTRAP_DATASETS:
+        return
+
+    bootstrap_datasets_cfgnodes = []
+    for dataset_cfg in cfg.BOOTSTRAP_DATASETS:
+        _C = get_bootstrap_dataset_config().clone()
+        _C.merge_from_other_cfg(CN(dataset_cfg))
+        bootstrap_datasets_cfgnodes.append(_C)
+    cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes
+
+
+def add_densepose_head_cse_config(cfg: CN) -> None:
+    """
+    Add configuration options for Continuous Surface Embeddings (CSE)
+    """
+    _C = cfg
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE = CN()
+    # Dimensionality D of the embedding space
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE = 16
+    # Embedder specifications for various mesh IDs
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS = CN(new_allowed=True)
+    # normalization coefficient for embedding distances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA = 0.01
+    # normalization coefficient for geodesic distances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA = 0.01
+    # embedding loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT = 0.6
+    # embedding loss name, currently the following options are supported:
+    # - EmbeddingLoss: cross-entropy on vertex labels
+    # - SoftEmbeddingLoss: cross-entropy on vertex label combined with
+    #    Gaussian penalty on distance between vertices
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME = "EmbeddingLoss"
+    # optimizer hyperparameters
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR = 1.0
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR = 1.0
+    # Shape to shape cycle consistency loss parameters:
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False})
+    # shape to shape cycle consistency loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.025
+    # norm type used for loss computation
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P = 2
+    # normalization term for embedding similarity matrices
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE = 0.05
+    # maximum number of vertices to include into shape to shape cycle loss
+    # if negative or zero, all vertices are considered
+    # if positive, random subset of vertices of given size is considered
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES = 4936
+    # Pixel to shape cycle consistency loss parameters:
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False})
+    # pixel to shape cycle consistency loss weight
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.0001
+    # norm type used for loss computation
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P = 2
+    # map images to all meshes and back (if false, use only gt meshes from the batch)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY = False
+    # Randomly select at most this number of pixels from every instance
+    # if negative or zero, all vertices are considered
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE = 100
+    # normalization factor for pixel to pixel distances (higher value = smoother distribution)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA = 5.0
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX = 0.05
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL = 0.05
+
+
+def add_densepose_head_config(cfg: CN) -> None:
+    """
+    Add config for densepose head.
+    """
+    _C = cfg
+
+    _C.MODEL.DENSEPOSE_ON = True
+
+    _C.MODEL.ROI_DENSEPOSE_HEAD = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8
+    # Number of parts used for point labels
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512
+    _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2
+    _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2  # 15 or 2
+    # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7
+    # Loss weights for annotation masks.(14 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0
+    # Loss weights for surface parts. (24 Parts)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0
+    # Loss weights for UV regression.
+    _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01
+    # Coarse segmentation is trained using instance segmentation task data
+    _C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False
+    # For Decoder
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = ""
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4
+    # For DeepLab head
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN()
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN"
+    _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0
+    # Predictor class name, must be registered in DENSEPOSE_PREDICTOR_REGISTRY
+    # Some registered predictors:
+    #   "DensePoseChartPredictor": predicts segmentation and UV coordinates for predefined charts
+    #   "DensePoseChartWithConfidencePredictor": predicts segmentation, UV coordinates
+    #       and associated confidences for predefined charts (default)
+    #   "DensePoseEmbeddingWithConfidencePredictor": predicts segmentation, embeddings
+    #       and associated confidences for CSE
+    _C.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME = "DensePoseChartWithConfidencePredictor"
+    # Loss class name, must be registered in DENSEPOSE_LOSS_REGISTRY
+    # Some registered losses:
+    #   "DensePoseChartLoss": loss for chart-based models that estimate
+    #      segmentation and UV coordinates
+    #   "DensePoseChartWithConfidenceLoss": loss for chart-based models that estimate
+    #      segmentation, UV coordinates and the corresponding confidences (default)
+    _C.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME = "DensePoseChartWithConfidenceLoss"
+    # Confidences
+    # Enable learning UV confidences (variances) along with the actual values
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False})
+    # UV confidence lower bound
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01
+    # Enable learning segmentation confidences (variances) along with the actual values
+    _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False})
+    # Segmentation confidence lower bound
+    _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01
+    # Statistical model type for confidence learning, possible values:
+    # - "iid_iso": statistically independent identically distributed residuals
+    #    with isotropic covariance
+    # - "indep_aniso": statistically independent residuals with anisotropic
+    #    covariances
+    _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso"
+    # List of angles for rotation in data augmentation during training
+    _C.INPUT.ROTATION_ANGLES = [0]
+    _C.TEST.AUG.ROTATION_ANGLES = ()  # Rotation TTA
+
+    add_densepose_head_cse_config(cfg)
+
+
+def add_hrnet_config(cfg: CN) -> None:
+    """
+    Add config for HRNet backbone.
+    """
+    _C = cfg
+
+    # For HigherHRNet w32
+    _C.MODEL.HRNET = CN()
+    _C.MODEL.HRNET.STEM_INPLANES = 64
+    _C.MODEL.HRNET.STAGE2 = CN()
+    _C.MODEL.HRNET.STAGE2.NUM_MODULES = 1
+    _C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2
+    _C.MODEL.HRNET.STAGE2.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4]
+    _C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64]
+    _C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM"
+    _C.MODEL.HRNET.STAGE3 = CN()
+    _C.MODEL.HRNET.STAGE3.NUM_MODULES = 4
+    _C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3
+    _C.MODEL.HRNET.STAGE3.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4]
+    _C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128]
+    _C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM"
+    _C.MODEL.HRNET.STAGE4 = CN()
+    _C.MODEL.HRNET.STAGE4.NUM_MODULES = 3
+    _C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4
+    _C.MODEL.HRNET.STAGE4.BLOCK = "BASIC"
+    _C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4]
+    _C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256]
+    _C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM"
+
+    _C.MODEL.HRNET.HRFPN = CN()
+    _C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256
+
+
+def add_densepose_config(cfg: CN) -> None:
+    add_densepose_head_config(cfg)
+    add_hrnet_config(cfg)
+    add_bootstrap_config(cfg)
+    add_dataset_category_config(cfg)
+    add_evaluation_config(cfg)
diff --git a/densepose/converters/__init__.py b/densepose/converters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b700f44437bd4e68be358ed5aae62a22df8d88a
--- /dev/null
+++ b/densepose/converters/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .hflip import HFlipConverter
+from .to_mask import ToMaskConverter
+from .to_chart_result import ToChartResultConverter, ToChartResultConverterWithConfidences
+from .segm_to_mask import (
+    predictor_output_with_fine_and_coarse_segm_to_mask,
+    predictor_output_with_coarse_segm_to_mask,
+    resample_fine_and_coarse_segm_to_bbox,
+)
+from .chart_output_to_chart_result import (
+    densepose_chart_predictor_output_to_result,
+    densepose_chart_predictor_output_to_result_with_confidences,
+)
+from .chart_output_hflip import densepose_chart_predictor_output_hflip
diff --git a/densepose/converters/__pycache__/__init__.cpython-39.pyc b/densepose/converters/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f82189374d4076b34e739cb25d2895ce7d3fb7eb
Binary files /dev/null and b/densepose/converters/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/base.cpython-39.pyc b/densepose/converters/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..755a65623042931dd2e8761050ee09fa1ad5281b
Binary files /dev/null and b/densepose/converters/__pycache__/base.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/builtin.cpython-39.pyc b/densepose/converters/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acd400757cff745028bb009e49bd17d8dd9a260a
Binary files /dev/null and b/densepose/converters/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc b/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b104343045eee9289ed2dc395b7be55fecf09187
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_hflip.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e76bffc24ce9be5576b4ac8bd8d1430e7645235
Binary files /dev/null and b/densepose/converters/__pycache__/chart_output_to_chart_result.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/hflip.cpython-39.pyc b/densepose/converters/__pycache__/hflip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74c7c523c2b4f63298187a67949f2d6b1afbcdf0
Binary files /dev/null and b/densepose/converters/__pycache__/hflip.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc b/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c992f1b0dedf2987ae482a6aaf0536923b4e238
Binary files /dev/null and b/densepose/converters/__pycache__/segm_to_mask.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc b/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87ba159e6bf5632712b2d8702b59f5ca2bb9c56f
Binary files /dev/null and b/densepose/converters/__pycache__/to_chart_result.cpython-39.pyc differ
diff --git a/densepose/converters/__pycache__/to_mask.cpython-39.pyc b/densepose/converters/__pycache__/to_mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7b79181961e4f9b4960ed9e6a6f1c4258f05304
Binary files /dev/null and b/densepose/converters/__pycache__/to_mask.cpython-39.pyc differ
diff --git a/densepose/converters/base.py b/densepose/converters/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3155a87b819fe526b7b2735e006aeb3a56dda
--- /dev/null
+++ b/densepose/converters/base.py
@@ -0,0 +1,95 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any, Tuple, Type
+import torch
+
+
+class BaseConverter:
+    """
+    Converter base class to be reused by various converters.
+    Converter allows one to convert data from various source types to a particular
+    destination type. Each source type needs to register its converter. The
+    registration for each source type is valid for all descendants of that type.
+    """
+
+    @classmethod
+    def register(cls, from_type: Type, converter: Any = None):
+        """
+        Registers a converter for the specified type.
+        Can be used as a decorator (if converter is None), or called as a method.
+
+        Args:
+            from_type (type): type to register the converter for;
+                all instances of this type will use the same converter
+            converter (callable): converter to be registered for the given
+                type; if None, this method is assumed to be a decorator for the converter
+        """
+
+        if converter is not None:
+            cls._do_register(from_type, converter)
+
+        def wrapper(converter: Any) -> Any:
+            cls._do_register(from_type, converter)
+            return converter
+
+        return wrapper
+
+    @classmethod
+    def _do_register(cls, from_type: Type, converter: Any):
+        cls.registry[from_type] = converter  # pyre-ignore[16]
+
+    @classmethod
+    def _lookup_converter(cls, from_type: Type) -> Any:
+        """
+        Perform recursive lookup for the given type
+        to find registered converter. If a converter was found for some base
+        class, it gets registered for this class to save on further lookups.
+
+        Args:
+            from_type: type for which to find a converter
+        Return:
+            callable or None - registered converter or None
+                if no suitable entry was found in the registry
+        """
+        if from_type in cls.registry:  # pyre-ignore[16]
+            return cls.registry[from_type]
+        for base in from_type.__bases__:
+            converter = cls._lookup_converter(base)
+            if converter is not None:
+                cls._do_register(from_type, converter)
+                return converter
+        return None
+
+    @classmethod
+    def convert(cls, instance: Any, *args, **kwargs):
+        """
+        Convert an instance to the destination type using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            instance: source instance to convert to the destination type
+        Return:
+            An instance of the destination type obtained from the source instance
+            Raises KeyError, if no suitable converter found
+        """
+        instance_type = type(instance)
+        converter = cls._lookup_converter(instance_type)
+        if converter is None:
+            if cls.dst_type is None:  # pyre-ignore[16]
+                output_type_str = "itself"
+            else:
+                output_type_str = cls.dst_type
+            raise KeyError(f"Could not find converter from {instance_type} to {output_type_str}")
+        return converter(instance, *args, **kwargs)
+
+
+IntTupleBox = Tuple[int, int, int, int]
+
+
+def make_int_box(box: torch.Tensor) -> IntTupleBox:
+    int_box = [0, 0, 0, 0]
+    int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist())
+    return int_box[0], int_box[1], int_box[2], int_box[3]
diff --git a/densepose/converters/builtin.py b/densepose/converters/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..5234410307d7bfff932da982ca44926afb729c23
--- /dev/null
+++ b/densepose/converters/builtin.py
@@ -0,0 +1,33 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from ..structures import DensePoseChartPredictorOutput, DensePoseEmbeddingPredictorOutput
+from . import (
+    HFlipConverter,
+    ToChartResultConverter,
+    ToChartResultConverterWithConfidences,
+    ToMaskConverter,
+    densepose_chart_predictor_output_hflip,
+    densepose_chart_predictor_output_to_result,
+    densepose_chart_predictor_output_to_result_with_confidences,
+    predictor_output_with_coarse_segm_to_mask,
+    predictor_output_with_fine_and_coarse_segm_to_mask,
+)
+
+ToMaskConverter.register(
+    DensePoseChartPredictorOutput, predictor_output_with_fine_and_coarse_segm_to_mask
+)
+ToMaskConverter.register(
+    DensePoseEmbeddingPredictorOutput, predictor_output_with_coarse_segm_to_mask
+)
+
+ToChartResultConverter.register(
+    DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result
+)
+
+ToChartResultConverterWithConfidences.register(
+    DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result_with_confidences
+)
+
+HFlipConverter.register(DensePoseChartPredictorOutput, densepose_chart_predictor_output_hflip)
diff --git a/densepose/converters/chart_output_hflip.py b/densepose/converters/chart_output_hflip.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f0061c858c80b083d40807c0bdfb4dfcc5d86b
--- /dev/null
+++ b/densepose/converters/chart_output_hflip.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from dataclasses import fields
+import torch
+
+from densepose.structures import DensePoseChartPredictorOutput, DensePoseTransformData
+
+
+def densepose_chart_predictor_output_hflip(
+    densepose_predictor_output: DensePoseChartPredictorOutput,
+    transform_data: DensePoseTransformData,
+) -> DensePoseChartPredictorOutput:
+    """
+    Change  to take into account a Horizontal flip.
+    """
+    if len(densepose_predictor_output) > 0:
+
+        PredictorOutput = type(densepose_predictor_output)
+        output_dict = {}
+
+        for field in fields(densepose_predictor_output):
+            field_value = getattr(densepose_predictor_output, field.name)
+            # flip tensors
+            if isinstance(field_value, torch.Tensor):
+                setattr(densepose_predictor_output, field.name, torch.flip(field_value, [3]))
+
+        densepose_predictor_output = _flip_iuv_semantics_tensor(
+            densepose_predictor_output, transform_data
+        )
+        densepose_predictor_output = _flip_segm_semantics_tensor(
+            densepose_predictor_output, transform_data
+        )
+
+        for field in fields(densepose_predictor_output):
+            output_dict[field.name] = getattr(densepose_predictor_output, field.name)
+
+        return PredictorOutput(**output_dict)
+    else:
+        return densepose_predictor_output
+
+
+def _flip_iuv_semantics_tensor(
+    densepose_predictor_output: DensePoseChartPredictorOutput,
+    dp_transform_data: DensePoseTransformData,
+) -> DensePoseChartPredictorOutput:
+    point_label_symmetries = dp_transform_data.point_label_symmetries
+    uv_symmetries = dp_transform_data.uv_symmetries
+
+    N, C, H, W = densepose_predictor_output.u.shape
+    u_loc = (densepose_predictor_output.u[:, 1:, :, :].clamp(0, 1) * 255).long()
+    v_loc = (densepose_predictor_output.v[:, 1:, :, :].clamp(0, 1) * 255).long()
+    Iindex = torch.arange(C - 1, device=densepose_predictor_output.u.device)[
+        None, :, None, None
+    ].expand(N, C - 1, H, W)
+    densepose_predictor_output.u[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc]
+    densepose_predictor_output.v[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc]
+
+    for el in ["fine_segm", "u", "v"]:
+        densepose_predictor_output.__dict__[el] = densepose_predictor_output.__dict__[el][
+            :, point_label_symmetries, :, :
+        ]
+    return densepose_predictor_output
+
+
+def _flip_segm_semantics_tensor(
+    densepose_predictor_output: DensePoseChartPredictorOutput, dp_transform_data
+):
+    if densepose_predictor_output.coarse_segm.shape[1] > 2:
+        densepose_predictor_output.coarse_segm = densepose_predictor_output.coarse_segm[
+            :, dp_transform_data.mask_label_symmetries, :, :
+        ]
+    return densepose_predictor_output
diff --git a/densepose/converters/chart_output_to_chart_result.py b/densepose/converters/chart_output_to_chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e9c2280a60f80d2e32861a392fc78b3148cac8
--- /dev/null
+++ b/densepose/converters/chart_output_to_chart_result.py
@@ -0,0 +1,190 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Dict
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures.boxes import Boxes, BoxMode
+
+from ..structures import (
+    DensePoseChartPredictorOutput,
+    DensePoseChartResult,
+    DensePoseChartResultWithConfidences,
+)
+from . import resample_fine_and_coarse_segm_to_bbox
+from .base import IntTupleBox, make_int_box
+
+
+def resample_uv_tensors_to_bbox(
+    u: torch.Tensor,
+    v: torch.Tensor,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> torch.Tensor:
+    """
+    Resamples U and V coordinate estimates for the given bounding box
+
+    Args:
+        u (tensor [1, C, H, W] of float): U coordinates
+        v (tensor [1, C, H, W] of float): V coordinates
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled U and V coordinates - a tensor [2, H, W] of float
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False)
+    v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False)
+    uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device)
+    for part_id in range(1, u_bbox.size(1)):
+        uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]
+        uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id]
+    return uv
+
+
+def resample_uv_to_bbox(
+    predictor_output: DensePoseChartPredictorOutput,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> torch.Tensor:
+    """
+    Resamples U and V coordinate estimates for the given bounding box
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be resampled
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled U and V coordinates - a tensor [2, H, W] of float
+    """
+    return resample_uv_tensors_to_bbox(
+        predictor_output.u,
+        predictor_output.v,
+        labels,
+        box_xywh_abs,
+    )
+
+
+def densepose_chart_predictor_output_to_result(
+    predictor_output: DensePoseChartPredictorOutput, boxes: Boxes
+) -> DensePoseChartResult:
+    """
+    Convert densepose chart predictor outputs to results
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be converted to results, must contain only 1 output
+        boxes (Boxes): bounding box that corresponds to the predictor output,
+            must contain only 1 bounding box
+    Return:
+       DensePose chart-based result (DensePoseChartResult)
+    """
+    assert len(predictor_output) == 1 and len(boxes) == 1, (
+        f"Predictor output to result conversion can operate only single outputs"
+        f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes"
+    )
+
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    box_xywh = make_int_box(boxes_xywh_abs[0])
+
+    labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0)
+    uv = resample_uv_to_bbox(predictor_output, labels, box_xywh)
+    return DensePoseChartResult(labels=labels, uv=uv)
+
+
+def resample_confidences_to_bbox(
+    predictor_output: DensePoseChartPredictorOutput,
+    labels: torch.Tensor,
+    box_xywh_abs: IntTupleBox,
+) -> Dict[str, torch.Tensor]:
+    """
+    Resamples confidences for the given bounding box
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output to be resampled
+        labels (tensor [H, W] of long): labels obtained by resampling segmentation
+            outputs for the given bounding box
+        box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs
+    Return:
+       Resampled confidences - a dict of [H, W] tensors of float
+    """
+
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+
+    confidence_names = [
+        "sigma_1",
+        "sigma_2",
+        "kappa_u",
+        "kappa_v",
+        "fine_segm_confidence",
+        "coarse_segm_confidence",
+    ]
+    confidence_results = {key: None for key in confidence_names}
+    confidence_names = [
+        key for key in confidence_names if getattr(predictor_output, key) is not None
+    ]
+    confidence_base = torch.zeros([h, w], dtype=torch.float32, device=predictor_output.u.device)
+
+    # assign data from channels that correspond to the labels
+    for key in confidence_names:
+        resampled_confidence = F.interpolate(
+            getattr(predictor_output, key),
+            (h, w),
+            mode="bilinear",
+            align_corners=False,
+        )
+        result = confidence_base.clone()
+        for part_id in range(1, predictor_output.u.size(1)):
+            if resampled_confidence.size(1) != predictor_output.u.size(1):
+                # confidence is not part-based, don't try to fill it part by part
+                continue
+            result[labels == part_id] = resampled_confidence[0, part_id][labels == part_id]
+
+        if resampled_confidence.size(1) != predictor_output.u.size(1):
+            # confidence is not part-based, fill the data with the first channel
+            # (targeted for segmentation confidences that have only 1 channel)
+            result = resampled_confidence[0, 0]
+
+        confidence_results[key] = result
+
+    return confidence_results  # pyre-ignore[7]
+
+
+def densepose_chart_predictor_output_to_result_with_confidences(
+    predictor_output: DensePoseChartPredictorOutput, boxes: Boxes
+) -> DensePoseChartResultWithConfidences:
+    """
+    Convert densepose chart predictor outputs to results
+
+    Args:
+        predictor_output (DensePoseChartPredictorOutput): DensePose predictor
+            output with confidences to be converted to results, must contain only 1 output
+        boxes (Boxes): bounding box that corresponds to the predictor output,
+            must contain only 1 bounding box
+    Return:
+       DensePose chart-based result with confidences (DensePoseChartResultWithConfidences)
+    """
+    assert len(predictor_output) == 1 and len(boxes) == 1, (
+        f"Predictor output to result conversion can operate only single outputs"
+        f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes"
+    )
+
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    box_xywh = make_int_box(boxes_xywh_abs[0])
+
+    labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0)
+    uv = resample_uv_to_bbox(predictor_output, labels, box_xywh)
+    confidences = resample_confidences_to_bbox(predictor_output, labels, box_xywh)
+    return DensePoseChartResultWithConfidences(labels=labels, uv=uv, **confidences)
diff --git a/densepose/converters/hflip.py b/densepose/converters/hflip.py
new file mode 100644
index 0000000000000000000000000000000000000000..711b73b3701adfd0217132519aea46f30f9ed74a
--- /dev/null
+++ b/densepose/converters/hflip.py
@@ -0,0 +1,36 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any
+
+from .base import BaseConverter
+
+
+class HFlipConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = None
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(cls, predictor_outputs: Any, transform_data: Any, *args, **kwargs):
+        """
+        Performs an horizontal flip on DensePose predictor outputs.
+        Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            predictor_outputs: DensePose predictor output to be converted to BitMasks
+            transform_data: Anything useful for the flip
+        Return:
+            An instance of the same type as predictor_outputs
+        """
+        return super(HFlipConverter, cls).convert(
+            predictor_outputs, transform_data, *args, **kwargs
+        )
diff --git a/densepose/converters/segm_to_mask.py b/densepose/converters/segm_to_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5843a2186f441aa9cb48b680fd67051aa1236f6
--- /dev/null
+++ b/densepose/converters/segm_to_mask.py
@@ -0,0 +1,152 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BitMasks, Boxes, BoxMode
+
+from .base import IntTupleBox, make_int_box
+from .to_mask import ImageSizeType
+
+
+def resample_coarse_segm_tensor_to_bbox(coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox):
+    """
+    Resample coarse segmentation tensor to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        coarse_segm: float tensor of shape [1, K, Hout, Wout]
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    labels = F.interpolate(coarse_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+    return labels
+
+
+def resample_fine_and_coarse_segm_tensors_to_bbox(
+    fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox
+):
+    """
+    Resample fine and coarse segmentation tensors to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        fine_segm: float tensor of shape [1, C, Hout, Wout]
+        coarse_segm: float tensor of shape [1, K, Hout, Wout]
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    x, y, w, h = box_xywh_abs
+    w = max(int(w), 1)
+    h = max(int(h), 1)
+    # coarse segmentation
+    coarse_segm_bbox = F.interpolate(
+        coarse_segm,
+        (h, w),
+        mode="bilinear",
+        align_corners=False,
+    ).argmax(dim=1)
+    # combined coarse and fine segmentation
+    labels = (
+        F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1)
+        * (coarse_segm_bbox > 0).long()
+    )
+    return labels
+
+
+def resample_fine_and_coarse_segm_to_bbox(predictor_output: Any, box_xywh_abs: IntTupleBox):
+    """
+    Resample fine and coarse segmentation outputs from a predictor to the given
+    bounding box and derive labels for each pixel of the bounding box
+
+    Args:
+        predictor_output: DensePose predictor output that contains segmentation
+            results to be resampled
+        box_xywh_abs (tuple of 4 int): bounding box given by its upper-left
+            corner coordinates, width (W) and height (H)
+    Return:
+        Labels for each pixel of the bounding box, a long tensor of size [1, H, W]
+    """
+    return resample_fine_and_coarse_segm_tensors_to_bbox(
+        predictor_output.fine_segm,
+        predictor_output.coarse_segm,
+        box_xywh_abs,
+    )
+
+
+def predictor_output_with_coarse_segm_to_mask(
+    predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType
+) -> BitMasks:
+    """
+    Convert predictor output with coarse and fine segmentation to a mask.
+    Assumes that predictor output has the following attributes:
+     - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation
+         unnormalized scores for N instances; D is the number of coarse
+         segmentation labels, H and W is the resolution of the estimate
+
+    Args:
+        predictor_output: DensePose predictor output to be converted to mask
+        boxes (Boxes): bounding boxes that correspond to the DensePose
+            predictor outputs
+        image_size_hw (tuple [int, int]): image height Himg and width Wimg
+    Return:
+        BitMasks that contain a bool tensor of size [N, Himg, Wimg] with
+        a mask of the size of the image for each instance
+    """
+    H, W = image_size_hw
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    N = len(boxes_xywh_abs)
+    masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device)
+    for i in range(len(boxes_xywh_abs)):
+        box_xywh = make_int_box(boxes_xywh_abs[i])
+        box_mask = resample_coarse_segm_tensor_to_bbox(predictor_output[i].coarse_segm, box_xywh)
+        x, y, w, h = box_xywh
+        masks[i, y : y + h, x : x + w] = box_mask
+
+    return BitMasks(masks)
+
+
+def predictor_output_with_fine_and_coarse_segm_to_mask(
+    predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType
+) -> BitMasks:
+    """
+    Convert predictor output with coarse and fine segmentation to a mask.
+    Assumes that predictor output has the following attributes:
+     - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation
+         unnormalized scores for N instances; D is the number of coarse
+         segmentation labels, H and W is the resolution of the estimate
+     - fine_segm (tensor of size [N, C, H, W]): fine segmentation
+         unnormalized scores for N instances; C is the number of fine
+         segmentation labels, H and W is the resolution of the estimate
+
+    Args:
+        predictor_output: DensePose predictor output to be converted to mask
+        boxes (Boxes): bounding boxes that correspond to the DensePose
+            predictor outputs
+        image_size_hw (tuple [int, int]): image height Himg and width Wimg
+    Return:
+        BitMasks that contain a bool tensor of size [N, Himg, Wimg] with
+        a mask of the size of the image for each instance
+    """
+    H, W = image_size_hw
+    boxes_xyxy_abs = boxes.tensor.clone()
+    boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    N = len(boxes_xywh_abs)
+    masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device)
+    for i in range(len(boxes_xywh_abs)):
+        box_xywh = make_int_box(boxes_xywh_abs[i])
+        labels_i = resample_fine_and_coarse_segm_to_bbox(predictor_output[i], box_xywh)
+        x, y, w, h = box_xywh
+        masks[i, y : y + h, x : x + w] = labels_i > 0
+    return BitMasks(masks)
diff --git a/densepose/converters/to_chart_result.py b/densepose/converters/to_chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e126a922ff8ac4d8ebc3008f67d3928b982c25
--- /dev/null
+++ b/densepose/converters/to_chart_result.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any
+
+from detectron2.structures import Boxes
+
+from ..structures import DensePoseChartResult, DensePoseChartResultWithConfidences
+from .base import BaseConverter
+
+
+class ToChartResultConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = DensePoseChartResult
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs) -> DensePoseChartResult:
+        """
+        Convert DensePose predictor outputs to DensePoseResult using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output to be
+                converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+        Return:
+            An instance of DensePoseResult. If no suitable converter was found, raises KeyError
+        """
+        return super(ToChartResultConverter, cls).convert(predictor_outputs, boxes, *args, **kwargs)
+
+
+class ToChartResultConverterWithConfidences(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to DensePose results.
+    Each DensePose predictor output type has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = DensePoseChartResultWithConfidences
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(
+        cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs
+    ) -> DensePoseChartResultWithConfidences:
+        """
+        Convert DensePose predictor outputs to DensePoseResult with confidences
+        using some registered converter. Does recursive lookup for base classes,
+        so there's no need for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output with confidences
+                to be converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+        Return:
+            An instance of DensePoseResult. If no suitable converter was found, raises KeyError
+        """
+        return super(ToChartResultConverterWithConfidences, cls).convert(
+            predictor_outputs, boxes, *args, **kwargs
+        )
diff --git a/densepose/converters/to_mask.py b/densepose/converters/to_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a47e2a7d7aa5f0d9c41ab46a4f1806184b7b4ba
--- /dev/null
+++ b/densepose/converters/to_mask.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any, Tuple
+
+from detectron2.structures import BitMasks, Boxes
+
+from .base import BaseConverter
+
+ImageSizeType = Tuple[int, int]
+
+
+class ToMaskConverter(BaseConverter):
+    """
+    Converts various DensePose predictor outputs to masks
+    in bit mask format (see `BitMasks`). Each DensePose predictor output type
+    has to register its convertion strategy.
+    """
+
+    registry = {}
+    dst_type = BitMasks
+
+    @classmethod
+    # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter`
+    #  inconsistently.
+    def convert(
+        cls,
+        densepose_predictor_outputs: Any,
+        boxes: Boxes,
+        image_size_hw: ImageSizeType,
+        *args,
+        **kwargs
+    ) -> BitMasks:
+        """
+        Convert DensePose predictor outputs to BitMasks using some registered
+        converter. Does recursive lookup for base classes, so there's no need
+        for explicit registration for derived classes.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor output to be
+                converted to BitMasks
+            boxes (Boxes): bounding boxes that correspond to the DensePose
+                predictor outputs
+            image_size_hw (tuple [int, int]): image height and width
+        Return:
+            An instance of `BitMasks`. If no suitable converter was found, raises KeyError
+        """
+        return super(ToMaskConverter, cls).convert(
+            densepose_predictor_outputs, boxes, image_size_hw, *args, **kwargs
+        )
diff --git a/densepose/data/__init__.py b/densepose/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5278887bd723f1606debd3de09b7e3e0ff5b3a03
--- /dev/null
+++ b/densepose/data/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .meshes import builtin
+from .build import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+    build_combined_loader,
+    build_frame_selector,
+    build_inference_based_loaders,
+    has_inference_based_loaders,
+    BootstrapDatasetFactoryCatalog,
+)
+from .combined_loader import CombinedDataLoader
+from .dataset_mapper import DatasetMapper
+from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
+from .image_list_dataset import ImageListDataset
+from .utils import is_relative_local_path, maybe_prepend_base_path
+
+# ensure the builtin datasets are registered
+from . import datasets
+
+# ensure the bootstrap datasets builders are registered
+from . import build
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/densepose/data/__pycache__/__init__.cpython-39.pyc b/densepose/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8685f36c53fcc1a31a2cbf7251fc343032cf7492
Binary files /dev/null and b/densepose/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/build.cpython-39.pyc b/densepose/data/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..426b867d001b02f7701d82979f2df747fde869d5
Binary files /dev/null and b/densepose/data/__pycache__/build.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/combined_loader.cpython-39.pyc b/densepose/data/__pycache__/combined_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b31a974d8aee7630c6eb9a24761cc26d08206ff9
Binary files /dev/null and b/densepose/data/__pycache__/combined_loader.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc b/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c3e18273902fa94162b5660580a09df5205714a
Binary files /dev/null and b/densepose/data/__pycache__/dataset_mapper.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc b/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9869b599bb055ea352748613007bc6021d79e6aa
Binary files /dev/null and b/densepose/data/__pycache__/image_list_dataset.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc b/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..735a797223587514aea5d3a8163e5df8e9b14e72
Binary files /dev/null and b/densepose/data/__pycache__/inference_based_loader.cpython-39.pyc differ
diff --git a/densepose/data/__pycache__/utils.cpython-39.pyc b/densepose/data/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed14f9d1c32c4fa5db57c0f8d5b995ce3bea129d
Binary files /dev/null and b/densepose/data/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/data/build.py b/densepose/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e8e8f782e75b27b8bb1ec387dd49ccdae8dbb3
--- /dev/null
+++ b/densepose/data/build.py
@@ -0,0 +1,738 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import itertools
+import logging
+import numpy as np
+from collections import UserDict, defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Collection, Dict, Iterable, List, Optional, Sequence, Tuple
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.config import CfgNode
+from detectron2.data.build import build_detection_test_loader as d2_build_detection_test_loader
+from detectron2.data.build import build_detection_train_loader as d2_build_detection_train_loader
+from detectron2.data.build import (
+    load_proposals_into_dataset,
+    print_instances_class_histogram,
+    trivial_batch_collator,
+    worker_init_reset_seed,
+)
+from detectron2.data.catalog import DatasetCatalog, Metadata, MetadataCatalog
+from detectron2.data.samplers import TrainingSampler
+from detectron2.utils.comm import get_world_size
+
+from densepose.config import get_bootstrap_dataset_config
+from densepose.modeling import build_densepose_embedder
+
+from .combined_loader import CombinedDataLoader, Loader
+from .dataset_mapper import DatasetMapper
+from .datasets.coco import DENSEPOSE_CSE_KEYS_WITHOUT_MASK, DENSEPOSE_IUV_KEYS_WITHOUT_MASK
+from .datasets.dataset_type import DatasetType
+from .inference_based_loader import InferenceBasedLoader, ScoreBasedFilter
+from .samplers import (
+    DensePoseConfidenceBasedSampler,
+    DensePoseCSEConfidenceBasedSampler,
+    DensePoseCSEUniformSampler,
+    DensePoseUniformSampler,
+    MaskFromDensePoseSampler,
+    PredictionToGroundTruthSampler,
+)
+from .transform import ImageResizeTransform
+from .utils import get_category_to_class_mapping, get_class_to_mesh_name_mapping
+from .video import (
+    FirstKFramesSelector,
+    FrameSelectionStrategy,
+    LastKFramesSelector,
+    RandomKFramesSelector,
+    VideoKeyframeDataset,
+    video_list_from_file,
+)
+
+__all__ = ["build_detection_train_loader", "build_detection_test_loader"]
+
+
+Instance = Dict[str, Any]
+InstancePredicate = Callable[[Instance], bool]
+
+
+def _compute_num_images_per_worker(cfg: CfgNode) -> int:
+    num_workers = get_world_size()
+    images_per_batch = cfg.SOLVER.IMS_PER_BATCH
+    assert (
+        images_per_batch % num_workers == 0
+    ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    assert (
+        images_per_batch >= num_workers
+    ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format(
+        images_per_batch, num_workers
+    )
+    images_per_worker = images_per_batch // num_workers
+    return images_per_worker
+
+
+def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]) -> None:
+    meta = MetadataCatalog.get(dataset_name)
+    for dataset_dict in dataset_dicts:
+        for ann in dataset_dict["annotations"]:
+            ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]]
+
+
+@dataclass
+class _DatasetCategory:
+    """
+    Class representing category data in a dataset:
+     - id: category ID, as specified in the dataset annotations file
+     - name: category name, as specified in the dataset annotations file
+     - mapped_id: category ID after applying category maps (DATASETS.CATEGORY_MAPS config option)
+     - mapped_name: category name after applying category maps
+     - dataset_name: dataset in which the category is defined
+
+    For example, when training models in a class-agnostic manner, one could take LVIS 1.0
+    dataset and map the animal categories to the same category as human data from COCO:
+     id = 225
+     name = "cat"
+     mapped_id = 1
+     mapped_name = "person"
+     dataset_name = "lvis_v1_animals_dp_train"
+    """
+
+    id: int
+    name: str
+    mapped_id: int
+    mapped_name: str
+    dataset_name: str
+
+
+_MergedCategoriesT = Dict[int, List[_DatasetCategory]]
+
+
+def _add_category_id_to_contiguous_id_maps_to_metadata(
+    merged_categories: _MergedCategoriesT,
+) -> None:
+    merged_categories_per_dataset = {}
+    for contiguous_cat_id, cat_id in enumerate(sorted(merged_categories.keys())):
+        for cat in merged_categories[cat_id]:
+            if cat.dataset_name not in merged_categories_per_dataset:
+                merged_categories_per_dataset[cat.dataset_name] = defaultdict(list)
+            merged_categories_per_dataset[cat.dataset_name][cat_id].append(
+                (
+                    contiguous_cat_id,
+                    cat,
+                )
+            )
+
+    logger = logging.getLogger(__name__)
+    for dataset_name, merged_categories in merged_categories_per_dataset.items():
+        meta = MetadataCatalog.get(dataset_name)
+        if not hasattr(meta, "thing_classes"):
+            meta.thing_classes = []
+            meta.thing_dataset_id_to_contiguous_id = {}
+            meta.thing_dataset_id_to_merged_id = {}
+        else:
+            meta.thing_classes.clear()
+            meta.thing_dataset_id_to_contiguous_id.clear()
+            meta.thing_dataset_id_to_merged_id.clear()
+        logger.info(f"Dataset {dataset_name}: category ID to contiguous ID mapping:")
+        for _cat_id, categories in sorted(merged_categories.items()):
+            added_to_thing_classes = False
+            for contiguous_cat_id, cat in categories:
+                if not added_to_thing_classes:
+                    meta.thing_classes.append(cat.mapped_name)
+                    added_to_thing_classes = True
+                meta.thing_dataset_id_to_contiguous_id[cat.id] = contiguous_cat_id
+                meta.thing_dataset_id_to_merged_id[cat.id] = cat.mapped_id
+                logger.info(f"{cat.id} ({cat.name}) -> {contiguous_cat_id}")
+
+
+def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    def has_annotations(instance: Instance) -> bool:
+        return "annotations" in instance
+
+    def has_only_crowd_anotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if ann.get("is_crowd", 0) == 0:
+                return False
+        return True
+
+    def general_keep_instance_predicate(instance: Instance) -> bool:
+        return has_annotations(instance) and not has_only_crowd_anotations(instance)
+
+    if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS:
+        return None
+    return general_keep_instance_predicate
+
+
+def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+
+    min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+
+    def has_sufficient_num_keypoints(instance: Instance) -> bool:
+        num_kpts = sum(
+            (np.array(ann["keypoints"][2::3]) > 0).sum()
+            for ann in instance["annotations"]
+            if "keypoints" in ann
+        )
+        return num_kpts >= min_num_keypoints
+
+    if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0):
+        return has_sufficient_num_keypoints
+    return None
+
+
+def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.MASK_ON:
+        return None
+
+    def has_mask_annotations(instance: Instance) -> bool:
+        return any("segmentation" in ann for ann in instance["annotations"])
+
+    return has_mask_annotations
+
+
+def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    if not cfg.MODEL.DENSEPOSE_ON:
+        return None
+
+    use_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+
+    def has_densepose_annotations(instance: Instance) -> bool:
+        for ann in instance["annotations"]:
+            if all(key in ann for key in DENSEPOSE_IUV_KEYS_WITHOUT_MASK) or all(
+                key in ann for key in DENSEPOSE_CSE_KEYS_WITHOUT_MASK
+            ):
+                return True
+            if use_masks and "segmentation" in ann:
+                return True
+        return False
+
+    return has_densepose_annotations
+
+
+def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]:
+    specific_predicate_creators = [
+        _maybe_create_keypoints_keep_instance_predicate,
+        _maybe_create_mask_keep_instance_predicate,
+        _maybe_create_densepose_keep_instance_predicate,
+    ]
+    predicates = [creator(cfg) for creator in specific_predicate_creators]
+    predicates = [p for p in predicates if p is not None]
+    if not predicates:
+        return None
+
+    def combined_predicate(instance: Instance) -> bool:
+        return any(p(instance) for p in predicates)
+
+    return combined_predicate
+
+
+def _get_train_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg)
+
+    def combined_general_specific_keep_predicate(instance: Instance) -> bool:
+        return general_keep_predicate(instance) and combined_specific_keep_predicate(instance)
+
+    if (general_keep_predicate is None) and (combined_specific_keep_predicate is None):
+        return None
+    if general_keep_predicate is None:
+        return combined_specific_keep_predicate
+    if combined_specific_keep_predicate is None:
+        return general_keep_predicate
+    return combined_general_specific_keep_predicate
+
+
+def _get_test_keep_instance_predicate(cfg: CfgNode):
+    general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg)
+    return general_keep_predicate
+
+
+def _maybe_filter_and_map_categories(
+    dataset_name: str, dataset_dicts: List[Instance]
+) -> List[Instance]:
+    meta = MetadataCatalog.get(dataset_name)
+    category_id_map = meta.thing_dataset_id_to_contiguous_id
+    filtered_dataset_dicts = []
+    for dataset_dict in dataset_dicts:
+        anns = []
+        for ann in dataset_dict["annotations"]:
+            cat_id = ann["category_id"]
+            if cat_id not in category_id_map:
+                continue
+            ann["category_id"] = category_id_map[cat_id]
+            anns.append(ann)
+        dataset_dict["annotations"] = anns
+        filtered_dataset_dicts.append(dataset_dict)
+    return filtered_dataset_dicts
+
+
+def _add_category_whitelists_to_metadata(cfg: CfgNode) -> None:
+    for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items():
+        meta = MetadataCatalog.get(dataset_name)
+        meta.whitelisted_categories = whitelisted_cat_ids
+        logger = logging.getLogger(__name__)
+        logger.info(
+            "Whitelisted categories for dataset {}: {}".format(
+                dataset_name, meta.whitelisted_categories
+            )
+        )
+
+
+def _add_category_maps_to_metadata(cfg: CfgNode) -> None:
+    for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items():
+        category_map = {
+            int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items()
+        }
+        meta = MetadataCatalog.get(dataset_name)
+        meta.category_map = category_map
+        logger = logging.getLogger(__name__)
+        logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map))
+
+
+def _add_category_info_to_bootstrapping_metadata(dataset_name: str, dataset_cfg: CfgNode) -> None:
+    meta = MetadataCatalog.get(dataset_name)
+    meta.category_to_class_mapping = get_category_to_class_mapping(dataset_cfg)
+    meta.categories = dataset_cfg.CATEGORIES
+    meta.max_count_per_category = dataset_cfg.MAX_COUNT_PER_CATEGORY
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Category to class mapping for dataset {}: {}".format(
+            dataset_name, meta.category_to_class_mapping
+        )
+    )
+
+
+def _maybe_add_class_to_mesh_name_map_to_metadata(dataset_names: List[str], cfg: CfgNode) -> None:
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        if not hasattr(meta, "class_to_mesh_name"):
+            meta.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+
+
+def _merge_categories(dataset_names: Collection[str]) -> _MergedCategoriesT:
+    merged_categories = defaultdict(list)
+    category_names = {}
+    for dataset_name in dataset_names:
+        meta = MetadataCatalog.get(dataset_name)
+        whitelisted_categories = meta.get("whitelisted_categories")
+        category_map = meta.get("category_map", {})
+        cat_ids = (
+            whitelisted_categories if whitelisted_categories is not None else meta.categories.keys()
+        )
+        for cat_id in cat_ids:
+            cat_name = meta.categories[cat_id]
+            cat_id_mapped = category_map.get(cat_id, cat_id)
+            if cat_id_mapped == cat_id or cat_id_mapped in cat_ids:
+                category_names[cat_id] = cat_name
+            else:
+                category_names[cat_id] = str(cat_id_mapped)
+            # assign temporary mapped category name, this name can be changed
+            # during the second pass, since mapped ID can correspond to a category
+            # from a different dataset
+            cat_name_mapped = meta.categories[cat_id_mapped]
+            merged_categories[cat_id_mapped].append(
+                _DatasetCategory(
+                    id=cat_id,
+                    name=cat_name,
+                    mapped_id=cat_id_mapped,
+                    mapped_name=cat_name_mapped,
+                    dataset_name=dataset_name,
+                )
+            )
+    # second pass to assign proper mapped category names
+    for cat_id, categories in merged_categories.items():
+        for cat in categories:
+            if cat_id in category_names and cat.mapped_name != category_names[cat_id]:
+                cat.mapped_name = category_names[cat_id]
+
+    return merged_categories
+
+
+def _warn_if_merged_different_categories(merged_categories: _MergedCategoriesT) -> None:
+    logger = logging.getLogger(__name__)
+    for cat_id in merged_categories:
+        merged_categories_i = merged_categories[cat_id]
+        first_cat_name = merged_categories_i[0].name
+        if len(merged_categories_i) > 1 and not all(
+            cat.name == first_cat_name for cat in merged_categories_i[1:]
+        ):
+            cat_summary_str = ", ".join(
+                [f"{cat.id} ({cat.name}) from {cat.dataset_name}" for cat in merged_categories_i]
+            )
+            logger.warning(
+                f"Merged category {cat_id} corresponds to the following categories: "
+                f"{cat_summary_str}"
+            )
+
+
+def combine_detection_dataset_dicts(
+    dataset_names: Collection[str],
+    keep_instance_predicate: Optional[InstancePredicate] = None,
+    proposal_files: Optional[Collection[str]] = None,
+) -> List[Instance]:
+    """
+    Load and prepare dataset dicts for training / testing
+
+    Args:
+        dataset_names (Collection[str]): a list of dataset names
+        keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate
+            applied to instance dicts which defines whether to keep the instance
+        proposal_files (Collection[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    """
+    assert len(dataset_names)
+    if proposal_files is None:
+        proposal_files = [None] * len(dataset_names)
+    assert len(dataset_names) == len(proposal_files)
+    # load datasets and metadata
+    dataset_name_to_dicts = {}
+    for dataset_name in dataset_names:
+        dataset_name_to_dicts[dataset_name] = DatasetCatalog.get(dataset_name)
+        assert len(dataset_name_to_dicts), f"Dataset '{dataset_name}' is empty!"
+    # merge categories, requires category metadata to be loaded
+    # cat_id -> [(orig_cat_id, cat_name, dataset_name)]
+    merged_categories = _merge_categories(dataset_names)
+    _warn_if_merged_different_categories(merged_categories)
+    merged_category_names = [
+        merged_categories[cat_id][0].mapped_name for cat_id in sorted(merged_categories)
+    ]
+    # map to contiguous category IDs
+    _add_category_id_to_contiguous_id_maps_to_metadata(merged_categories)
+    # load annotations and dataset metadata
+    for dataset_name, proposal_file in zip(dataset_names, proposal_files):
+        dataset_dicts = dataset_name_to_dicts[dataset_name]
+        assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!"
+        if proposal_file is not None:
+            dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file)
+        dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts)
+        print_instances_class_histogram(dataset_dicts, merged_category_names)
+        dataset_name_to_dicts[dataset_name] = dataset_dicts
+
+    if keep_instance_predicate is not None:
+        all_datasets_dicts_plain = [
+            d
+            for d in itertools.chain.from_iterable(dataset_name_to_dicts.values())
+            if keep_instance_predicate(d)
+        ]
+    else:
+        all_datasets_dicts_plain = list(
+            itertools.chain.from_iterable(dataset_name_to_dicts.values())
+        )
+    return all_datasets_dicts_plain
+
+
+def build_detection_train_loader(cfg: CfgNode, mapper=None):
+    """
+    A data loader is created in a way similar to that of Detectron2.
+    The main differences are:
+     - it allows to combine datasets with different but compatible object category sets
+
+    The data loader is created by the following steps:
+    1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts.
+    2. Start workers to work on the dicts. Each worker will:
+        * Map each metadata dict into another format to be consumed by the model.
+        * Batch them by simply putting dicts into a list.
+    The batched ``list[mapped_dict]`` is what this dataloader will return.
+
+    Args:
+        cfg (CfgNode): the config
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, True)`.
+
+    Returns:
+        an infinite iterator of training data
+    """
+
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    _maybe_add_class_to_mesh_name_map_to_metadata(cfg.DATASETS.TRAIN, cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        cfg.DATASETS.TRAIN,
+        keep_instance_predicate=_get_train_keep_instance_predicate(cfg),
+        proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    return d2_build_detection_train_loader(cfg, dataset=dataset_dicts, mapper=mapper)
+
+
+def build_detection_test_loader(cfg, dataset_name, mapper=None):
+    """
+    Similar to `build_detection_train_loader`.
+    But this function uses the given `dataset_name` argument (instead of the names in cfg),
+    and uses batch size 1.
+
+    Args:
+        cfg: a detectron2 CfgNode
+        dataset_name (str): a name of the dataset that's available in the DatasetCatalog
+        mapper (callable): a callable which takes a sample (dict) from dataset
+            and returns the format to be consumed by the model.
+            By default it will be `DatasetMapper(cfg, False)`.
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+            dataset, with test-time transformation and batching.
+    """
+    _add_category_whitelists_to_metadata(cfg)
+    _add_category_maps_to_metadata(cfg)
+    _maybe_add_class_to_mesh_name_map_to_metadata([dataset_name], cfg)
+    dataset_dicts = combine_detection_dataset_dicts(
+        [dataset_name],
+        keep_instance_predicate=_get_test_keep_instance_predicate(cfg),
+        proposal_files=(
+            [cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)]]
+            if cfg.MODEL.LOAD_PROPOSALS
+            else None
+        ),
+    )
+    sampler = None
+    if not cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE:
+        sampler = torch.utils.data.SequentialSampler(dataset_dicts)
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return d2_build_detection_test_loader(
+        dataset_dicts, mapper=mapper, num_workers=cfg.DATALOADER.NUM_WORKERS, sampler=sampler
+    )
+
+
+def build_frame_selector(cfg: CfgNode):
+    strategy = FrameSelectionStrategy(cfg.STRATEGY)
+    if strategy == FrameSelectionStrategy.RANDOM_K:
+        frame_selector = RandomKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.FIRST_K:
+        frame_selector = FirstKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.LAST_K:
+        frame_selector = LastKFramesSelector(cfg.NUM_IMAGES)
+    elif strategy == FrameSelectionStrategy.ALL:
+        frame_selector = None
+    # pyre-fixme[61]: `frame_selector` may not be initialized here.
+    return frame_selector
+
+
+def build_transform(cfg: CfgNode, data_type: str):
+    if cfg.TYPE == "resize":
+        if data_type == "image":
+            return ImageResizeTransform(cfg.MIN_SIZE, cfg.MAX_SIZE)
+    raise ValueError(f"Unknown transform {cfg.TYPE} for data type {data_type}")
+
+
+def build_combined_loader(cfg: CfgNode, loaders: Collection[Loader], ratios: Sequence[float]):
+    images_per_worker = _compute_num_images_per_worker(cfg)
+    return CombinedDataLoader(loaders, images_per_worker, ratios)
+
+
+def build_bootstrap_dataset(dataset_name: str, cfg: CfgNode) -> Sequence[torch.Tensor]:
+    """
+    Build dataset that provides data to bootstrap on
+
+    Args:
+        dataset_name (str): Name of the dataset, needs to have associated metadata
+            to load the data
+        cfg (CfgNode): bootstrapping config
+    Returns:
+        Sequence[Tensor] - dataset that provides image batches, Tensors of size
+            [N, C, H, W] of type float32
+    """
+    logger = logging.getLogger(__name__)
+    _add_category_info_to_bootstrapping_metadata(dataset_name, cfg)
+    meta = MetadataCatalog.get(dataset_name)
+    factory = BootstrapDatasetFactoryCatalog.get(meta.dataset_type)
+    dataset = None
+    if factory is not None:
+        dataset = factory(meta, cfg)
+    if dataset is None:
+        logger.warning(f"Failed to create dataset {dataset_name} of type {meta.dataset_type}")
+    return dataset
+
+
+def build_data_sampler(cfg: CfgNode, sampler_cfg: CfgNode, embedder: Optional[torch.nn.Module]):
+    if sampler_cfg.TYPE == "densepose_uniform":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseUniformSampler(count_per_class=sampler_cfg.COUNT_PER_CLASS),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_UV_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="sigma_2",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_fine_segm_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="fine_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_coarse_segm_confidence":
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseConfidenceBasedSampler(
+                confidence_channel="coarse_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_cse_uniform":
+        assert embedder is not None
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseCSEUniformSampler(
+                cfg=cfg,
+                use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES,
+                embedder=embedder,
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+    elif sampler_cfg.TYPE == "densepose_cse_coarse_segm_confidence":
+        assert embedder is not None
+        data_sampler = PredictionToGroundTruthSampler()
+        # transform densepose pred -> gt
+        data_sampler.register_sampler(
+            "pred_densepose",
+            "gt_densepose",
+            DensePoseCSEConfidenceBasedSampler(
+                cfg=cfg,
+                use_gt_categories=sampler_cfg.USE_GROUND_TRUTH_CATEGORIES,
+                embedder=embedder,
+                confidence_channel="coarse_segm_confidence",
+                count_per_class=sampler_cfg.COUNT_PER_CLASS,
+                search_proportion=0.5,
+            ),
+        )
+        data_sampler.register_sampler("pred_densepose", "gt_masks", MaskFromDensePoseSampler())
+        return data_sampler
+
+    raise ValueError(f"Unknown data sampler type {sampler_cfg.TYPE}")
+
+
+def build_data_filter(cfg: CfgNode):
+    if cfg.TYPE == "detection_score":
+        min_score = cfg.MIN_VALUE
+        return ScoreBasedFilter(min_score=min_score)
+    raise ValueError(f"Unknown data filter type {cfg.TYPE}")
+
+
+def build_inference_based_loader(
+    cfg: CfgNode,
+    dataset_cfg: CfgNode,
+    model: torch.nn.Module,
+    embedder: Optional[torch.nn.Module] = None,
+) -> InferenceBasedLoader:
+    """
+    Constructs data loader based on inference results of a model.
+    """
+    dataset = build_bootstrap_dataset(dataset_cfg.DATASET, dataset_cfg.IMAGE_LOADER)
+    meta = MetadataCatalog.get(dataset_cfg.DATASET)
+    training_sampler = TrainingSampler(len(dataset))
+    data_loader = torch.utils.data.DataLoader(
+        dataset,  # pyre-ignore[6]
+        batch_size=dataset_cfg.IMAGE_LOADER.BATCH_SIZE,
+        sampler=training_sampler,
+        num_workers=dataset_cfg.IMAGE_LOADER.NUM_WORKERS,
+        collate_fn=trivial_batch_collator,
+        worker_init_fn=worker_init_reset_seed,
+    )
+    return InferenceBasedLoader(
+        model,
+        data_loader=data_loader,
+        data_sampler=build_data_sampler(cfg, dataset_cfg.DATA_SAMPLER, embedder),
+        data_filter=build_data_filter(dataset_cfg.FILTER),
+        shuffle=True,
+        batch_size=dataset_cfg.INFERENCE.OUTPUT_BATCH_SIZE,
+        inference_batch_size=dataset_cfg.INFERENCE.INPUT_BATCH_SIZE,
+        category_to_class_mapping=meta.category_to_class_mapping,
+    )
+
+
+def has_inference_based_loaders(cfg: CfgNode) -> bool:
+    """
+    Returns True, if at least one inferense-based loader must
+    be instantiated for training
+    """
+    return len(cfg.BOOTSTRAP_DATASETS) > 0
+
+
+def build_inference_based_loaders(
+    cfg: CfgNode, model: torch.nn.Module
+) -> Tuple[List[InferenceBasedLoader], List[float]]:
+    loaders = []
+    ratios = []
+    embedder = build_densepose_embedder(cfg).to(device=model.device)  # pyre-ignore[16]
+    for dataset_spec in cfg.BOOTSTRAP_DATASETS:
+        dataset_cfg = get_bootstrap_dataset_config().clone()
+        dataset_cfg.merge_from_other_cfg(CfgNode(dataset_spec))
+        loader = build_inference_based_loader(cfg, dataset_cfg, model, embedder)
+        loaders.append(loader)
+        ratios.append(dataset_cfg.RATIO)
+    return loaders, ratios
+
+
+def build_video_list_dataset(meta: Metadata, cfg: CfgNode):
+    video_list_fpath = meta.video_list_fpath
+    video_base_path = meta.video_base_path
+    category = meta.category
+    if cfg.TYPE == "video_keyframe":
+        frame_selector = build_frame_selector(cfg.SELECT)
+        transform = build_transform(cfg.TRANSFORM, data_type="image")
+        video_list = video_list_from_file(video_list_fpath, video_base_path)
+        keyframe_helper_fpath = getattr(cfg, "KEYFRAME_HELPER", None)
+        return VideoKeyframeDataset(
+            video_list, category, frame_selector, transform, keyframe_helper_fpath
+        )
+
+
+class _BootstrapDatasetFactoryCatalog(UserDict):
+    """
+    A global dictionary that stores information about bootstrapped datasets creation functions
+    from metadata and config, for diverse DatasetType
+    """
+
+    def register(self, dataset_type: DatasetType, factory: Callable[[Metadata, CfgNode], Dataset]):
+        """
+        Args:
+            dataset_type (DatasetType): a DatasetType e.g. DatasetType.VIDEO_LIST
+            factory (Callable[Metadata, CfgNode]): a callable which takes Metadata and cfg
+            arguments and returns a dataset object.
+        """
+        assert dataset_type not in self, "Dataset '{}' is already registered!".format(dataset_type)
+        self[dataset_type] = factory
+
+
+BootstrapDatasetFactoryCatalog = _BootstrapDatasetFactoryCatalog()
+BootstrapDatasetFactoryCatalog.register(DatasetType.VIDEO_LIST, build_video_list_dataset)
diff --git a/densepose/data/combined_loader.py b/densepose/data/combined_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c038c23a3b436b1cc6c29427c8dbf940f56250c9
--- /dev/null
+++ b/densepose/data/combined_loader.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+from collections import deque
+from typing import Any, Collection, Deque, Iterable, Iterator, List, Sequence
+
+Loader = Iterable[Any]
+
+
+def _pooled_next(iterator: Iterator[Any], pool: Deque[Any]):
+    if not pool:
+        pool.extend(next(iterator))
+    return pool.popleft()
+
+
+class CombinedDataLoader:
+    """
+    Combines data loaders using the provided sampling ratios
+    """
+
+    BATCH_COUNT = 100
+
+    def __init__(self, loaders: Collection[Loader], batch_size: int, ratios: Sequence[float]):
+        self.loaders = loaders
+        self.batch_size = batch_size
+        self.ratios = ratios
+
+    def __iter__(self) -> Iterator[List[Any]]:
+        iters = [iter(loader) for loader in self.loaders]
+        indices = []
+        pool = [deque()] * len(iters)
+        # infinite iterator, as in D2
+        while True:
+            if not indices:
+                # just a buffer of indices, its size doesn't matter
+                # as long as it's a multiple of batch_size
+                k = self.batch_size * self.BATCH_COUNT
+                indices = random.choices(range(len(self.loaders)), self.ratios, k=k)
+            try:
+                batch = [_pooled_next(iters[i], pool[i]) for i in indices[: self.batch_size]]
+            except StopIteration:
+                break
+            indices = indices[self.batch_size :]
+            yield batch
diff --git a/densepose/data/dataset_mapper.py b/densepose/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..5537a94c0811f7f6849f534612222e8dc154b59d
--- /dev/null
+++ b/densepose/data/dataset_mapper.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import copy
+import logging
+from typing import Any, Dict, List, Tuple
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.layers import ROIAlign
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from densepose.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData
+
+
+def build_augmentation(cfg, is_train):
+    logger = logging.getLogger(__name__)
+    result = utils.build_augmentation(cfg, is_train)
+    if is_train:
+        random_rotation = T.RandomRotation(
+            cfg.INPUT.ROTATION_ANGLES, expand=False, sample_style="choice"
+        )
+        result.append(random_rotation)
+        logger.info("DensePose-specific augmentation used in training: " + str(random_rotation))
+    return result
+
+
+class DatasetMapper:
+    """
+    A customized version of `detectron2.data.DatasetMapper`
+    """
+
+    def __init__(self, cfg, is_train=True):
+        self.augmentation = build_augmentation(cfg, is_train)
+
+        # fmt: off
+        self.img_format     = cfg.INPUT.FORMAT
+        self.mask_on        = (
+            cfg.MODEL.MASK_ON or (
+                cfg.MODEL.DENSEPOSE_ON
+                and cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS)
+        )
+        self.keypoint_on    = cfg.MODEL.KEYPOINT_ON
+        self.densepose_on   = cfg.MODEL.DENSEPOSE_ON
+        assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet"
+        # fmt: on
+        if self.keypoint_on and is_train:
+            # Flip only makes sense in training
+            self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        else:
+            self.keypoint_hflip_indices = None
+
+        if self.densepose_on:
+            densepose_transform_srcs = [
+                MetadataCatalog.get(ds).densepose_transform_src
+                for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST
+            ]
+            assert len(densepose_transform_srcs) > 0
+            # TODO: check that DensePose transformation data is the same for
+            # all the datasets. Otherwise one would have to pass DB ID with
+            # each entry to select proper transformation data. For now, since
+            # all DensePose annotated data uses the same data semantics, we
+            # omit this check.
+            densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0])
+            self.densepose_transform_data = DensePoseTransformData.load(
+                densepose_transform_data_fpath
+            )
+
+        self.is_train = is_train
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        image, transforms = T.apply_transform_gens(self.augmentation, image)
+        image_shape = image.shape[:2]  # h, w
+        dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32"))
+
+        if not self.is_train:
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+
+        for anno in dataset_dict["annotations"]:
+            if not self.mask_on:
+                anno.pop("segmentation", None)
+            if not self.keypoint_on:
+                anno.pop("keypoints", None)
+
+        # USER: Implement additional transformations if you have other types of data
+        # USER: Don't call transpose_densepose if you don't need
+        annos = [
+            self._transform_densepose(
+                utils.transform_instance_annotations(
+                    obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+                ),
+                transforms,
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+
+        if self.mask_on:
+            self._add_densepose_masks_as_segmentation(annos, image_shape)
+
+        instances = utils.annotations_to_instances(annos, image_shape, mask_format="bitmask")
+        densepose_annotations = [obj.get("densepose") for obj in annos]
+        if densepose_annotations and not all(v is None for v in densepose_annotations):
+            instances.gt_densepose = DensePoseList(
+                densepose_annotations, instances.gt_boxes, image_shape
+            )
+
+        dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()]
+        return dataset_dict
+
+    def _transform_densepose(self, annotation, transforms):
+        if not self.densepose_on:
+            return annotation
+
+        # Handle densepose annotations
+        is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation)
+        if is_valid:
+            densepose_data = DensePoseDataRelative(annotation, cleanup=True)
+            densepose_data.apply_transform(transforms, self.densepose_transform_data)
+            annotation["densepose"] = densepose_data
+        else:
+            # logger = logging.getLogger(__name__)
+            # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid))
+            DensePoseDataRelative.cleanup_annotation(annotation)
+            # NOTE: annotations for certain instances may be unavailable.
+            # 'None' is accepted by the DensePostList data structure.
+            annotation["densepose"] = None
+        return annotation
+
+    def _add_densepose_masks_as_segmentation(
+        self, annotations: List[Dict[str, Any]], image_shape_hw: Tuple[int, int]
+    ):
+        for obj in annotations:
+            if ("densepose" not in obj) or ("segmentation" in obj):
+                continue
+            # DP segmentation: torch.Tensor [S, S] of float32, S=256
+            segm_dp = torch.zeros_like(obj["densepose"].segm)
+            segm_dp[obj["densepose"].segm > 0] = 1
+            segm_h, segm_w = segm_dp.shape
+            bbox_segm_dp = torch.tensor((0, 0, segm_h - 1, segm_w - 1), dtype=torch.float32)
+            # image bbox
+            x0, y0, x1, y1 = (
+                v.item() for v in BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS)
+            )
+            segm_aligned = (
+                ROIAlign((y1 - y0, x1 - x0), 1.0, 0, aligned=True)
+                .forward(segm_dp.view(1, 1, *segm_dp.shape), bbox_segm_dp)
+                .squeeze()
+            )
+            image_mask = torch.zeros(*image_shape_hw, dtype=torch.float32)
+            image_mask[y0:y1, x0:x1] = segm_aligned
+            # segmentation for BitMask: np.array [H, W] of bool
+            obj["segmentation"] = image_mask >= 0.5
diff --git a/densepose/data/datasets/__init__.py b/densepose/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf0cf1c1dd2e21e096bd7c849150d9c261b9b4f
--- /dev/null
+++ b/densepose/data/datasets/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from . import builtin  # ensure the builtin datasets are registered
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc b/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61935bbf49810ed1060457b39f303040bedf3995
Binary files /dev/null and b/densepose/data/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc b/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63f05943d99425298d42f2d18e65d6b707edce98
Binary files /dev/null and b/densepose/data/datasets/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc b/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fa4c5f0e50fd5dc7e21d849d5c115ea94f54394
Binary files /dev/null and b/densepose/data/datasets/__pycache__/chimpnsee.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/coco.cpython-39.pyc b/densepose/data/datasets/__pycache__/coco.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ffa9197dfb155499693b54226ac1f856481c4e8
Binary files /dev/null and b/densepose/data/datasets/__pycache__/coco.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc b/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..355cf766241d344aaa9d0b0def6bc2a01a2c583b
Binary files /dev/null and b/densepose/data/datasets/__pycache__/dataset_type.cpython-39.pyc differ
diff --git a/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc b/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9599f29ee346020a1fab05b3b4d3a4943e447493
Binary files /dev/null and b/densepose/data/datasets/__pycache__/lvis.cpython-39.pyc differ
diff --git a/densepose/data/datasets/builtin.py b/densepose/data/datasets/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..759c295e064b29c7968ec7db5e78d3d4de033578
--- /dev/null
+++ b/densepose/data/datasets/builtin.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from .chimpnsee import register_dataset as register_chimpnsee_dataset
+from .coco import BASE_DATASETS as BASE_COCO_DATASETS
+from .coco import DATASETS as COCO_DATASETS
+from .coco import register_datasets as register_coco_datasets
+from .lvis import DATASETS as LVIS_DATASETS
+from .lvis import register_datasets as register_lvis_datasets
+
+DEFAULT_DATASETS_ROOT = "datasets"
+
+
+register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT)
+register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT)
+register_lvis_datasets(LVIS_DATASETS, DEFAULT_DATASETS_ROOT)
+
+register_chimpnsee_dataset(DEFAULT_DATASETS_ROOT)  # pyre-ignore[19]
diff --git a/densepose/data/datasets/chimpnsee.py b/densepose/data/datasets/chimpnsee.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a0ee3768597f730f8230f52807a953148350f16
--- /dev/null
+++ b/densepose/data/datasets/chimpnsee.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Optional
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+
+from ..utils import maybe_prepend_base_path
+from .dataset_type import DatasetType
+
+CHIMPNSEE_DATASET_NAME = "chimpnsee"
+
+
+def register_dataset(datasets_root: Optional[str] = None) -> None:
+    def empty_load_callback():
+        pass
+
+    video_list_fpath = maybe_prepend_base_path(
+        datasets_root,
+        "chimpnsee/cdna.eva.mpg.de/video_list.txt",
+    )
+    video_base_path = maybe_prepend_base_path(datasets_root, "chimpnsee/cdna.eva.mpg.de")
+
+    DatasetCatalog.register(CHIMPNSEE_DATASET_NAME, empty_load_callback)
+    MetadataCatalog.get(CHIMPNSEE_DATASET_NAME).set(
+        dataset_type=DatasetType.VIDEO_LIST,
+        video_list_fpath=video_list_fpath,
+        video_base_path=video_base_path,
+        category="chimpanzee",
+    )
diff --git a/densepose/data/datasets/coco.py b/densepose/data/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c9a5e1dc7422a970b7804277f9ba07841bc714
--- /dev/null
+++ b/densepose/data/datasets/coco.py
@@ -0,0 +1,434 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import contextlib
+import io
+import logging
+import os
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+
+DENSEPOSE_MASK_KEY = "dp_masks"
+DENSEPOSE_IUV_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"]
+DENSEPOSE_CSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_vertex", "ref_model"]
+DENSEPOSE_ALL_POSSIBLE_KEYS = set(
+    DENSEPOSE_IUV_KEYS_WITHOUT_MASK + DENSEPOSE_CSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY]
+)
+DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/"
+
+
+@dataclass
+class CocoDatasetInfo:
+    name: str
+    images_root: str
+    annotations_fpath: str
+
+
+DATASETS = [
+    CocoDatasetInfo(
+        name="densepose_coco_2014_train",
+        images_root="coco/train2014",
+        annotations_fpath="coco/annotations/densepose_train2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_100",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_minival2014_100.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_valminusminival",
+        images_root="coco/val2014",
+        annotations_fpath="coco/annotations/densepose_valminusminival2014.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_train_cse",
+        images_root="coco/train2014",
+        annotations_fpath="coco_cse/densepose_train2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_minival2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_minival_100_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_minival2014_100_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_coco_2014_valminusminival_cse",
+        images_root="coco/val2014",
+        annotations_fpath="coco_cse/densepose_valminusminival2014_cse.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_densepose.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps_cse_train",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_cse_train.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_chimps_cse_val",
+        images_root="densepose_chimps/images",
+        annotations_fpath="densepose_chimps/densepose_chimps_cse_val.json",
+    ),
+    CocoDatasetInfo(
+        name="posetrack2017_train",
+        images_root="posetrack2017/posetrack_data_2017",
+        annotations_fpath="posetrack2017/densepose_posetrack_train2017.json",
+    ),
+    CocoDatasetInfo(
+        name="posetrack2017_val",
+        images_root="posetrack2017/posetrack_data_2017",
+        annotations_fpath="posetrack2017/densepose_posetrack_val2017.json",
+    ),
+    CocoDatasetInfo(
+        name="lvis_v05_train",
+        images_root="coco/train2017",
+        annotations_fpath="lvis/lvis_v0.5_plus_dp_train.json",
+    ),
+    CocoDatasetInfo(
+        name="lvis_v05_val",
+        images_root="coco/val2017",
+        annotations_fpath="lvis/lvis_v0.5_plus_dp_val.json",
+    ),
+]
+
+
+BASE_DATASETS = [
+    CocoDatasetInfo(
+        name="base_coco_2017_train",
+        images_root="coco/train2017",
+        annotations_fpath="coco/annotations/instances_train2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017.json",
+    ),
+    CocoDatasetInfo(
+        name="base_coco_2017_val_100",
+        images_root="coco/val2017",
+        annotations_fpath="coco/annotations/instances_val2017_100.json",
+    ),
+]
+
+
+def get_metadata(base_path: Optional[str]) -> Dict[str, Any]:
+    """
+    Returns metadata associated with COCO DensePose datasets
+
+    Args:
+    base_path: Optional[str]
+        Base path used to load metadata from
+
+    Returns:
+    Dict[str, Any]
+        Metadata in the form of a dictionary
+    """
+    meta = {
+        "densepose_transform_src": maybe_prepend_base_path(base_path, "UV_symmetry_transforms.mat"),
+        "densepose_smpl_subdiv": maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"),
+        "densepose_smpl_subdiv_transform": maybe_prepend_base_path(
+            base_path,
+            "SMPL_SUBDIV_TRANSFORM.mat",
+        ),
+    }
+    return meta
+
+
+def _load_coco_annotations(json_file: str):
+    """
+    Load COCO annotations from a JSON file
+
+    Args:
+        json_file: str
+            Path to the file to load annotations from
+    Returns:
+        Instance of `pycocotools.coco.COCO` that provides access to annotations
+        data
+    """
+    from pycocotools.coco import COCO
+
+    logger = logging.getLogger(__name__)
+    timer = Timer()
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    return coco_api
+
+
+def _add_categories_metadata(dataset_name: str, categories: List[Dict[str, Any]]):
+    meta = MetadataCatalog.get(dataset_name)
+    meta.categories = {c["id"]: c["name"] for c in categories}
+    logger = logging.getLogger(__name__)
+    logger.info("Dataset {} categories: {}".format(dataset_name, meta.categories))
+
+
+def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]):
+    if "minival" in json_file:
+        # Skip validation on COCO2014 valminusminival and minival annotations
+        # The ratio of buggy annotations there is tiny and does not affect accuracy
+        # Therefore we explicitly white-list them
+        return
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+        json_file
+    )
+
+
+def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "bbox" not in ann_dict:
+        return
+    obj["bbox"] = ann_dict["bbox"]
+    obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+
+def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "segmentation" not in ann_dict:
+        return
+    segm = ann_dict["segmentation"]
+    if not isinstance(segm, dict):
+        # filter out invalid polygons (< 3 points)
+        segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+        if len(segm) == 0:
+            return
+    obj["segmentation"] = segm
+
+
+def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    if "keypoints" not in ann_dict:
+        return
+    keypts = ann_dict["keypoints"]  # list[int]
+    for idx, v in enumerate(keypts):
+        if idx % 3 != 2:
+            # COCO's segmentation coordinates are floating points in [0, H or W],
+            # but keypoint coordinates are integers in [0, H-1 or W-1]
+            # Therefore we assume the coordinates are "pixel indices" and
+            # add 0.5 to convert to floating point coordinates.
+            keypts[idx] = v + 0.5
+    obj["keypoints"] = keypts
+
+
+def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]):
+    for key in DENSEPOSE_ALL_POSSIBLE_KEYS:
+        if key in ann_dict:
+            obj[key] = ann_dict[key]
+
+
+def _combine_images_with_annotations(
+    dataset_name: str,
+    image_root: str,
+    img_datas: Iterable[Dict[str, Any]],
+    ann_datas: Iterable[Iterable[Dict[str, Any]]],
+):
+
+    ann_keys = ["iscrowd", "category_id"]
+    dataset_dicts = []
+    contains_video_frame_info = False
+
+    for img_dict, ann_dicts in zip(img_datas, ann_datas):
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["image_id"] = img_dict["id"]
+        record["dataset"] = dataset_name
+        if "frame_id" in img_dict:
+            record["frame_id"] = img_dict["frame_id"]
+            record["video_id"] = img_dict.get("vid_id", None)
+            contains_video_frame_info = True
+        objs = []
+        for ann_dict in ann_dicts:
+            assert ann_dict["image_id"] == record["image_id"]
+            assert ann_dict.get("ignore", 0) == 0
+            obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict}
+            _maybe_add_bbox(obj, ann_dict)
+            _maybe_add_segm(obj, ann_dict)
+            _maybe_add_keypoints(obj, ann_dict)
+            _maybe_add_densepose(obj, ann_dict)
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    if contains_video_frame_info:
+        create_video_frame_mapping(dataset_name, dataset_dicts)
+    return dataset_dicts
+
+
+def get_contiguous_id_to_category_id_map(metadata):
+    cat_id_2_cont_id = metadata.thing_dataset_id_to_contiguous_id
+    cont_id_2_cat_id = {}
+    for cat_id, cont_id in cat_id_2_cont_id.items():
+        if cont_id in cont_id_2_cat_id:
+            continue
+        cont_id_2_cat_id[cont_id] = cat_id
+    return cont_id_2_cat_id
+
+
+def maybe_filter_categories_cocoapi(dataset_name, coco_api):
+    meta = MetadataCatalog.get(dataset_name)
+    cont_id_2_cat_id = get_contiguous_id_to_category_id_map(meta)
+    cat_id_2_cont_id = meta.thing_dataset_id_to_contiguous_id
+    # filter categories
+    cats = []
+    for cat in coco_api.dataset["categories"]:
+        cat_id = cat["id"]
+        if cat_id not in cat_id_2_cont_id:
+            continue
+        cont_id = cat_id_2_cont_id[cat_id]
+        if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id):
+            cats.append(cat)
+    coco_api.dataset["categories"] = cats
+    # filter annotations, if multiple categories are mapped to a single
+    # contiguous ID, use only one category ID and map all annotations to that category ID
+    anns = []
+    for ann in coco_api.dataset["annotations"]:
+        cat_id = ann["category_id"]
+        if cat_id not in cat_id_2_cont_id:
+            continue
+        cont_id = cat_id_2_cont_id[cat_id]
+        ann["category_id"] = cont_id_2_cat_id[cont_id]
+        anns.append(ann)
+    coco_api.dataset["annotations"] = anns
+    # recreate index
+    coco_api.createIndex()
+
+
+def maybe_filter_and_map_categories_cocoapi(dataset_name, coco_api):
+    meta = MetadataCatalog.get(dataset_name)
+    category_id_map = meta.thing_dataset_id_to_contiguous_id
+    # map categories
+    cats = []
+    for cat in coco_api.dataset["categories"]:
+        cat_id = cat["id"]
+        if cat_id not in category_id_map:
+            continue
+        cat["id"] = category_id_map[cat_id]
+        cats.append(cat)
+    coco_api.dataset["categories"] = cats
+    # map annotation categories
+    anns = []
+    for ann in coco_api.dataset["annotations"]:
+        cat_id = ann["category_id"]
+        if cat_id not in category_id_map:
+            continue
+        ann["category_id"] = category_id_map[cat_id]
+        anns.append(ann)
+    coco_api.dataset["annotations"] = anns
+    # recreate index
+    coco_api.createIndex()
+
+
+def create_video_frame_mapping(dataset_name, dataset_dicts):
+    mapping = defaultdict(dict)
+    for d in dataset_dicts:
+        video_id = d.get("video_id")
+        if video_id is None:
+            continue
+        mapping[video_id].update({d["frame_id"]: d["file_name"]})
+    MetadataCatalog.get(dataset_name).set(video_frame_mapping=mapping)
+
+
+def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str):
+    """
+    Loads a JSON file with annotations in COCO instances format.
+    Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata
+    in a more flexible way. Postpones category mapping to a later stage to be
+    able to combine several datasets with different (but coherent) sets of
+    categories.
+
+    Args:
+
+    annotations_json_file: str
+        Path to the JSON file with annotations in COCO instances format.
+    image_root: str
+        directory that contains all the images
+    dataset_name: str
+        the name that identifies a dataset, e.g. "densepose_coco_2014_train"
+    extra_annotation_keys: Optional[List[str]]
+        If provided, these keys are used to extract additional data from
+        the annotations.
+    """
+    coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file))
+    _add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds()))
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file))
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images.
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    _verify_annotations_have_unique_ids(annotations_json_file, anns)
+    dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
+    return dataset_records
+
+
+def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None):
+    """
+    Registers provided COCO DensePose dataset
+
+    Args:
+    dataset_data: CocoDatasetInfo
+        Dataset data
+    datasets_root: Optional[str]
+        Datasets root folder (default: None)
+    """
+    annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
+    images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
+
+    def load_annotations():
+        return load_coco_json(
+            annotations_json_file=annotations_fpath,
+            image_root=images_root,
+            dataset_name=dataset_data.name,
+        )
+
+    DatasetCatalog.register(dataset_data.name, load_annotations)
+    MetadataCatalog.get(dataset_data.name).set(
+        json_file=annotations_fpath,
+        image_root=images_root,
+        **get_metadata(DENSEPOSE_METADATA_URL_PREFIX)
+    )
+
+
+def register_datasets(
+    datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None
+):
+    """
+    Registers provided COCO DensePose datasets
+
+    Args:
+    datasets_data: Iterable[CocoDatasetInfo]
+        An iterable of dataset datas
+    datasets_root: Optional[str]
+        Datasets root folder (default: None)
+    """
+    for dataset_data in datasets_data:
+        register_dataset(dataset_data, datasets_root)
diff --git a/densepose/data/datasets/dataset_type.py b/densepose/data/datasets/dataset_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e546f2aa74b4586d97618d41c69432ed01e21e9
--- /dev/null
+++ b/densepose/data/datasets/dataset_type.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from enum import Enum
+
+
+class DatasetType(Enum):
+    """
+    Dataset type, mostly used for datasets that contain data to bootstrap models on
+    """
+
+    VIDEO_LIST = "video_list"
diff --git a/densepose/data/datasets/lvis.py b/densepose/data/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..e90caac4bb429f9500a98998df18d238254a709e
--- /dev/null
+++ b/densepose/data/datasets/lvis.py
@@ -0,0 +1,259 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import logging
+import os
+from typing import Any, Dict, Iterable, List, Optional
+from fvcore.common.timer import Timer
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.lvis import get_lvis_instances_meta
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+from .coco import (
+    DENSEPOSE_ALL_POSSIBLE_KEYS,
+    DENSEPOSE_METADATA_URL_PREFIX,
+    CocoDatasetInfo,
+    get_metadata,
+)
+
+DATASETS = [
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_train_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds1_train_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_val_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds1_val_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds2_train_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds2_train_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds2_val_v1",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_ds2_val_v1.json",
+    ),
+    CocoDatasetInfo(
+        name="densepose_lvis_v1_ds1_val_animals_100",
+        images_root="coco_",
+        annotations_fpath="lvis/densepose_lvis_v1_val_animals_100_v2.json",
+    ),
+]
+
+
+def _load_lvis_annotations(json_file: str):
+    """
+    Load COCO annotations from a JSON file
+
+    Args:
+        json_file: str
+            Path to the file to load annotations from
+    Returns:
+        Instance of `pycocotools.coco.COCO` that provides access to annotations
+        data
+    """
+    from lvis import LVIS
+
+    json_file = PathManager.get_local_path(json_file)
+    logger = logging.getLogger(__name__)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    return lvis_api
+
+
+def _add_categories_metadata(dataset_name: str) -> None:
+    metadict = get_lvis_instances_meta(dataset_name)
+    categories = metadict["thing_classes"]
+    metadata = MetadataCatalog.get(dataset_name)
+    metadata.categories = {i + 1: categories[i] for i in range(len(categories))}
+    logger = logging.getLogger(__name__)
+    logger.info(f"Dataset {dataset_name} has {len(categories)} categories")
+
+
+def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]) -> None:
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+        json_file
+    )
+
+
+def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "bbox" not in ann_dict:
+        return
+    obj["bbox"] = ann_dict["bbox"]
+    obj["bbox_mode"] = BoxMode.XYWH_ABS
+
+
+def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "segmentation" not in ann_dict:
+        return
+    segm = ann_dict["segmentation"]
+    if not isinstance(segm, dict):
+        # filter out invalid polygons (< 3 points)
+        segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+        if len(segm) == 0:
+            return
+    obj["segmentation"] = segm
+
+
+def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    if "keypoints" not in ann_dict:
+        return
+    keypts = ann_dict["keypoints"]  # list[int]
+    for idx, v in enumerate(keypts):
+        if idx % 3 != 2:
+            # COCO's segmentation coordinates are floating points in [0, H or W],
+            # but keypoint coordinates are integers in [0, H-1 or W-1]
+            # Therefore we assume the coordinates are "pixel indices" and
+            # add 0.5 to convert to floating point coordinates.
+            keypts[idx] = v + 0.5
+    obj["keypoints"] = keypts
+
+
+def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]) -> None:
+    for key in DENSEPOSE_ALL_POSSIBLE_KEYS:
+        if key in ann_dict:
+            obj[key] = ann_dict[key]
+
+
+def _combine_images_with_annotations(
+    dataset_name: str,
+    image_root: str,
+    img_datas: Iterable[Dict[str, Any]],
+    ann_datas: Iterable[Iterable[Dict[str, Any]]],
+):
+
+    dataset_dicts = []
+
+    def get_file_name(img_root, img_dict):
+        # Determine the path including the split folder ("train2017", "val2017", "test2017") from
+        # the coco_url field. Example:
+        #   'coco_url': 'http://images.cocodataset.org/train2017/000000155379.jpg'
+        split_folder, file_name = img_dict["coco_url"].split("/")[-2:]
+        return os.path.join(img_root + split_folder, file_name)
+
+    for img_dict, ann_dicts in zip(img_datas, ann_datas):
+        record = {}
+        record["file_name"] = get_file_name(image_root, img_dict)
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", [])
+        record["neg_category_ids"] = img_dict.get("neg_category_ids", [])
+        record["image_id"] = img_dict["id"]
+        record["dataset"] = dataset_name
+
+        objs = []
+        for ann_dict in ann_dicts:
+            assert ann_dict["image_id"] == record["image_id"]
+            obj = {}
+            _maybe_add_bbox(obj, ann_dict)
+            obj["iscrowd"] = ann_dict.get("iscrowd", 0)
+            obj["category_id"] = ann_dict["category_id"]
+            _maybe_add_segm(obj, ann_dict)
+            _maybe_add_keypoints(obj, ann_dict)
+            _maybe_add_densepose(obj, ann_dict)
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts.append(record)
+    return dataset_dicts
+
+
+def load_lvis_json(annotations_json_file: str, image_root: str, dataset_name: str):
+    """
+    Loads a JSON file with annotations in LVIS instances format.
+    Replaces `detectron2.data.datasets.coco.load_lvis_json` to handle metadata
+    in a more flexible way. Postpones category mapping to a later stage to be
+    able to combine several datasets with different (but coherent) sets of
+    categories.
+
+    Args:
+
+    annotations_json_file: str
+        Path to the JSON file with annotations in COCO instances format.
+    image_root: str
+        directory that contains all the images
+    dataset_name: str
+        the name that identifies a dataset, e.g. "densepose_coco_2014_train"
+    extra_annotation_keys: Optional[List[str]]
+        If provided, these keys are used to extract additional data from
+        the annotations.
+    """
+    lvis_api = _load_lvis_annotations(PathManager.get_local_path(annotations_json_file))
+
+    _add_categories_metadata(dataset_name)
+
+    # sort indices for reproducible results
+    img_ids = sorted(lvis_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = lvis_api.load_imgs(img_ids)
+    logger = logging.getLogger(__name__)
+    logger.info("Loaded {} images in LVIS format from {}".format(len(imgs), annotations_json_file))
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images.
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+
+    _verify_annotations_have_unique_ids(annotations_json_file, anns)
+    dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns)
+    return dataset_records
+
+
+def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[str] = None) -> None:
+    """
+    Registers provided LVIS DensePose dataset
+
+    Args:
+        dataset_data: CocoDatasetInfo
+            Dataset data
+        datasets_root: Optional[str]
+            Datasets root folder (default: None)
+    """
+    annotations_fpath = maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath)
+    images_root = maybe_prepend_base_path(datasets_root, dataset_data.images_root)
+
+    def load_annotations():
+        return load_lvis_json(
+            annotations_json_file=annotations_fpath,
+            image_root=images_root,
+            dataset_name=dataset_data.name,
+        )
+
+    DatasetCatalog.register(dataset_data.name, load_annotations)
+    MetadataCatalog.get(dataset_data.name).set(
+        json_file=annotations_fpath,
+        image_root=images_root,
+        evaluator_type="lvis",
+        **get_metadata(DENSEPOSE_METADATA_URL_PREFIX),
+    )
+
+
+def register_datasets(
+    datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[str] = None
+) -> None:
+    """
+    Registers provided LVIS DensePose datasets
+
+    Args:
+        datasets_data: Iterable[CocoDatasetInfo]
+            An iterable of dataset datas
+        datasets_root: Optional[str]
+            Datasets root folder (default: None)
+    """
+    for dataset_data in datasets_data:
+        register_dataset(dataset_data, datasets_root)
diff --git a/densepose/data/image_list_dataset.py b/densepose/data/image_list_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..503bf647d7810f4b45cb3a442370ddbbf8e7f2a3
--- /dev/null
+++ b/densepose/data/image_list_dataset.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import logging
+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.data.detection_utils import read_image
+
+ImageTransform = Callable[[torch.Tensor], torch.Tensor]
+
+
+class ImageListDataset(Dataset):
+    """
+    Dataset that provides images from a list.
+    """
+
+    _EMPTY_IMAGE = torch.empty((0, 3, 1, 1))
+
+    def __init__(
+        self,
+        image_list: List[str],
+        category_list: Union[str, List[str], None] = None,
+        transform: Optional[ImageTransform] = None,
+    ):
+        """
+        Args:
+            image_list (List[str]): list of paths to image files
+            category_list (Union[str, List[str], None]): list of animal categories for
+                each image. If it is a string, or None, this applies to all images
+        """
+        if type(category_list) is list:
+            self.category_list = category_list
+        else:
+            self.category_list = [category_list] * len(image_list)
+        assert len(image_list) == len(
+            self.category_list
+        ), "length of image and category lists must be equal"
+        self.image_list = image_list
+        self.transform = transform
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Gets selected images from the list
+
+        Args:
+            idx (int): video index in the video list file
+        Returns:
+            A dictionary containing two keys:
+                images (torch.Tensor): tensor of size [N, 3, H, W] (N = 1, or 0 for _EMPTY_IMAGE)
+                categories (List[str]): categories of the frames
+        """
+        categories = [self.category_list[idx]]
+        fpath = self.image_list[idx]
+        transform = self.transform
+
+        try:
+            image = torch.from_numpy(np.ascontiguousarray(read_image(fpath, format="BGR")))
+            image = image.permute(2, 0, 1).unsqueeze(0).float()  # HWC -> NCHW
+            if transform is not None:
+                image = transform(image)
+            return {"images": image, "categories": categories}
+        except (OSError, RuntimeError) as e:
+            logger = logging.getLogger(__name__)
+            logger.warning(f"Error opening image file container {fpath}: {e}")
+
+        return {"images": self._EMPTY_IMAGE, "categories": []}
+
+    def __len__(self):
+        return len(self.image_list)
diff --git a/densepose/data/inference_based_loader.py b/densepose/data/inference_based_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..b643935cb7cbcaa06f66ca1c459ef25c5753cffd
--- /dev/null
+++ b/densepose/data/inference_based_loader.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple
+import torch
+from torch import nn
+
+SampledData = Any
+ModelOutput = Any
+
+
+def _grouper(iterable: Iterable[Any], n: int, fillvalue=None) -> Iterator[Tuple[Any]]:
+    """
+    Group elements of an iterable by chunks of size `n`, e.g.
+    grouper(range(9), 4) ->
+        (0, 1, 2, 3), (4, 5, 6, 7), (8, None, None, None)
+    """
+    it = iter(iterable)
+    while True:
+        values = []
+        for _ in range(n):
+            try:
+                value = next(it)
+            except StopIteration:
+                if values:
+                    values.extend([fillvalue] * (n - len(values)))
+                    yield tuple(values)
+                return
+            values.append(value)
+        yield tuple(values)
+
+
+class ScoreBasedFilter:
+    """
+    Filters entries in model output based on their scores
+    Discards all entries with score less than the specified minimum
+    """
+
+    def __init__(self, min_score: float = 0.8):
+        self.min_score = min_score
+
+    def __call__(self, model_output: ModelOutput) -> ModelOutput:
+        for model_output_i in model_output:
+            instances = model_output_i["instances"]
+            if not instances.has("scores"):
+                continue
+            instances_filtered = instances[instances.scores >= self.min_score]
+            model_output_i["instances"] = instances_filtered
+        return model_output
+
+
+class InferenceBasedLoader:
+    """
+    Data loader based on results inferred by a model. Consists of:
+     - a data loader that provides batches of images
+     - a model that is used to infer the results
+     - a data sampler that converts inferred results to annotations
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        data_loader: Iterable[List[Dict[str, Any]]],
+        data_sampler: Optional[Callable[[ModelOutput], List[SampledData]]] = None,
+        data_filter: Optional[Callable[[ModelOutput], ModelOutput]] = None,
+        shuffle: bool = True,
+        batch_size: int = 4,
+        inference_batch_size: int = 4,
+        drop_last: bool = False,
+        category_to_class_mapping: Optional[dict] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          model (torch.nn.Module): model used to produce data
+          data_loader (Iterable[List[Dict[str, Any]]]): iterable that provides
+            dictionaries with "images" and "categories" fields to perform inference on
+          data_sampler (Callable: ModelOutput -> SampledData): functor
+              that produces annotation data from inference results;
+              (optional, default: None)
+          data_filter (Callable: ModelOutput -> ModelOutput): filter
+              that selects model outputs for further processing
+              (optional, default: None)
+          shuffle (bool): if True, the input images get shuffled
+          batch_size (int): batch size for the produced annotation data
+          inference_batch_size (int): batch size for input images
+          drop_last (bool): if True, drop the last batch if it is undersized
+          category_to_class_mapping (dict): category to class mapping
+        """
+        self.model = model
+        self.model.eval()
+        self.data_loader = data_loader
+        self.data_sampler = data_sampler
+        self.data_filter = data_filter
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.inference_batch_size = inference_batch_size
+        self.drop_last = drop_last
+        if category_to_class_mapping is not None:
+            self.category_to_class_mapping = category_to_class_mapping
+        else:
+            self.category_to_class_mapping = {}
+
+    def __iter__(self) -> Iterator[List[SampledData]]:
+        for batch in self.data_loader:
+            # batch : List[Dict[str: Tensor[N, C, H, W], str: Optional[str]]]
+            # images_batch : Tensor[N, C, H, W]
+            # image : Tensor[C, H, W]
+            images_and_categories = [
+                {"image": image, "category": category}
+                for element in batch
+                for image, category in zip(element["images"], element["categories"])
+            ]
+            if not images_and_categories:
+                continue
+            if self.shuffle:
+                random.shuffle(images_and_categories)
+            yield from self._produce_data(images_and_categories)  # pyre-ignore[6]
+
+    def _produce_data(
+        self, images_and_categories: List[Tuple[torch.Tensor, Optional[str]]]
+    ) -> Iterator[List[SampledData]]:
+        """
+        Produce batches of data from images
+
+        Args:
+          images_and_categories (List[Tuple[torch.Tensor, Optional[str]]]):
+            list of images and corresponding categories to process
+
+        Returns:
+          Iterator over batches of data sampled from model outputs
+        """
+        data_batches: List[SampledData] = []
+        category_to_class_mapping = self.category_to_class_mapping
+        batched_images_and_categories = _grouper(images_and_categories, self.inference_batch_size)
+        for batch in batched_images_and_categories:
+            batch = [
+                {
+                    "image": image_and_category["image"].to(self.model.device),
+                    "category": image_and_category["category"],
+                }
+                for image_and_category in batch
+                if image_and_category is not None
+            ]
+            if not batch:
+                continue
+            with torch.no_grad():
+                model_output = self.model(batch)
+            for model_output_i, batch_i in zip(model_output, batch):
+                assert len(batch_i["image"].shape) == 3
+                model_output_i["image"] = batch_i["image"]
+                instance_class = category_to_class_mapping.get(batch_i["category"], 0)
+                model_output_i["instances"].dataset_classes = torch.tensor(
+                    [instance_class] * len(model_output_i["instances"])
+                )
+            model_output_filtered = (
+                model_output if self.data_filter is None else self.data_filter(model_output)
+            )
+            data = (
+                model_output_filtered
+                if self.data_sampler is None
+                else self.data_sampler(model_output_filtered)
+            )
+            for data_i in data:
+                if len(data_i["instances"]):
+                    data_batches.append(data_i)
+            if len(data_batches) >= self.batch_size:
+                yield data_batches[: self.batch_size]
+                data_batches = data_batches[self.batch_size :]
+        if not self.drop_last and data_batches:
+            yield data_batches
diff --git a/densepose/data/meshes/__init__.py b/densepose/data/meshes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7552c251b2225af62212aae69d4ce273608f7a67
--- /dev/null
+++ b/densepose/data/meshes/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from . import builtin
+
+__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")]
diff --git a/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc b/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b73877f82c9da09bbc6452e13ef1ad70f0334acc
Binary files /dev/null and b/densepose/data/meshes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc b/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5ea957980c9c3cb9e510ecd14334dfedd26bbc
Binary files /dev/null and b/densepose/data/meshes/__pycache__/builtin.cpython-39.pyc differ
diff --git a/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc b/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7d310612b2685e0400e70419d39f3016093880
Binary files /dev/null and b/densepose/data/meshes/__pycache__/catalog.cpython-39.pyc differ
diff --git a/densepose/data/meshes/builtin.py b/densepose/data/meshes/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8ec8418852dc344d7c4bd9f6c5fdd049b30a6d
--- /dev/null
+++ b/densepose/data/meshes/builtin.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from .catalog import MeshInfo, register_meshes
+
+DENSEPOSE_MESHES_DIR = "https://dl.fbaipublicfiles.com/densepose/meshes/"
+
+MESHES = [
+    MeshInfo(
+        name="smpl_27554",
+        data="smpl_27554.pkl",
+        geodists="geodists/geodists_smpl_27554.pkl",
+        symmetry="symmetry/symmetry_smpl_27554.pkl",
+        texcoords="texcoords/texcoords_smpl_27554.pkl",
+    ),
+    MeshInfo(
+        name="chimp_5029",
+        data="chimp_5029.pkl",
+        geodists="geodists/geodists_chimp_5029.pkl",
+        symmetry="symmetry/symmetry_chimp_5029.pkl",
+        texcoords="texcoords/texcoords_chimp_5029.pkl",
+    ),
+    MeshInfo(
+        name="cat_5001",
+        data="cat_5001.pkl",
+        geodists="geodists/geodists_cat_5001.pkl",
+        symmetry="symmetry/symmetry_cat_5001.pkl",
+        texcoords="texcoords/texcoords_cat_5001.pkl",
+    ),
+    MeshInfo(
+        name="cat_7466",
+        data="cat_7466.pkl",
+        geodists="geodists/geodists_cat_7466.pkl",
+        symmetry="symmetry/symmetry_cat_7466.pkl",
+        texcoords="texcoords/texcoords_cat_7466.pkl",
+    ),
+    MeshInfo(
+        name="sheep_5004",
+        data="sheep_5004.pkl",
+        geodists="geodists/geodists_sheep_5004.pkl",
+        symmetry="symmetry/symmetry_sheep_5004.pkl",
+        texcoords="texcoords/texcoords_sheep_5004.pkl",
+    ),
+    MeshInfo(
+        name="zebra_5002",
+        data="zebra_5002.pkl",
+        geodists="geodists/geodists_zebra_5002.pkl",
+        symmetry="symmetry/symmetry_zebra_5002.pkl",
+        texcoords="texcoords/texcoords_zebra_5002.pkl",
+    ),
+    MeshInfo(
+        name="horse_5004",
+        data="horse_5004.pkl",
+        geodists="geodists/geodists_horse_5004.pkl",
+        symmetry="symmetry/symmetry_horse_5004.pkl",
+        texcoords="texcoords/texcoords_zebra_5002.pkl",
+    ),
+    MeshInfo(
+        name="giraffe_5002",
+        data="giraffe_5002.pkl",
+        geodists="geodists/geodists_giraffe_5002.pkl",
+        symmetry="symmetry/symmetry_giraffe_5002.pkl",
+        texcoords="texcoords/texcoords_giraffe_5002.pkl",
+    ),
+    MeshInfo(
+        name="elephant_5002",
+        data="elephant_5002.pkl",
+        geodists="geodists/geodists_elephant_5002.pkl",
+        symmetry="symmetry/symmetry_elephant_5002.pkl",
+        texcoords="texcoords/texcoords_elephant_5002.pkl",
+    ),
+    MeshInfo(
+        name="dog_5002",
+        data="dog_5002.pkl",
+        geodists="geodists/geodists_dog_5002.pkl",
+        symmetry="symmetry/symmetry_dog_5002.pkl",
+        texcoords="texcoords/texcoords_dog_5002.pkl",
+    ),
+    MeshInfo(
+        name="dog_7466",
+        data="dog_7466.pkl",
+        geodists="geodists/geodists_dog_7466.pkl",
+        symmetry="symmetry/symmetry_dog_7466.pkl",
+        texcoords="texcoords/texcoords_dog_7466.pkl",
+    ),
+    MeshInfo(
+        name="cow_5002",
+        data="cow_5002.pkl",
+        geodists="geodists/geodists_cow_5002.pkl",
+        symmetry="symmetry/symmetry_cow_5002.pkl",
+        texcoords="texcoords/texcoords_cow_5002.pkl",
+    ),
+    MeshInfo(
+        name="bear_4936",
+        data="bear_4936.pkl",
+        geodists="geodists/geodists_bear_4936.pkl",
+        symmetry="symmetry/symmetry_bear_4936.pkl",
+        texcoords="texcoords/texcoords_bear_4936.pkl",
+    ),
+]
+
+register_meshes(MESHES, DENSEPOSE_MESHES_DIR)
diff --git a/densepose/data/meshes/catalog.py b/densepose/data/meshes/catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae624a8aa21fb11cc3c3f7ee467f28b896959781
--- /dev/null
+++ b/densepose/data/meshes/catalog.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import logging
+from collections import UserDict
+from dataclasses import dataclass
+from typing import Iterable, Optional
+
+from ..utils import maybe_prepend_base_path
+
+
+@dataclass
+class MeshInfo:
+    name: str
+    data: str
+    geodists: Optional[str] = None
+    symmetry: Optional[str] = None
+    texcoords: Optional[str] = None
+
+
+class _MeshCatalog(UserDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.mesh_ids = {}
+        self.mesh_names = {}
+        self.max_mesh_id = -1
+
+    def __setitem__(self, key, value):
+        if key in self:
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Overwriting mesh catalog entry '{key}': old value {self[key]}"
+                f", new value {value}"
+            )
+            mesh_id = self.mesh_ids[key]
+        else:
+            self.max_mesh_id += 1
+            mesh_id = self.max_mesh_id
+        super().__setitem__(key, value)
+        self.mesh_ids[key] = mesh_id
+        self.mesh_names[mesh_id] = key
+
+    def get_mesh_id(self, shape_name: str) -> int:
+        return self.mesh_ids[shape_name]
+
+    def get_mesh_name(self, mesh_id: int) -> str:
+        return self.mesh_names[mesh_id]
+
+
+MeshCatalog = _MeshCatalog()
+
+
+def register_mesh(mesh_info: MeshInfo, base_path: Optional[str]) -> None:
+    geodists, symmetry, texcoords = mesh_info.geodists, mesh_info.symmetry, mesh_info.texcoords
+    if geodists:
+        geodists = maybe_prepend_base_path(base_path, geodists)
+    if symmetry:
+        symmetry = maybe_prepend_base_path(base_path, symmetry)
+    if texcoords:
+        texcoords = maybe_prepend_base_path(base_path, texcoords)
+    MeshCatalog[mesh_info.name] = MeshInfo(
+        name=mesh_info.name,
+        data=maybe_prepend_base_path(base_path, mesh_info.data),
+        geodists=geodists,
+        symmetry=symmetry,
+        texcoords=texcoords,
+    )
+
+
+def register_meshes(mesh_infos: Iterable[MeshInfo], base_path: Optional[str]) -> None:
+    for mesh_info in mesh_infos:
+        register_mesh(mesh_info, base_path)
diff --git a/densepose/data/samplers/__init__.py b/densepose/data/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf28288d8929c1b250720a2c6decfc9978dd903
--- /dev/null
+++ b/densepose/data/samplers/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .densepose_uniform import DensePoseUniformSampler
+from .densepose_confidence_based import DensePoseConfidenceBasedSampler
+from .densepose_cse_uniform import DensePoseCSEUniformSampler
+from .densepose_cse_confidence_based import DensePoseCSEConfidenceBasedSampler
+from .mask_from_densepose import MaskFromDensePoseSampler
+from .prediction_to_gt import PredictionToGroundTruthSampler
diff --git a/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc b/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e579fdd88793d74e11c1b7b42d66414a54583596
Binary files /dev/null and b/densepose/data/samplers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba5378faf8a96cd7e354c6b01ac2b20218ee1ae9
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_base.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ed3c6dd119f5f5618d46c8dd1c9bf5964e76abd
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_confidence_based.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59fa080b6e017af44015e1465931a67d316a8e5c
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_base.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53e87d53d9e478af918a4f4a6a7545dd86ccb5b4
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_confidence_based.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37adef4fbcb2361b9d6e007e89fad0b2771ade7f
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_cse_uniform.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e38dc210286789848ac9d3370014c81b67f60ae
Binary files /dev/null and b/densepose/data/samplers/__pycache__/densepose_uniform.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..390b44b34d532f4110706819d021ec8f3f5640cb
Binary files /dev/null and b/densepose/data/samplers/__pycache__/mask_from_densepose.cpython-39.pyc differ
diff --git a/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e99aa2aae13552c1b2940d07829153b7585c3c7
Binary files /dev/null and b/densepose/data/samplers/__pycache__/prediction_to_gt.cpython-39.pyc differ
diff --git a/densepose/data/samplers/densepose_base.py b/densepose/data/samplers/densepose_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..260413a5b65853d12b4cdb1bcff906f02ed7d63c
--- /dev/null
+++ b/densepose/data/samplers/densepose_base.py
@@ -0,0 +1,205 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any, Dict, List, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BoxMode, Instances
+
+from densepose.converters import ToChartResultConverter
+from densepose.converters.base import IntTupleBox, make_int_box
+from densepose.structures import DensePoseDataRelative, DensePoseList
+
+
+class DensePoseBaseSampler:
+    """
+    Base DensePose sampler to produce DensePose data from DensePose predictions.
+    Samples for each class are drawn according to some distribution over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(self, count_per_class: int = 8):
+        """
+        Constructor
+
+        Args:
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        self.count_per_class = count_per_class
+
+    def __call__(self, instances: Instances) -> DensePoseList:
+        """
+        Convert DensePose predictions (an instance of `DensePoseChartPredictorOutput`)
+        into DensePose annotations data (an instance of `DensePoseList`)
+        """
+        boxes_xyxy_abs = instances.pred_boxes.tensor.clone().cpu()
+        boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        dp_datas = []
+        for i in range(len(boxes_xywh_abs)):
+            annotation_i = self._sample(instances[i], make_int_box(boxes_xywh_abs[i]))
+            annotation_i[DensePoseDataRelative.S_KEY] = self._resample_mask(  # pyre-ignore[6]
+                instances[i].pred_densepose
+            )
+            dp_datas.append(DensePoseDataRelative(annotation_i))
+        # create densepose annotations on CPU
+        dp_list = DensePoseList(dp_datas, boxes_xyxy_abs, instances.image_size)
+        return dp_list
+
+    def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]:
+        """
+        Sample DensPoseDataRelative from estimation results
+        """
+        labels, dp_result = self._produce_labels_and_results(instance)
+        annotation = {
+            DensePoseDataRelative.X_KEY: [],
+            DensePoseDataRelative.Y_KEY: [],
+            DensePoseDataRelative.U_KEY: [],
+            DensePoseDataRelative.V_KEY: [],
+            DensePoseDataRelative.I_KEY: [],
+        }
+        n, h, w = dp_result.shape
+        for part_id in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            # indices - tuple of 3 1D tensors of size k
+            # 0: index along the first dimension N
+            # 1: index along H dimension
+            # 2: index along W dimension
+            indices = torch.nonzero(labels.expand(n, h, w) == part_id, as_tuple=True)
+            # values - an array of size [n, k]
+            # n: number of channels (U, V, confidences)
+            # k: number of points labeled with part_id
+            values = dp_result[indices].view(n, -1)
+            k = values.shape[1]
+            count = min(self.count_per_class, k)
+            if count <= 0:
+                continue
+            index_sample = self._produce_index_sample(values, count)
+            sampled_values = values[:, index_sample]
+            sampled_y = indices[1][index_sample] + 0.5
+            sampled_x = indices[2][index_sample] + 0.5
+            # prepare / normalize data
+            x = (sampled_x / w * 256.0).cpu().tolist()
+            y = (sampled_y / h * 256.0).cpu().tolist()
+            u = sampled_values[0].clamp(0, 1).cpu().tolist()
+            v = sampled_values[1].clamp(0, 1).cpu().tolist()
+            fine_segm_labels = [part_id] * count
+            # extend annotations
+            annotation[DensePoseDataRelative.X_KEY].extend(x)
+            annotation[DensePoseDataRelative.Y_KEY].extend(y)
+            annotation[DensePoseDataRelative.U_KEY].extend(u)
+            annotation[DensePoseDataRelative.V_KEY].extend(v)
+            annotation[DensePoseDataRelative.I_KEY].extend(fine_segm_labels)
+        return annotation
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Abstract method to produce a sample of indices to select data
+        To be implemented in descendants
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        raise NotImplementedError
+
+    def _produce_labels_and_results(self, instance: Instances) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of `DensePoseChartPredictorOutput`
+
+        Return:
+            labels (torch.Tensor): shape [H, W], DensePose segmentation labels
+            dp_result (torch.Tensor): shape [2, H, W], stacked DensePose results u and v
+        """
+        converter = ToChartResultConverter
+        chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes)
+        labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu()
+        return labels, dp_result
+
+    def _resample_mask(self, output: Any) -> torch.Tensor:
+        """
+        Convert DensePose predictor output to segmentation annotation - tensors of size
+        (256, 256) and type `int64`.
+
+        Args:
+            output: DensePose predictor output with the following attributes:
+             - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse
+               segmentation scores
+             - fine_segm: tensor of size [N, C, H, W] with unnormalized fine
+               segmentation scores
+        Return:
+            Tensor of size (S, S) and type `int64` with coarse segmentation annotations,
+            where S = DensePoseDataRelative.MASK_SIZE
+        """
+        sz = DensePoseDataRelative.MASK_SIZE
+        S = (
+            F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False)
+            .argmax(dim=1)
+            .long()
+        )
+        I = (
+            (
+                F.interpolate(
+                    output.fine_segm,
+                    (sz, sz),
+                    mode="bilinear",
+                    align_corners=False,
+                ).argmax(dim=1)
+                * (S > 0).long()
+            )
+            .squeeze()
+            .cpu()
+        )
+        # Map fine segmentation results to coarse segmentation ground truth
+        # TODO: extract this into separate classes
+        # coarse segmentation: 1 = Torso, 2 = Right Hand, 3 = Left Hand,
+        # 4 = Left Foot, 5 = Right Foot, 6 = Upper Leg Right, 7 = Upper Leg Left,
+        # 8 = Lower Leg Right, 9 = Lower Leg Left, 10 = Upper Arm Left,
+        # 11 = Upper Arm Right, 12 = Lower Arm Left, 13 = Lower Arm Right,
+        # 14 = Head
+        # fine segmentation: 1, 2 = Torso, 3 = Right Hand, 4 = Left Hand,
+        # 5 = Left Foot, 6 = Right Foot, 7, 9 = Upper Leg Right,
+        # 8, 10 = Upper Leg Left, 11, 13 = Lower Leg Right,
+        # 12, 14 = Lower Leg Left, 15, 17 = Upper Arm Left,
+        # 16, 18 = Upper Arm Right, 19, 21 = Lower Arm Left,
+        # 20, 22 = Lower Arm Right, 23, 24 = Head
+        FINE_TO_COARSE_SEGMENTATION = {
+            1: 1,
+            2: 1,
+            3: 2,
+            4: 3,
+            5: 4,
+            6: 5,
+            7: 6,
+            8: 7,
+            9: 6,
+            10: 7,
+            11: 8,
+            12: 9,
+            13: 8,
+            14: 9,
+            15: 10,
+            16: 11,
+            17: 10,
+            18: 11,
+            19: 12,
+            20: 13,
+            21: 12,
+            22: 13,
+            23: 14,
+            24: 14,
+        }
+        mask = torch.zeros((sz, sz), dtype=torch.int64, device=torch.device("cpu"))
+        for i in range(DensePoseDataRelative.N_PART_LABELS):
+            mask[I == i + 1] = FINE_TO_COARSE_SEGMENTATION[i + 1]
+        return mask
diff --git a/densepose/data/samplers/densepose_confidence_based.py b/densepose/data/samplers/densepose_confidence_based.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9a637e214cbd584773a9fb6031368b5d32417b
--- /dev/null
+++ b/densepose/data/samplers/densepose_confidence_based.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+from typing import Optional, Tuple
+import torch
+
+from densepose.converters import ToChartResultConverterWithConfidences
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseConfidenceBasedSampler(DensePoseBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn using confidence value estimates.
+    """
+
+    def __init__(
+        self,
+        confidence_channel: str,
+        count_per_class: int = 8,
+        search_count_multiplier: Optional[float] = None,
+        search_proportion: Optional[float] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          confidence_channel (str): confidence channel to use for sampling;
+            possible values:
+              "sigma_2": confidences for UV values
+              "fine_segm_confidence": confidences for fine segmentation
+              "coarse_segm_confidence": confidences for coarse segmentation
+            (default: "sigma_2")
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category (default: 8)
+          search_count_multiplier (float or None): if not None, the total number
+              of the most confident estimates of a given class to consider is
+              defined as `min(search_count_multiplier * count_per_class, N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_proportion` (default: None)
+          search_proportion (float or None): if not None, the total number of the
+              of the most confident estimates of a given class to consider is
+              defined as `min(max(search_proportion * N, count_per_class), N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_count_multiplier` (default: None)
+        """
+        super().__init__(count_per_class)
+        self.confidence_channel = confidence_channel
+        self.search_count_multiplier = search_count_multiplier
+        self.search_proportion = search_proportion
+        assert (search_count_multiplier is None) or (search_proportion is None), (
+            f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
+            f"and search_proportion (={search_proportion})"
+        )
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a sample of indices to select data based on confidences
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        if k == count:
+            index_sample = list(range(k))
+        else:
+            # take the best count * search_count_multiplier pixels,
+            # sample from them uniformly
+            # (here best = smallest variance)
+            _, sorted_confidence_indices = torch.sort(values[2])
+            if self.search_count_multiplier is not None:
+                search_count = min(int(count * self.search_count_multiplier), k)
+            elif self.search_proportion is not None:
+                search_count = min(max(int(k * self.search_proportion), count), k)
+            else:
+                search_count = min(count, k)
+            sample_from_top = random.sample(range(search_count), count)
+            index_sample = sorted_confidence_indices[:search_count][sample_from_top]
+        return index_sample
+
+    def _produce_labels_and_results(self, instance) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance, with confidences
+
+        Args:
+            instance (Instances): an instance of `DensePoseChartPredictorOutputWithConfidences`
+
+        Return:
+            labels (torch.Tensor): shape [H, W], DensePose segmentation labels
+            dp_result (torch.Tensor): shape [3, H, W], DensePose results u and v
+                stacked with the confidence channel
+        """
+        converter = ToChartResultConverterWithConfidences
+        chart_result = converter.convert(instance.pred_densepose, instance.pred_boxes)
+        labels, dp_result = chart_result.labels.cpu(), chart_result.uv.cpu()
+        dp_result = torch.cat(
+            (dp_result, getattr(chart_result, self.confidence_channel)[None].cpu())
+        )
+
+        return labels, dp_result
diff --git a/densepose/data/samplers/densepose_cse_base.py b/densepose/data/samplers/densepose_cse_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..593f1339f29308ff93ba98ed1426ee1dbd47be27
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_base.py
@@ -0,0 +1,141 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any, Dict, List, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.converters.base import IntTupleBox
+from densepose.data.utils import get_class_to_mesh_name_mapping
+from densepose.modeling.cse.utils import squared_euclidean_distance_matrix
+from densepose.structures import DensePoseDataRelative
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseCSEBaseSampler(DensePoseBaseSampler):
+    """
+    Base DensePose sampler to produce DensePose data from DensePose predictions.
+    Samples for each class are drawn according to some distribution over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(
+        self,
+        cfg: CfgNode,
+        use_gt_categories: bool,
+        embedder: torch.nn.Module,
+        count_per_class: int = 8,
+    ):
+        """
+        Constructor
+
+        Args:
+          cfg (CfgNode): the config of the model
+          embedder (torch.nn.Module): necessary to compute mesh vertex embeddings
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        super().__init__(count_per_class)
+        self.embedder = embedder
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.use_gt_categories = use_gt_categories
+
+    def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]:
+        """
+        Sample DensPoseDataRelative from estimation results
+        """
+        if self.use_gt_categories:
+            instance_class = instance.dataset_classes.tolist()[0]
+        else:
+            instance_class = instance.pred_classes.tolist()[0]
+        mesh_name = self.class_to_mesh_name[instance_class]
+
+        annotation = {
+            DensePoseDataRelative.X_KEY: [],
+            DensePoseDataRelative.Y_KEY: [],
+            DensePoseDataRelative.VERTEX_IDS_KEY: [],
+            DensePoseDataRelative.MESH_NAME_KEY: mesh_name,
+        }
+
+        mask, embeddings, other_values = self._produce_mask_and_results(instance, bbox_xywh)
+        indices = torch.nonzero(mask, as_tuple=True)
+        selected_embeddings = embeddings.permute(1, 2, 0)[indices].cpu()
+        values = other_values[:, indices[0], indices[1]]
+        k = values.shape[1]
+
+        count = min(self.count_per_class, k)
+        if count <= 0:
+            return annotation
+
+        index_sample = self._produce_index_sample(values, count)
+        closest_vertices = squared_euclidean_distance_matrix(
+            selected_embeddings[index_sample], self.embedder(mesh_name)
+        )
+        closest_vertices = torch.argmin(closest_vertices, dim=1)
+
+        sampled_y = indices[0][index_sample] + 0.5
+        sampled_x = indices[1][index_sample] + 0.5
+        # prepare / normalize data
+        _, _, w, h = bbox_xywh
+        x = (sampled_x / w * 256.0).cpu().tolist()
+        y = (sampled_y / h * 256.0).cpu().tolist()
+        # extend annotations
+        annotation[DensePoseDataRelative.X_KEY].extend(x)
+        annotation[DensePoseDataRelative.Y_KEY].extend(y)
+        annotation[DensePoseDataRelative.VERTEX_IDS_KEY].extend(closest_vertices.cpu().tolist())
+        return annotation
+
+    def _produce_mask_and_results(
+        self, instance: Instances, bbox_xywh: IntTupleBox
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of `DensePoseEmbeddingPredictorOutput`
+            bbox_xywh (IntTupleBox): the corresponding bounding box
+
+        Return:
+            mask (torch.Tensor): shape [H, W], DensePose segmentation mask
+            embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W],
+                DensePose CSE Embeddings
+            other_values (Tuple[torch.Tensor]): a tensor of shape [0, H, W],
+                for potential other values
+        """
+        densepose_output = instance.pred_densepose
+        S = densepose_output.coarse_segm
+        E = densepose_output.embedding
+        _, _, w, h = bbox_xywh
+        embeddings = F.interpolate(E, size=(h, w), mode="bilinear")[0]
+        coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0]
+        mask = coarse_segm_resized.argmax(0) > 0
+        other_values = torch.empty((0, h, w), device=E.device)
+        return mask, embeddings, other_values
+
+    def _resample_mask(self, output: Any) -> torch.Tensor:
+        """
+        Convert DensePose predictor output to segmentation annotation - tensors of size
+        (256, 256) and type `int64`.
+
+        Args:
+            output: DensePose predictor output with the following attributes:
+             - coarse_segm: tensor of size [N, D, H, W] with unnormalized coarse
+               segmentation scores
+        Return:
+            Tensor of size (S, S) and type `int64` with coarse segmentation annotations,
+            where S = DensePoseDataRelative.MASK_SIZE
+        """
+        sz = DensePoseDataRelative.MASK_SIZE
+        mask = (
+            F.interpolate(output.coarse_segm, (sz, sz), mode="bilinear", align_corners=False)
+            .argmax(dim=1)
+            .long()
+            .squeeze()
+            .cpu()
+        )
+        return mask
diff --git a/densepose/data/samplers/densepose_cse_confidence_based.py b/densepose/data/samplers/densepose_cse_confidence_based.py
new file mode 100644
index 0000000000000000000000000000000000000000..d656a5ab853152c65d8f4c88fe7210cf68ee8df7
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_confidence_based.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+from typing import Optional, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.converters.base import IntTupleBox
+
+from .densepose_cse_base import DensePoseCSEBaseSampler
+
+
+class DensePoseCSEConfidenceBasedSampler(DensePoseCSEBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn using confidence value estimates.
+    """
+
+    def __init__(
+        self,
+        cfg: CfgNode,
+        use_gt_categories: bool,
+        embedder: torch.nn.Module,
+        confidence_channel: str,
+        count_per_class: int = 8,
+        search_count_multiplier: Optional[float] = None,
+        search_proportion: Optional[float] = None,
+    ):
+        """
+        Constructor
+
+        Args:
+          cfg (CfgNode): the config of the model
+          embedder (torch.nn.Module): necessary to compute mesh vertex embeddings
+          confidence_channel (str): confidence channel to use for sampling;
+            possible values:
+              "coarse_segm_confidence": confidences for coarse segmentation
+            (default: "coarse_segm_confidence")
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category (default: 8)
+          search_count_multiplier (float or None): if not None, the total number
+              of the most confident estimates of a given class to consider is
+              defined as `min(search_count_multiplier * count_per_class, N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_proportion` (default: None)
+          search_proportion (float or None): if not None, the total number of the
+              of the most confident estimates of a given class to consider is
+              defined as `min(max(search_proportion * N, count_per_class), N)`,
+              where `N` is the total number of estimates of the class; cannot be
+              specified together with `search_count_multiplier` (default: None)
+        """
+        super().__init__(cfg, use_gt_categories, embedder, count_per_class)
+        self.confidence_channel = confidence_channel
+        self.search_count_multiplier = search_count_multiplier
+        self.search_proportion = search_proportion
+        assert (search_count_multiplier is None) or (search_proportion is None), (
+            f"Cannot specify both search_count_multiplier (={search_count_multiplier})"
+            f"and search_proportion (={search_proportion})"
+        )
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a sample of indices to select data based on confidences
+
+        Args:
+            values (torch.Tensor): a tensor of length k that contains confidences
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        if k == count:
+            index_sample = list(range(k))
+        else:
+            # take the best count * search_count_multiplier pixels,
+            # sample from them uniformly
+            # (here best = smallest variance)
+            _, sorted_confidence_indices = torch.sort(values[0])
+            if self.search_count_multiplier is not None:
+                search_count = min(int(count * self.search_count_multiplier), k)
+            elif self.search_proportion is not None:
+                search_count = min(max(int(k * self.search_proportion), count), k)
+            else:
+                search_count = min(count, k)
+            sample_from_top = random.sample(range(search_count), count)
+            index_sample = sorted_confidence_indices[-search_count:][sample_from_top]
+        return index_sample
+
+    def _produce_mask_and_results(
+        self, instance: Instances, bbox_xywh: IntTupleBox
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Method to get labels and DensePose results from an instance
+
+        Args:
+            instance (Instances): an instance of
+                `DensePoseEmbeddingPredictorOutputWithConfidences`
+            bbox_xywh (IntTupleBox): the corresponding bounding box
+
+        Return:
+            mask (torch.Tensor): shape [H, W], DensePose segmentation mask
+            embeddings (Tuple[torch.Tensor]): a tensor of shape [D, H, W]
+                DensePose CSE Embeddings
+            other_values: a tensor of shape [1, H, W], DensePose CSE confidence
+        """
+        _, _, w, h = bbox_xywh
+        densepose_output = instance.pred_densepose
+        mask, embeddings, _ = super()._produce_mask_and_results(instance, bbox_xywh)
+        other_values = F.interpolate(
+            getattr(densepose_output, self.confidence_channel),
+            size=(h, w),
+            mode="bilinear",
+        )[0].cpu()
+        return mask, embeddings, other_values
diff --git a/densepose/data/samplers/densepose_cse_uniform.py b/densepose/data/samplers/densepose_cse_uniform.py
new file mode 100644
index 0000000000000000000000000000000000000000..482c650caf404bfe96dd28c5092d2508b17a1dbf
--- /dev/null
+++ b/densepose/data/samplers/densepose_cse_uniform.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .densepose_cse_base import DensePoseCSEBaseSampler
+from .densepose_uniform import DensePoseUniformSampler
+
+
+class DensePoseCSEUniformSampler(DensePoseCSEBaseSampler, DensePoseUniformSampler):
+    """
+    Uniform Sampler for CSE
+    """
+
+    pass
diff --git a/densepose/data/samplers/densepose_uniform.py b/densepose/data/samplers/densepose_uniform.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0e35b667047674a498433e4c153475a5b5a1fc
--- /dev/null
+++ b/densepose/data/samplers/densepose_uniform.py
@@ -0,0 +1,43 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+import torch
+
+from .densepose_base import DensePoseBaseSampler
+
+
+class DensePoseUniformSampler(DensePoseBaseSampler):
+    """
+    Samples DensePose data from DensePose predictions.
+    Samples for each class are drawn uniformly over all pixels estimated
+    to belong to that class.
+    """
+
+    def __init__(self, count_per_class: int = 8):
+        """
+        Constructor
+
+        Args:
+          count_per_class (int): the sampler produces at most `count_per_class`
+              samples for each category
+        """
+        super().__init__(count_per_class)
+
+    def _produce_index_sample(self, values: torch.Tensor, count: int):
+        """
+        Produce a uniform sample of indices to select data
+
+        Args:
+            values (torch.Tensor): an array of size [n, k] that contains
+                estimated values (U, V, confidences);
+                n: number of channels (U, V, confidences)
+                k: number of points labeled with part_id
+            count (int): number of samples to produce, should be positive and <= k
+
+        Return:
+            list(int): indices of values (along axis 1) selected as a sample
+        """
+        k = values.shape[1]
+        return random.sample(range(k), count)
diff --git a/densepose/data/samplers/mask_from_densepose.py b/densepose/data/samplers/mask_from_densepose.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d631dff320bbec264675e6772c565cd06fc6b9f
--- /dev/null
+++ b/densepose/data/samplers/mask_from_densepose.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from detectron2.structures import BitMasks, Instances
+
+from densepose.converters import ToMaskConverter
+
+
+class MaskFromDensePoseSampler:
+    """
+    Produce mask GT from DensePose predictions
+    This sampler simply converts DensePose predictions to BitMasks
+    that a contain a bool tensor of the size of the input image
+    """
+
+    def __call__(self, instances: Instances) -> BitMasks:
+        """
+        Converts predicted data from `instances` into the GT mask data
+
+        Args:
+            instances (Instances): predicted results, expected to have `pred_densepose` field
+
+        Returns:
+            Boolean Tensor of the size of the input image that has non-zero
+            values at pixels that are estimated to belong to the detected object
+        """
+        return ToMaskConverter.convert(
+            instances.pred_densepose, instances.pred_boxes, instances.image_size
+        )
diff --git a/densepose/data/samplers/prediction_to_gt.py b/densepose/data/samplers/prediction_to_gt.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a28ff4f19012e96fdf3fb4923500839429a999
--- /dev/null
+++ b/densepose/data/samplers/prediction_to_gt.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional
+
+from detectron2.structures import Instances
+
+ModelOutput = Dict[str, Any]
+SampledData = Dict[str, Any]
+
+
+@dataclass
+class _Sampler:
+    """
+    Sampler registry entry that contains:
+     - src (str): source field to sample from (deleted after sampling)
+     - dst (Optional[str]): destination field to sample to, if not None
+     - func (Optional[Callable: Any -> Any]): function that performs sampling,
+         if None, reference copy is performed
+    """
+
+    src: str
+    dst: Optional[str]
+    func: Optional[Callable[[Any], Any]]
+
+
+class PredictionToGroundTruthSampler:
+    """
+    Sampler implementation that converts predictions to GT using registered
+    samplers for different fields of `Instances`.
+    """
+
+    def __init__(self, dataset_name: str = ""):
+        self.dataset_name = dataset_name
+        self._samplers = {}
+        self.register_sampler("pred_boxes", "gt_boxes", None)
+        self.register_sampler("pred_classes", "gt_classes", None)
+        # delete scores
+        self.register_sampler("scores")
+
+    def __call__(self, model_output: List[ModelOutput]) -> List[SampledData]:
+        """
+        Transform model output into ground truth data through sampling
+
+        Args:
+          model_output (Dict[str, Any]): model output
+        Returns:
+          Dict[str, Any]: sampled data
+        """
+        for model_output_i in model_output:
+            instances: Instances = model_output_i["instances"]
+            # transform data in each field
+            for _, sampler in self._samplers.items():
+                if not instances.has(sampler.src) or sampler.dst is None:
+                    continue
+                if sampler.func is None:
+                    instances.set(sampler.dst, instances.get(sampler.src))
+                else:
+                    instances.set(sampler.dst, sampler.func(instances))
+            # delete model output data that was transformed
+            for _, sampler in self._samplers.items():
+                if sampler.src != sampler.dst and instances.has(sampler.src):
+                    instances.remove(sampler.src)
+            model_output_i["dataset"] = self.dataset_name
+        return model_output
+
+    def register_sampler(
+        self,
+        prediction_attr: str,
+        gt_attr: Optional[str] = None,
+        func: Optional[Callable[[Any], Any]] = None,
+    ):
+        """
+        Register sampler for a field
+
+        Args:
+          prediction_attr (str): field to replace with a sampled value
+          gt_attr (Optional[str]): field to store the sampled value to, if not None
+          func (Optional[Callable: Any -> Any]): sampler function
+        """
+        self._samplers[(prediction_attr, gt_attr)] = _Sampler(
+            src=prediction_attr, dst=gt_attr, func=func
+        )
+
+    def remove_sampler(
+        self,
+        prediction_attr: str,
+        gt_attr: Optional[str] = None,
+    ):
+        """
+        Remove sampler for a field
+
+        Args:
+          prediction_attr (str): field to replace with a sampled value
+          gt_attr (Optional[str]): field to store the sampled value to, if not None
+        """
+        assert (prediction_attr, gt_attr) in self._samplers
+        del self._samplers[(prediction_attr, gt_attr)]
diff --git a/densepose/data/transform/__init__.py b/densepose/data/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..147671e198475ce4a82b17e8f81a688d697207d8
--- /dev/null
+++ b/densepose/data/transform/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .image import ImageResizeTransform
diff --git a/densepose/data/transform/__pycache__/__init__.cpython-39.pyc b/densepose/data/transform/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2c4b18f933a8a7ccf9fd415e5669d6418377aab
Binary files /dev/null and b/densepose/data/transform/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/transform/__pycache__/image.cpython-39.pyc b/densepose/data/transform/__pycache__/image.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca923dca82946775e637795ac51b8cce341d007
Binary files /dev/null and b/densepose/data/transform/__pycache__/image.cpython-39.pyc differ
diff --git a/densepose/data/transform/image.py b/densepose/data/transform/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f35b3ab1de3b1b58e9d7f9763c73eb1236f67d2
--- /dev/null
+++ b/densepose/data/transform/image.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import torch
+
+
+class ImageResizeTransform:
+    """
+    Transform that resizes images loaded from a dataset
+    (BGR data in NCHW channel order, typically uint8) to a format ready to be
+    consumed by DensePose training (BGR float32 data in NCHW channel order)
+    """
+
+    def __init__(self, min_size: int = 800, max_size: int = 1333):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            images (torch.Tensor): tensor of size [N, 3, H, W] that contains
+                BGR data (typically in uint8)
+        Returns:
+            images (torch.Tensor): tensor of size [N, 3, H1, W1] where
+                H1 and W1 are chosen to respect the specified min and max sizes
+                and preserve the original aspect ratio, the data channels
+                follow BGR order and the data type is `torch.float32`
+        """
+        # resize with min size
+        images = images.float()
+        min_size = min(images.shape[-2:])
+        max_size = max(images.shape[-2:])
+        scale = min(self.min_size / min_size, self.max_size / max_size)
+        images = torch.nn.functional.interpolate(
+            images,
+            scale_factor=scale,
+            mode="bilinear",
+            align_corners=False,
+        )
+        return images
diff --git a/densepose/data/utils.py b/densepose/data/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7625f3d5f7894d2d1519e8672d6fb2e6411e07ba
--- /dev/null
+++ b/densepose/data/utils.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import os
+from typing import Dict, Optional
+
+from detectron2.config import CfgNode
+
+
+def is_relative_local_path(path: str) -> bool:
+    path_str = os.fsdecode(path)
+    return ("://" not in path_str) and not os.path.isabs(path)
+
+
+def maybe_prepend_base_path(base_path: Optional[str], path: str):
+    """
+    Prepends the provided path with a base path prefix if:
+    1) base path is not None;
+    2) path is a local path
+    """
+    if base_path is None:
+        return path
+    if is_relative_local_path(path):
+        return os.path.join(base_path, path)
+    return path
+
+
+def get_class_to_mesh_name_mapping(cfg: CfgNode) -> Dict[int, str]:
+    return {
+        int(class_id): mesh_name
+        for class_id, mesh_name in cfg.DATASETS.CLASS_TO_MESH_NAME_MAPPING.items()
+    }
+
+
+def get_category_to_class_mapping(dataset_cfg: CfgNode) -> Dict[str, int]:
+    return {
+        category: int(class_id)
+        for category, class_id in dataset_cfg.CATEGORY_TO_CLASS_MAPPING.items()
+    }
diff --git a/densepose/data/video/__init__.py b/densepose/data/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd83443be8d6fff40b35a13758c31984f3d89be
--- /dev/null
+++ b/densepose/data/video/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .frame_selector import (
+    FrameSelectionStrategy,
+    RandomKFramesSelector,
+    FirstKFramesSelector,
+    LastKFramesSelector,
+    FrameTsList,
+    FrameSelector,
+)
+
+from .video_keyframe_dataset import (
+    VideoKeyframeDataset,
+    video_list_from_file,
+    list_keyframes,
+    read_keyframes,
+)
diff --git a/densepose/data/video/__pycache__/__init__.cpython-39.pyc b/densepose/data/video/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79d16cb81c94ad299c124828b29cb8a0bfa9e358
Binary files /dev/null and b/densepose/data/video/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc b/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3336d45f920a5e106d92a8da89ef0e8373b843da
Binary files /dev/null and b/densepose/data/video/__pycache__/frame_selector.cpython-39.pyc differ
diff --git a/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e46b9af3c5a5a1bb2bdaa5af921fe941ed7a271
Binary files /dev/null and b/densepose/data/video/__pycache__/video_keyframe_dataset.cpython-39.pyc differ
diff --git a/densepose/data/video/frame_selector.py b/densepose/data/video/frame_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..77a97a82f7c7bb95b2023df946b246f9de71a7d2
--- /dev/null
+++ b/densepose/data/video/frame_selector.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import random
+from collections.abc import Callable
+from enum import Enum
+from typing import Callable as TCallable
+from typing import List
+
+FrameTsList = List[int]
+FrameSelector = TCallable[[FrameTsList], FrameTsList]
+
+
+class FrameSelectionStrategy(Enum):
+    """
+    Frame selection strategy used with videos:
+     - "random_k": select k random frames
+     - "first_k": select k first frames
+     - "last_k": select k last frames
+     - "all": select all frames
+    """
+
+    # fmt: off
+    RANDOM_K = "random_k"
+    FIRST_K  = "first_k"
+    LAST_K   = "last_k"
+    ALL      = "all"
+    # fmt: on
+
+
+class RandomKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` random frames
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` random frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return random.sample(frame_tss, min(self.k, len(frame_tss)))
+
+
+class FirstKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` first frames
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` first frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return frame_tss[: self.k]
+
+
+class LastKFramesSelector(Callable):  # pyre-ignore[39]
+    """
+    Selector that retains at most `k` last frames from video data
+    """
+
+    def __init__(self, k: int):
+        self.k = k
+
+    def __call__(self, frame_tss: FrameTsList) -> FrameTsList:
+        """
+        Select `k` last frames
+
+        Args:
+          frames_tss (List[int]): timestamps of input frames
+        Returns:
+          List[int]: timestamps of selected frames
+        """
+        return frame_tss[-self.k :]
diff --git a/densepose/data/video/video_keyframe_dataset.py b/densepose/data/video/video_keyframe_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68857f095a1224313b1dfddc9d75981b04ffa34
--- /dev/null
+++ b/densepose/data/video/video_keyframe_dataset.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import csv
+import logging
+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+import av
+import torch
+from torch.utils.data.dataset import Dataset
+
+from detectron2.utils.file_io import PathManager
+
+from ..utils import maybe_prepend_base_path
+from .frame_selector import FrameSelector, FrameTsList
+
+FrameList = List[av.frame.Frame]  # pyre-ignore[16]
+FrameTransform = Callable[[torch.Tensor], torch.Tensor]
+
+
+def list_keyframes(video_fpath: str, video_stream_idx: int = 0) -> FrameTsList:
+    """
+    Traverses all keyframes of a video file. Returns a list of keyframe
+    timestamps. Timestamps are counts in timebase units.
+
+    Args:
+       video_fpath (str): Video file path
+       video_stream_idx (int): Video stream index (default: 0)
+    Returns:
+       List[int]: list of keyframe timestaps (timestamp is a count in timebase
+           units)
+    """
+    try:
+        with PathManager.open(video_fpath, "rb") as io:
+            # pyre-fixme[16]: Module `av` has no attribute `open`.
+            container = av.open(io, mode="r")
+            stream = container.streams.video[video_stream_idx]
+            keyframes = []
+            pts = -1
+            # Note: even though we request forward seeks for keyframes, sometimes
+            # a keyframe in backwards direction is returned. We introduce tolerance
+            # as a max count of ignored backward seeks
+            tolerance_backward_seeks = 2
+            while True:
+                try:
+                    container.seek(pts + 1, backward=False, any_frame=False, stream=stream)
+                except av.AVError as e:
+                    # the exception occurs when the video length is exceeded,
+                    # we then return whatever data we've already collected
+                    logger = logging.getLogger(__name__)
+                    logger.debug(
+                        f"List keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts + 1}, AV error: {e}"
+                    )
+                    return keyframes
+                except OSError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"List keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts + 1}, OS error: {e}"
+                    )
+                    return []
+                packet = next(container.demux(video=video_stream_idx))
+                if packet.pts is not None and packet.pts <= pts:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Video file {video_fpath}, stream {video_stream_idx}: "
+                        f"bad seek for packet {pts + 1} (got packet {packet.pts}), "
+                        f"tolerance {tolerance_backward_seeks}."
+                    )
+                    tolerance_backward_seeks -= 1
+                    if tolerance_backward_seeks == 0:
+                        return []
+                    pts += 1
+                    continue
+                tolerance_backward_seeks = 2
+                pts = packet.pts
+                if pts is None:
+                    return keyframes
+                if packet.is_keyframe:
+                    keyframes.append(pts)
+            return keyframes
+    except OSError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"List keyframes: Error opening video file container {video_fpath}, " f"OS error: {e}"
+        )
+    except RuntimeError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"List keyframes: Error opening video file container {video_fpath}, "
+            f"Runtime error: {e}"
+        )
+    return []
+
+
+def read_keyframes(
+    video_fpath: str, keyframes: FrameTsList, video_stream_idx: int = 0
+) -> FrameList:  # pyre-ignore[11]
+    """
+    Reads keyframe data from a video file.
+
+    Args:
+        video_fpath (str): Video file path
+        keyframes (List[int]): List of keyframe timestamps (as counts in
+            timebase units to be used in container seek operations)
+        video_stream_idx (int): Video stream index (default: 0)
+    Returns:
+        List[Frame]: list of frames that correspond to the specified timestamps
+    """
+    try:
+        with PathManager.open(video_fpath, "rb") as io:
+            # pyre-fixme[16]: Module `av` has no attribute `open`.
+            container = av.open(io)
+            stream = container.streams.video[video_stream_idx]
+            frames = []
+            for pts in keyframes:
+                try:
+                    container.seek(pts, any_frame=False, stream=stream)
+                    frame = next(container.decode(video=0))
+                    frames.append(frame)
+                except av.AVError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}, AV error: {e}"
+                    )
+                    container.close()
+                    return frames
+                except OSError as e:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error seeking video file {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}, OS error: {e}"
+                    )
+                    container.close()
+                    return frames
+                except StopIteration:
+                    logger = logging.getLogger(__name__)
+                    logger.warning(
+                        f"Read keyframes: Error decoding frame from {video_fpath}, "
+                        f"video stream {video_stream_idx}, pts {pts}"
+                    )
+                    container.close()
+                    return frames
+
+            container.close()
+            return frames
+    except OSError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"Read keyframes: Error opening video file container {video_fpath}, OS error: {e}"
+        )
+    except RuntimeError as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            f"Read keyframes: Error opening video file container {video_fpath}, Runtime error: {e}"
+        )
+    return []
+
+
+def video_list_from_file(video_list_fpath: str, base_path: Optional[str] = None):
+    """
+    Create a list of paths to video files from a text file.
+
+    Args:
+        video_list_fpath (str): path to a plain text file with the list of videos
+        base_path (str): base path for entries from the video list (default: None)
+    """
+    video_list = []
+    with PathManager.open(video_list_fpath, "r") as io:
+        for line in io:
+            video_list.append(maybe_prepend_base_path(base_path, str(line.strip())))
+    return video_list
+
+
+def read_keyframe_helper_data(fpath: str):
+    """
+    Read keyframe data from a file in CSV format: the header should contain
+    "video_id" and "keyframes" fields. Value specifications are:
+      video_id: int
+      keyframes: list(int)
+    Example of contents:
+      video_id,keyframes
+      2,"[1,11,21,31,41,51,61,71,81]"
+
+    Args:
+        fpath (str): File containing keyframe data
+
+    Return:
+        video_id_to_keyframes (dict: int -> list(int)): for a given video ID it
+          contains a list of keyframes for that video
+    """
+    video_id_to_keyframes = {}
+    try:
+        with PathManager.open(fpath, "r") as io:
+            csv_reader = csv.reader(io)
+            header = next(csv_reader)
+            video_id_idx = header.index("video_id")
+            keyframes_idx = header.index("keyframes")
+            for row in csv_reader:
+                video_id = int(row[video_id_idx])
+                assert (
+                    video_id not in video_id_to_keyframes
+                ), f"Duplicate keyframes entry for video {fpath}"
+                video_id_to_keyframes[video_id] = (
+                    [int(v) for v in row[keyframes_idx][1:-1].split(",")]
+                    if len(row[keyframes_idx]) > 2
+                    else []
+                )
+    except Exception as e:
+        logger = logging.getLogger(__name__)
+        logger.warning(f"Error reading keyframe helper data from {fpath}: {e}")
+    return video_id_to_keyframes
+
+
+class VideoKeyframeDataset(Dataset):
+    """
+    Dataset that provides keyframes for a set of videos.
+    """
+
+    _EMPTY_FRAMES = torch.empty((0, 3, 1, 1))
+
+    def __init__(
+        self,
+        video_list: List[str],
+        category_list: Union[str, List[str], None] = None,
+        frame_selector: Optional[FrameSelector] = None,
+        transform: Optional[FrameTransform] = None,
+        keyframe_helper_fpath: Optional[str] = None,
+    ):
+        """
+        Dataset constructor
+
+        Args:
+            video_list (List[str]): list of paths to video files
+            category_list (Union[str, List[str], None]): list of animal categories for each
+                video file. If it is a string, or None, this applies to all videos
+            frame_selector (Callable: KeyFrameList -> KeyFrameList):
+                selects keyframes to process, keyframes are given by
+                packet timestamps in timebase counts. If None, all keyframes
+                are selected (default: None)
+            transform (Callable: torch.Tensor -> torch.Tensor):
+                transforms a batch of RGB images (tensors of size [B, 3, H, W]),
+                returns a tensor of the same size. If None, no transform is
+                applied (default: None)
+
+        """
+        if type(category_list) is list:
+            self.category_list = category_list
+        else:
+            self.category_list = [category_list] * len(video_list)
+        assert len(video_list) == len(
+            self.category_list
+        ), "length of video and category lists must be equal"
+        self.video_list = video_list
+        self.frame_selector = frame_selector
+        self.transform = transform
+        self.keyframe_helper_data = (
+            read_keyframe_helper_data(keyframe_helper_fpath)
+            if keyframe_helper_fpath is not None
+            else None
+        )
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Gets selected keyframes from a given video
+
+        Args:
+            idx (int): video index in the video list file
+        Returns:
+            A dictionary containing two keys:
+                images (torch.Tensor): tensor of size [N, H, W, 3] or of size
+                    defined by the transform that contains keyframes data
+                categories (List[str]): categories of the frames
+        """
+        categories = [self.category_list[idx]]
+        fpath = self.video_list[idx]
+        keyframes = (
+            list_keyframes(fpath)
+            if self.keyframe_helper_data is None or idx not in self.keyframe_helper_data
+            else self.keyframe_helper_data[idx]
+        )
+        transform = self.transform
+        frame_selector = self.frame_selector
+        if not keyframes:
+            return {"images": self._EMPTY_FRAMES, "categories": []}
+        if frame_selector is not None:
+            keyframes = frame_selector(keyframes)
+        frames = read_keyframes(fpath, keyframes)
+        if not frames:
+            return {"images": self._EMPTY_FRAMES, "categories": []}
+        frames = np.stack([frame.to_rgb().to_ndarray() for frame in frames])
+        frames = torch.as_tensor(frames, device=torch.device("cpu"))
+        frames = frames[..., [2, 1, 0]]  # RGB -> BGR
+        frames = frames.permute(0, 3, 1, 2).float()  # NHWC -> NCHW
+        if transform is not None:
+            frames = transform(frames)
+        return {"images": frames, "categories": categories}
+
+    def __len__(self):
+        return len(self.video_list)
diff --git a/densepose/engine/__init__.py b/densepose/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4709c1b2d87e3c578d98aaa083e41323e4047ac9
--- /dev/null
+++ b/densepose/engine/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .trainer import Trainer
diff --git a/densepose/engine/trainer.py b/densepose/engine/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c9046312244dc2381ea722413986010f4ba75e7
--- /dev/null
+++ b/densepose/engine/trainer.py
@@ -0,0 +1,260 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import logging
+import os
+from collections import OrderedDict
+from typing import List, Optional, Union
+import torch
+from torch import nn
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode
+from detectron2.engine import DefaultTrainer
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    DatasetEvaluators,
+    inference_on_dataset,
+    print_csv_format,
+)
+from detectron2.solver.build import get_default_optimizer_params, maybe_add_gradient_clipping
+from detectron2.utils import comm
+from detectron2.utils.events import EventWriter, get_event_storage
+
+from densepose import DensePoseDatasetMapperTTA, DensePoseGeneralizedRCNNWithTTA, load_from_cfg
+from densepose.data import (
+    DatasetMapper,
+    build_combined_loader,
+    build_detection_test_loader,
+    build_detection_train_loader,
+    build_inference_based_loaders,
+    has_inference_based_loaders,
+)
+from densepose.evaluation.d2_evaluator_adapter import Detectron2COCOEvaluatorAdapter
+from densepose.evaluation.evaluator import DensePoseCOCOEvaluator, build_densepose_evaluator_storage
+from densepose.modeling.cse import Embedder
+
+
+class SampleCountingLoader:
+    def __init__(self, loader):
+        self.loader = loader
+
+    def __iter__(self):
+        it = iter(self.loader)
+        storage = get_event_storage()
+        while True:
+            try:
+                batch = next(it)
+                num_inst_per_dataset = {}
+                for data in batch:
+                    dataset_name = data["dataset"]
+                    if dataset_name not in num_inst_per_dataset:
+                        num_inst_per_dataset[dataset_name] = 0
+                    num_inst = len(data["instances"])
+                    num_inst_per_dataset[dataset_name] += num_inst
+                for dataset_name in num_inst_per_dataset:
+                    storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name])
+                yield batch
+            except StopIteration:
+                break
+
+
+class SampleCountMetricPrinter(EventWriter):
+    def __init__(self):
+        self.logger = logging.getLogger(__name__)
+
+    def write(self):
+        storage = get_event_storage()
+        batch_stats_strs = []
+        for key, buf in storage.histories().items():
+            if key.startswith("batch/"):
+                batch_stats_strs.append(f"{key} {buf.avg(20)}")
+        self.logger.info(", ".join(batch_stats_strs))
+
+
+class Trainer(DefaultTrainer):
+    @classmethod
+    def extract_embedder_from_model(cls, model: nn.Module) -> Optional[Embedder]:
+        if isinstance(model, nn.parallel.DistributedDataParallel):
+            model = model.module
+        if hasattr(model, "roi_heads") and hasattr(model.roi_heads, "embedder"):
+            return model.roi_heads.embedder
+        return None
+
+    # TODO: the only reason to copy the base class code here is to pass the embedder from
+    # the model to the evaluator; that should be refactored to avoid unnecessary copy-pasting
+    @classmethod
+    def test(
+        cls,
+        cfg: CfgNode,
+        model: nn.Module,
+        evaluators: Optional[Union[DatasetEvaluator, List[DatasetEvaluator]]] = None,
+    ):
+        """
+        Args:
+            cfg (CfgNode):
+            model (nn.Module):
+            evaluators (DatasetEvaluator, list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                ``cfg.DATASETS.TEST``.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        if isinstance(evaluators, DatasetEvaluator):
+            evaluators = [evaluators]
+        if evaluators is not None:
+            assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format(
+                len(cfg.DATASETS.TEST), len(evaluators)
+            )
+
+        results = OrderedDict()
+        for idx, dataset_name in enumerate(cfg.DATASETS.TEST):
+            data_loader = cls.build_test_loader(cfg, dataset_name)
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            if evaluators is not None:
+                evaluator = evaluators[idx]
+            else:
+                try:
+                    embedder = cls.extract_embedder_from_model(model)
+                    evaluator = cls.build_evaluator(cfg, dataset_name, embedder=embedder)
+                except NotImplementedError:
+                    logger.warn(
+                        "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+                        "or implement its `build_evaluator` method."
+                    )
+                    results[dataset_name] = {}
+                    continue
+            if cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE or comm.is_main_process():
+                results_i = inference_on_dataset(model, data_loader, evaluator)
+            else:
+                results_i = {}
+            results[dataset_name] = results_i
+            if comm.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info("Evaluation results for {} in csv format:".format(dataset_name))
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
+
+    @classmethod
+    def build_evaluator(
+        cls,
+        cfg: CfgNode,
+        dataset_name: str,
+        output_folder: Optional[str] = None,
+        embedder: Optional[Embedder] = None,
+    ) -> DatasetEvaluators:
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluators = []
+        distributed = cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE
+        # Note: we currently use COCO evaluator for both COCO and LVIS datasets
+        # to have compatible metrics. LVIS bbox evaluator could also be used
+        # with an adapter to properly handle filtered / mapped categories
+        # evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        # if evaluator_type == "coco":
+        #     evaluators.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+        # elif evaluator_type == "lvis":
+        #     evaluators.append(LVISEvaluator(dataset_name, output_dir=output_folder))
+        evaluators.append(
+            Detectron2COCOEvaluatorAdapter(
+                dataset_name, output_dir=output_folder, distributed=distributed
+            )
+        )
+        if cfg.MODEL.DENSEPOSE_ON:
+            storage = build_densepose_evaluator_storage(cfg, output_folder)
+            evaluators.append(
+                DensePoseCOCOEvaluator(
+                    dataset_name,
+                    distributed,
+                    output_folder,
+                    evaluator_type=cfg.DENSEPOSE_EVALUATION.TYPE,
+                    min_iou_threshold=cfg.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD,
+                    storage=storage,
+                    embedder=embedder,
+                    should_evaluate_mesh_alignment=cfg.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT,
+                    mesh_alignment_mesh_names=cfg.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES,
+                )
+            )
+        return DatasetEvaluators(evaluators)
+
+    @classmethod
+    def build_optimizer(cls, cfg: CfgNode, model: nn.Module):
+        params = get_default_optimizer_params(
+            model,
+            base_lr=cfg.SOLVER.BASE_LR,
+            weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
+            bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
+            weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
+            overrides={
+                "features": {
+                    "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR,
+                },
+                "embeddings": {
+                    "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR,
+                },
+            },
+        )
+        optimizer = torch.optim.SGD(
+            params,
+            cfg.SOLVER.BASE_LR,
+            momentum=cfg.SOLVER.MOMENTUM,
+            nesterov=cfg.SOLVER.NESTEROV,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY,
+        )
+        # pyre-fixme[6]: For 2nd param expected `Type[Optimizer]` but got `SGD`.
+        return maybe_add_gradient_clipping(cfg, optimizer)
+
+    @classmethod
+    def build_test_loader(cls, cfg: CfgNode, dataset_name):
+        return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False))
+
+    @classmethod
+    def build_train_loader(cls, cfg: CfgNode):
+        data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
+        if not has_inference_based_loaders(cfg):
+            return data_loader
+        model = cls.build_model(cfg)
+        model.to(cfg.BOOTSTRAP_MODEL.DEVICE)
+        DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False)
+        inference_based_loaders, ratios = build_inference_based_loaders(cfg, model)
+        loaders = [data_loader] + inference_based_loaders
+        ratios = [1.0] + ratios
+        combined_data_loader = build_combined_loader(cfg, loaders, ratios)
+        sample_counting_loader = SampleCountingLoader(combined_data_loader)
+        return sample_counting_loader
+
+    def build_writers(self):
+        writers = super().build_writers()
+        writers.append(SampleCountMetricPrinter())
+        return writers
+
+    @classmethod
+    def test_with_TTA(cls, cfg: CfgNode, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        transform_data = load_from_cfg(cfg)
+        model = DensePoseGeneralizedRCNNWithTTA(
+            cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg)
+        )
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)  # pyre-ignore[6]
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
diff --git a/densepose/evaluation/__init__.py b/densepose/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffabf0808c913a309b791ba8869c80db52a0ac8
--- /dev/null
+++ b/densepose/evaluation/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .evaluator import DensePoseCOCOEvaluator
diff --git a/densepose/evaluation/__pycache__/__init__.cpython-39.pyc b/densepose/evaluation/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbc342d643af7faa280175d272cef37e20cfdadb
Binary files /dev/null and b/densepose/evaluation/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e2e5dae772b724f21b4bfb17d1c6ce06c3edf62
Binary files /dev/null and b/densepose/evaluation/__pycache__/densepose_coco_evaluation.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc b/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83c5d730167a55cf162a0aadecb4d155f21a5632
Binary files /dev/null and b/densepose/evaluation/__pycache__/evaluator.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..155670b616bf38fd96745c5688279965f1666115
Binary files /dev/null and b/densepose/evaluation/__pycache__/mesh_alignment_evaluator.cpython-39.pyc differ
diff --git a/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc b/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e50821f5e2ab3af814c0abeffdf2667c4609e8d
Binary files /dev/null and b/densepose/evaluation/__pycache__/tensor_storage.cpython-39.pyc differ
diff --git a/densepose/evaluation/d2_evaluator_adapter.py b/densepose/evaluation/d2_evaluator_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7fbb9e34f42bce02c71eab9efad742491c6b4aa
--- /dev/null
+++ b/densepose/evaluation/d2_evaluator_adapter.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from detectron2.data.catalog import Metadata
+from detectron2.evaluation import COCOEvaluator
+
+from densepose.data.datasets.coco import (
+    get_contiguous_id_to_category_id_map,
+    maybe_filter_categories_cocoapi,
+)
+
+
+def _maybe_add_iscrowd_annotations(cocoapi) -> None:
+    for ann in cocoapi.dataset["annotations"]:
+        if "iscrowd" not in ann:
+            ann["iscrowd"] = 0
+
+
+class Detectron2COCOEvaluatorAdapter(COCOEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        output_dir=None,
+        distributed=True,
+    ):
+        super().__init__(dataset_name, output_dir=output_dir, distributed=distributed)
+        maybe_filter_categories_cocoapi(dataset_name, self._coco_api)
+        _maybe_add_iscrowd_annotations(self._coco_api)
+        # substitute category metadata to account for categories
+        # that are mapped to the same contiguous id
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            self._maybe_substitute_metadata()
+
+    def _maybe_substitute_metadata(self):
+        cont_id_2_cat_id = get_contiguous_id_to_category_id_map(self._metadata)
+        cat_id_2_cont_id = self._metadata.thing_dataset_id_to_contiguous_id
+        if len(cont_id_2_cat_id) == len(cat_id_2_cont_id):
+            return
+
+        cat_id_2_cont_id_injective = {}
+        for cat_id, cont_id in cat_id_2_cont_id.items():
+            if (cont_id in cont_id_2_cat_id) and (cont_id_2_cat_id[cont_id] == cat_id):
+                cat_id_2_cont_id_injective[cat_id] = cont_id
+
+        metadata_new = Metadata(name=self._metadata.name)
+        for key, value in self._metadata.__dict__.items():
+            if key == "thing_dataset_id_to_contiguous_id":
+                setattr(metadata_new, key, cat_id_2_cont_id_injective)
+            else:
+                setattr(metadata_new, key, value)
+        self._metadata = metadata_new
diff --git a/densepose/evaluation/densepose_coco_evaluation.py b/densepose/evaluation/densepose_coco_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bcec6a08921eb62f22ece337821d7ce9e7e591
--- /dev/null
+++ b/densepose/evaluation/densepose_coco_evaluation.py
@@ -0,0 +1,1305 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# This is a modified version of cocoeval.py where we also have the densepose evaluation.
+
+# pyre-unsafe
+
+__author__ = "tsungyi"
+
+import copy
+import datetime
+import logging
+import numpy as np
+import pickle
+import time
+from collections import defaultdict
+from enum import Enum
+from typing import Any, Dict, Tuple
+import scipy.spatial.distance as ssd
+import torch
+import torch.nn.functional as F
+from pycocotools import mask as maskUtils
+from scipy.io import loadmat
+from scipy.ndimage import zoom as spzoom
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.converters.chart_output_to_chart_result import resample_uv_tensors_to_bbox
+from densepose.converters.segm_to_mask import (
+    resample_coarse_segm_tensor_to_bbox,
+    resample_fine_and_coarse_segm_tensors_to_bbox,
+)
+from densepose.modeling.cse.utils import squared_euclidean_distance_matrix
+from densepose.structures import DensePoseDataRelative
+from densepose.structures.mesh import create_mesh
+
+logger = logging.getLogger(__name__)
+
+
+class DensePoseEvalMode(str, Enum):
+    # use both masks and geodesic distances (GPS * IOU) to compute scores
+    GPSM = "gpsm"
+    # use only geodesic distances (GPS)  to compute scores
+    GPS = "gps"
+    # use only masks (IOU) to compute scores
+    IOU = "iou"
+
+
+class DensePoseDataMode(str, Enum):
+    # use estimated IUV data (default mode)
+    IUV_DT = "iuvdt"
+    # use ground truth IUV data
+    IUV_GT = "iuvgt"
+    # use ground truth labels I and set UV to 0
+    I_GT_UV_0 = "igtuv0"
+    # use ground truth labels I and estimated UV coordinates
+    I_GT_UV_DT = "igtuvdt"
+    # use estimated labels I and set UV to 0
+    I_DT_UV_0 = "idtuv0"
+
+
+class DensePoseCocoEval:
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(
+        self,
+        cocoGt=None,
+        cocoDt=None,
+        iouType: str = "densepose",
+        multi_storage=None,
+        embedder=None,
+        dpEvalMode: DensePoseEvalMode = DensePoseEvalMode.GPS,
+        dpDataMode: DensePoseDataMode = DensePoseDataMode.IUV_DT,
+    ):
+        """
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.multi_storage = multi_storage
+        self.embedder = embedder
+        self._dpEvalMode = dpEvalMode
+        self._dpDataMode = dpDataMode
+        self.evalImgs = defaultdict(list)  # per-image per-category eval results [KxAxI]
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+        self.ignoreThrBB = 0.7
+        self.ignoreThrUV = 0.9
+
+    def _loadGEval(self):
+        smpl_subdiv_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_subdiv.mat"
+        )
+        pdist_transform_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/SMPL_SUBDIV_TRANSFORM.mat"
+        )
+        pdist_matrix_fpath = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/Pdist_matrix.pkl", timeout_sec=120
+        )
+        SMPL_subdiv = loadmat(smpl_subdiv_fpath)
+        self.PDIST_transform = loadmat(pdist_transform_fpath)
+        self.PDIST_transform = self.PDIST_transform["index"].squeeze()
+        UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze()
+        ClosestVertInds = np.arange(UV.shape[1]) + 1
+        self.Part_UVs = []
+        self.Part_ClosestVertInds = []
+        for i in np.arange(24):
+            self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)])
+            self.Part_ClosestVertInds.append(
+                ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]
+            )
+
+        with open(pdist_matrix_fpath, "rb") as hFile:
+            arrays = pickle.load(hFile, encoding="latin1")
+        self.Pdist_matrix = arrays["Pdist_matrix"]
+        self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze())
+        # Mean geodesic distances for parts.
+        self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150])
+        # Coarse Part labels.
+        self.CoarseParts = np.array(
+            [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8]
+        )
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        """
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                # safeguard for invalid segmentation annotation;
+                # annotations containing empty lists exist in the posetrack
+                # dataset. This is not a correct segmentation annotation
+                # in terms of COCO format; we need to deal with it somehow
+                segm = ann["segmentation"]
+                if type(segm) is list and len(segm) == 0:
+                    ann["segmentation"] = None
+                    continue
+                rle = coco.annToRLE(ann)
+                ann["segmentation"] = rle
+
+        def _getIgnoreRegion(iid, coco):
+            img = coco.imgs[iid]
+
+            if "ignore_regions_x" not in img.keys():
+                return None
+
+            if len(img["ignore_regions_x"]) == 0:
+                return None
+
+            rgns_merged = [
+                [v for xy in zip(region_x, region_y) for v in xy]
+                for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"])
+            ]
+            rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"])
+            rle = maskUtils.merge(rles)
+            return maskUtils.decode(rle)
+
+        def _checkIgnore(dt, iregion):
+            if iregion is None:
+                return True
+
+            bb = np.array(dt["bbox"]).astype(int)
+            x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3]
+            x2 = min([x2, iregion.shape[1]])
+            y2 = min([y2, iregion.shape[0]])
+
+            if bb[2] * bb[3] == 0:
+                return False
+
+            crop_iregion = iregion[y1:y2, x1:x2]
+
+            if crop_iregion.sum() == 0:
+                return True
+
+            if "densepose" not in dt.keys():  # filtering boxes
+                return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB
+
+            # filtering UVs
+            ignoremask = np.require(crop_iregion, requirements=["F"])
+            mask = self._extract_mask(dt)
+            uvmask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            uvmask_ = maskUtils.encode(uvmask)
+            ignoremask_ = maskUtils.encode(ignoremask)
+            uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0]
+            return uviou < self.ignoreThrUV
+
+        p = self.params
+
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        imns = self.cocoGt.loadImgs(p.imgIds)
+        self.size_mapping = {}
+        for im in imns:
+            self.size_mapping[im["id"]] = [im["height"], im["width"]]
+
+        # if iouType == 'uv', add point gt annotations
+        if p.iouType == "densepose":
+            self._loadGEval()
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == "segm":
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+
+        # set ignore flag
+        for gt in gts:
+            gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
+            gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
+            if p.iouType == "keypoints":
+                gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
+            if p.iouType == "densepose":
+                gt["ignore"] = ("dp_x" in gt) == 0
+            if p.iouType == "segm":
+                gt["ignore"] = gt["segmentation"] is None
+
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self._igrgns = defaultdict(list)
+
+        for gt in gts:
+            iid = gt["image_id"]
+            if iid not in self._igrgns.keys():
+                self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt)
+            if _checkIgnore(gt, self._igrgns[iid]):
+                self._gts[iid, gt["category_id"]].append(gt)
+        for dt in dts:
+            iid = dt["image_id"]
+            if (iid not in self._igrgns) or _checkIgnore(dt, self._igrgns[iid]):
+                self._dts[iid, dt["category_id"]].append(dt)
+
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        """
+        tic = time.time()
+        logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType))
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            logger.info("useSegm (deprecated) is not None. Running DensePose evaluation")
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType in ["segm", "bbox"]:
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        elif p.iouType == "densepose":
+            computeIoU = self.computeOgps
+            if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}:
+                self.real_ious = {
+                    (imgId, catId): self.computeDPIoU(imgId, catId)
+                    for imgId in p.imgIds
+                    for catId in catIds
+                }
+
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds
+        }
+
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [
+            evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic))
+
+    def getDensePoseMask(self, polys):
+        maskGen = np.zeros([256, 256])
+        stop = min(len(polys) + 1, 15)
+        for i in range(1, stop):
+            if polys[i - 1]:
+                currentMask = maskUtils.decode(polys[i - 1])
+                maskGen[currentMask > 0] = i
+        return maskGen
+
+    def _generate_rlemask_on_image(self, mask, imgId, data):
+        bbox_xywh = np.array(data["bbox"])
+        x, y, w, h = bbox_xywh
+        im_h, im_w = self.size_mapping[imgId]
+        im_mask = np.zeros((im_h, im_w), dtype=np.uint8)
+        if mask is not None:
+            x0 = max(int(x), 0)
+            x1 = min(int(x + w), im_w, int(x) + mask.shape[1])
+            y0 = max(int(y), 0)
+            y1 = min(int(y + h), im_h, int(y) + mask.shape[0])
+            y = int(y)
+            x = int(x)
+            im_mask[y0:y1, x0:x1] = mask[y0 - y : y1 - y, x0 - x : x1 - x]
+        im_mask = np.require(np.asarray(im_mask > 0), dtype=np.uint8, requirements=["F"])
+        rle_mask = maskUtils.encode(np.array(im_mask[:, :, np.newaxis], order="F"))[0]
+        return rle_mask
+
+    def computeDPIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        gtmasks = []
+        for g in gt:
+            if DensePoseDataRelative.S_KEY in g:
+                # convert DensePose mask to a binary mask
+                mask = np.minimum(self.getDensePoseMask(g[DensePoseDataRelative.S_KEY]), 1.0)
+                _, _, w, h = g["bbox"]
+                scale_x = float(max(w, 1)) / mask.shape[1]
+                scale_y = float(max(h, 1)) / mask.shape[0]
+                mask = spzoom(mask, (scale_y, scale_x), order=1, prefilter=False)
+                mask = np.array(mask > 0.5, dtype=np.uint8)
+                rle_mask = self._generate_rlemask_on_image(mask, imgId, g)
+            elif "segmentation" in g:
+                segmentation = g["segmentation"]
+                if isinstance(segmentation, list) and segmentation:
+                    # polygons
+                    im_h, im_w = self.size_mapping[imgId]
+                    rles = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    rle_mask = maskUtils.merge(rles)
+                elif isinstance(segmentation, dict):
+                    if isinstance(segmentation["counts"], list):
+                        # uncompressed RLE
+                        im_h, im_w = self.size_mapping[imgId]
+                        rle_mask = maskUtils.frPyObjects(segmentation, im_h, im_w)
+                    else:
+                        # compressed RLE
+                        rle_mask = segmentation
+                else:
+                    rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            else:
+                rle_mask = self._generate_rlemask_on_image(None, imgId, g)
+            gtmasks.append(rle_mask)
+
+        dtmasks = []
+        for d in dt:
+            mask = self._extract_mask(d)
+            mask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"])
+            rle_mask = self._generate_rlemask_on_image(mask, imgId, d)
+            dtmasks.append(rle_mask)
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        iousDP = maskUtils.iou(dtmasks, gtmasks, iscrowd)
+        return iousDP
+
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "segm":
+            g = [g["segmentation"] for g in gt if g["segmentation"] is not None]
+            d = [d["segmentation"] for d in dt if d["segmentation"] is not None]
+        elif p.iouType == "bbox":
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        ious = maskUtils.iou(d, g, iscrowd)
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d["score"] for d in dts], kind="mergesort")
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = (
+            np.array(
+                [
+                    0.26,
+                    0.25,
+                    0.25,
+                    0.35,
+                    0.35,
+                    0.79,
+                    0.79,
+                    0.72,
+                    0.72,
+                    0.62,
+                    0.62,
+                    1.07,
+                    1.07,
+                    0.87,
+                    0.87,
+                    0.89,
+                    0.89,
+                ]
+            )
+            / 10.0
+        )
+        vars = (sigmas * 2) ** 2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt["keypoints"])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt["bbox"]
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt["keypoints"])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros(k)
+                    dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt["area"] + np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def _extract_mask(self, dt: Dict[str, Any]) -> np.ndarray:
+        if "densepose" in dt:
+            densepose_results_quantized = dt["densepose"]
+            return densepose_results_quantized.labels_uv_uint8[0].numpy()
+        elif "cse_mask" in dt:
+            return dt["cse_mask"]
+        elif "coarse_segm" in dt:
+            dy = max(int(dt["bbox"][3]), 1)
+            dx = max(int(dt["bbox"][2]), 1)
+            return (
+                F.interpolate(
+                    dt["coarse_segm"].unsqueeze(0),
+                    (dy, dx),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .argmax(0)
+                .numpy()
+                .astype(np.uint8)
+            )
+        elif "record_id" in dt:
+            assert (
+                self.multi_storage is not None
+            ), f"Storage record id encountered in a detection {dt}, but no storage provided!"
+            record = self.multi_storage.get(dt["rank"], dt["record_id"])
+            coarse_segm = record["coarse_segm"]
+            dy = max(int(dt["bbox"][3]), 1)
+            dx = max(int(dt["bbox"][2]), 1)
+            return (
+                F.interpolate(
+                    coarse_segm.unsqueeze(0),
+                    (dy, dx),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .argmax(0)
+                .numpy()
+                .astype(np.uint8)
+            )
+        else:
+            raise Exception(f"No mask data in the detection: {dt}")
+        raise ValueError('The prediction dict needs to contain either "densepose" or "cse_mask"')
+
+    def _extract_iuv(
+        self, densepose_data: np.ndarray, py: np.ndarray, px: np.ndarray, gt: Dict[str, Any]
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Extract arrays of I, U and V values at given points as numpy arrays
+        given the data mode stored in self._dpDataMode
+        """
+        if self._dpDataMode == DensePoseDataMode.IUV_DT:
+            # estimated labels and UV (default)
+            ipoints = densepose_data[0, py, px]
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.IUV_GT:
+            # ground truth
+            ipoints = np.array(gt["dp_I"])
+            upoints = np.array(gt["dp_U"])
+            vpoints = np.array(gt["dp_V"])
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_0:
+            # ground truth labels, UV = 0
+            ipoints = np.array(gt["dp_I"])
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        elif self._dpDataMode == DensePoseDataMode.I_GT_UV_DT:
+            # ground truth labels, estimated UV
+            ipoints = np.array(gt["dp_I"])
+            upoints = densepose_data[1, py, px] / 255.0  # convert from uint8 by /255.
+            vpoints = densepose_data[2, py, px] / 255.0
+        elif self._dpDataMode == DensePoseDataMode.I_DT_UV_0:
+            # estimated labels, UV = 0
+            ipoints = densepose_data[0, py, px]
+            upoints = upoints * 0.0
+            vpoints = vpoints * 0.0
+        else:
+            raise ValueError(f"Unknown data mode: {self._dpDataMode}")
+        return ipoints, upoints, vpoints
+
+    def computeOgps_single_pair(self, dt, gt, py, px, pt_mask):
+        if "densepose" in dt:
+            ipoints, upoints, vpoints = self.extract_iuv_from_quantized(dt, gt, py, px, pt_mask)
+            return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+        elif "u" in dt:
+            ipoints, upoints, vpoints = self.extract_iuv_from_raw(dt, gt, py, px, pt_mask)
+            return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+        elif "record_id" in dt:
+            assert (
+                self.multi_storage is not None
+            ), f"Storage record id encountered in detection {dt}, but no storage provided!"
+            record = self.multi_storage.get(dt["rank"], dt["record_id"])
+            record["bbox"] = dt["bbox"]
+            if "u" in record:
+                ipoints, upoints, vpoints = self.extract_iuv_from_raw(record, gt, py, px, pt_mask)
+                return self.computeOgps_single_pair_iuv(dt, gt, ipoints, upoints, vpoints)
+            elif "embedding" in record:
+                return self.computeOgps_single_pair_cse(
+                    dt,
+                    gt,
+                    py,
+                    px,
+                    pt_mask,
+                    record["coarse_segm"],
+                    record["embedding"],
+                    record["bbox"],
+                )
+            else:
+                raise Exception(f"Unknown record format: {record}")
+        elif "embedding" in dt:
+            return self.computeOgps_single_pair_cse(
+                dt, gt, py, px, pt_mask, dt["coarse_segm"], dt["embedding"], dt["bbox"]
+            )
+        raise Exception(f"Unknown detection format: {dt}")
+
+    def extract_iuv_from_quantized(self, dt, gt, py, px, pt_mask):
+        densepose_results_quantized = dt["densepose"]
+        ipoints, upoints, vpoints = self._extract_iuv(
+            densepose_results_quantized.labels_uv_uint8.numpy(), py, px, gt
+        )
+        ipoints[pt_mask == -1] = 0
+        return ipoints, upoints, vpoints
+
+    def extract_iuv_from_raw(self, dt, gt, py, px, pt_mask):
+        labels_dt = resample_fine_and_coarse_segm_tensors_to_bbox(
+            dt["fine_segm"].unsqueeze(0),
+            dt["coarse_segm"].unsqueeze(0),
+            dt["bbox"],
+        )
+        uv = resample_uv_tensors_to_bbox(
+            dt["u"].unsqueeze(0), dt["v"].unsqueeze(0), labels_dt.squeeze(0), dt["bbox"]
+        )
+        labels_uv_uint8 = torch.cat((labels_dt.byte(), (uv * 255).clamp(0, 255).byte()))
+        ipoints, upoints, vpoints = self._extract_iuv(labels_uv_uint8.numpy(), py, px, gt)
+        ipoints[pt_mask == -1] = 0
+        return ipoints, upoints, vpoints
+
+    def computeOgps_single_pair_iuv(self, dt, gt, ipoints, upoints, vpoints):
+        cVertsGT, ClosestVertsGTTransformed = self.findAllClosestVertsGT(gt)
+        cVerts = self.findAllClosestVertsUV(upoints, vpoints, ipoints)
+        # Get pairwise geodesic distances between gt and estimated mesh points.
+        dist = self.getDistancesUV(ClosestVertsGTTransformed, cVerts)
+        # Compute the Ogps measure.
+        # Find the mean geodesic normalization distance for
+        # each GT point, based on which part it is on.
+        Current_Mean_Distances = self.Mean_Distances[
+            self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]]
+        ]
+        return dist, Current_Mean_Distances
+
+    def computeOgps_single_pair_cse(
+        self, dt, gt, py, px, pt_mask, coarse_segm, embedding, bbox_xywh_abs
+    ):
+        # 0-based mesh vertex indices
+        cVertsGT = torch.as_tensor(gt["dp_vertex"], dtype=torch.int64)
+        # label for each pixel of the bbox, [H, W] tensor of long
+        labels_dt = resample_coarse_segm_tensor_to_bbox(
+            coarse_segm.unsqueeze(0), bbox_xywh_abs
+        ).squeeze(0)
+        x, y, w, h = bbox_xywh_abs
+        # embedding for each pixel of the bbox, [D, H, W] tensor of float32
+        embedding = F.interpolate(
+            embedding.unsqueeze(0), (int(h), int(w)), mode="bilinear", align_corners=False
+        ).squeeze(0)
+        # valid locations py, px
+        py_pt = torch.from_numpy(py[pt_mask > -1])
+        px_pt = torch.from_numpy(px[pt_mask > -1])
+        cVerts = torch.ones_like(cVertsGT) * -1
+        cVerts[pt_mask > -1] = self.findClosestVertsCse(
+            embedding, py_pt, px_pt, labels_dt, gt["ref_model"]
+        )
+        # Get pairwise geodesic distances between gt and estimated mesh points.
+        dist = self.getDistancesCse(cVertsGT, cVerts, gt["ref_model"])
+        # normalize distances
+        if (gt["ref_model"] == "smpl_27554") and ("dp_I" in gt):
+            Current_Mean_Distances = self.Mean_Distances[
+                self.CoarseParts[np.array(gt["dp_I"], dtype=int)]
+            ]
+        else:
+            Current_Mean_Distances = 0.255
+        return dist, Current_Mean_Distances
+
+    def computeOgps(self, imgId, catId):
+        p = self.params
+        # dimension here should be Nxm
+        g = self._gts[imgId, catId]
+        d = self._dts[imgId, catId]
+        inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort")
+        d = [d[i] for i in inds]
+        if len(d) > p.maxDets[-1]:
+            d = d[0 : p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(g) == 0 or len(d) == 0:
+            return []
+        ious = np.zeros((len(d), len(g)))
+        # compute opgs between each detection and ground truth object
+        # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5
+        # 1 # dist = 0.3m corresponds to ogps = 0.96
+        # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5)
+        for j, gt in enumerate(g):
+            if not gt["ignore"]:
+                g_ = gt["bbox"]
+                for i, dt in enumerate(d):
+                    #
+                    dy = int(dt["bbox"][3])
+                    dx = int(dt["bbox"][2])
+                    dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0
+                    dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0
+                    py = (dp_y + g_[1] - dt["bbox"][1]).astype(int)
+                    px = (dp_x + g_[0] - dt["bbox"][0]).astype(int)
+                    #
+                    pts = np.zeros(len(px))
+                    pts[px >= dx] = -1
+                    pts[py >= dy] = -1
+                    pts[px < 0] = -1
+                    pts[py < 0] = -1
+                    if len(pts) < 1:
+                        ogps = 0.0
+                    elif np.max(pts) == -1:
+                        ogps = 0.0
+                    else:
+                        px[pts == -1] = 0
+                        py[pts == -1] = 0
+                        dists_between_matches, dist_norm_coeffs = self.computeOgps_single_pair(
+                            dt, gt, py, px, pts
+                        )
+                        # Compute gps
+                        ogps_values = np.exp(
+                            -(dists_between_matches**2) / (2 * (dist_norm_coeffs**2))
+                        )
+                        #
+                        ogps = np.mean(ogps_values) if len(ogps_values) > 0 else 0.0
+                    ious[i, j] = ogps
+
+        gbb = [gt["bbox"] for gt in g]
+        dbb = [dt["bbox"] for dt in d]
+
+        # compute iou between each dt and gt region
+        iscrowd = [int(o.get("iscrowd", 0)) for o in g]
+        ious_bb = maskUtils.iou(dbb, gbb, iscrowd)
+        return ious, ious_bb
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        """
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            # g['_ignore'] = g['ignore']
+            if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]):
+                g["_ignore"] = True
+            else:
+                g["_ignore"] = False
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o.get("iscrowd", 0)) for o in gt]
+        # load computed ious
+        if p.iouType == "densepose":
+            # print('Checking the length', len(self.ious[imgId, catId]))
+            # if len(self.ious[imgId, catId]) == 0:
+            #    print(self.ious[imgId, catId])
+            ious = (
+                self.ious[imgId, catId][0][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            ioubs = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+            if self._dpEvalMode in {DensePoseEvalMode.GPSM, DensePoseEvalMode.IOU}:
+                iousM = (
+                    self.real_ious[imgId, catId][:, gtind]
+                    if len(self.real_ious[imgId, catId]) > 0
+                    else self.real_ious[imgId, catId]
+                )
+        else:
+            ious = (
+                self.ious[imgId, catId][:, gtind]
+                if len(self.ious[imgId, catId]) > 0
+                else self.ious[imgId, catId]
+            )
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+        if np.all(gtIg) and p.iouType == "densepose":
+            dtIg = np.logical_or(dtIg, True)
+
+        if len(ious) > 0:  # and not p.iouType == 'densepose':
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, _g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        if p.iouType == "densepose":
+                            if self._dpEvalMode == DensePoseEvalMode.GPSM:
+                                new_iou = np.sqrt(iousM[dind, gind] * ious[dind, gind])
+                            elif self._dpEvalMode == DensePoseEvalMode.IOU:
+                                new_iou = iousM[dind, gind]
+                            elif self._dpEvalMode == DensePoseEvalMode.GPS:
+                                new_iou = ious[dind, gind]
+                        else:
+                            new_iou = ious[dind, gind]
+                        if new_iou < iou:
+                            continue
+                        if new_iou == 0.0:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou = new_iou
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+
+        if p.iouType == "densepose":
+            if not len(ioubs) == 0:
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    if dtm[tind, dind] == 0:
+                        ioub = 0.8
+                        m = -1
+                        for gind, _g in enumerate(gt):
+                            # if this gt already matched, and not a crowd, continue
+                            if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                                continue
+                            # continue to next gt unless better match made
+                            if ioubs[dind, gind] < ioub:
+                                continue
+                            # if match successful and best so far, store appropriately
+                            ioub = ioubs[dind, gind]
+                            m = gind
+                            # if match made store id of match for both dt and gt
+                        if m > -1:
+                            dtIg[:, dind] = gtIg[m]
+                            if gtIg[m]:
+                                dtm[tind, dind] = gt[m]["id"]
+                                gtm[tind, m] = d["id"]
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+        # store results for given image and category
+        # print('Done with the function', len(self.ious[imgId, catId]))
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        """
+        logger.info("Accumulating evaluation results...")
+        tic = time.time()
+        if not self.evalImgs:
+            logger.info("Please run evaluate() first")
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -(np.ones((T, R, K, A, M)))  # -1 for the precision of absent categories
+        recall = -(np.ones((T, K, A, M)))
+
+        # create dictionary for future indexing
+        logger.info("Categories: {}".format(p.catIds))
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind="mergesort")
+
+                    dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds]
+                    gtIg = np.concatenate([e["gtIgnore"] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg))
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R,))
+
+                        if nd:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side="left")
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+        logger.info(
+            "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision))
+        )
+        self.eval = {
+            "params": p,
+            "counts": [T, R, K, A, M],
+            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "precision": precision,
+            "recall": recall,
+        }
+        toc = time.time()
+        logger.info("DONE (t={:0.2f}s).".format(toc - tic))
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this function can *only* be applied on the default parameter setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100):
+            p = self.params
+            iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+            measure = "IoU"
+            if self.params.iouType == "keypoints":
+                measure = "OKS"
+            elif self.params.iouType == "densepose":
+                measure = "OGPS"
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval["precision"]
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval["recall"]
+                if iouThr is not None:
+                    t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=0.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng="medium")
+            stats[4] = _summarize(1, maxDets=20, areaRng="large")
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=0.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=0.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng="medium")
+            stats[9] = _summarize(0, maxDets=20, areaRng="large")
+            return stats
+
+        def _summarizeUvs():
+            stats = [_summarize(1, maxDets=self.params.maxDets[0])]
+            min_threshold = self.params.iouThrs.min()
+            if min_threshold <= 0.201:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.2)]
+            if min_threshold <= 0.301:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.3)]
+            if min_threshold <= 0.401:
+                stats += [_summarize(1, maxDets=self.params.maxDets[0], iouThr=0.4)]
+            stats += [
+                _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5),
+                _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75),
+                _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium"),
+                _summarize(1, maxDets=self.params.maxDets[0], areaRng="large"),
+                _summarize(0, maxDets=self.params.maxDets[0]),
+                _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5),
+                _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75),
+                _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium"),
+                _summarize(0, maxDets=self.params.maxDets[0], areaRng="large"),
+            ]
+            return np.array(stats)
+
+        def _summarizeUvsOld():
+            stats = np.zeros((18,))
+            stats[0] = _summarize(1, maxDets=self.params.maxDets[0])
+            stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55)
+            stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60)
+            stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65)
+            stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70)
+            stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80)
+            stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85)
+            stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90)
+            stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95)
+            stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large")
+            stats[13] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5)
+            stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75)
+            stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium")
+            stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large")
+            return stats
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+        iouType = self.params.iouType
+        if iouType in ["segm", "bbox"]:
+            summarize = _summarizeDets
+        elif iouType in ["keypoints"]:
+            summarize = _summarizeKps
+        elif iouType in ["densepose"]:
+            summarize = _summarizeUvs
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+    # ================ functions for dense pose ==============================
+    def findAllClosestVertsUV(self, U_points, V_points, Index_points):
+        ClosestVerts = np.ones(Index_points.shape) * -1
+        for i in np.arange(24):
+            #
+            if (i + 1) in Index_points:
+                UVs = np.array(
+                    [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]]
+                )
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[
+                    np.argmin(D, axis=0)
+                ]
+        ClosestVertsTransformed = self.PDIST_transform[ClosestVerts.astype(int) - 1]
+        ClosestVertsTransformed[ClosestVerts < 0] = 0
+        return ClosestVertsTransformed
+
+    def findClosestVertsCse(self, embedding, py, px, mask, mesh_name):
+        mesh_vertex_embeddings = self.embedder(mesh_name)
+        pixel_embeddings = embedding[:, py, px].t().to(device="cuda")
+        mask_vals = mask[py, px]
+        edm = squared_euclidean_distance_matrix(pixel_embeddings, mesh_vertex_embeddings)
+        vertex_indices = edm.argmin(dim=1).cpu()
+        vertex_indices[mask_vals <= 0] = -1
+        return vertex_indices
+
+    def findAllClosestVertsGT(self, gt):
+        #
+        I_gt = np.array(gt["dp_I"])
+        U_gt = np.array(gt["dp_U"])
+        V_gt = np.array(gt["dp_V"])
+        #
+        # print(I_gt)
+        #
+        ClosestVertsGT = np.ones(I_gt.shape) * -1
+        for i in np.arange(24):
+            if (i + 1) in I_gt:
+                UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]])
+                Current_Part_UVs = self.Part_UVs[i]
+                Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i]
+                D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze()
+                ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)]
+        #
+        ClosestVertsGTTransformed = self.PDIST_transform[ClosestVertsGT.astype(int) - 1]
+        ClosestVertsGTTransformed[ClosestVertsGT < 0] = 0
+        return ClosestVertsGT, ClosestVertsGTTransformed
+
+    def getDistancesCse(self, cVertsGT, cVerts, mesh_name):
+        geodists_vertices = torch.ones_like(cVertsGT) * float("inf")
+        selected = (cVertsGT >= 0) * (cVerts >= 0)
+        mesh = create_mesh(mesh_name, "cpu")
+        geodists_vertices[selected] = mesh.geodists[cVertsGT[selected], cVerts[selected]]
+        return geodists_vertices.numpy()
+
+    def getDistancesUV(self, cVertsGT, cVerts):
+        #
+        n = 27554
+        dists = []
+        for d in range(len(cVertsGT)):
+            if cVertsGT[d] > 0:
+                if cVerts[d] > 0:
+                    i = cVertsGT[d] - 1
+                    j = cVerts[d] - 1
+                    if j == i:
+                        dists.append(0)
+                    elif j > i:
+                        ccc = i
+                        i = j
+                        j = ccc
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                    else:
+                        i = n - i - 1
+                        j = n - j - 1
+                        k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1
+                        k = (n * n - n) / 2 - k - 1
+                        dists.append(self.Pdist_matrix[int(k)][0])
+                else:
+                    dists.append(np.inf)
+        return np.atleast_1d(np.array(dists).squeeze())
+
+
+class Params:
+    """
+    Params for coco evaluation api
+    """
+
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0**2, 1e5**2],
+            [0**2, 32**2],
+            [32**2, 96**2],
+            [96**2, 1e5**2],
+        ]
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def setUvParams(self):
+        self.imgIds = []
+        self.catIds = []
+        self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ["all", "medium", "large"]
+        self.useCats = 1
+
+    def __init__(self, iouType="segm"):
+        if iouType == "segm" or iouType == "bbox":
+            self.setDetParams()
+        elif iouType == "keypoints":
+            self.setKpParams()
+        elif iouType == "densepose":
+            self.setUvParams()
+        else:
+            raise Exception("iouType not supported")
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/densepose/evaluation/evaluator.py b/densepose/evaluation/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..803d3dccbe60a637e349a22e3364f3c0b5f4f1e5
--- /dev/null
+++ b/densepose/evaluation/evaluator.py
@@ -0,0 +1,423 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import contextlib
+import copy
+import io
+import itertools
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+from typing import Dict, Iterable, List, Optional
+import pycocotools.mask as mask_utils
+import torch
+from pycocotools.coco import COCO
+from tabulate import tabulate
+
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.evaluation import DatasetEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.comm import gather, get_rank, is_main_process, synchronize
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+
+from densepose.converters import ToChartResultConverter, ToMaskConverter
+from densepose.data.datasets.coco import maybe_filter_and_map_categories_cocoapi
+from densepose.structures import (
+    DensePoseChartPredictorOutput,
+    DensePoseEmbeddingPredictorOutput,
+    quantize_densepose_chart_result,
+)
+
+from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode
+from .mesh_alignment_evaluator import MeshAlignmentEvaluator
+from .tensor_storage import (
+    SingleProcessFileTensorStorage,
+    SingleProcessRamTensorStorage,
+    SingleProcessTensorStorage,
+    SizeData,
+    storage_gather,
+)
+
+
+class DensePoseCOCOEvaluator(DatasetEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        distributed,
+        output_dir=None,
+        evaluator_type: str = "iuv",
+        min_iou_threshold: float = 0.5,
+        storage: Optional[SingleProcessTensorStorage] = None,
+        embedder=None,
+        should_evaluate_mesh_alignment: bool = False,
+        mesh_alignment_mesh_names: Optional[List[str]] = None,
+    ):
+        self._embedder = embedder
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._evaluator_type = evaluator_type
+        self._storage = storage
+        self._should_evaluate_mesh_alignment = should_evaluate_mesh_alignment
+
+        assert not (
+            should_evaluate_mesh_alignment and embedder is None
+        ), "Mesh alignment evaluation is activated, but no vertex embedder provided!"
+        if should_evaluate_mesh_alignment:
+            self._mesh_alignment_evaluator = MeshAlignmentEvaluator(
+                embedder,
+                mesh_alignment_mesh_names,
+            )
+
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._min_threshold = min_iou_threshold
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file)
+        maybe_filter_and_map_categories_cocoapi(dataset_name, self._coco_api)
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+                The :class:`Instances` object needs to have `densepose` field.
+        """
+        for input, output in zip(inputs, outputs):
+            instances = output["instances"].to(self._cpu_device)
+            if not instances.has("pred_densepose"):
+                continue
+            prediction_list = prediction_to_dict(
+                instances,
+                input["image_id"],
+                self._embedder,
+                self._metadata.class_to_mesh_name,
+                self._storage is not None,
+            )
+            if self._storage is not None:
+                for prediction_dict in prediction_list:
+                    dict_to_store = {}
+                    for field_name in self._storage.data_schema:
+                        dict_to_store[field_name] = prediction_dict[field_name]
+                    record_id = self._storage.put(dict_to_store)
+                    prediction_dict["record_id"] = record_id
+                    prediction_dict["rank"] = get_rank()
+                    for field_name in self._storage.data_schema:
+                        del prediction_dict[field_name]
+            self._predictions.extend(prediction_list)
+
+    def evaluate(self, img_ids=None):
+        if self._distributed:
+            synchronize()
+            predictions = gather(self._predictions)
+            predictions = list(itertools.chain(*predictions))
+        else:
+            predictions = self._predictions
+
+        multi_storage = storage_gather(self._storage) if self._storage is not None else None
+
+        if not is_main_process():
+            return
+        return copy.deepcopy(self._eval_predictions(predictions, multi_storage, img_ids))
+
+    def _eval_predictions(self, predictions, multi_storage=None, img_ids=None):
+        """
+        Evaluate predictions on densepose.
+        Return results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "coco_densepose_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+
+        self._logger.info("Evaluating predictions ...")
+        res = OrderedDict()
+        results_gps, results_gpsm, results_segm = _evaluate_predictions_on_coco(
+            self._coco_api,
+            predictions,
+            multi_storage,
+            self._embedder,
+            class_names=self._metadata.get("thing_classes"),
+            min_threshold=self._min_threshold,
+            img_ids=img_ids,
+        )
+        res["densepose_gps"] = results_gps
+        res["densepose_gpsm"] = results_gpsm
+        res["densepose_segm"] = results_segm
+        if self._should_evaluate_mesh_alignment:
+            res["densepose_mesh_alignment"] = self._evaluate_mesh_alignment()
+        return res
+
+    def _evaluate_mesh_alignment(self):
+        self._logger.info("Mesh alignment evaluation ...")
+        mean_ge, mean_gps, per_mesh_metrics = self._mesh_alignment_evaluator.evaluate()
+        results = {
+            "GE": mean_ge * 100,
+            "GPS": mean_gps * 100,
+        }
+        mesh_names = set()
+        for metric_name in per_mesh_metrics:
+            for mesh_name, value in per_mesh_metrics[metric_name].items():
+                results[f"{metric_name}-{mesh_name}"] = value * 100
+                mesh_names.add(mesh_name)
+        self._print_mesh_alignment_results(results, mesh_names)
+        return results
+
+    def _print_mesh_alignment_results(self, results: Dict[str, float], mesh_names: Iterable[str]):
+        self._logger.info("Evaluation results for densepose, mesh alignment:")
+        self._logger.info(f'| {"Mesh":13s} | {"GErr":7s} | {"GPS":7s} |')
+        self._logger.info("| :-----------: | :-----: | :-----: |")
+        for mesh_name in mesh_names:
+            ge_key = f"GE-{mesh_name}"
+            ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " "
+            gps_key = f"GPS-{mesh_name}"
+            gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " "
+            self._logger.info(f"| {mesh_name:13s} | {ge_str:7s} | {gps_str:7s} |")
+        self._logger.info("| :-------------------------------: |")
+        ge_key = "GE"
+        ge_str = f"{results[ge_key]:.4f}" if ge_key in results else " "
+        gps_key = "GPS"
+        gps_str = f"{results[gps_key]:.4f}" if gps_key in results else " "
+        self._logger.info(f'| {"MEAN":13s} | {ge_str:7s} | {gps_str:7s} |')
+
+
+def prediction_to_dict(instances, img_id, embedder, class_to_mesh_name, use_storage):
+    """
+    Args:
+        instances (Instances): the output of the model
+        img_id (str): the image id in COCO
+
+    Returns:
+        list[dict]: the results in densepose evaluation format
+    """
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    raw_boxes_xywh = BoxMode.convert(
+        instances.pred_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+    )
+
+    if isinstance(instances.pred_densepose, DensePoseEmbeddingPredictorOutput):
+        results_densepose = densepose_cse_predictions_to_dict(
+            instances, embedder, class_to_mesh_name, use_storage
+        )
+    elif isinstance(instances.pred_densepose, DensePoseChartPredictorOutput):
+        if not use_storage:
+            results_densepose = densepose_chart_predictions_to_dict(instances)
+        else:
+            results_densepose = densepose_chart_predictions_to_storage_dict(instances)
+
+    results = []
+    for k in range(len(instances)):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": raw_boxes_xywh[k].tolist(),
+            "score": scores[k],
+        }
+        results.append({**result, **results_densepose[k]})
+    return results
+
+
+def densepose_chart_predictions_to_dict(instances):
+    segmentations = ToMaskConverter.convert(
+        instances.pred_densepose, instances.pred_boxes, instances.image_size
+    )
+
+    results = []
+    for k in range(len(instances)):
+        densepose_results_quantized = quantize_densepose_chart_result(
+            ToChartResultConverter.convert(instances.pred_densepose[k], instances.pred_boxes[k])
+        )
+        densepose_results_quantized.labels_uv_uint8 = (
+            densepose_results_quantized.labels_uv_uint8.cpu()
+        )
+        segmentation = segmentations.tensor[k]
+        segmentation_encoded = mask_utils.encode(
+            np.require(segmentation.numpy(), dtype=np.uint8, requirements=["F"])
+        )
+        segmentation_encoded["counts"] = segmentation_encoded["counts"].decode("utf-8")
+        result = {
+            "densepose": densepose_results_quantized,
+            "segmentation": segmentation_encoded,
+        }
+        results.append(result)
+    return results
+
+
+def densepose_chart_predictions_to_storage_dict(instances):
+    results = []
+    for k in range(len(instances)):
+        densepose_predictor_output = instances.pred_densepose[k]
+        result = {
+            "coarse_segm": densepose_predictor_output.coarse_segm.squeeze(0).cpu(),
+            "fine_segm": densepose_predictor_output.fine_segm.squeeze(0).cpu(),
+            "u": densepose_predictor_output.u.squeeze(0).cpu(),
+            "v": densepose_predictor_output.v.squeeze(0).cpu(),
+        }
+        results.append(result)
+    return results
+
+
+def densepose_cse_predictions_to_dict(instances, embedder, class_to_mesh_name, use_storage):
+    results = []
+    for k in range(len(instances)):
+        cse = instances.pred_densepose[k]
+        results.append(
+            {
+                "coarse_segm": cse.coarse_segm[0].cpu(),
+                "embedding": cse.embedding[0].cpu(),
+            }
+        )
+    return results
+
+
+def _evaluate_predictions_on_coco(
+    coco_gt,
+    coco_results,
+    multi_storage=None,
+    embedder=None,
+    class_names=None,
+    min_threshold: float = 0.5,
+    img_ids=None,
+):
+    logger = logging.getLogger(__name__)
+
+    densepose_metrics = _get_densepose_metrics(min_threshold)
+    if len(coco_results) == 0:  # cocoapi does not handle empty results very well
+        logger.warn("No predictions from the model! Set scores to -1")
+        results_gps = {metric: -1 for metric in densepose_metrics}
+        results_gpsm = {metric: -1 for metric in densepose_metrics}
+        results_segm = {metric: -1 for metric in densepose_metrics}
+        return results_gps, results_gpsm, results_segm
+
+    coco_dt = coco_gt.loadRes(coco_results)
+
+    results = []
+    for eval_mode_name in ["GPS", "GPSM", "IOU"]:
+        eval_mode = getattr(DensePoseEvalMode, eval_mode_name)
+        coco_eval = DensePoseCocoEval(
+            coco_gt, coco_dt, "densepose", multi_storage, embedder, dpEvalMode=eval_mode
+        )
+        result = _derive_results_from_coco_eval(
+            coco_eval, eval_mode_name, densepose_metrics, class_names, min_threshold, img_ids
+        )
+        results.append(result)
+    return results
+
+
+def _get_densepose_metrics(min_threshold: float = 0.5):
+    metrics = ["AP"]
+    if min_threshold <= 0.201:
+        metrics += ["AP20"]
+    if min_threshold <= 0.301:
+        metrics += ["AP30"]
+    if min_threshold <= 0.401:
+        metrics += ["AP40"]
+    metrics.extend(["AP50", "AP75", "APm", "APl", "AR", "AR50", "AR75", "ARm", "ARl"])
+    return metrics
+
+
+def _derive_results_from_coco_eval(
+    coco_eval, eval_mode_name, metrics, class_names, min_threshold: float, img_ids
+):
+    if img_ids is not None:
+        coco_eval.params.imgIds = img_ids
+    coco_eval.params.iouThrs = np.linspace(
+        min_threshold, 0.95, int(np.round((0.95 - min_threshold) / 0.05)) + 1, endpoint=True
+    )
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)}
+    logger = logging.getLogger(__name__)
+    logger.info(
+        f"Evaluation results for densepose, {eval_mode_name} metric: \n"
+        + create_small_table(results)
+    )
+    if class_names is None or len(class_names) <= 1:
+        return results
+
+    # Compute per-category AP, the same way as it is done in D2
+    # (see detectron2/evaluation/coco_evaluation.py):
+    precisions = coco_eval.eval["precision"]
+    # precision has dims (iou, recall, cls, area range, max dets)
+    assert len(class_names) == precisions.shape[2]
+
+    results_per_category = []
+    for idx, name in enumerate(class_names):
+        # area range index 0: all area ranges
+        # max dets index -1: typically 100 per image
+        precision = precisions[:, :, idx, 0, -1]
+        precision = precision[precision > -1]
+        ap = np.mean(precision) if precision.size else float("nan")
+        results_per_category.append((f"{name}", float(ap * 100)))
+
+    # tabulate it
+    n_cols = min(6, len(results_per_category) * 2)
+    results_flatten = list(itertools.chain(*results_per_category))
+    results_2d = itertools.zip_longest(*[results_flatten[i::n_cols] for i in range(n_cols)])
+    table = tabulate(
+        results_2d,
+        tablefmt="pipe",
+        floatfmt=".3f",
+        headers=["category", "AP"] * (n_cols // 2),
+        numalign="left",
+    )
+    logger.info(f"Per-category {eval_mode_name} AP: \n" + table)
+
+    results.update({"AP-" + name: ap for name, ap in results_per_category})
+    return results
+
+
+def build_densepose_evaluator_storage(cfg: CfgNode, output_folder: str):
+    storage_spec = cfg.DENSEPOSE_EVALUATION.STORAGE
+    if storage_spec == "none":
+        return None
+    evaluator_type = cfg.DENSEPOSE_EVALUATION.TYPE
+    # common output tensor sizes
+    hout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+    wout = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+    n_csc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+    # specific output tensors
+    if evaluator_type == "iuv":
+        n_fsc = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        schema = {
+            "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)),
+            "fine_segm": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+            "u": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+            "v": SizeData(dtype="float32", shape=(n_fsc, hout, wout)),
+        }
+    elif evaluator_type == "cse":
+        embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        schema = {
+            "coarse_segm": SizeData(dtype="float32", shape=(n_csc, hout, wout)),
+            "embedding": SizeData(dtype="float32", shape=(embed_size, hout, wout)),
+        }
+    else:
+        raise ValueError(f"Unknown evaluator type: {evaluator_type}")
+    # storage types
+    if storage_spec == "ram":
+        storage = SingleProcessRamTensorStorage(schema, io.BytesIO())
+    elif storage_spec == "file":
+        fpath = os.path.join(output_folder, f"DensePoseEvaluatorStorage.{get_rank()}.bin")
+        PathManager.mkdirs(output_folder)
+        storage = SingleProcessFileTensorStorage(schema, fpath, "wb")
+    else:
+        raise ValueError(f"Unknown storage specification: {storage_spec}")
+    return storage
diff --git a/densepose/evaluation/mesh_alignment_evaluator.py b/densepose/evaluation/mesh_alignment_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6c76f3cf2d54250f7fa1d9a2a3a1d2c60eb0aad
--- /dev/null
+++ b/densepose/evaluation/mesh_alignment_evaluator.py
@@ -0,0 +1,68 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import json
+import logging
+from typing import List, Optional
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.structures.mesh import create_mesh
+
+
+class MeshAlignmentEvaluator:
+    """
+    Class for evaluation of 3D mesh alignment based on the learned vertex embeddings
+    """
+
+    def __init__(self, embedder: nn.Module, mesh_names: Optional[List[str]]):
+        self.embedder = embedder
+        # use the provided mesh names if not None and not an empty list
+        self.mesh_names = mesh_names if mesh_names else embedder.mesh_names
+        self.logger = logging.getLogger(__name__)
+        with PathManager.open(
+            "https://dl.fbaipublicfiles.com/densepose/data/cse/mesh_keyvertices_v0.json", "r"
+        ) as f:
+            self.mesh_keyvertices = json.load(f)
+
+    def evaluate(self):
+        ge_per_mesh = {}
+        gps_per_mesh = {}
+        for mesh_name_1 in self.mesh_names:
+            avg_errors = []
+            avg_gps = []
+            embeddings_1 = self.embedder(mesh_name_1)
+            keyvertices_1 = self.mesh_keyvertices[mesh_name_1]
+            keyvertex_names_1 = list(keyvertices_1.keys())
+            keyvertex_indices_1 = [keyvertices_1[name] for name in keyvertex_names_1]
+            for mesh_name_2 in self.mesh_names:
+                if mesh_name_1 == mesh_name_2:
+                    continue
+                embeddings_2 = self.embedder(mesh_name_2)
+                keyvertices_2 = self.mesh_keyvertices[mesh_name_2]
+                sim_matrix_12 = embeddings_1[keyvertex_indices_1].mm(embeddings_2.T)
+                vertices_2_matching_keyvertices_1 = sim_matrix_12.argmax(axis=1)
+                mesh_2 = create_mesh(mesh_name_2, embeddings_2.device)
+                geodists = mesh_2.geodists[
+                    vertices_2_matching_keyvertices_1,
+                    [keyvertices_2[name] for name in keyvertex_names_1],
+                ]
+                Current_Mean_Distances = 0.255
+                gps = (-(geodists**2) / (2 * (Current_Mean_Distances**2))).exp()
+                avg_errors.append(geodists.mean().item())
+                avg_gps.append(gps.mean().item())
+
+            ge_mean = torch.as_tensor(avg_errors).mean().item()
+            gps_mean = torch.as_tensor(avg_gps).mean().item()
+            ge_per_mesh[mesh_name_1] = ge_mean
+            gps_per_mesh[mesh_name_1] = gps_mean
+        ge_mean_global = torch.as_tensor(list(ge_per_mesh.values())).mean().item()
+        gps_mean_global = torch.as_tensor(list(gps_per_mesh.values())).mean().item()
+        per_mesh_metrics = {
+            "GE": ge_per_mesh,
+            "GPS": gps_per_mesh,
+        }
+        return ge_mean_global, gps_mean_global, per_mesh_metrics
diff --git a/densepose/evaluation/tensor_storage.py b/densepose/evaluation/tensor_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..369a29470807e60be377516f7910a9f95ab0a47d
--- /dev/null
+++ b/densepose/evaluation/tensor_storage.py
@@ -0,0 +1,241 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import io
+import numpy as np
+import os
+from dataclasses import dataclass
+from functools import reduce
+from operator import mul
+from typing import BinaryIO, Dict, Optional, Tuple
+import torch
+
+from detectron2.utils.comm import gather, get_rank
+from detectron2.utils.file_io import PathManager
+
+
+@dataclass
+class SizeData:
+    dtype: str
+    shape: Tuple[int]
+
+
+def _calculate_record_field_size_b(data_schema: Dict[str, SizeData], field_name: str) -> int:
+    schema = data_schema[field_name]
+    element_size_b = np.dtype(schema.dtype).itemsize
+    record_field_size_b = reduce(mul, schema.shape) * element_size_b
+    return record_field_size_b
+
+
+def _calculate_record_size_b(data_schema: Dict[str, SizeData]) -> int:
+    record_size_b = 0
+    for field_name in data_schema:
+        record_field_size_b = _calculate_record_field_size_b(data_schema, field_name)
+        record_size_b += record_field_size_b
+    return record_size_b
+
+
+def _calculate_record_field_sizes_b(data_schema: Dict[str, SizeData]) -> Dict[str, int]:
+    field_sizes_b = {}
+    for field_name in data_schema:
+        field_sizes_b[field_name] = _calculate_record_field_size_b(data_schema, field_name)
+    return field_sizes_b
+
+
+class SingleProcessTensorStorage:
+    """
+    Compact tensor storage to keep tensor data of predefined size and type.
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], storage_impl: BinaryIO):
+        """
+        Construct tensor storage based on information on data shape and size.
+        Internally uses numpy to interpret the type specification.
+        The storage must support operations `seek(offset, whence=os.SEEK_SET)` and
+        `read(size)` to be able to perform the `get` operation.
+        The storage must support operation `write(bytes)` to be able to perform
+        the `put` operation.
+
+        Args:
+            data_schema (dict: str -> SizeData): dictionary which maps tensor name
+                to its size data (shape and data type), e.g.
+                ```
+                {
+                  "coarse_segm": SizeData(dtype="float32", shape=(112, 112)),
+                  "embedding": SizeData(dtype="float32", shape=(16, 112, 112)),
+                }
+                ```
+            storage_impl (BinaryIO): io instance that handles file-like seek, read
+                and write operations, e.g. a file handle or a memory buffer like io.BytesIO
+        """
+        self.data_schema = data_schema
+        self.record_size_b = _calculate_record_size_b(data_schema)
+        self.record_field_sizes_b = _calculate_record_field_sizes_b(data_schema)
+        self.storage_impl = storage_impl
+        self.next_record_id = 0
+
+    def get(self, record_id: int) -> Dict[str, torch.Tensor]:
+        """
+        Load tensors from the storage by record ID
+
+        Args:
+            record_id (int): Record ID, for which to load the data
+
+        Return:
+            dict: str -> tensor: tensor name mapped to tensor data, recorded under the provided ID
+        """
+        self.storage_impl.seek(record_id * self.record_size_b, os.SEEK_SET)
+        data_bytes = self.storage_impl.read(self.record_size_b)
+        assert len(data_bytes) == self.record_size_b, (
+            f"Expected data size {self.record_size_b} B could not be read: "
+            f"got {len(data_bytes)} B"
+        )
+        record = {}
+        cur_idx = 0
+        # it's important to read and write in the same order
+        for field_name in sorted(self.data_schema):
+            schema = self.data_schema[field_name]
+            field_size_b = self.record_field_sizes_b[field_name]
+            chunk = data_bytes[cur_idx : cur_idx + field_size_b]
+            data_np = np.frombuffer(
+                chunk, dtype=schema.dtype, count=reduce(mul, schema.shape)
+            ).reshape(schema.shape)
+            record[field_name] = torch.from_numpy(data_np)
+            cur_idx += field_size_b
+        return record
+
+    def put(self, data: Dict[str, torch.Tensor]) -> int:
+        """
+        Store tensors in the storage
+
+        Args:
+            data (dict: str -> tensor): data to store, a dictionary which maps
+                tensor names into tensors; tensor shapes must match those specified
+                in data schema.
+        Return:
+            int: record ID, under which the data is stored
+        """
+        # it's important to read and write in the same order
+        for field_name in sorted(self.data_schema):
+            assert (
+                field_name in data
+            ), f"Field '{field_name}' not present in data: data keys are {data.keys()}"
+            value = data[field_name]
+            assert value.shape == self.data_schema[field_name].shape, (
+                f"Mismatched tensor shapes for field '{field_name}': "
+                f"expected {self.data_schema[field_name].shape}, got {value.shape}"
+            )
+            data_bytes = value.cpu().numpy().tobytes()
+            assert len(data_bytes) == self.record_field_sizes_b[field_name], (
+                f"Expected field {field_name} to be of size "
+                f"{self.record_field_sizes_b[field_name]} B, got {len(data_bytes)} B"
+            )
+            self.storage_impl.write(data_bytes)
+        record_id = self.next_record_id
+        self.next_record_id += 1
+        return record_id
+
+
+class SingleProcessFileTensorStorage(SingleProcessTensorStorage):
+    """
+    Implementation of a single process tensor storage which stores data in a file
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], fpath: str, mode: str):
+        self.fpath = fpath
+        assert "b" in mode, f"Tensor storage should be opened in binary mode, got '{mode}'"
+        if "w" in mode:
+            # pyre-fixme[6]: For 2nd argument expected `Union[typing_extensions.Liter...
+            file_h = PathManager.open(fpath, mode)
+        elif "r" in mode:
+            local_fpath = PathManager.get_local_path(fpath)
+            file_h = open(local_fpath, mode)
+        else:
+            raise ValueError(f"Unsupported file mode {mode}, supported modes: rb, wb")
+        super().__init__(data_schema, file_h)  # pyre-ignore[6]
+
+
+class SingleProcessRamTensorStorage(SingleProcessTensorStorage):
+    """
+    Implementation of a single process tensor storage which stores data in RAM
+    """
+
+    def __init__(self, data_schema: Dict[str, SizeData], buf: io.BytesIO):
+        super().__init__(data_schema, buf)
+
+
+class MultiProcessTensorStorage:
+    """
+    Representation of a set of tensor storages created by individual processes,
+    allows to access those storages from a single owner process. The storages
+    should either be shared or broadcasted to the owner process.
+    The processes are identified by their rank, data is uniquely defined by
+    the rank of the process and the record ID.
+    """
+
+    def __init__(self, rank_to_storage: Dict[int, SingleProcessTensorStorage]):
+        self.rank_to_storage = rank_to_storage
+
+    def get(self, rank: int, record_id: int) -> Dict[str, torch.Tensor]:
+        storage = self.rank_to_storage[rank]
+        return storage.get(record_id)
+
+    def put(self, rank: int, data: Dict[str, torch.Tensor]) -> int:
+        storage = self.rank_to_storage[rank]
+        return storage.put(data)
+
+
+class MultiProcessFileTensorStorage(MultiProcessTensorStorage):
+    def __init__(self, data_schema: Dict[str, SizeData], rank_to_fpath: Dict[int, str], mode: str):
+        rank_to_storage = {
+            rank: SingleProcessFileTensorStorage(data_schema, fpath, mode)
+            for rank, fpath in rank_to_fpath.items()
+        }
+        super().__init__(rank_to_storage)  # pyre-ignore[6]
+
+
+class MultiProcessRamTensorStorage(MultiProcessTensorStorage):
+    def __init__(self, data_schema: Dict[str, SizeData], rank_to_buffer: Dict[int, io.BytesIO]):
+        rank_to_storage = {
+            rank: SingleProcessRamTensorStorage(data_schema, buf)
+            for rank, buf in rank_to_buffer.items()
+        }
+        super().__init__(rank_to_storage)  # pyre-ignore[6]
+
+
+def _ram_storage_gather(
+    storage: SingleProcessRamTensorStorage, dst_rank: int = 0
+) -> Optional[MultiProcessRamTensorStorage]:
+    storage.storage_impl.seek(0, os.SEEK_SET)
+    # TODO: overhead, pickling a bytes object, can just pass bytes in a tensor directly
+    # see detectron2/utils.comm.py
+    data_list = gather(storage.storage_impl.read(), dst=dst_rank)
+    if get_rank() != dst_rank:
+        return None
+    rank_to_buffer = {i: io.BytesIO(data_list[i]) for i in range(len(data_list))}
+    multiprocess_storage = MultiProcessRamTensorStorage(storage.data_schema, rank_to_buffer)
+    return multiprocess_storage
+
+
+def _file_storage_gather(
+    storage: SingleProcessFileTensorStorage,
+    dst_rank: int = 0,
+    mode: str = "rb",
+) -> Optional[MultiProcessFileTensorStorage]:
+    storage.storage_impl.close()
+    fpath_list = gather(storage.fpath, dst=dst_rank)
+    if get_rank() != dst_rank:
+        return None
+    rank_to_fpath = {i: fpath_list[i] for i in range(len(fpath_list))}
+    return MultiProcessFileTensorStorage(storage.data_schema, rank_to_fpath, mode)
+
+
+def storage_gather(
+    storage: SingleProcessTensorStorage, dst_rank: int = 0
+) -> Optional[MultiProcessTensorStorage]:
+    if isinstance(storage, SingleProcessRamTensorStorage):
+        return _ram_storage_gather(storage, dst_rank)
+    elif isinstance(storage, SingleProcessFileTensorStorage):
+        return _file_storage_gather(storage, dst_rank)
+    raise Exception(f"Unsupported storage for gather operation: {storage}")
diff --git a/densepose/modeling/__init__.py b/densepose/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c5b48b1fc6100dd531f7b61467876e222e40bdd
--- /dev/null
+++ b/densepose/modeling/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from .filter import DensePoseDataFilter
+from .inference import densepose_inference
+from .utils import initialize_module_params
+from .build import (
+    build_densepose_data_filter,
+    build_densepose_embedder,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+)
diff --git a/densepose/modeling/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb514b114a95540c46fffd1c0739939bc23dfafe
Binary files /dev/null and b/densepose/modeling/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/build.cpython-39.pyc b/densepose/modeling/__pycache__/build.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab20b2c6d876ced320e312146146e39f71d98dcc
Binary files /dev/null and b/densepose/modeling/__pycache__/build.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/confidence.cpython-39.pyc b/densepose/modeling/__pycache__/confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2242d1bf50d79da3f554735c111a830accd10d1c
Binary files /dev/null and b/densepose/modeling/__pycache__/confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/filter.cpython-39.pyc b/densepose/modeling/__pycache__/filter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..471ab7a844279e79902f481460b9ed3fed620ffb
Binary files /dev/null and b/densepose/modeling/__pycache__/filter.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc b/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff803e930882092b2d2650444f2ab2060c823f09
Binary files /dev/null and b/densepose/modeling/__pycache__/hrfpn.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/hrnet.cpython-39.pyc b/densepose/modeling/__pycache__/hrnet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b72be0bb4f87b08f7815dd2fbfb7a25ff9ff7c
Binary files /dev/null and b/densepose/modeling/__pycache__/hrnet.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/inference.cpython-39.pyc b/densepose/modeling/__pycache__/inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f691af36aa63bd6175423f0e430b988b19c106ef
Binary files /dev/null and b/densepose/modeling/__pycache__/inference.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc b/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8c016bbaf816a2b867dd9f87a230883860c101
Binary files /dev/null and b/densepose/modeling/__pycache__/test_time_augmentation.cpython-39.pyc differ
diff --git a/densepose/modeling/__pycache__/utils.cpython-39.pyc b/densepose/modeling/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..161f01ca964f996c8ec4870b4305bae0c14db66b
Binary files /dev/null and b/densepose/modeling/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/build.py b/densepose/modeling/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..82e40d9284eeb9c90bf5e2ac13a95f587c76a595
--- /dev/null
+++ b/densepose/modeling/build.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Optional
+from torch import nn
+
+from detectron2.config import CfgNode
+
+from .cse.embedder import Embedder
+from .filter import DensePoseDataFilter
+
+
+def build_densepose_predictor(cfg: CfgNode, input_channels: int):
+    """
+    Create an instance of DensePose predictor based on configuration options.
+
+    Args:
+        cfg (CfgNode): configuration options
+        input_channels (int): input tensor size along the channel dimension
+    Return:
+        An instance of DensePose predictor
+    """
+    from .predictors import DENSEPOSE_PREDICTOR_REGISTRY
+
+    predictor_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME
+    return DENSEPOSE_PREDICTOR_REGISTRY.get(predictor_name)(cfg, input_channels)
+
+
+def build_densepose_data_filter(cfg: CfgNode):
+    """
+    Build DensePose data filter which selects data for training
+
+    Args:
+        cfg (CfgNode): configuration options
+
+    Return:
+        Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances)
+        An instance of DensePose filter, which takes feature tensors and proposals
+        as an input and returns filtered features and proposals
+    """
+    dp_filter = DensePoseDataFilter(cfg)
+    return dp_filter
+
+
+def build_densepose_head(cfg: CfgNode, input_channels: int):
+    """
+    Build DensePose head based on configurations options
+
+    Args:
+        cfg (CfgNode): configuration options
+        input_channels (int): input tensor size along the channel dimension
+    Return:
+        An instance of DensePose head
+    """
+    from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+    head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME
+    return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels)
+
+
+def build_densepose_losses(cfg: CfgNode):
+    """
+    Build DensePose loss based on configurations options
+
+    Args:
+        cfg (CfgNode): configuration options
+    Return:
+        An instance of DensePose loss
+    """
+    from .losses import DENSEPOSE_LOSS_REGISTRY
+
+    loss_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME
+    return DENSEPOSE_LOSS_REGISTRY.get(loss_name)(cfg)
+
+
+def build_densepose_embedder(cfg: CfgNode) -> Optional[nn.Module]:
+    """
+    Build embedder used to embed mesh vertices into an embedding space.
+    Embedder contains sub-embedders, one for each mesh ID.
+
+    Args:
+        cfg (cfgNode): configuration options
+    Return:
+        Embedding module
+    """
+    if cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS:
+        return Embedder(cfg)
+    return None
diff --git a/densepose/modeling/confidence.py b/densepose/modeling/confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..364e389078e78935da9e432bc04b5530d2d9963f
--- /dev/null
+++ b/densepose/modeling/confidence.py
@@ -0,0 +1,75 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from enum import Enum
+
+from detectron2.config import CfgNode
+
+
+class DensePoseUVConfidenceType(Enum):
+    """
+    Statistical model type for confidence learning, possible values:
+     - "iid_iso": statistically independent identically distributed residuals
+         with anisotropic covariance
+     - "indep_aniso": statistically independent residuals with anisotropic
+         covariances
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    # fmt: off
+    IID_ISO     = "iid_iso"
+    INDEP_ANISO = "indep_aniso"
+    # fmt: on
+
+
+@dataclass
+class DensePoseUVConfidenceConfig:
+    """
+    Configuration options for confidence on UV data
+    """
+
+    enabled: bool = False
+    # lower bound on UV confidences
+    epsilon: float = 0.01
+    type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO
+
+
+@dataclass
+class DensePoseSegmConfidenceConfig:
+    """
+    Configuration options for confidence on segmentation
+    """
+
+    enabled: bool = False
+    # lower bound on confidence values
+    epsilon: float = 0.01
+
+
+@dataclass
+class DensePoseConfidenceModelConfig:
+    """
+    Configuration options for confidence models
+    """
+
+    # confidence for U and V values
+    uv_confidence: DensePoseUVConfidenceConfig
+    # segmentation confidence
+    segm_confidence: DensePoseSegmConfidenceConfig
+
+    @staticmethod
+    def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig":
+        return DensePoseConfidenceModelConfig(
+            uv_confidence=DensePoseUVConfidenceConfig(
+                enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED,
+                epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON,
+                type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE),
+            ),
+            segm_confidence=DensePoseSegmConfidenceConfig(
+                enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED,
+                epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON,
+            ),
+        )
diff --git a/densepose/modeling/cse/__init__.py b/densepose/modeling/cse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80248c94c5cc23f1503a6338af225f63bc8cec42
--- /dev/null
+++ b/densepose/modeling/cse/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from .vertex_direct_embedder import VertexDirectEmbedder
+from .vertex_feature_embedder import VertexFeatureEmbedder
+from .embedder import Embedder
diff --git a/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c0f3d5a1eb6db87191fc1ca581c26a56dcb1309
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ab612169300ac2dcc121fe19e98b48577a6c342
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc b/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cd25bc909172cd84687f94dcbea5be158f32530
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0fb28165653bd12500b4bcc430ff3b6d8db5a82
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_direct_embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6c41c9a1caab804374bf4f8f91817d71f17c6ef
Binary files /dev/null and b/densepose/modeling/cse/__pycache__/vertex_feature_embedder.cpython-39.pyc differ
diff --git a/densepose/modeling/cse/embedder.py b/densepose/modeling/cse/embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..69082294acee57517b4b4ab8c11814b7c99e5232
--- /dev/null
+++ b/densepose/modeling/cse/embedder.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import logging
+import numpy as np
+import pickle
+from enum import Enum
+from typing import Optional
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.utils.file_io import PathManager
+
+from .vertex_direct_embedder import VertexDirectEmbedder
+from .vertex_feature_embedder import VertexFeatureEmbedder
+
+
+class EmbedderType(Enum):
+    """
+    Embedder type which defines how vertices are mapped into the embedding space:
+     - "vertex_direct": direct vertex embedding
+     - "vertex_feature": embedding vertex features
+    """
+
+    VERTEX_DIRECT = "vertex_direct"
+    VERTEX_FEATURE = "vertex_feature"
+
+
+def create_embedder(embedder_spec: CfgNode, embedder_dim: int) -> nn.Module:
+    """
+    Create an embedder based on the provided configuration
+
+    Args:
+        embedder_spec (CfgNode): embedder configuration
+        embedder_dim (int): embedding space dimensionality
+    Return:
+        An embedder instance for the specified configuration
+        Raises ValueError, in case of unexpected  embedder type
+    """
+    embedder_type = EmbedderType(embedder_spec.TYPE)
+    if embedder_type == EmbedderType.VERTEX_DIRECT:
+        embedder = VertexDirectEmbedder(
+            num_vertices=embedder_spec.NUM_VERTICES,
+            embed_dim=embedder_dim,
+        )
+        if embedder_spec.INIT_FILE != "":
+            embedder.load(embedder_spec.INIT_FILE)
+    elif embedder_type == EmbedderType.VERTEX_FEATURE:
+        embedder = VertexFeatureEmbedder(
+            num_vertices=embedder_spec.NUM_VERTICES,
+            feature_dim=embedder_spec.FEATURE_DIM,
+            embed_dim=embedder_dim,
+            train_features=embedder_spec.FEATURES_TRAINABLE,
+        )
+        if embedder_spec.INIT_FILE != "":
+            embedder.load(embedder_spec.INIT_FILE)
+    else:
+        raise ValueError(f"Unexpected embedder type {embedder_type}")
+
+    if not embedder_spec.IS_TRAINABLE:
+        embedder.requires_grad_(False)
+
+    return embedder
+
+
+class Embedder(nn.Module):
+    """
+    Embedder module that serves as a container for embedders to use with different
+    meshes. Extends Module to automatically save / load state dict.
+    """
+
+    DEFAULT_MODEL_CHECKPOINT_PREFIX = "roi_heads.embedder."
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize mesh embedders. An embedder for mesh `i` is stored in a submodule
+        "embedder_{i}".
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        super(Embedder, self).__init__()
+        self.mesh_names = set()
+        embedder_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        logger = logging.getLogger(__name__)
+        for mesh_name, embedder_spec in cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.items():
+            logger.info(f"Adding embedder embedder_{mesh_name} with spec {embedder_spec}")
+            self.add_module(f"embedder_{mesh_name}", create_embedder(embedder_spec, embedder_dim))
+            self.mesh_names.add(mesh_name)
+        if cfg.MODEL.WEIGHTS != "":
+            self.load_from_model_checkpoint(cfg.MODEL.WEIGHTS)
+
+    def load_from_model_checkpoint(self, fpath: str, prefix: Optional[str] = None):
+        if prefix is None:
+            prefix = Embedder.DEFAULT_MODEL_CHECKPOINT_PREFIX
+        state_dict = None
+        if fpath.endswith(".pkl"):
+            with PathManager.open(fpath, "rb") as hFile:
+                state_dict = pickle.load(hFile, encoding="latin1")
+        else:
+            with PathManager.open(fpath, "rb") as hFile:
+                state_dict = torch.load(hFile, map_location=torch.device("cpu"))
+        if state_dict is not None and "model" in state_dict:
+            state_dict_local = {}
+            for key in state_dict["model"]:
+                if key.startswith(prefix):
+                    v_key = state_dict["model"][key]
+                    if isinstance(v_key, np.ndarray):
+                        v_key = torch.from_numpy(v_key)
+                    state_dict_local[key[len(prefix) :]] = v_key
+            # non-strict loading to finetune on different meshes
+            self.load_state_dict(state_dict_local, strict=False)
+
+    def forward(self, mesh_name: str) -> torch.Tensor:
+        """
+        Produce vertex embeddings for the specific mesh; vertex embeddings are
+        a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+        Args:
+            mesh_name (str): name of a mesh for which to obtain vertex embeddings
+        Return:
+            Vertex embeddings, a tensor of shape [N, D]
+        """
+        return getattr(self, f"embedder_{mesh_name}")()
+
+    def has_embeddings(self, mesh_name: str) -> bool:
+        return hasattr(self, f"embedder_{mesh_name}")
diff --git a/densepose/modeling/cse/utils.py b/densepose/modeling/cse/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb83b1af580ef76d8eddb03980fa14fe97298965
--- /dev/null
+++ b/densepose/modeling/cse/utils.py
@@ -0,0 +1,83 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import torch
+from torch.nn import functional as F
+
+
+def squared_euclidean_distance_matrix(pts1: torch.Tensor, pts2: torch.Tensor) -> torch.Tensor:
+    """
+    Get squared Euclidean Distance Matrix
+    Computes pairwise squared Euclidean distances between points
+
+    Args:
+        pts1: Tensor [M x D], M is the number of points, D is feature dimensionality
+        pts2: Tensor [N x D], N is the number of points, D is feature dimensionality
+
+    Return:
+        Tensor [M, N]: matrix of squared Euclidean distances; at index (m, n)
+            it contains || pts1[m] - pts2[n] ||^2
+    """
+    edm = torch.mm(-2 * pts1, pts2.t())
+    edm += (pts1 * pts1).sum(1, keepdim=True) + (pts2 * pts2).sum(1, keepdim=True).t()
+    return edm.contiguous()
+
+
+def normalize_embeddings(embeddings: torch.Tensor, epsilon: float = 1e-6) -> torch.Tensor:
+    """
+    Normalize N D-dimensional embedding vectors arranged in a tensor [N, D]
+
+    Args:
+        embeddings (tensor [N, D]): N D-dimensional embedding vectors
+        epsilon (float): minimum value for a vector norm
+    Return:
+        Normalized embeddings (tensor [N, D]), such that L2 vector norms are all equal to 1.
+    """
+    return embeddings / torch.clamp(embeddings.norm(p=None, dim=1, keepdim=True), min=epsilon)
+
+
+def get_closest_vertices_mask_from_ES(
+    E: torch.Tensor,
+    S: torch.Tensor,
+    h: int,
+    w: int,
+    mesh_vertex_embeddings: torch.Tensor,
+    device: torch.device,
+):
+    """
+    Interpolate Embeddings and Segmentations to the size of a given bounding box,
+    and compute closest vertices and the segmentation mask
+
+    Args:
+        E (tensor [1, D, H, W]): D-dimensional embedding vectors for every point of the
+            default-sized box
+        S (tensor [1, 2, H, W]): 2-dimensional segmentation mask for every point of the
+            default-sized box
+        h (int): height of the target bounding box
+        w (int): width of the target bounding box
+        mesh_vertex_embeddings (tensor [N, D]): vertex embeddings for a chosen mesh
+            N is the number of vertices in the mesh, D is feature dimensionality
+        device (torch.device): device to move the tensors to
+    Return:
+        Closest Vertices (tensor [h, w]), int, for every point of the resulting box
+        Segmentation mask (tensor [h, w]), boolean, for every point of the resulting box
+    """
+    embedding_resized = F.interpolate(E, size=(h, w), mode="bilinear")[0].to(device)
+    coarse_segm_resized = F.interpolate(S, size=(h, w), mode="bilinear")[0].to(device)
+    mask = coarse_segm_resized.argmax(0) > 0
+    closest_vertices = torch.zeros(mask.shape, dtype=torch.long, device=device)
+    all_embeddings = embedding_resized[:, mask].t()
+    size_chunk = 10_000  # Chunking to avoid possible OOM
+    edm = []
+    if len(all_embeddings) == 0:
+        return closest_vertices, mask
+    for chunk in range((len(all_embeddings) - 1) // size_chunk + 1):
+        chunk_embeddings = all_embeddings[size_chunk * chunk : size_chunk * (chunk + 1)]
+        edm.append(
+            torch.argmin(
+                squared_euclidean_distance_matrix(chunk_embeddings, mesh_vertex_embeddings), dim=1
+            )
+        )
+    closest_vertices[mask] = torch.cat(edm)
+    return closest_vertices, mask
diff --git a/densepose/modeling/cse/vertex_direct_embedder.py b/densepose/modeling/cse/vertex_direct_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d92e7786336da0ed9582793620c33a3853195e
--- /dev/null
+++ b/densepose/modeling/cse/vertex_direct_embedder.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import pickle
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from .utils import normalize_embeddings
+
+
+class VertexDirectEmbedder(nn.Module):
+    """
+    Class responsible for embedding vertices. Vertex embeddings take
+    the form of a tensor of size [N, D], where
+        N = number of vertices
+        D = number of dimensions in the embedding space
+    """
+
+    def __init__(self, num_vertices: int, embed_dim: int):
+        """
+        Initialize embedder, set random embeddings
+
+        Args:
+            num_vertices (int): number of vertices to embed
+            embed_dim (int): number of dimensions in the embedding space
+        """
+        super(VertexDirectEmbedder, self).__init__()
+        self.embeddings = nn.Parameter(torch.Tensor(num_vertices, embed_dim))
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        """
+        Reset embeddings to random values
+        """
+        self.embeddings.zero_()
+
+    def forward(self) -> torch.Tensor:
+        """
+        Produce vertex embeddings, a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+
+        Return:
+           Full vertex embeddings, a tensor of shape [N, D]
+        """
+        return normalize_embeddings(self.embeddings)
+
+    @torch.no_grad()
+    def load(self, fpath: str):
+        """
+        Load data from a file
+
+        Args:
+            fpath (str): file path to load data from
+        """
+        with PathManager.open(fpath, "rb") as hFile:
+            data = pickle.load(hFile)
+            for name in ["embeddings"]:
+                if name in data:
+                    getattr(self, name).copy_(
+                        torch.tensor(data[name]).float().to(device=getattr(self, name).device)
+                    )
diff --git a/densepose/modeling/cse/vertex_feature_embedder.py b/densepose/modeling/cse/vertex_feature_embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb495f88bc5a205e3639d797910c899d6344cca5
--- /dev/null
+++ b/densepose/modeling/cse/vertex_feature_embedder.py
@@ -0,0 +1,77 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import pickle
+import torch
+from torch import nn
+
+from detectron2.utils.file_io import PathManager
+
+from .utils import normalize_embeddings
+
+
+class VertexFeatureEmbedder(nn.Module):
+    """
+    Class responsible for embedding vertex features. Mapping from
+    feature space to the embedding space is a tensor of size [K, D], where
+        K = number of dimensions in the feature space
+        D = number of dimensions in the embedding space
+    Vertex features is a tensor of size [N, K], where
+        N = number of vertices
+        K = number of dimensions in the feature space
+    Vertex embeddings are computed as F * E = tensor of size [N, D]
+    """
+
+    def __init__(
+        self, num_vertices: int, feature_dim: int, embed_dim: int, train_features: bool = False
+    ):
+        """
+        Initialize embedder, set random embeddings
+
+        Args:
+            num_vertices (int): number of vertices to embed
+            feature_dim (int): number of dimensions in the feature space
+            embed_dim (int): number of dimensions in the embedding space
+            train_features (bool): determines whether vertex features should
+                be trained (default: False)
+        """
+        super(VertexFeatureEmbedder, self).__init__()
+        if train_features:
+            self.features = nn.Parameter(torch.Tensor(num_vertices, feature_dim))
+        else:
+            self.register_buffer("features", torch.Tensor(num_vertices, feature_dim))
+        self.embeddings = nn.Parameter(torch.Tensor(feature_dim, embed_dim))
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        self.features.zero_()
+        self.embeddings.zero_()
+
+    def forward(self) -> torch.Tensor:
+        """
+        Produce vertex embeddings, a tensor of shape [N, D] where:
+            N = number of vertices
+            D = number of dimensions in the embedding space
+
+        Return:
+           Full vertex embeddings, a tensor of shape [N, D]
+        """
+        return normalize_embeddings(torch.mm(self.features, self.embeddings))
+
+    @torch.no_grad()
+    def load(self, fpath: str):
+        """
+        Load data from a file
+
+        Args:
+            fpath (str): file path to load data from
+        """
+        with PathManager.open(fpath, "rb") as hFile:
+            data = pickle.load(hFile)
+            for name in ["features", "embeddings"]:
+                if name in data:
+                    getattr(self, name).copy_(
+                        torch.tensor(data[name]).float().to(device=getattr(self, name).device)
+                    )
diff --git a/densepose/modeling/densepose_checkpoint.py b/densepose/modeling/densepose_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c85711e976efdf56f0c6494fd19636e7411be2b4
--- /dev/null
+++ b/densepose/modeling/densepose_checkpoint.py
@@ -0,0 +1,37 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from collections import OrderedDict
+
+from detectron2.checkpoint import DetectionCheckpointer
+
+
+def _rename_HRNet_weights(weights):
+    # We detect and  rename HRNet weights for DensePose. 1956 and 1716 are values that are
+    # common to all HRNet pretrained weights, and should be enough to accurately identify them
+    if (
+        len(weights["model"].keys()) == 1956
+        and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716
+    ):
+        hrnet_weights = OrderedDict()
+        for k in weights["model"].keys():
+            hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k]
+        return {"model": hrnet_weights}
+    else:
+        return weights
+
+
+class DensePoseCheckpointer(DetectionCheckpointer):
+    """
+    Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights
+    """
+
+    def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables):
+        super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables)
+
+    def _load_file(self, filename: str) -> object:
+        """
+        Adding hrnet support
+        """
+        weights = super()._load_file(filename)
+        return _rename_HRNet_weights(weights)
diff --git a/densepose/modeling/filter.py b/densepose/modeling/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..503321004e39c1bd96be3512a3811e33fed4d008
--- /dev/null
+++ b/densepose/modeling/filter.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import List
+import torch
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+from detectron2.structures.boxes import matched_pairwise_iou
+
+
+class DensePoseDataFilter:
+    def __init__(self, cfg: CfgNode):
+        self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD
+        self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+
+    @torch.no_grad()
+    def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]):
+        """
+        Filters proposals with targets to keep only the ones relevant for
+        DensePose training
+
+        Args:
+            features (list[Tensor]): input data as a list of features,
+                each feature is a tensor. Axis 0 represents the number of
+                images `N` in the input data; axes 1-3 are channels,
+                height, and width, which may vary between features
+                (e.g., if a feature pyramid is used).
+            proposals_with_targets (list[Instances]): length `N` list of
+                `Instances`. The i-th `Instances` contains instances
+                (proposals, GT) for the i-th input image,
+        Returns:
+            list[Tensor]: filtered features
+            list[Instances]: filtered proposals
+        """
+        proposals_filtered = []
+        # TODO: the commented out code was supposed to correctly deal with situations
+        # where no valid DensePose GT is available for certain images. The corresponding
+        # image features were sliced and proposals were filtered. This led to performance
+        # deterioration, both in terms of runtime and in terms of evaluation results.
+        #
+        # feature_mask = torch.ones(
+        #    len(proposals_with_targets),
+        #    dtype=torch.bool,
+        #    device=features[0].device if len(features) > 0 else torch.device("cpu"),
+        # )
+        for i, proposals_per_image in enumerate(proposals_with_targets):
+            if not proposals_per_image.has("gt_densepose") and (
+                not proposals_per_image.has("gt_masks") or not self.keep_masks
+            ):
+                # feature_mask[i] = 0
+                continue
+            gt_boxes = proposals_per_image.gt_boxes
+            est_boxes = proposals_per_image.proposal_boxes
+            # apply match threshold for densepose head
+            iou = matched_pairwise_iou(gt_boxes, est_boxes)
+            iou_select = iou > self.iou_threshold
+            proposals_per_image = proposals_per_image[iou_select]  # pyre-ignore[6]
+
+            N_gt_boxes = len(proposals_per_image.gt_boxes)
+            assert N_gt_boxes == len(proposals_per_image.proposal_boxes), (
+                f"The number of GT boxes {N_gt_boxes} is different from the "
+                f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}"
+            )
+            # filter out any target without suitable annotation
+            if self.keep_masks:
+                gt_masks = (
+                    proposals_per_image.gt_masks
+                    if hasattr(proposals_per_image, "gt_masks")
+                    else [None] * N_gt_boxes
+                )
+            else:
+                gt_masks = [None] * N_gt_boxes
+            gt_densepose = (
+                proposals_per_image.gt_densepose
+                if hasattr(proposals_per_image, "gt_densepose")
+                else [None] * N_gt_boxes
+            )
+            assert len(gt_masks) == N_gt_boxes
+            assert len(gt_densepose) == N_gt_boxes
+            selected_indices = [
+                i
+                for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks))
+                if (dp_target is not None) or (mask_target is not None)
+            ]
+            # if not len(selected_indices):
+            #     feature_mask[i] = 0
+            #     continue
+            if len(selected_indices) != N_gt_boxes:
+                proposals_per_image = proposals_per_image[selected_indices]  # pyre-ignore[6]
+            assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes)
+            proposals_filtered.append(proposals_per_image)
+        # features_filtered = [feature[feature_mask] for feature in features]
+        # return features_filtered, proposals_filtered
+        return features, proposals_filtered
diff --git a/densepose/modeling/hrfpn.py b/densepose/modeling/hrfpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a19c3261198798738130267cb4c35022ddf8a9e6
--- /dev/null
+++ b/densepose/modeling/hrfpn.py
@@ -0,0 +1,184 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+"""
+MIT License
+Copyright (c) 2019 Microsoft
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.backbone import Backbone
+
+from .hrnet import build_pose_hrnet_backbone
+
+
+class HRFPN(Backbone):
+    """HRFPN (High Resolution Feature Pyramids)
+    Transforms outputs of HRNet backbone so they are suitable for the ROI_heads
+    arXiv: https://arxiv.org/abs/1904.04514
+    Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py
+    Args:
+        bottom_up: (list) output of HRNet
+        in_features (list): names of the input features (output of HRNet)
+        in_channels (list): number of channels for each branch
+        out_channels (int): output channels of feature pyramids
+        n_out_features (int): number of output stages
+        pooling (str): pooling for generating feature pyramids (from {MAX, AVG})
+        share_conv (bool): Have one conv per output, or share one with all the outputs
+    """
+
+    def __init__(
+        self,
+        bottom_up,
+        in_features,
+        n_out_features,
+        in_channels,
+        out_channels,
+        pooling="AVG",
+        share_conv=False,
+    ):
+        super(HRFPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.bottom_up = bottom_up
+        self.in_features = in_features
+        self.n_out_features = n_out_features
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.share_conv = share_conv
+
+        if self.share_conv:
+            self.fpn_conv = nn.Conv2d(
+                in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1
+            )
+        else:
+            self.fpn_conv = nn.ModuleList()
+            for _ in range(self.n_out_features):
+                self.fpn_conv.append(
+                    nn.Conv2d(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=3,
+                        padding=1,
+                    )
+                )
+
+        # Custom change: Replaces a simple bilinear interpolation
+        self.interp_conv = nn.ModuleList()
+        for i in range(len(self.in_features)):
+            self.interp_conv.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(
+                        in_channels=in_channels[i],
+                        out_channels=in_channels[i],
+                        kernel_size=4,
+                        stride=2**i,
+                        padding=0,
+                        output_padding=0,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(in_channels[i], momentum=0.1),
+                    nn.ReLU(inplace=True),
+                )
+            )
+
+        # Custom change: Replaces a couple (reduction conv + pooling) by one conv
+        self.reduction_pooling_conv = nn.ModuleList()
+        for i in range(self.n_out_features):
+            self.reduction_pooling_conv.append(
+                nn.Sequential(
+                    nn.Conv2d(sum(in_channels), out_channels, kernel_size=2**i, stride=2**i),
+                    nn.BatchNorm2d(out_channels, momentum=0.1),
+                    nn.ReLU(inplace=True),
+                )
+            )
+
+        if pooling == "MAX":
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+        self._out_features = []
+        self._out_feature_channels = {}
+        self._out_feature_strides = {}
+
+        for i in range(self.n_out_features):
+            self._out_features.append("p%d" % (i + 1))
+            self._out_feature_channels.update({self._out_features[-1]: self.out_channels})
+            self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)})
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, inputs):
+        bottom_up_features = self.bottom_up(inputs)
+        assert len(bottom_up_features) == len(self.in_features)
+        inputs = [bottom_up_features[f] for f in self.in_features]
+
+        outs = []
+        for i in range(len(inputs)):
+            outs.append(self.interp_conv[i](inputs[i]))
+        shape_2 = min(o.shape[2] for o in outs)
+        shape_3 = min(o.shape[3] for o in outs)
+        out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1)
+        outs = []
+        for i in range(self.n_out_features):
+            outs.append(self.reduction_pooling_conv[i](out))
+        for i in range(len(outs)):  # Make shapes consistent
+            outs[-1 - i] = outs[-1 - i][
+                :, :, : outs[-1].shape[2] * 2**i, : outs[-1].shape[3] * 2**i
+            ]
+        outputs = []
+        for i in range(len(outs)):
+            if self.share_conv:
+                outputs.append(self.fpn_conv(outs[i]))
+            else:
+                outputs.append(self.fpn_conv[i](outs[i]))
+
+        assert len(self._out_features) == len(outputs)
+        return dict(zip(self._out_features, outputs))
+
+
+@BACKBONE_REGISTRY.register()
+def build_hrfpn_backbone(cfg, input_shape: ShapeSpec) -> HRFPN:
+
+    in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS
+    in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)]
+    n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES)
+    out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS
+    hrnet = build_pose_hrnet_backbone(cfg, input_shape)
+    hrfpn = HRFPN(
+        hrnet,
+        in_features,
+        n_out_features,
+        in_channels,
+        out_channels,
+        pooling="AVG",
+        share_conv=False,
+    )
+
+    return hrfpn
diff --git a/densepose/modeling/hrnet.py b/densepose/modeling/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e3cab545c7f999300676bb27fa0461abd143e2
--- /dev/null
+++ b/densepose/modeling/hrnet.py
@@ -0,0 +1,476 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (leoxiaobin@gmail.com)
+# Modified by Bowen Cheng (bcheng9@illinois.edu)
+# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py  # noqa
+# ------------------------------------------------------------------------------
+
+# pyre-unsafe
+
+from __future__ import absolute_import, division, print_function
+import logging
+import torch.nn as nn
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.backbone import Backbone
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"]
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class HighResolutionModule(nn.Module):
+    """HighResolutionModule
+    Building block of the PoseHigherResolutionNet (see lower)
+    arXiv: https://arxiv.org/abs/1908.10357
+    Args:
+        num_branches (int): number of branches of the modyle
+        blocks (str): type of block of the module
+        num_blocks (int): number of blocks of the module
+        num_inchannels (int): number of input channels of the module
+        num_channels (list): number of channels of each branch
+        multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet
+    """
+
+    def __init__(
+        self,
+        num_branches,
+        blocks,
+        num_blocks,
+        num_inchannels,
+        num_channels,
+        multi_scale_output=True,
+    ):
+        super(HighResolutionModule, self).__init__()
+        self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels)
+
+        self.num_inchannels = num_inchannels
+        self.num_branches = num_branches
+
+        self.multi_scale_output = multi_scale_output
+
+        self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(True)
+
+    def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks))
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
+                num_branches, len(num_channels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_inchannels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format(
+                num_branches, len(num_inchannels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1):
+        downsample = None
+        if (
+            stride != 1
+            or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion
+        ):
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.num_inchannels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(
+            block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample)
+        )
+        self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion
+        for _ in range(1, num_blocks[branch_index]):
+            layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index]))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        num_inchannels = self.num_inchannels
+        fuse_layers = []
+        for i in range(num_branches if self.multi_scale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False),
+                            nn.BatchNorm2d(num_inchannels[i]),
+                            nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"),
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3,
+                                        2,
+                                        1,
+                                        bias=False,
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                )
+                            )
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            conv3x3s.append(
+                                nn.Sequential(
+                                    nn.Conv2d(
+                                        num_inchannels[j],
+                                        num_outchannels_conv3x3,
+                                        3,
+                                        2,
+                                        1,
+                                        bias=False,
+                                    ),
+                                    nn.BatchNorm2d(num_outchannels_conv3x3),
+                                    nn.ReLU(True),
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        return self.num_inchannels
+
+    def forward(self, x):
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y = y + x[j]
+                else:
+                    z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]]
+                    y = y + z
+            x_fuse.append(self.relu(y))
+
+        return x_fuse
+
+
+blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck}
+
+
+class PoseHigherResolutionNet(Backbone):
+    """PoseHigherResolutionNet
+    Composed of several HighResolutionModule tied together with ConvNets
+    Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure
+    arXiv: https://arxiv.org/abs/1908.10357
+    """
+
+    def __init__(self, cfg, **kwargs):
+        self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES
+        super(PoseHigherResolutionNet, self).__init__()
+
+        # stem net
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(Bottleneck, 64, 4)
+
+        self.stage2_cfg = cfg.MODEL.HRNET.STAGE2
+        num_channels = self.stage2_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage2_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition1 = self._make_transition_layer([256], num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels)
+
+        self.stage3_cfg = cfg.MODEL.HRNET.STAGE3
+        num_channels = self.stage3_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage3_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels)
+
+        self.stage4_cfg = cfg.MODEL.HRNET.STAGE4
+        num_channels = self.stage4_cfg.NUM_CHANNELS
+        block = blocks_dict[self.stage4_cfg.BLOCK]
+        num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))]
+        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multi_scale_output=True
+        )
+
+        self._out_features = []
+        self._out_feature_channels = {}
+        self._out_feature_strides = {}
+
+        for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES):
+            self._out_features.append("p%d" % (i + 1))
+            self._out_feature_channels.update(
+                {self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]}
+            )
+            self._out_feature_strides.update({self._out_features[-1]: 1})
+
+    def _get_deconv_cfg(self, deconv_kernel):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            nn.Conv2d(
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                3,
+                                1,
+                                1,
+                                bias=False,
+                            ),
+                            nn.BatchNorm2d(num_channels_cur_layer[i]),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv3x3s = []
+                for j in range(i + 1 - num_branches_pre):
+                    inchannels = num_channels_pre_layer[-1]
+                    outchannels = (
+                        num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels
+                    )
+                    conv3x3s.append(
+                        nn.Sequential(
+                            nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False),
+                            nn.BatchNorm2d(outchannels),
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv3x3s))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True):
+        num_modules = layer_config["NUM_MODULES"]
+        num_branches = layer_config["NUM_BRANCHES"]
+        num_blocks = layer_config["NUM_BLOCKS"]
+        num_channels = layer_config["NUM_CHANNELS"]
+        block = blocks_dict[layer_config["BLOCK"]]
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multi_scale_output and i == num_modules - 1:
+                reset_multi_scale_output = False
+            else:
+                reset_multi_scale_output = True
+
+            modules.append(
+                HighResolutionModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    reset_multi_scale_output,
+                )
+            )
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg.NUM_BRANCHES):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg.NUM_BRANCHES):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg.NUM_BRANCHES):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        assert len(self._out_features) == len(y_list)
+        return dict(zip(self._out_features, y_list))  # final_outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec):
+    model = PoseHigherResolutionNet(cfg)
+    return model
diff --git a/densepose/modeling/inference.py b/densepose/modeling/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a797ff9b28e61827f5553045a6147ff3390d9fe3
--- /dev/null
+++ b/densepose/modeling/inference.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from dataclasses import fields
+from typing import Any, List
+import torch
+
+from detectron2.structures import Instances
+
+
+def densepose_inference(densepose_predictor_output: Any, detections: List[Instances]) -> None:
+    """
+    Splits DensePose predictor outputs into chunks, each chunk corresponds to
+    detections on one image. Predictor output chunks are stored in `pred_densepose`
+    attribute of the corresponding `Instances` object.
+
+    Args:
+        densepose_predictor_output: a dataclass instance (can be of different types,
+            depending on predictor used for inference). Each field can be `None`
+            (if the corresponding output was not inferred) or a tensor of size
+            [N, ...], where N = N_1 + N_2 + .. + N_k is a total number of
+            detections on all images, N_1 is the number of detections on image 1,
+            N_2 is the number of detections on image 2, etc.
+        detections: a list of objects of type `Instance`, k-th object corresponds
+            to detections on k-th image.
+    """
+    k = 0
+    for detection_i in detections:
+        if densepose_predictor_output is None:
+            # don't add `pred_densepose` attribute
+            continue
+        n_i = detection_i.__len__()
+
+        PredictorOutput = type(densepose_predictor_output)
+        output_i_dict = {}
+        # we assume here that `densepose_predictor_output` is a dataclass object
+        for field in fields(densepose_predictor_output):
+            field_value = getattr(densepose_predictor_output, field.name)
+            # slice tensors
+            if isinstance(field_value, torch.Tensor):
+                output_i_dict[field.name] = field_value[k : k + n_i]
+            # leave others as is
+            else:
+                output_i_dict[field.name] = field_value
+        detection_i.pred_densepose = PredictorOutput(**output_i_dict)
+        k += n_i
diff --git a/densepose/modeling/losses/__init__.py b/densepose/modeling/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b028a23924b030e0bac4d554b61ed34f3110a798
--- /dev/null
+++ b/densepose/modeling/losses/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .chart import DensePoseChartLoss
+from .chart_with_confidences import DensePoseChartWithConfidenceLoss
+from .cse import DensePoseCseLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+
+
+__all__ = [
+    "DensePoseChartLoss",
+    "DensePoseChartWithConfidenceLoss",
+    "DensePoseCseLoss",
+    "DENSEPOSE_LOSS_REGISTRY",
+]
diff --git a/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3a3ce2d26b95688e13ea6a6126f2567e1966d87
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc b/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..029fd17dbf7e3d1f385e13b45fc7527f73e4e463
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc b/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a88b5814984b5d1f0c9eeb592cb4bc8e1d50de0e
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/chart_with_confidences.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cda1964edaa396b5a94619f342eb6039a70ee80
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3706fb0ab25036a20301e12d9502af8cf4eddb7d
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cycle_pix2shape.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc b/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdd1c216b1509bc4de0b3a6661102b59f5a77cdf
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/cycle_shape2shape.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc b/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef99139f9886765ea30c9995c7255bd92fba7d6d
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/embed.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc b/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca295b2a5e5f8545c8990e5729e8633aff3d2b84
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/embed_utils.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc b/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf588c5d09d0db7b4f486626761f73542edacdea
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/mask.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc b/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..890fd6b429bd56cea359d09e37e59cf455f705d5
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/mask_or_segm.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc b/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bab89c7a92e32b41e8ac7448492dee61e86bc17
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc b/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..312e345d42fd61314ece091afeca1c5f67e17ab8
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/segm.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc b/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f32e803597d181a739ab8ca712483e4e9e0f2af0
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/soft_embed.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc b/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a86ee0660ff642cc8b786af4ba756990b9c031f9
Binary files /dev/null and b/densepose/modeling/losses/__pycache__/utils.cpython-39.pyc differ
diff --git a/densepose/modeling/losses/chart.py b/densepose/modeling/losses/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..770648f3d3fddbfc553c18a3e7f5101396913593
--- /dev/null
+++ b/densepose/modeling/losses/chart.py
@@ -0,0 +1,293 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any, List
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .mask_or_segm import MaskOrSegmentationLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .utils import (
+    BilinearInterpolationHelper,
+    ChartBasedAnnotationsAccumulator,
+    LossDict,
+    extract_packed_annotations_from_matches,
+)
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseChartLoss:
+    """
+    DensePose loss for chart-based training. A mesh is split into charts,
+    each chart is given a label (I) and parametrized by 2 coordinates referred to
+    as U and V. Ground truth consists of a number of points annotated with
+    I, U and V values and coarse segmentation S defined for all pixels of the
+    object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`),
+    semantic segmentation annotations can be used as ground truth inputs as well.
+
+    Estimated values are tensors:
+     * U coordinates, tensor of shape [N, C, S, S]
+     * V coordinates, tensor of shape [N, C, S, S]
+     * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized
+       scores for each fine segmentation label at each location
+     * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized
+       scores for each coarse segmentation label at each location
+    where N is the number of detections, C is the number of fine segmentation
+    labels, S is the estimate size ( = width = height) and D is the number of
+    coarse segmentation channels.
+
+    The losses are:
+    * regression (smooth L1) loss for U and V coordinates
+    * cross entropy loss for fine (I) and coarse (S) segmentations
+    Each loss has an associated weight
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize chart-based loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        # fmt: off
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.w_points     = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
+        self.w_part       = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
+        self.w_segm       = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        self.n_segm_chan  = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        # fmt: on
+        self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+        self.segm_loss = MaskOrSegmentationLoss(cfg)
+
+    def __call__(
+        self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs
+    ) -> LossDict:
+        """
+        Produce chart-based DensePose losses
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+                * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+                * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+                * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+            where N is the number of detections, C is the number of fine segmentation
+            labels, S is the estimate size ( = width = height) and D is the number of
+            coarse segmentation channels.
+
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
+             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
+             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
+                 segmentation estimates given ground truth labels;
+             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
+                 segmentation estimates given ground truth labels;
+        """
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+
+        if not len(proposals_with_gt):
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        accumulator = ChartBasedAnnotationsAccumulator()
+        packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)
+
+        # NOTE: we need to keep the same computation graph on all the GPUs to
+        # perform reduction properly. Hence even if we have no data on one
+        # of the GPUs, we still need to generate the computation graph.
+        # Add fake (zero) loss in the form Tensor.sum() * 0
+        if packed_annotations is None:
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        h, w = densepose_predictor_outputs.u.shape[2:]
+        interpolator = BilinearInterpolationHelper.from_matches(
+            packed_annotations,
+            (h, w),
+        )
+
+        j_valid_fg = interpolator.j_valid * (  # pyre-ignore[16]
+            packed_annotations.fine_segm_labels_gt > 0
+        )
+        # pyre-fixme[6]: For 1st param expected `Tensor` but got `int`.
+        if not torch.any(j_valid_fg):
+            return self.produce_fake_densepose_losses(densepose_predictor_outputs)
+
+        losses_uv = self.produce_densepose_losses_uv(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,  # pyre-ignore[6]
+        )
+
+        losses_segm = self.produce_densepose_losses_segm(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,  # pyre-ignore[6]
+        )
+
+        return {**losses_uv, **losses_segm}
+
+    def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for fine segmentation and U/V coordinates. These are used when
+        no suitable ground truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+             * `loss_densepose_I`: has value 0
+             * `loss_densepose_S`: has value 0
+        """
+        losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs)
+        losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs)
+        return {**losses_uv, **losses_segm}
+
+    def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for U/V coordinates. These are used when no suitable ground
+        truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+        """
+        return {
+            "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0,
+            "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0,
+        }
+
+    def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Fake losses for fine / coarse segmentation. These are used when
+        no suitable ground truth data was found in a batch. The loss has a value 0
+        and is primarily used to construct the computation graph, so that
+        `DistributedDataParallel` has similar graphs on all GPUs and can perform
+        reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_I`: has value 0
+             * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False
+        """
+        losses = {
+            "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0,
+            "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
+        }
+        return losses
+
+    def produce_densepose_losses_uv(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        """
+        Compute losses for U/V coordinates: smooth L1 loss between
+        estimated coordinates and the ground truth.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates
+             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates
+        """
+        u_gt = packed_annotations.u_gt[j_valid_fg]
+        u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
+        v_gt = packed_annotations.v_gt[j_valid_fg]
+        v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
+        return {
+            "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points,
+            "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points,
+        }
+
+    def produce_densepose_losses_segm(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        """
+        Losses for fine / coarse segmentation: cross-entropy
+        for segmentation unnormalized scores given ground truth labels at
+        annotated points for fine segmentation and dense mask annotations
+        for coarse segmentation.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine
+                 segmentation estimates given ground truth labels
+             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse
+                 segmentation estimates given ground truth labels;
+                 may be included if coarse segmentation is only trained
+                 using DensePose ground truth; if additional supervision through
+                 instance segmentation data is performed (`segm_trained_by_masks` is True),
+                 this loss is handled by `produce_mask_losses` instead
+        """
+        fine_segm_gt = packed_annotations.fine_segm_labels_gt[
+            interpolator.j_valid  # pyre-ignore[16]
+        ]
+        fine_segm_est = interpolator.extract_at_points(
+            densepose_predictor_outputs.fine_segm,
+            slice_fine_segm=slice(None),
+            w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+            w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+            w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+            w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+        )[interpolator.j_valid, :]
+        return {
+            "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part,
+            "loss_densepose_S": self.segm_loss(
+                proposals_with_gt, densepose_predictor_outputs, packed_annotations
+            )
+            * self.w_segm,
+        }
diff --git a/densepose/modeling/losses/chart_with_confidences.py b/densepose/modeling/losses/chart_with_confidences.py
new file mode 100644
index 0000000000000000000000000000000000000000..d061488d7d5fb8fe0e220e7dfe3f03ea2eda7977
--- /dev/null
+++ b/densepose/modeling/losses/chart_with_confidences.py
@@ -0,0 +1,211 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import math
+from typing import Any, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .. import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from .chart import DensePoseChartLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .utils import BilinearInterpolationHelper, LossDict
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseChartWithConfidenceLoss(DensePoseChartLoss):
+    """ """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__(cfg)
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+            self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+        elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO:
+            self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss(
+                self.confidence_model_cfg.uv_confidence.epsilon
+            )
+
+    def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
+        """
+        Overrides fake losses for fine segmentation and U/V coordinates to
+        include computation graphs for additional confidence parameters.
+        These are used when no suitable ground truth data was found in a batch.
+        The loss has a value 0 and is primarily used to construct the computation graph,
+        so that `DistributedDataParallel` has similar graphs on all GPUs and can
+        perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have the following attributes:
+             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]
+             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]
+        Return:
+            dict: str -> tensor: dict of losses with the following entries:
+             * `loss_densepose_U`: has value 0
+             * `loss_densepose_V`: has value 0
+             * `loss_densepose_I`: has value 0
+        """
+        conf_type = self.confidence_model_cfg.uv_confidence.type
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            loss_uv = (
+                densepose_predictor_outputs.u.sum() + densepose_predictor_outputs.v.sum()
+            ) * 0
+            if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                loss_uv += densepose_predictor_outputs.sigma_2.sum() * 0
+            elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO:
+                loss_uv += (
+                    densepose_predictor_outputs.sigma_2.sum()
+                    + densepose_predictor_outputs.kappa_u.sum()
+                    + densepose_predictor_outputs.kappa_v.sum()
+                ) * 0
+            return {"loss_densepose_UV": loss_uv}
+        else:
+            return super().produce_fake_densepose_losses_uv(densepose_predictor_outputs)
+
+    def produce_densepose_losses_uv(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+        interpolator: BilinearInterpolationHelper,
+        j_valid_fg: torch.Tensor,
+    ) -> LossDict:
+        conf_type = self.confidence_model_cfg.uv_confidence.type
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            u_gt = packed_annotations.u_gt[j_valid_fg]
+            u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
+            v_gt = packed_annotations.v_gt[j_valid_fg]
+            v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
+            sigma_2_est = interpolator.extract_at_points(densepose_predictor_outputs.sigma_2)[
+                j_valid_fg
+            ]
+            if conf_type == DensePoseUVConfidenceType.IID_ISO:
+                return {
+                    "loss_densepose_UV": (
+                        self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt)
+                        * self.w_points
+                    )
+                }
+            elif conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]:
+                kappa_u_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_u)[
+                    j_valid_fg
+                ]
+                kappa_v_est = interpolator.extract_at_points(densepose_predictor_outputs.kappa_v)[
+                    j_valid_fg
+                ]
+                return {
+                    "loss_densepose_UV": (
+                        self.uv_loss_with_confidences(
+                            u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt
+                        )
+                        * self.w_points
+                    )
+                }
+        return super().produce_densepose_losses_uv(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            j_valid_fg,
+        )
+
+
+class IIDIsotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of iid residuals with isotropic covariance:
+    $Sigma_i = sigma_i^2 I$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IIDIsotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        # use sigma_lower_bound to avoid degenerate solution for variance
+        # (sigma -> 0)
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|delta_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2
+        # the total loss from the formula above:
+        loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2)
+        return loss.sum()
+
+
+class IndepAnisotropicGaussianUVLoss(nn.Module):
+    """
+    Loss for the case of independent residuals with anisotropic covariances:
+    $Sigma_i = sigma_i^2 I + r_i r_i^T$
+    The loss (negative log likelihood) is then:
+    $1/2 sum_{i=1}^n (log(2 pi)
+      + log sigma_i^2 (sigma_i^2 + ||r_i||^2)
+      + ||delta_i||^2 / sigma_i^2
+      - <delta_i, r_i>^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$,
+    where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates
+    difference between estimated and ground truth UV values
+    For details, see:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+    Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    """
+
+    def __init__(self, sigma_lower_bound: float):
+        super(IndepAnisotropicGaussianUVLoss, self).__init__()
+        self.sigma_lower_bound = sigma_lower_bound
+        self.log2pi = math.log(2 * math.pi)
+
+    def forward(
+        self,
+        u: torch.Tensor,
+        v: torch.Tensor,
+        sigma_u: torch.Tensor,
+        kappa_u_est: torch.Tensor,
+        kappa_v_est: torch.Tensor,
+        target_u: torch.Tensor,
+        target_v: torch.Tensor,
+    ):
+        # compute $\sigma_i^2$
+        sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound
+        # compute \|r_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        r_sqnorm2 = kappa_u_est**2 + kappa_v_est**2
+        delta_u = u - target_u
+        delta_v = v - target_v
+        # compute \|delta_i\|^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_sqnorm = delta_u**2 + delta_v**2
+        delta_u_r_u = delta_u * kappa_u_est
+        delta_v_r_v = delta_v * kappa_v_est
+        # compute the scalar product <delta_i, r_i>
+        delta_r = delta_u_r_u + delta_v_r_v
+        # compute squared scalar product <delta_i, r_i>^2
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        delta_r_sqnorm = delta_r**2
+        denom2 = sigma2 * (sigma2 + r_sqnorm2)
+        loss = 0.5 * (
+            self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2
+        )
+        return loss.sum()
diff --git a/densepose/modeling/losses/cse.py b/densepose/modeling/losses/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe219c5474392da8048bcf409257cbfce817236
--- /dev/null
+++ b/densepose/modeling/losses/cse.py
@@ -0,0 +1,117 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, List
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .cycle_pix2shape import PixToShapeCycleLoss
+from .cycle_shape2shape import ShapeToShapeCycleLoss
+from .embed import EmbeddingLoss
+from .embed_utils import CseAnnotationsAccumulator
+from .mask_or_segm import MaskOrSegmentationLoss
+from .registry import DENSEPOSE_LOSS_REGISTRY
+from .soft_embed import SoftEmbeddingLoss
+from .utils import BilinearInterpolationHelper, LossDict, extract_packed_annotations_from_matches
+
+
+@DENSEPOSE_LOSS_REGISTRY.register()
+class DensePoseCseLoss:
+    """ """
+
+    _EMBED_LOSS_REGISTRY = {
+        EmbeddingLoss.__name__: EmbeddingLoss,
+        SoftEmbeddingLoss.__name__: SoftEmbeddingLoss,
+    }
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize CSE loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
+        self.w_embed = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT
+        self.segm_loss = MaskOrSegmentationLoss(cfg)
+        self.embed_loss = DensePoseCseLoss.create_embed_loss(cfg)
+        self.do_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.ENABLED
+        if self.do_shape2shape:
+            self.w_shape2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT
+            self.shape2shape_loss = ShapeToShapeCycleLoss(cfg)
+        self.do_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.ENABLED
+        if self.do_pix2shape:
+            self.w_pix2shape = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT
+            self.pix2shape_loss = PixToShapeCycleLoss(cfg)
+
+    @classmethod
+    def create_embed_loss(cls, cfg: CfgNode):
+        # registry not used here, since embedding losses are currently local
+        # and are not used anywhere else
+        return cls._EMBED_LOSS_REGISTRY[cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME](cfg)
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        embedder: nn.Module,
+    ) -> LossDict:
+        if not len(proposals_with_gt):
+            return self.produce_fake_losses(densepose_predictor_outputs, embedder)
+        accumulator = CseAnnotationsAccumulator()
+        packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)
+        if packed_annotations is None:
+            return self.produce_fake_losses(densepose_predictor_outputs, embedder)
+        h, w = densepose_predictor_outputs.embedding.shape[2:]
+        interpolator = BilinearInterpolationHelper.from_matches(
+            packed_annotations,
+            (h, w),
+        )
+        meshid_to_embed_losses = self.embed_loss(
+            proposals_with_gt,
+            densepose_predictor_outputs,
+            packed_annotations,
+            interpolator,
+            embedder,
+        )
+        embed_loss_dict = {
+            f"loss_densepose_E{meshid}": self.w_embed * meshid_to_embed_losses[meshid]
+            for meshid in meshid_to_embed_losses
+        }
+        all_loss_dict = {
+            "loss_densepose_S": self.w_segm
+            * self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations),
+            **embed_loss_dict,
+        }
+        if self.do_shape2shape:
+            all_loss_dict["loss_shape2shape"] = self.w_shape2shape * self.shape2shape_loss(embedder)
+        if self.do_pix2shape:
+            all_loss_dict["loss_pix2shape"] = self.w_pix2shape * self.pix2shape_loss(
+                proposals_with_gt, densepose_predictor_outputs, packed_annotations, embedder
+            )
+        return all_loss_dict
+
+    def produce_fake_losses(
+        self, densepose_predictor_outputs: Any, embedder: nn.Module
+    ) -> LossDict:
+        meshname_to_embed_losses = self.embed_loss.fake_values(
+            densepose_predictor_outputs, embedder=embedder
+        )
+        embed_loss_dict = {
+            f"loss_densepose_E{mesh_name}": meshname_to_embed_losses[mesh_name]
+            for mesh_name in meshname_to_embed_losses
+        }
+        all_loss_dict = {
+            "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
+            **embed_loss_dict,
+        }
+        if self.do_shape2shape:
+            all_loss_dict["loss_shape2shape"] = self.shape2shape_loss.fake_value(embedder)
+        if self.do_pix2shape:
+            all_loss_dict["loss_pix2shape"] = self.pix2shape_loss.fake_value(
+                densepose_predictor_outputs, embedder
+            )
+        return all_loss_dict
diff --git a/densepose/modeling/losses/cycle_pix2shape.py b/densepose/modeling/losses/cycle_pix2shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e0a94a68370a994179d9d3eb5fb0ed9ed4af39
--- /dev/null
+++ b/densepose/modeling/losses/cycle_pix2shape.py
@@ -0,0 +1,154 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+
+from .embed_utils import PackedCseAnnotations
+from .mask import extract_data_for_mask_loss_from_matches
+
+
+def _create_pixel_dist_matrix(grid_size: int) -> torch.Tensor:
+    rows = torch.arange(grid_size)
+    cols = torch.arange(grid_size)
+    # at index `i` contains [row, col], where
+    # row = i // grid_size
+    # col = i % grid_size
+    pix_coords = (
+        torch.stack(torch.meshgrid(rows, cols), -1).reshape((grid_size * grid_size, 2)).float()
+    )
+    return squared_euclidean_distance_matrix(pix_coords, pix_coords)
+
+
+def _sample_fg_pixels_randperm(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor:
+    fg_mask_flattened = fg_mask.reshape((-1,))
+    num_pixels = int(fg_mask_flattened.sum().item())
+    fg_pixel_indices = fg_mask_flattened.nonzero(as_tuple=True)[0]
+    if (sample_size <= 0) or (num_pixels <= sample_size):
+        return fg_pixel_indices
+    sample_indices = torch.randperm(num_pixels, device=fg_mask.device)[:sample_size]
+    return fg_pixel_indices[sample_indices]
+
+
+def _sample_fg_pixels_multinomial(fg_mask: torch.Tensor, sample_size: int) -> torch.Tensor:
+    fg_mask_flattened = fg_mask.reshape((-1,))
+    num_pixels = int(fg_mask_flattened.sum().item())
+    if (sample_size <= 0) or (num_pixels <= sample_size):
+        return fg_mask_flattened.nonzero(as_tuple=True)[0]
+    return fg_mask_flattened.float().multinomial(sample_size, replacement=False)
+
+
+class PixToShapeCycleLoss(nn.Module):
+    """
+    Cycle loss for pixel-vertex correspondence
+    """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__()
+        self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys())
+        self.embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P
+        self.use_all_meshes_not_gt_only = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY
+        )
+        self.num_pixels_to_sample = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE
+        )
+        self.pix_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA
+        self.temperature_pix_to_vertex = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX
+        )
+        self.temperature_vertex_to_pix = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL
+        )
+        self.pixel_dists = _create_pixel_dist_matrix(cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE)
+
+    def forward(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        embedder: nn.Module,
+    ):
+        """
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        """
+        pix_embeds = densepose_predictor_outputs.embedding
+        if self.pixel_dists.device != pix_embeds.device:
+            # should normally be done only once
+            self.pixel_dists = self.pixel_dists.to(device=pix_embeds.device)
+        with torch.no_grad():
+            mask_loss_data = extract_data_for_mask_loss_from_matches(
+                proposals_with_gt, densepose_predictor_outputs.coarse_segm
+            )
+        # GT masks - tensor of shape [N, S, S] of int64
+        masks_gt = mask_loss_data.masks_gt.long()  # pyre-ignore[16]
+        assert len(pix_embeds) == len(masks_gt), (
+            f"Number of instances with embeddings {len(pix_embeds)} != "
+            f"number of instances with GT masks {len(masks_gt)}"
+        )
+        losses = []
+        mesh_names = (
+            self.shape_names
+            if self.use_all_meshes_not_gt_only
+            else [
+                MeshCatalog.get_mesh_name(mesh_id.item())
+                for mesh_id in packed_annotations.vertex_mesh_ids_gt.unique()
+            ]
+        )
+        for pixel_embeddings, mask_gt in zip(pix_embeds, masks_gt):
+            # pixel_embeddings [D, S, S]
+            # mask_gt [S, S]
+            for mesh_name in mesh_names:
+                mesh_vertex_embeddings = embedder(mesh_name)
+                # pixel indices [M]
+                pixel_indices_flattened = _sample_fg_pixels_randperm(
+                    mask_gt, self.num_pixels_to_sample
+                )
+                # pixel distances [M, M]
+                pixel_dists = self.pixel_dists.to(pixel_embeddings.device)[
+                    torch.meshgrid(pixel_indices_flattened, pixel_indices_flattened)
+                ]
+                # pixel embeddings [M, D]
+                pixel_embeddings_sampled = normalize_embeddings(
+                    pixel_embeddings.reshape((self.embed_size, -1))[:, pixel_indices_flattened].T
+                )
+                # pixel-vertex similarity [M, K]
+                sim_matrix = pixel_embeddings_sampled.mm(mesh_vertex_embeddings.T)
+                c_pix_vertex = F.softmax(sim_matrix / self.temperature_pix_to_vertex, dim=1)
+                c_vertex_pix = F.softmax(sim_matrix.T / self.temperature_vertex_to_pix, dim=1)
+                c_cycle = c_pix_vertex.mm(c_vertex_pix)
+                loss_cycle = torch.norm(pixel_dists * c_cycle, p=self.norm_p)
+                losses.append(loss_cycle)
+
+        if len(losses) == 0:
+            return pix_embeds.sum() * 0
+        return torch.stack(losses, dim=0).mean()
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = [embedder(mesh_name).sum() * 0 for mesh_name in embedder.mesh_names]
+        losses.append(densepose_predictor_outputs.embedding.sum() * 0)
+        return torch.mean(torch.stack(losses))
diff --git a/densepose/modeling/losses/cycle_shape2shape.py b/densepose/modeling/losses/cycle_shape2shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6e2ca89a39f391eadc915154964f82d6ddecdd4
--- /dev/null
+++ b/densepose/modeling/losses/cycle_shape2shape.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import random
+from typing import Tuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+
+from densepose.structures.mesh import create_mesh
+
+from .utils import sample_random_indices
+
+
+class ShapeToShapeCycleLoss(nn.Module):
+    """
+    Cycle Loss for Shapes.
+    Inspired by:
+    "Mapping in a Cycle: Sinkhorn Regularized Unsupervised Learning for Point Cloud Shapes".
+    """
+
+    def __init__(self, cfg: CfgNode):
+        super().__init__()
+        self.shape_names = list(cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS.keys())
+        self.all_shape_pairs = [
+            (x, y) for i, x in enumerate(self.shape_names) for y in self.shape_names[i + 1 :]
+        ]
+        random.shuffle(self.all_shape_pairs)
+        self.cur_pos = 0
+        self.norm_p = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P
+        self.temperature = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE
+        self.max_num_vertices = (
+            cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES
+        )
+
+    def _sample_random_pair(self) -> Tuple[str, str]:
+        """
+        Produce a random pair of different mesh names
+
+        Return:
+            tuple(str, str): a pair of different mesh names
+        """
+        if self.cur_pos >= len(self.all_shape_pairs):
+            random.shuffle(self.all_shape_pairs)
+            self.cur_pos = 0
+        shape_pair = self.all_shape_pairs[self.cur_pos]
+        self.cur_pos += 1
+        return shape_pair
+
+    def forward(self, embedder: nn.Module):
+        """
+        Do a forward pass with a random pair (src, dst) pair of shapes
+        Args:
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        """
+        src_mesh_name, dst_mesh_name = self._sample_random_pair()
+        return self._forward_one_pair(embedder, src_mesh_name, dst_mesh_name)
+
+    def fake_value(self, embedder: nn.Module):
+        losses = []
+        for mesh_name in embedder.mesh_names:
+            losses.append(embedder(mesh_name).sum() * 0)
+        return torch.mean(torch.stack(losses))
+
+    def _get_embeddings_and_geodists_for_mesh(
+        self, embedder: nn.Module, mesh_name: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Produces embeddings and geodesic distance tensors for a given mesh. May subsample
+        the mesh, if it contains too many vertices (controlled by
+        SHAPE_CYCLE_LOSS_MAX_NUM_VERTICES parameter).
+        Args:
+            embedder (nn.Module): module that computes embeddings for mesh vertices
+            mesh_name (str): mesh name
+        Return:
+            embeddings (torch.Tensor of size [N, D]): embeddings for selected mesh
+                vertices (N = number of selected vertices, D = embedding space dim)
+            geodists (torch.Tensor of size [N, N]): geodesic distances for the selected
+                mesh vertices (N = number of selected vertices)
+        """
+        embeddings = embedder(mesh_name)
+        indices = sample_random_indices(
+            embeddings.shape[0], self.max_num_vertices, embeddings.device
+        )
+        mesh = create_mesh(mesh_name, embeddings.device)
+        geodists = mesh.geodists
+        if indices is not None:
+            embeddings = embeddings[indices]
+            geodists = geodists[torch.meshgrid(indices, indices)]
+        return embeddings, geodists
+
+    def _forward_one_pair(
+        self, embedder: nn.Module, mesh_name_1: str, mesh_name_2: str
+    ) -> torch.Tensor:
+        """
+        Do a forward pass with a selected pair of meshes
+        Args:
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+            mesh_name_1 (str): first mesh name
+            mesh_name_2 (str): second mesh name
+        Return:
+            Tensor containing the loss value
+        """
+        embeddings_1, geodists_1 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_1)
+        embeddings_2, geodists_2 = self._get_embeddings_and_geodists_for_mesh(embedder, mesh_name_2)
+        sim_matrix_12 = embeddings_1.mm(embeddings_2.T)
+
+        c_12 = F.softmax(sim_matrix_12 / self.temperature, dim=1)
+        c_21 = F.softmax(sim_matrix_12.T / self.temperature, dim=1)
+        c_11 = c_12.mm(c_21)
+        c_22 = c_21.mm(c_12)
+
+        loss_cycle_11 = torch.norm(geodists_1 * c_11, p=self.norm_p)
+        loss_cycle_22 = torch.norm(geodists_2 * c_22, p=self.norm_p)
+
+        return loss_cycle_11 + loss_cycle_22
diff --git a/densepose/modeling/losses/embed.py b/densepose/modeling/losses/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc8a16a478649847a6ce9200004eb4da64bb01e
--- /dev/null
+++ b/densepose/modeling/losses/embed.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, Dict, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+
+from .embed_utils import PackedCseAnnotations
+from .utils import BilinearInterpolationHelper
+
+
+class EmbeddingLoss:
+    """
+    Computes losses for estimated embeddings given annotated vertices.
+    Instances in a minibatch that correspond to the same mesh are grouped
+    together. For each group, loss is computed as cross-entropy for
+    unnormalized scores given ground truth mesh vertex ids.
+    Scores are based on squared distances between estimated vertex embeddings
+    and mesh vertex embeddings.
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize embedding loss from config
+        """
+        self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        interpolator: BilinearInterpolationHelper,
+        embedder: nn.Module,
+    ) -> Dict[int, torch.Tensor]:
+        """
+        Produces losses for estimated embeddings given annotated vertices.
+        Embeddings for all the vertices of a mesh are computed by the embedder.
+        Embeddings for observed pixels are estimated by a predictor.
+        Losses are computed as cross-entropy for squared distances between
+        observed vertex embeddings and all mesh vertex embeddings given
+        ground truth vertex IDs.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            interpolator (BilinearInterpolationHelper): bilinear interpolation helper
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        Return:
+            dict(int -> tensor): losses for different mesh IDs
+        """
+        losses = {}
+        for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique():
+            mesh_id = mesh_id_tensor.item()
+            mesh_name = MeshCatalog.get_mesh_name(mesh_id)
+            # valid points are those that fall into estimated bbox
+            # and correspond to the current mesh
+            j_valid = interpolator.j_valid * (  # pyre-ignore[16]
+                packed_annotations.vertex_mesh_ids_gt == mesh_id
+            )
+            if not torch.any(j_valid):
+                continue
+            # extract estimated embeddings for valid points
+            # -> tensor [J, D]
+            vertex_embeddings_i = normalize_embeddings(
+                interpolator.extract_at_points(
+                    densepose_predictor_outputs.embedding,
+                    slice_fine_segm=slice(None),
+                    w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+                    w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+                    w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+                    w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+                )[j_valid, :]
+            )
+            # extract vertex ids for valid points
+            # -> tensor [J]
+            vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid]
+            # embeddings for all mesh vertices
+            # -> tensor [K, D]
+            mesh_vertex_embeddings = embedder(mesh_name)
+            # unnormalized scores for valid points
+            # -> tensor [J, K]
+            scores = squared_euclidean_distance_matrix(
+                vertex_embeddings_i, mesh_vertex_embeddings
+            ) / (-self.embdist_gauss_sigma)
+            losses[mesh_name] = F.cross_entropy(scores, vertex_indices_i, ignore_index=-1)
+
+        for mesh_name in embedder.mesh_names:
+            if mesh_name not in losses:
+                losses[mesh_name] = self.fake_value(
+                    densepose_predictor_outputs, embedder, mesh_name
+                )
+        return losses
+
+    def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = {}
+        for mesh_name in embedder.mesh_names:
+            losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name)
+        return losses
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str):
+        return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0
diff --git a/densepose/modeling/losses/embed_utils.py b/densepose/modeling/losses/embed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..92210f002c0c181c4893a9115e84aaaad512f8e3
--- /dev/null
+++ b/densepose/modeling/losses/embed_utils.py
@@ -0,0 +1,139 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+
+from detectron2.structures import BoxMode, Instances
+
+from .utils import AnnotationsAccumulator
+
+
+@dataclass
+class PackedCseAnnotations:
+    x_gt: torch.Tensor
+    y_gt: torch.Tensor
+    coarse_segm_gt: Optional[torch.Tensor]
+    vertex_mesh_ids_gt: torch.Tensor
+    vertex_ids_gt: torch.Tensor
+    bbox_xywh_gt: torch.Tensor
+    bbox_xywh_est: torch.Tensor
+    point_bbox_with_dp_indices: torch.Tensor
+    point_bbox_indices: torch.Tensor
+    bbox_indices: torch.Tensor
+
+
+class CseAnnotationsAccumulator(AnnotationsAccumulator):
+    """
+    Accumulates annotations by batches that correspond to objects detected on
+    individual images. Can pack them together into single tensors.
+    """
+
+    def __init__(self):
+        self.x_gt = []
+        self.y_gt = []
+        self.s_gt = []
+        self.vertex_mesh_ids_gt = []
+        self.vertex_ids_gt = []
+        self.bbox_xywh_gt = []
+        self.bbox_xywh_est = []
+        self.point_bbox_with_dp_indices = []
+        self.point_bbox_indices = []
+        self.bbox_indices = []
+        self.nxt_bbox_with_dp_index = 0
+        self.nxt_bbox_index = 0
+
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        boxes_xywh_est = BoxMode.convert(
+            instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        boxes_xywh_gt = BoxMode.convert(
+            instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        n_matches = len(boxes_xywh_gt)
+        assert n_matches == len(
+            boxes_xywh_est
+        ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes"
+        if not n_matches:
+            # no detection - GT matches
+            return
+        if (
+            not hasattr(instances_one_image, "gt_densepose")
+            or instances_one_image.gt_densepose is None
+        ):
+            # no densepose GT for the detections, just increase the bbox index
+            self.nxt_bbox_index += n_matches
+            return
+        for box_xywh_est, box_xywh_gt, dp_gt in zip(
+            boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose
+        ):
+            if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`.
+                # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`.
+                self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt)
+            self.nxt_bbox_index += 1
+
+    def _do_accumulate(self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: Any):
+        """
+        Accumulate instances data for one image, given that the data is not empty
+
+        Args:
+            box_xywh_gt (tensor): GT bounding box
+            box_xywh_est (tensor): estimated bounding box
+            dp_gt: GT densepose data with the following attributes:
+             - x: normalized X coordinates
+             - y: normalized Y coordinates
+             - segm: tensor of size [S, S] with coarse segmentation
+             -
+        """
+        self.x_gt.append(dp_gt.x)
+        self.y_gt.append(dp_gt.y)
+        if hasattr(dp_gt, "segm"):
+            self.s_gt.append(dp_gt.segm.unsqueeze(0))
+        self.vertex_ids_gt.append(dp_gt.vertex_ids)
+        self.vertex_mesh_ids_gt.append(torch.full_like(dp_gt.vertex_ids, dp_gt.mesh_id))
+        self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4))
+        self.bbox_xywh_est.append(box_xywh_est.view(-1, 4))
+        self.point_bbox_with_dp_indices.append(
+            torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_with_dp_index)
+        )
+        self.point_bbox_indices.append(torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_index))
+        self.bbox_indices.append(self.nxt_bbox_index)
+        self.nxt_bbox_with_dp_index += 1
+
+    def pack(self) -> Optional[PackedCseAnnotations]:
+        """
+        Pack data into tensors
+        """
+        if not len(self.x_gt):
+            # TODO:
+            # returning proper empty annotations would require
+            # creating empty tensors of appropriate shape and
+            # type on an appropriate device;
+            # we return None so far to indicate empty annotations
+            return None
+        return PackedCseAnnotations(
+            x_gt=torch.cat(self.x_gt, 0),
+            y_gt=torch.cat(self.y_gt, 0),
+            vertex_mesh_ids_gt=torch.cat(self.vertex_mesh_ids_gt, 0),
+            vertex_ids_gt=torch.cat(self.vertex_ids_gt, 0),
+            # ignore segmentation annotations, if not all the instances contain those
+            coarse_segm_gt=(
+                torch.cat(self.s_gt, 0) if len(self.s_gt) == len(self.bbox_xywh_gt) else None
+            ),
+            bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0),
+            bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0),
+            point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0),
+            point_bbox_indices=torch.cat(self.point_bbox_indices, 0),
+            bbox_indices=torch.as_tensor(
+                self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device
+            ),
+        )
diff --git a/densepose/modeling/losses/mask.py b/densepose/modeling/losses/mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f8f75a425d288e1167eaf8cb48e4dc0f851ff45
--- /dev/null
+++ b/densepose/modeling/losses/mask.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Optional
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import Instances
+
+
+@dataclass
+class DataForMaskLoss:
+    """
+    Contains mask GT and estimated data for proposals from multiple images:
+    """
+
+    # tensor of size (K, H, W) containing GT labels
+    masks_gt: Optional[torch.Tensor] = None
+    # tensor of size (K, C, H, W) containing estimated scores
+    masks_est: Optional[torch.Tensor] = None
+
+
+def extract_data_for_mask_loss_from_matches(
+    proposals_targets: Iterable[Instances], estimated_segm: torch.Tensor
+) -> DataForMaskLoss:
+    """
+    Extract data for mask loss from instances that contain matched GT and
+    estimated bounding boxes.
+    Args:
+        proposals_targets: Iterable[Instances]
+            matched GT and estimated results, each item in the iterable
+            corresponds to data in 1 image
+        estimated_segm: tensor(K, C, S, S) of float - raw unnormalized
+            segmentation scores, here S is the size to which GT masks are
+            to be resized
+    Return:
+        masks_est: tensor(K, C, S, S) of float - class scores
+        masks_gt: tensor(K, S, S) of int64 - labels
+    """
+    data = DataForMaskLoss()
+    masks_gt = []
+    offset = 0
+    assert estimated_segm.shape[2] == estimated_segm.shape[3], (
+        f"Expected estimated segmentation to have a square shape, "
+        f"but the actual shape is {estimated_segm.shape[2:]}"
+    )
+    mask_size = estimated_segm.shape[2]
+    num_proposals = sum(inst.proposal_boxes.tensor.size(0) for inst in proposals_targets)
+    num_estimated = estimated_segm.shape[0]
+    assert (
+        num_proposals == num_estimated
+    ), "The number of proposals {} must be equal to the number of estimates {}".format(
+        num_proposals, num_estimated
+    )
+
+    for proposals_targets_per_image in proposals_targets:
+        n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0)
+        if not n_i:
+            continue
+        gt_masks_per_image = proposals_targets_per_image.gt_masks.crop_and_resize(
+            proposals_targets_per_image.proposal_boxes.tensor, mask_size
+        ).to(device=estimated_segm.device)
+        masks_gt.append(gt_masks_per_image)
+        offset += n_i
+    if masks_gt:
+        data.masks_est = estimated_segm
+        data.masks_gt = torch.cat(masks_gt, dim=0)
+    return data
+
+
+class MaskLoss:
+    """
+    Mask loss as cross-entropy for raw unnormalized scores given ground truth labels.
+    Mask ground truth labels are defined for the whole image and not only the
+    bounding box of interest. They are stored as objects that are assumed to implement
+    the `crop_and_resize` interface (e.g. BitMasks, PolygonMasks).
+    """
+
+    def __call__(
+        self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any
+    ) -> torch.Tensor:
+        """
+        Computes segmentation loss as cross-entropy for raw unnormalized
+        scores given ground truth labels.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attribute:
+                * coarse_segm (tensor of shape [N, D, S, S]): coarse segmentation estimates
+                    as raw unnormalized scores
+                where N is the number of detections, S is the estimate size ( = width = height)
+                and D is the number of coarse segmentation channels.
+        Return:
+            Cross entropy for raw unnormalized scores for coarse segmentation given
+            ground truth labels from masks
+        """
+        if not len(proposals_with_gt):
+            return self.fake_value(densepose_predictor_outputs)
+        # densepose outputs are computed for all images and all bounding boxes;
+        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
+        # the outputs will have size(0) == 3+1+2+1 == 7
+        with torch.no_grad():
+            mask_loss_data = extract_data_for_mask_loss_from_matches(
+                proposals_with_gt, densepose_predictor_outputs.coarse_segm
+            )
+        if (mask_loss_data.masks_gt is None) or (mask_loss_data.masks_est is None):
+            return self.fake_value(densepose_predictor_outputs)
+        return F.cross_entropy(mask_loss_data.masks_est, mask_loss_data.masks_gt.long())
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/mask_or_segm.py b/densepose/modeling/losses/mask_or_segm.py
new file mode 100644
index 0000000000000000000000000000000000000000..350a2ebf81b13839c3a16545984c05c1aa68f5bf
--- /dev/null
+++ b/densepose/modeling/losses/mask_or_segm.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, List
+import torch
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .mask import MaskLoss
+from .segm import SegmentationLoss
+
+
+class MaskOrSegmentationLoss:
+    """
+    Mask or segmentation loss as cross-entropy for raw unnormalized scores
+    given ground truth labels. Ground truth labels are either defined by coarse
+    segmentation annotation, or by mask annotation, depending on the config
+    value MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize segmentation loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
+        if self.segm_trained_by_masks:
+            self.mask_loss = MaskLoss()
+        self.segm_loss = SegmentationLoss(cfg)
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+    ) -> torch.Tensor:
+        """
+        Compute segmentation loss as cross-entropy between aligned unnormalized
+        score estimates and ground truth; with ground truth given
+        either by masks, or by coarse segmentation annotations.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+            packed_annotations: packed annotations for efficient loss computation
+        Return:
+            tensor: loss value as cross-entropy for raw unnormalized scores
+                given ground truth labels
+        """
+        if self.segm_trained_by_masks:
+            return self.mask_loss(proposals_with_gt, densepose_predictor_outputs)
+        return self.segm_loss(proposals_with_gt, densepose_predictor_outputs, packed_annotations)
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/registry.py b/densepose/modeling/losses/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db8e82343abd352482e3d740a6922a1e12ac5
--- /dev/null
+++ b/densepose/modeling/losses/registry.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from detectron2.utils.registry import Registry
+
+DENSEPOSE_LOSS_REGISTRY = Registry("DENSEPOSE_LOSS")
diff --git a/densepose/modeling/losses/segm.py b/densepose/modeling/losses/segm.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd04d129c1d05ee0f3273bc7256a60cf7cbe64b9
--- /dev/null
+++ b/densepose/modeling/losses/segm.py
@@ -0,0 +1,85 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, List
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from .utils import resample_data
+
+
+class SegmentationLoss:
+    """
+    Segmentation loss as cross-entropy for raw unnormalized scores given ground truth
+    labels. Segmentation ground truth labels are defined for the bounding box of
+    interest at some fixed resolution [S, S], where
+        S = MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE.
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize segmentation loss from configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+        """
+        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
+        self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: Any,
+    ) -> torch.Tensor:
+        """
+        Compute segmentation loss as cross-entropy on aligned segmentation
+        ground truth and estimated scores.
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated ground truth data
+            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs
+                with estimated values; assumed to have the following attributes:
+                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]
+            packed_annotations: packed annotations for efficient loss computation;
+                the following attributes are used:
+                 - coarse_segm_gt
+                 - bbox_xywh_gt
+                 - bbox_xywh_est
+        """
+        if packed_annotations.coarse_segm_gt is None:
+            return self.fake_value(densepose_predictor_outputs)
+        coarse_segm_est = densepose_predictor_outputs.coarse_segm[packed_annotations.bbox_indices]
+        with torch.no_grad():
+            coarse_segm_gt = resample_data(
+                packed_annotations.coarse_segm_gt.unsqueeze(1),
+                packed_annotations.bbox_xywh_gt,
+                packed_annotations.bbox_xywh_est,
+                self.heatmap_size,
+                self.heatmap_size,
+                mode="nearest",
+                padding_mode="zeros",
+            ).squeeze(1)
+        if self.n_segm_chan == 2:
+            coarse_segm_gt = coarse_segm_gt > 0
+        return F.cross_entropy(coarse_segm_est, coarse_segm_gt.long())
+
+    def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor:
+        """
+        Fake segmentation loss used when no suitable ground truth data
+        was found in a batch. The loss has a value 0 and is primarily used to
+        construct the computation graph, so that `DistributedDataParallel`
+        has similar graphs on all GPUs and can perform reduction properly.
+
+        Args:
+            densepose_predictor_outputs: DensePose predictor outputs, an object
+                of a dataclass that is assumed to have `coarse_segm`
+                attribute
+        Return:
+            Zero value loss with proper computation graph
+        """
+        return densepose_predictor_outputs.coarse_segm.sum() * 0
diff --git a/densepose/modeling/losses/soft_embed.py b/densepose/modeling/losses/soft_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f746d67a75738c9d38f84830f59f72da55a99280
--- /dev/null
+++ b/densepose/modeling/losses/soft_embed.py
@@ -0,0 +1,135 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from typing import Any, Dict, List
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.structures import Instances
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.modeling.cse.utils import normalize_embeddings, squared_euclidean_distance_matrix
+from densepose.structures.mesh import create_mesh
+
+from .embed_utils import PackedCseAnnotations
+from .utils import BilinearInterpolationHelper
+
+
+class SoftEmbeddingLoss:
+    """
+    Computes losses for estimated embeddings given annotated vertices.
+    Instances in a minibatch that correspond to the same mesh are grouped
+    together. For each group, loss is computed as cross-entropy for
+    unnormalized scores given ground truth mesh vertex ids.
+    Scores are based on:
+     1) squared distances between estimated vertex embeddings
+        and mesh vertex embeddings;
+     2) geodesic distances between vertices of a mesh
+    """
+
+    def __init__(self, cfg: CfgNode):
+        """
+        Initialize embedding loss from config
+        """
+        self.embdist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA
+        self.geodist_gauss_sigma = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA
+
+    def __call__(
+        self,
+        proposals_with_gt: List[Instances],
+        densepose_predictor_outputs: Any,
+        packed_annotations: PackedCseAnnotations,
+        interpolator: BilinearInterpolationHelper,
+        embedder: nn.Module,
+    ) -> Dict[int, torch.Tensor]:
+        """
+        Produces losses for estimated embeddings given annotated vertices.
+        Embeddings for all the vertices of a mesh are computed by the embedder.
+        Embeddings for observed pixels are estimated by a predictor.
+        Losses are computed as cross-entropy for unnormalized scores given
+        ground truth vertex IDs.
+         1) squared distances between estimated vertex embeddings
+            and mesh vertex embeddings;
+         2) geodesic distances between vertices of a mesh
+
+        Args:
+            proposals_with_gt (list of Instances): detections with associated
+                ground truth data; each item corresponds to instances detected
+                on 1 image; the number of items corresponds to the number of
+                images in a batch
+            densepose_predictor_outputs: an object of a dataclass that contains predictor
+                outputs with estimated values; assumed to have the following attributes:
+                * embedding - embedding estimates, tensor of shape [N, D, S, S], where
+                  N = number of instances (= sum N_i, where N_i is the number of
+                      instances on image i)
+                  D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE)
+                  S = output size (width and height)
+            packed_annotations (PackedCseAnnotations): contains various data useful
+                for loss computation, each data is packed into a single tensor
+            interpolator (BilinearInterpolationHelper): bilinear interpolation helper
+            embedder (nn.Module): module that computes vertex embeddings for different meshes
+        Return:
+            dict(int -> tensor): losses for different mesh IDs
+        """
+        losses = {}
+        for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique():
+            mesh_id = mesh_id_tensor.item()
+            mesh_name = MeshCatalog.get_mesh_name(mesh_id)
+            # valid points are those that fall into estimated bbox
+            # and correspond to the current mesh
+            j_valid = interpolator.j_valid * (  # pyre-ignore[16]
+                packed_annotations.vertex_mesh_ids_gt == mesh_id
+            )
+            if not torch.any(j_valid):
+                continue
+            # extract estimated embeddings for valid points
+            # -> tensor [J, D]
+            vertex_embeddings_i = normalize_embeddings(
+                interpolator.extract_at_points(
+                    densepose_predictor_outputs.embedding,
+                    slice_fine_segm=slice(None),
+                    w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
+                    w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
+                    w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
+                    w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
+                )[j_valid, :]
+            )
+            # extract vertex ids for valid points
+            # -> tensor [J]
+            vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid]
+            # embeddings for all mesh vertices
+            # -> tensor [K, D]
+            mesh_vertex_embeddings = embedder(mesh_name)
+            # softmax values of geodesic distances for GT mesh vertices
+            # -> tensor [J, K]
+            mesh = create_mesh(mesh_name, mesh_vertex_embeddings.device)
+            geodist_softmax_values = F.softmax(
+                mesh.geodists[vertex_indices_i] / (-self.geodist_gauss_sigma), dim=1
+            )
+            # logsoftmax values for valid points
+            # -> tensor [J, K]
+            embdist_logsoftmax_values = F.log_softmax(
+                squared_euclidean_distance_matrix(vertex_embeddings_i, mesh_vertex_embeddings)
+                / (-self.embdist_gauss_sigma),
+                dim=1,
+            )
+            losses[mesh_name] = (-geodist_softmax_values * embdist_logsoftmax_values).sum(1).mean()
+
+        for mesh_name in embedder.mesh_names:
+            if mesh_name not in losses:
+                losses[mesh_name] = self.fake_value(
+                    densepose_predictor_outputs, embedder, mesh_name
+                )
+        return losses
+
+    def fake_values(self, densepose_predictor_outputs: Any, embedder: nn.Module):
+        losses = {}
+        for mesh_name in embedder.mesh_names:
+            losses[mesh_name] = self.fake_value(densepose_predictor_outputs, embedder, mesh_name)
+        return losses
+
+    def fake_value(self, densepose_predictor_outputs: Any, embedder: nn.Module, mesh_name: str):
+        return densepose_predictor_outputs.embedding.sum() * 0 + embedder(mesh_name).sum() * 0
diff --git a/densepose/modeling/losses/utils.py b/densepose/modeling/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f865798760c798c814b4c12eb9c185a13fba7146
--- /dev/null
+++ b/densepose/modeling/losses/utils.py
@@ -0,0 +1,445 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch.nn import functional as F
+
+from detectron2.structures import BoxMode, Instances
+
+from densepose import DensePoseDataRelative
+
+LossDict = Dict[str, torch.Tensor]
+
+
+def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z):
+    """
+    Computes utility values for linear interpolation at points v.
+    The points are given as normalized offsets in the source interval
+    (v0_src, v0_src + size_src), more precisely:
+        v = v0_src + v_norm * size_src / 256.0
+    The computed utilities include lower points v_lo, upper points v_hi,
+    interpolation weights v_w and flags j_valid indicating whether the
+    points falls into the destination interval (v0_dst, v0_dst + size_dst).
+
+    Args:
+        v_norm (:obj: `torch.Tensor`): tensor of size N containing
+            normalized point offsets
+        v0_src (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of source intervals for normalized points
+        size_src (:obj: `torch.Tensor`): tensor of size N containing
+            source interval sizes for normalized points
+        v0_dst (:obj: `torch.Tensor`): tensor of size N containing
+            left bounds of destination intervals
+        size_dst (:obj: `torch.Tensor`): tensor of size N containing
+            destination interval sizes
+        size_z (int): interval size for data to be interpolated
+
+    Returns:
+        v_lo (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of lower values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_hi (:obj: `torch.Tensor`): int tensor of size N containing
+            indices of upper values used for interpolation, all values are
+            integers from [0, size_z - 1]
+        v_w (:obj: `torch.Tensor`): float tensor of size N containing
+            interpolation weights
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing
+            0 for points outside the estimation interval
+            (v0_est, v0_est + size_est) and 1 otherwise
+    """
+    v = v0_src + v_norm * size_src / 256.0
+    j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst)
+    v_grid = (v - v0_dst) * size_z / size_dst
+    v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1)
+    v_hi = (v_lo + 1).clamp(max=size_z - 1)
+    v_grid = torch.min(v_hi.float(), v_grid)
+    v_w = v_grid - v_lo.float()
+    return v_lo, v_hi, v_w, j_valid
+
+
+class BilinearInterpolationHelper:
+    """
+    Args:
+        packed_annotations: object that contains packed annotations
+        j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing
+            0 for points to be discarded and 1 for points to be selected
+        y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values
+            in z_est for each point
+        y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values
+            in z_est for each point
+        x_lo (:obj: `torch.Tensor`): int tensor of indices of left values
+            in z_est for each point
+        x_hi (:obj: `torch.Tensor`): int tensor of indices of right values
+            in z_est for each point
+        w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-left value weight for each point
+        w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains upper-right value weight for each point
+        w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-left value weight for each point
+        w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M;
+            contains lower-right value weight for each point
+    """
+
+    def __init__(
+        self,
+        packed_annotations: Any,
+        j_valid: torch.Tensor,
+        y_lo: torch.Tensor,
+        y_hi: torch.Tensor,
+        x_lo: torch.Tensor,
+        x_hi: torch.Tensor,
+        w_ylo_xlo: torch.Tensor,
+        w_ylo_xhi: torch.Tensor,
+        w_yhi_xlo: torch.Tensor,
+        w_yhi_xhi: torch.Tensor,
+    ):
+        for k, v in locals().items():
+            if k != "self":
+                setattr(self, k, v)
+
+    @staticmethod
+    def from_matches(
+        packed_annotations: Any, densepose_outputs_size_hw: Tuple[int, int]
+    ) -> "BilinearInterpolationHelper":
+        """
+        Args:
+            packed_annotations: annotations packed into tensors, the following
+                attributes are required:
+                 - bbox_xywh_gt
+                 - bbox_xywh_est
+                 - x_gt
+                 - y_gt
+                 - point_bbox_with_dp_indices
+                 - point_bbox_indices
+            densepose_outputs_size_hw (tuple [int, int]): resolution of
+                DensePose predictor outputs (H, W)
+        Return:
+            An instance of `BilinearInterpolationHelper` used to perform
+            interpolation for the given annotation points and output resolution
+        """
+
+        zh, zw = densepose_outputs_size_hw
+        x0_gt, y0_gt, w_gt, h_gt = packed_annotations.bbox_xywh_gt[
+            packed_annotations.point_bbox_with_dp_indices
+        ].unbind(dim=1)
+        x0_est, y0_est, w_est, h_est = packed_annotations.bbox_xywh_est[
+            packed_annotations.point_bbox_with_dp_indices
+        ].unbind(dim=1)
+        x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities(
+            packed_annotations.x_gt, x0_gt, w_gt, x0_est, w_est, zw
+        )
+        y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities(
+            packed_annotations.y_gt, y0_gt, h_gt, y0_est, h_est, zh
+        )
+        j_valid = jx_valid * jy_valid
+
+        w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w)
+        w_ylo_xhi = x_w * (1.0 - y_w)
+        w_yhi_xlo = (1.0 - x_w) * y_w
+        w_yhi_xhi = x_w * y_w
+
+        return BilinearInterpolationHelper(
+            packed_annotations,
+            j_valid,
+            y_lo,
+            y_hi,
+            x_lo,
+            x_hi,
+            w_ylo_xlo,  # pyre-ignore[6]
+            w_ylo_xhi,
+            # pyre-fixme[6]: Expected `Tensor` for 9th param but got `float`.
+            w_yhi_xlo,
+            w_yhi_xhi,
+        )
+
+    def extract_at_points(
+        self,
+        z_est,
+        slice_fine_segm=None,
+        w_ylo_xlo=None,
+        w_ylo_xhi=None,
+        w_yhi_xlo=None,
+        w_yhi_xhi=None,
+    ):
+        """
+        Extract ground truth values z_gt for valid point indices and estimated
+        values z_est using bilinear interpolation over top-left (y_lo, x_lo),
+        top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right
+        (y_hi, x_hi) values in z_est with corresponding weights:
+        w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi.
+        Use slice_fine_segm to slice dim=1 in z_est
+        """
+        slice_fine_segm = (
+            self.packed_annotations.fine_segm_labels_gt
+            if slice_fine_segm is None
+            else slice_fine_segm
+        )
+        w_ylo_xlo = self.w_ylo_xlo if w_ylo_xlo is None else w_ylo_xlo
+        w_ylo_xhi = self.w_ylo_xhi if w_ylo_xhi is None else w_ylo_xhi
+        w_yhi_xlo = self.w_yhi_xlo if w_yhi_xlo is None else w_yhi_xlo
+        w_yhi_xhi = self.w_yhi_xhi if w_yhi_xhi is None else w_yhi_xhi
+
+        index_bbox = self.packed_annotations.point_bbox_indices
+        z_est_sampled = (
+            z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_lo] * w_ylo_xlo
+            + z_est[index_bbox, slice_fine_segm, self.y_lo, self.x_hi] * w_ylo_xhi
+            + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_lo] * w_yhi_xlo
+            + z_est[index_bbox, slice_fine_segm, self.y_hi, self.x_hi] * w_yhi_xhi
+        )
+        return z_est_sampled
+
+
+def resample_data(
+    z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode: str = "nearest", padding_mode: str = "zeros"
+):
+    """
+    Args:
+        z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be
+            resampled
+        bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            source bounding boxes in format XYWH
+        bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing
+            destination bounding boxes in format XYWH
+    Return:
+        zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout)
+            with resampled values of z, where D is the discretization size
+    """
+    n = bbox_xywh_src.size(0)
+    assert n == bbox_xywh_dst.size(0), (
+        "The number of "
+        "source ROIs for resampling ({}) should be equal to the number "
+        "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0))
+    )
+    x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1)
+    x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1)
+    x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1
+    y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1
+    x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1
+    y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1
+    grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout
+    grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout
+    grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout)
+    grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout)
+    dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout)
+    dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout)
+    x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout)
+    y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout)
+    grid_x = grid_w_expanded * dx_expanded + x0_expanded
+    grid_y = grid_h_expanded * dy_expanded + y0_expanded
+    grid = torch.stack((grid_x, grid_y), dim=3)
+    # resample Z from (N, C, H, W) into (N, C, Hout, Wout)
+    zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True)
+    return zresampled
+
+
+class AnnotationsAccumulator(ABC):
+    """
+    Abstract class for an accumulator for annotations that can produce
+    dense annotations packed into tensors.
+    """
+
+    @abstractmethod
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        pass
+
+    @abstractmethod
+    def pack(self) -> Any:
+        """
+        Pack data into tensors
+        """
+        pass
+
+
+@dataclass
+class PackedChartBasedAnnotations:
+    """
+    Packed annotations for chart-based model training. The following attributes
+    are defined:
+     - fine_segm_labels_gt (tensor [K] of `int64`): GT fine segmentation point labels
+     - x_gt (tensor [K] of `float32`): GT normalized X point coordinates
+     - y_gt (tensor [K] of `float32`): GT normalized Y point coordinates
+     - u_gt (tensor [K] of `float32`): GT point U values
+     - v_gt (tensor [K] of `float32`): GT point V values
+     - coarse_segm_gt (tensor [N, S, S] of `float32`): GT segmentation for bounding boxes
+     - bbox_xywh_gt (tensor [N, 4] of `float32`): selected GT bounding boxes in
+         XYWH format
+     - bbox_xywh_est (tensor [N, 4] of `float32`): selected matching estimated
+         bounding boxes in XYWH format
+     - point_bbox_with_dp_indices (tensor [K] of `int64`): indices of bounding boxes
+         with DensePose annotations that correspond to the point data
+     - point_bbox_indices (tensor [K] of `int64`): indices of bounding boxes
+         (not necessarily the selected ones with DensePose data) that correspond
+         to the point data
+     - bbox_indices (tensor [N] of `int64`): global indices of selected bounding
+         boxes with DensePose annotations; these indices could be used to access
+         features that are computed for all bounding boxes, not only the ones with
+         DensePose annotations.
+    Here K is the total number of points and N is the total number of instances
+    with DensePose annotations.
+    """
+
+    fine_segm_labels_gt: torch.Tensor
+    x_gt: torch.Tensor
+    y_gt: torch.Tensor
+    u_gt: torch.Tensor
+    v_gt: torch.Tensor
+    coarse_segm_gt: Optional[torch.Tensor]
+    bbox_xywh_gt: torch.Tensor
+    bbox_xywh_est: torch.Tensor
+    point_bbox_with_dp_indices: torch.Tensor
+    point_bbox_indices: torch.Tensor
+    bbox_indices: torch.Tensor
+
+
+class ChartBasedAnnotationsAccumulator(AnnotationsAccumulator):
+    """
+    Accumulates annotations by batches that correspond to objects detected on
+    individual images. Can pack them together into single tensors.
+    """
+
+    def __init__(self):
+        self.i_gt = []
+        self.x_gt = []
+        self.y_gt = []
+        self.u_gt = []
+        self.v_gt = []
+        self.s_gt = []
+        self.bbox_xywh_gt = []
+        self.bbox_xywh_est = []
+        self.point_bbox_with_dp_indices = []
+        self.point_bbox_indices = []
+        self.bbox_indices = []
+        self.nxt_bbox_with_dp_index = 0
+        self.nxt_bbox_index = 0
+
+    def accumulate(self, instances_one_image: Instances):
+        """
+        Accumulate instances data for one image
+
+        Args:
+            instances_one_image (Instances): instances data to accumulate
+        """
+        boxes_xywh_est = BoxMode.convert(
+            instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        boxes_xywh_gt = BoxMode.convert(
+            instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+        )
+        n_matches = len(boxes_xywh_gt)
+        assert n_matches == len(
+            boxes_xywh_est
+        ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes"
+        if not n_matches:
+            # no detection - GT matches
+            return
+        if (
+            not hasattr(instances_one_image, "gt_densepose")
+            or instances_one_image.gt_densepose is None
+        ):
+            # no densepose GT for the detections, just increase the bbox index
+            self.nxt_bbox_index += n_matches
+            return
+        for box_xywh_est, box_xywh_gt, dp_gt in zip(
+            boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose
+        ):
+            if (dp_gt is not None) and (len(dp_gt.x) > 0):
+                # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`.
+                # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`.
+                self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt)
+            self.nxt_bbox_index += 1
+
+    def _do_accumulate(
+        self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: DensePoseDataRelative
+    ):
+        """
+        Accumulate instances data for one image, given that the data is not empty
+
+        Args:
+            box_xywh_gt (tensor): GT bounding box
+            box_xywh_est (tensor): estimated bounding box
+            dp_gt (DensePoseDataRelative): GT densepose data
+        """
+        self.i_gt.append(dp_gt.i)
+        self.x_gt.append(dp_gt.x)
+        self.y_gt.append(dp_gt.y)
+        self.u_gt.append(dp_gt.u)
+        self.v_gt.append(dp_gt.v)
+        if hasattr(dp_gt, "segm"):
+            self.s_gt.append(dp_gt.segm.unsqueeze(0))
+        self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4))
+        self.bbox_xywh_est.append(box_xywh_est.view(-1, 4))
+        self.point_bbox_with_dp_indices.append(
+            torch.full_like(dp_gt.i, self.nxt_bbox_with_dp_index)
+        )
+        self.point_bbox_indices.append(torch.full_like(dp_gt.i, self.nxt_bbox_index))
+        self.bbox_indices.append(self.nxt_bbox_index)
+        self.nxt_bbox_with_dp_index += 1
+
+    def pack(self) -> Optional[PackedChartBasedAnnotations]:
+        """
+        Pack data into tensors
+        """
+        if not len(self.i_gt):
+            # TODO:
+            # returning proper empty annotations would require
+            # creating empty tensors of appropriate shape and
+            # type on an appropriate device;
+            # we return None so far to indicate empty annotations
+            return None
+        return PackedChartBasedAnnotations(
+            fine_segm_labels_gt=torch.cat(self.i_gt, 0).long(),
+            x_gt=torch.cat(self.x_gt, 0),
+            y_gt=torch.cat(self.y_gt, 0),
+            u_gt=torch.cat(self.u_gt, 0),
+            v_gt=torch.cat(self.v_gt, 0),
+            # ignore segmentation annotations, if not all the instances contain those
+            coarse_segm_gt=(
+                torch.cat(self.s_gt, 0) if len(self.s_gt) == len(self.bbox_xywh_gt) else None
+            ),
+            bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0),
+            bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0),
+            point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0).long(),
+            point_bbox_indices=torch.cat(self.point_bbox_indices, 0).long(),
+            bbox_indices=torch.as_tensor(
+                self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device
+            ).long(),
+        )
+
+
+def extract_packed_annotations_from_matches(
+    proposals_with_targets: List[Instances], accumulator: AnnotationsAccumulator
+) -> Any:
+    for proposals_targets_per_image in proposals_with_targets:
+        accumulator.accumulate(proposals_targets_per_image)
+    return accumulator.pack()
+
+
+def sample_random_indices(
+    n_indices: int, n_samples: int, device: Optional[torch.device] = None
+) -> Optional[torch.Tensor]:
+    """
+    Samples `n_samples` random indices from range `[0..n_indices - 1]`.
+    If `n_indices` is smaller than `n_samples`, returns `None` meaning that all indices
+    are selected.
+    Args:
+        n_indices (int): total number of indices
+        n_samples (int): number of indices to sample
+        device (torch.device): the desired device of returned tensor
+    Return:
+        Tensor of selected vertex indices, or `None`, if all vertices are selected
+    """
+    if (n_samples <= 0) or (n_indices <= n_samples):
+        return None
+    indices = torch.randperm(n_indices, device=device)[:n_samples]
+    return indices
diff --git a/densepose/modeling/predictors/__init__.py b/densepose/modeling/predictors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c749ea264690d0b4c85abc520e7476bc4365175d
--- /dev/null
+++ b/densepose/modeling/predictors/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .chart import DensePoseChartPredictor
+from .chart_confidence import DensePoseChartConfidencePredictorMixin
+from .chart_with_confidence import DensePoseChartWithConfidencePredictor
+from .cse import DensePoseEmbeddingPredictor
+from .cse_confidence import DensePoseEmbeddingConfidencePredictorMixin
+from .cse_with_confidence import DensePoseEmbeddingWithConfidencePredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
diff --git a/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49bf3176cb5463f326249cb7bef5a624c878b6f6
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9cf846e5b9f98daafadd44693d1b66516d11e03
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09bafc804749b612f6473339a0689f69ffe605ea
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0405d8e1e531989ce76f7210f8edf191f3eba2f
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/chart_with_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..276180023c4b103763674faeb0575f1b2fb7e010
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ace37119c22ef8933bd8c42fd62ce5442a9af1e7
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa6f8a1d7721b2502fb97fecfec63e3d1d4b4e8b
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/cse_with_confidence.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc b/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b40ea844c3f65ff969c2162fe021d2d9b680fab
Binary files /dev/null and b/densepose/modeling/predictors/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/predictors/chart.py b/densepose/modeling/predictors/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..67fc401d70fe5e7d7baec3530d435955d4a23f7c
--- /dev/null
+++ b/densepose/modeling/predictors/chart.py
@@ -0,0 +1,96 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d, interpolate
+
+from ...structures import DensePoseChartPredictorOutput
+from ..utils import initialize_module_params
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseChartPredictor(nn.Module):
+    """
+    Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input
+    and produces 4 tensors which represent DensePose results for predefined body parts
+    (patches / charts):
+     * coarse segmentation, a tensor of shape [N, K, Hout, Wout]
+     * fine segmentation, a tensor of shape [N, C, Hout, Wout]
+     * U coordinates, a tensor of shape [N, C, Hout, Wout]
+     * V coordinates, a tensor of shape [N, C, Hout, Wout]
+    where
+     - N is the number of instances
+     - K is the number of coarse segmentation channels (
+         2 = foreground / background,
+         15 = one of 14 body parts / background)
+     - C is the number of fine segmentation channels (
+         24 fine body parts / background)
+     - Hout and Wout are height and width of predictions
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize predictor using configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): input tensor size along the channel dimension
+        """
+        super().__init__()
+        dim_in = input_channels
+        n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        # coarse segmentation
+        self.ann_index_lowres = ConvTranspose2d(
+            dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # fine segmentation
+        self.index_uv_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # U
+        self.u_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # V
+        self.v_lowres = ConvTranspose2d(
+            dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        initialize_module_params(self)
+
+    def interp2d(self, tensor_nchw: torch.Tensor):
+        """
+        Bilinear interpolation method to be used for upscaling
+
+        Args:
+            tensor_nchw (tensor): tensor of shape (N, C, H, W)
+        Return:
+            tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
+                by applying the scale factor to H and W
+        """
+        return interpolate(
+            tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+        )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward step on DensePose head outputs
+
+        Args:
+            head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
+        Return:
+           An instance of DensePoseChartPredictorOutput
+        """
+        return DensePoseChartPredictorOutput(
+            coarse_segm=self.interp2d(self.ann_index_lowres(head_outputs)),
+            fine_segm=self.interp2d(self.index_uv_lowres(head_outputs)),
+            u=self.interp2d(self.u_lowres(head_outputs)),
+            v=self.interp2d(self.v_lowres(head_outputs)),
+        )
diff --git a/densepose/modeling/predictors/chart_confidence.py b/densepose/modeling/predictors/chart_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2220efa3a8c48e8f86bb4d1d11b3643c3cd6157
--- /dev/null
+++ b/densepose/modeling/predictors/chart_confidence.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d
+
+from ...structures import decorate_predictor_output_class_with_confidences
+from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType
+from ..utils import initialize_module_params
+
+
+class DensePoseChartConfidencePredictorMixin:
+    """
+    Predictor contains the last layers of a DensePose model that take DensePose head
+    outputs as an input and produce model outputs. Confidence predictor mixin is used
+    to generate confidences for segmentation and UV tensors estimated by some
+    base predictor. Several assumptions need to hold for the base predictor:
+    1) the `forward` method must return SIUV tuple as the first result (
+        S = coarse segmentation, I = fine segmentation, U and V are intrinsic
+        chart coordinates)
+    2) `interp2d` method must be defined to perform bilinear interpolation;
+        the same method is typically used for SIUV and confidences
+    Confidence predictor mixin provides confidence estimates, as described in:
+        N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
+            from Noisy Labels, NeurIPS 2019
+        A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize confidence predictor using configuration options.
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        # we rely on base predictor to call nn.Module.__init__
+        super().__init__(cfg, input_channels)  # pyre-ignore[19]
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        self._initialize_confidence_estimation_layers(cfg, input_channels)
+        self._registry = {}
+        initialize_module_params(self)  # pyre-ignore[6]
+
+    def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
+        """
+        Initialize confidence estimation layers based on configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            dim_in (int): number of input channels
+        """
+        dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                self.sigma_2_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            elif (
+                self.confidence_model_cfg.uv_confidence.type
+                == DensePoseUVConfidenceType.INDEP_ANISO
+            ):
+                self.sigma_2_lowres = ConvTranspose2d(
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_u_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+                self.kappa_v_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                    dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+                )
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: "
+                    f"{self.confidence_model_cfg.confidence_model_type}"
+                )
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            self.fine_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+            self.coarse_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward operation on head outputs used as inputs for the predictor.
+        Calls forward method from the base predictor and uses its outputs to compute
+        confidences.
+
+        Args:
+            head_outputs (Tensor): head outputs used as predictor inputs
+        Return:
+            An instance of outputs with confidences,
+            see `decorate_predictor_output_class_with_confidences`
+        """
+        # assuming base class returns SIUV estimates in its first result
+        base_predictor_outputs = super().forward(head_outputs)  # pyre-ignore[16]
+
+        # create output instance by extending base predictor outputs:
+        output = self._create_output_instance(base_predictor_outputs)
+
+        if self.confidence_model_cfg.uv_confidence.enabled:
+            if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO:
+                # assuming base class defines interp2d method for bilinear interpolation
+                output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs))  # pyre-ignore[16]
+            elif (
+                self.confidence_model_cfg.uv_confidence.type
+                == DensePoseUVConfidenceType.INDEP_ANISO
+            ):
+                # assuming base class defines interp2d method for bilinear interpolation
+                output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs))
+                output.kappa_u = self.interp2d(self.kappa_u_lowres(head_outputs))  # pyre-ignore[16]
+                output.kappa_v = self.interp2d(self.kappa_v_lowres(head_outputs))  # pyre-ignore[16]
+            else:
+                raise ValueError(
+                    f"Unknown confidence model type: "
+                    f"{self.confidence_model_cfg.confidence_model_type}"
+                )
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            # base predictor outputs are assumed to have `fine_segm` and `coarse_segm` attributes
+            # base predictor is assumed to define `interp2d` method for bilinear interpolation
+            output.fine_segm_confidence = (
+                F.softplus(
+                    self.interp2d(self.fine_segm_confidence_lowres(head_outputs))  # pyre-ignore[16]
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.fine_segm = base_predictor_outputs.fine_segm * torch.repeat_interleave(
+                output.fine_segm_confidence, base_predictor_outputs.fine_segm.shape[1], dim=1
+            )
+            output.coarse_segm_confidence = (
+                F.softplus(
+                    self.interp2d(
+                        self.coarse_segm_confidence_lowres(head_outputs)  # pyre-ignore[16]
+                    )
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave(
+                output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1
+            )
+
+        return output
+
+    def _create_output_instance(self, base_predictor_outputs: Any):
+        """
+        Create an instance of predictor outputs by copying the outputs from the
+        base predictor and initializing confidence
+
+        Args:
+            base_predictor_outputs: an instance of base predictor outputs
+                (the outputs type is assumed to be a dataclass)
+        Return:
+           An instance of outputs with confidences
+        """
+        PredictorOutput = decorate_predictor_output_class_with_confidences(
+            type(base_predictor_outputs)  # pyre-ignore[6]
+        )
+        # base_predictor_outputs is assumed to be a dataclass
+        # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields
+        output = PredictorOutput(
+            **base_predictor_outputs.__dict__,
+            coarse_segm_confidence=None,
+            fine_segm_confidence=None,
+            sigma_1=None,
+            sigma_2=None,
+            kappa_u=None,
+            kappa_v=None,
+        )
+        return output
diff --git a/densepose/modeling/predictors/chart_with_confidence.py b/densepose/modeling/predictors/chart_with_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..902032c77c65408e0268077f776bd957e80091a1
--- /dev/null
+++ b/densepose/modeling/predictors/chart_with_confidence.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseChartWithConfidencePredictor(
+    DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor
+):
+    """
+    Predictor that combines chart and chart confidence estimation
+    """
+
+    pass
diff --git a/densepose/modeling/predictors/cse.py b/densepose/modeling/predictors/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..8494b7975bab1f64e704c4d7c6bdcca4a43ba817
--- /dev/null
+++ b/densepose/modeling/predictors/cse.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import torch
+from torch import nn
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d, interpolate
+
+from ...structures import DensePoseEmbeddingPredictorOutput
+from ..utils import initialize_module_params
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseEmbeddingPredictor(nn.Module):
+    """
+    Last layers of a DensePose model that take DensePose head outputs as an input
+    and produce model outputs for continuous surface embeddings (CSE).
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize predictor using configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): input tensor size along the channel dimension
+        """
+        super().__init__()
+        dim_in = input_channels
+        n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+        embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        # coarse segmentation
+        self.coarse_segm_lowres = ConvTranspose2d(
+            dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        # embedding
+        self.embed_lowres = ConvTranspose2d(
+            dim_in, embed_size, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+        )
+        self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE
+        initialize_module_params(self)
+
+    def interp2d(self, tensor_nchw: torch.Tensor):
+        """
+        Bilinear interpolation method to be used for upscaling
+
+        Args:
+            tensor_nchw (tensor): tensor of shape (N, C, H, W)
+        Return:
+            tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed
+                by applying the scale factor to H and W
+        """
+        return interpolate(
+            tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False
+        )
+
+    def forward(self, head_outputs):
+        """
+        Perform forward step on DensePose head outputs
+
+        Args:
+            head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W]
+        """
+        embed_lowres = self.embed_lowres(head_outputs)
+        coarse_segm_lowres = self.coarse_segm_lowres(head_outputs)
+        embed = self.interp2d(embed_lowres)
+        coarse_segm = self.interp2d(coarse_segm_lowres)
+        return DensePoseEmbeddingPredictorOutput(embedding=embed, coarse_segm=coarse_segm)
diff --git a/densepose/modeling/predictors/cse_confidence.py b/densepose/modeling/predictors/cse_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d19b354fa14eb5f79e584c090f2bc0cb4d28c5f
--- /dev/null
+++ b/densepose/modeling/predictors/cse_confidence.py
@@ -0,0 +1,117 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from typing import Any
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import ConvTranspose2d
+
+from densepose.modeling.confidence import DensePoseConfidenceModelConfig
+from densepose.modeling.utils import initialize_module_params
+from densepose.structures import decorate_cse_predictor_output_class_with_confidences
+
+
+class DensePoseEmbeddingConfidencePredictorMixin:
+    """
+    Predictor contains the last layers of a DensePose model that take DensePose head
+    outputs as an input and produce model outputs. Confidence predictor mixin is used
+    to generate confidences for coarse segmentation estimated by some
+    base predictor. Several assumptions need to hold for the base predictor:
+    1) the `forward` method must return CSE DensePose head outputs,
+        tensor of shape [N, D, H, W]
+    2) `interp2d` method must be defined to perform bilinear interpolation;
+        the same method is typically used for masks and confidences
+    Confidence predictor mixin provides confidence estimates, as described in:
+        N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences
+            from Noisy Labels, NeurIPS 2019
+        A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize confidence predictor using configuration options.
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        # we rely on base predictor to call nn.Module.__init__
+        super().__init__(cfg, input_channels)  # pyre-ignore[19]
+        self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg)
+        self._initialize_confidence_estimation_layers(cfg, input_channels)
+        self._registry = {}
+        initialize_module_params(self)  # pyre-ignore[6]
+
+    def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int):
+        """
+        Initialize confidence estimation layers based on configuration options
+
+        Args:
+            cfg (CfgNode): configuration options
+            dim_in (int): number of input channels
+        """
+        kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            self.coarse_segm_confidence_lowres = ConvTranspose2d(  # pyre-ignore[16]
+                dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1)
+            )
+
+    def forward(self, head_outputs: torch.Tensor):
+        """
+        Perform forward operation on head outputs used as inputs for the predictor.
+        Calls forward method from the base predictor and uses its outputs to compute
+        confidences.
+
+        Args:
+            head_outputs (Tensor): head outputs used as predictor inputs
+        Return:
+            An instance of outputs with confidences,
+            see `decorate_cse_predictor_output_class_with_confidences`
+        """
+        # assuming base class returns SIUV estimates in its first result
+        base_predictor_outputs = super().forward(head_outputs)  # pyre-ignore[16]
+
+        # create output instance by extending base predictor outputs:
+        output = self._create_output_instance(base_predictor_outputs)
+
+        if self.confidence_model_cfg.segm_confidence.enabled:
+            # base predictor outputs are assumed to have `coarse_segm` attribute
+            # base predictor is assumed to define `interp2d` method for bilinear interpolation
+            output.coarse_segm_confidence = (
+                F.softplus(
+                    self.interp2d(  # pyre-ignore[16]
+                        self.coarse_segm_confidence_lowres(head_outputs)  # pyre-ignore[16]
+                    )
+                )
+                + self.confidence_model_cfg.segm_confidence.epsilon
+            )
+            output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave(
+                output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1
+            )
+
+        return output
+
+    def _create_output_instance(self, base_predictor_outputs: Any):
+        """
+        Create an instance of predictor outputs by copying the outputs from the
+        base predictor and initializing confidence
+
+        Args:
+            base_predictor_outputs: an instance of base predictor outputs
+                (the outputs type is assumed to be a dataclass)
+        Return:
+           An instance of outputs with confidences
+        """
+        PredictorOutput = decorate_cse_predictor_output_class_with_confidences(
+            type(base_predictor_outputs)  # pyre-ignore[6]
+        )
+        # base_predictor_outputs is assumed to be a dataclass
+        # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields
+        output = PredictorOutput(
+            **base_predictor_outputs.__dict__,
+            coarse_segm_confidence=None,
+        )
+        return output
diff --git a/densepose/modeling/predictors/cse_with_confidence.py b/densepose/modeling/predictors/cse_with_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..02389dbcbe734c89e6eb86757d877c9657fd12b1
--- /dev/null
+++ b/densepose/modeling/predictors/cse_with_confidence.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from . import DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor
+from .registry import DENSEPOSE_PREDICTOR_REGISTRY
+
+
+@DENSEPOSE_PREDICTOR_REGISTRY.register()
+class DensePoseEmbeddingWithConfidencePredictor(
+    DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor
+):
+    """
+    Predictor that combines CSE and CSE confidence estimation
+    """
+
+    pass
diff --git a/densepose/modeling/predictors/registry.py b/densepose/modeling/predictors/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..c883ba3538e8d8e5b11c68811fdf1990a2964a71
--- /dev/null
+++ b/densepose/modeling/predictors/registry.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from detectron2.utils.registry import Registry
+
+DENSEPOSE_PREDICTOR_REGISTRY = Registry("DENSEPOSE_PREDICTOR")
diff --git a/densepose/modeling/roi_heads/__init__.py b/densepose/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a055a65454517876107c621ba53e4742fa5eb54
--- /dev/null
+++ b/densepose/modeling/roi_heads/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .v1convx import DensePoseV1ConvXHead
+from .deeplab import DensePoseDeepLabHead
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+from .roi_head import Decoder, DensePoseROIHeads
diff --git a/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..987d23c7c539e2deb35c7c0c0dc2116151c5856e
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5197471220a5e71b609f73fe171249e91e34201
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/deeplab.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc43717ab47c4470f2044a68ade8f1aa4ad265b0
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/registry.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c9d77481b049d645ec7e4d5998a83a3557a72a
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/roi_head.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..264b30c248d452b1c141da59c32b3b4787428a71
Binary files /dev/null and b/densepose/modeling/roi_heads/__pycache__/v1convx.cpython-39.pyc differ
diff --git a/densepose/modeling/roi_heads/deeplab.py b/densepose/modeling/roi_heads/deeplab.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f42d20681a34b319c15967548839ffffa77c89a
--- /dev/null
+++ b/densepose/modeling/roi_heads/deeplab.py
@@ -0,0 +1,265 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import Conv2d
+
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseDeepLabHead(nn.Module):
+    """
+    DensePose head using DeepLabV3 model from
+    "Rethinking Atrous Convolution for Semantic Image Segmentation"
+    <https://arxiv.org/abs/1706.05587>.
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        super(DensePoseDeepLabHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        norm                 = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        self.use_nonlocal    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+
+        self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels)  # 6, 12, 56
+        self.add_module("ASPP", self.ASPP)
+
+        if self.use_nonlocal:
+            self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True)
+            self.add_module("NLBlock", self.NLBlock)
+        # weight_init.c2_msra_fill(self.ASPP)
+
+        for i in range(self.n_stacked_convs):
+            norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None
+            layer = Conv2d(
+                n_channels,
+                hidden_dim,
+                kernel_size,
+                stride=1,
+                padding=pad_size,
+                bias=not norm,
+                norm=norm_module,
+            )
+            weight_init.c2_msra_fill(layer)
+            n_channels = hidden_dim
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+        self.n_out_channels = hidden_dim
+        # initialize_module_params(self)
+
+    def forward(self, features):
+        x0 = features
+        x = self.ASPP(x0)
+        if self.use_nonlocal:
+            x = self.NLBlock(x)
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i: int):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
+
+
+# Copied from
+# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py
+# See https://arxiv.org/pdf/1706.05587.pdf for details
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(
+                in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        ]
+        super(ASPPConv, self).__init__(*modules)
+
+
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        x = super(ASPPPooling, self).forward(x)
+        return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+
+
+class ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates, out_channels):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, bias=False),
+                nn.GroupNorm(32, out_channels),
+                nn.ReLU(),
+            )
+        )
+
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        modules.append(ASPPConv(in_channels, out_channels, rate1))
+        modules.append(ASPPConv(in_channels, out_channels, rate2))
+        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+
+        self.convs = nn.ModuleList(modules)
+
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            # nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            # nn.Dropout(0.5)
+        )
+
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
+
+
+# copied from
+# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py
+# See https://arxiv.org/abs/1711.07971 for details
+class _NonLocalBlockND(nn.Module):
+    def __init__(
+        self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True
+    ):
+        super(_NonLocalBlockND, self).__init__()
+
+        assert dimension in [1, 2, 3]
+
+        self.dimension = dimension
+        self.sub_sample = sub_sample
+
+        self.in_channels = in_channels
+        self.inter_channels = inter_channels
+
+        if self.inter_channels is None:
+            self.inter_channels = in_channels // 2
+            if self.inter_channels == 0:
+                self.inter_channels = 1
+
+        if dimension == 3:
+            conv_nd = nn.Conv3d
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim) #nn.BatchNorm3d
+        elif dimension == 2:
+            conv_nd = nn.Conv2d
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm2d
+        else:
+            conv_nd = nn.Conv1d
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            bn = nn.GroupNorm  # (32, hidden_dim)nn.BatchNorm1d
+
+        self.g = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if bn_layer:
+            self.W = nn.Sequential(
+                conv_nd(
+                    in_channels=self.inter_channels,
+                    out_channels=self.in_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                ),
+                bn(32, self.in_channels),
+            )
+            nn.init.constant_(self.W[1].weight, 0)
+            nn.init.constant_(self.W[1].bias, 0)
+        else:
+            self.W = conv_nd(
+                in_channels=self.inter_channels,
+                out_channels=self.in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+            nn.init.constant_(self.W.weight, 0)
+            nn.init.constant_(self.W.bias, 0)
+
+        self.theta = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.phi = conv_nd(
+            in_channels=self.in_channels,
+            out_channels=self.inter_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+        if sub_sample:
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            self.phi = nn.Sequential(self.phi, max_pool_layer)
+
+    def forward(self, x):
+        """
+        :param x: (b, c, t, h, w)
+        :return:
+        """
+
+        batch_size = x.size(0)
+
+        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
+        theta_x = theta_x.permute(0, 2, 1)
+        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
+        f = torch.matmul(theta_x, phi_x)
+        f_div_C = F.softmax(f, dim=-1)
+
+        y = torch.matmul(f_div_C, g_x)
+        y = y.permute(0, 2, 1).contiguous()
+        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
+        W_y = self.W(y)
+        z = W_y + x
+
+        return z
+
+
+class NONLocalBlock2D(_NonLocalBlockND):
+    def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True):
+        super(NONLocalBlock2D, self).__init__(
+            in_channels,
+            inter_channels=inter_channels,
+            dimension=2,
+            sub_sample=sub_sample,
+            bn_layer=bn_layer,
+        )
diff --git a/densepose/modeling/roi_heads/registry.py b/densepose/modeling/roi_heads/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..89514279ffba6a65fc499e03bc0177ed8039482f
--- /dev/null
+++ b/densepose/modeling/roi_heads/registry.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from detectron2.utils.registry import Registry
+
+ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD")
diff --git a/densepose/modeling/roi_heads/roi_head.py b/densepose/modeling/roi_heads/roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8f869f84aa59a09286c421123b31c9db436ae6
--- /dev/null
+++ b/densepose/modeling/roi_heads/roi_head.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import numpy as np
+from typing import Dict, List, Optional
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads
+from detectron2.modeling.poolers import ROIPooler
+from detectron2.modeling.roi_heads import select_foreground_proposals
+from detectron2.structures import ImageList, Instances
+
+from .. import (
+    build_densepose_data_filter,
+    build_densepose_embedder,
+    build_densepose_head,
+    build_densepose_losses,
+    build_densepose_predictor,
+    densepose_inference,
+)
+
+
+class Decoder(nn.Module):
+    """
+    A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper
+    (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from
+    all levels of the FPN into single output.
+    """
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features):
+        super(Decoder, self).__init__()
+
+        # fmt: off
+        self.in_features      = in_features
+        feature_strides       = {k: v.stride for k, v in input_shape.items()}
+        feature_channels      = {k: v.channels for k, v in input_shape.items()}
+        num_classes           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES
+        conv_dims             = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
+        self.common_stride    = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE
+        norm                  = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM
+        # fmt: on
+
+        self.scale_heads = []
+        for in_feature in self.in_features:
+            head_ops = []
+            head_length = max(
+                1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride))
+            )
+            for k in range(head_length):
+                conv = Conv2d(
+                    feature_channels[in_feature] if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=get_norm(norm, conv_dims),
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if feature_strides[in_feature] != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+
+    def forward(self, features: List[torch.Tensor]):
+        for i, _ in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[i])
+            else:
+                x = x + self.scale_heads[i](features[i])
+        x = self.predictor(x)
+        return x
+
+
+@ROI_HEADS_REGISTRY.register()
+class DensePoseROIHeads(StandardROIHeads):
+    """
+    A Standard ROIHeads which contains an addition of DensePose head.
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__(cfg, input_shape)
+        self._init_densepose_head(cfg, input_shape)
+
+    def _init_densepose_head(self, cfg, input_shape):
+        # fmt: off
+        self.densepose_on          = cfg.MODEL.DENSEPOSE_ON
+        if not self.densepose_on:
+            return
+        self.densepose_data_filter = build_densepose_data_filter(cfg)
+        dp_pooler_resolution       = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION
+        dp_pooler_sampling_ratio   = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO
+        dp_pooler_type             = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE
+        self.use_decoder           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
+        # fmt: on
+        if self.use_decoder:
+            dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,)
+        else:
+            dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features)
+        in_channels = [input_shape[f].channels for f in self.in_features][0]
+
+        if self.use_decoder:
+            self.decoder = Decoder(cfg, input_shape, self.in_features)
+
+        self.densepose_pooler = ROIPooler(
+            output_size=dp_pooler_resolution,
+            scales=dp_pooler_scales,
+            sampling_ratio=dp_pooler_sampling_ratio,
+            pooler_type=dp_pooler_type,
+        )
+        self.densepose_head = build_densepose_head(cfg, in_channels)
+        self.densepose_predictor = build_densepose_predictor(
+            cfg, self.densepose_head.n_out_channels
+        )
+        self.densepose_losses = build_densepose_losses(cfg)
+        self.embedder = build_densepose_embedder(cfg)
+
+    def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]):
+        """
+        Forward logic of the densepose prediction branch.
+
+        Args:
+            features (dict[str, Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            instances (list[Instances]): length `N` list of `Instances`. The i-th
+                `Instances` contains instances for the i-th input image,
+                In training, they can be the proposals.
+                In inference, they can be the predicted boxes.
+
+        Returns:
+            In training, a dict of losses.
+            In inference, update `instances` with new fields "densepose" and return it.
+        """
+        if not self.densepose_on:
+            return {} if self.training else instances
+
+        features_list = [features[f] for f in self.in_features]
+        if self.training:
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            features_list, proposals = self.densepose_data_filter(features_list, proposals)
+            if len(proposals) > 0:
+                proposal_boxes = [x.proposal_boxes for x in proposals]
+
+                if self.use_decoder:
+                    features_list = [self.decoder(features_list)]
+
+                features_dp = self.densepose_pooler(features_list, proposal_boxes)
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs)
+                densepose_loss_dict = self.densepose_losses(
+                    proposals, densepose_predictor_outputs, embedder=self.embedder
+                )
+                return densepose_loss_dict
+        else:
+            pred_boxes = [x.pred_boxes for x in instances]
+
+            if self.use_decoder:
+                features_list = [self.decoder(features_list)]
+
+            features_dp = self.densepose_pooler(features_list, pred_boxes)
+            if len(features_dp) > 0:
+                densepose_head_outputs = self.densepose_head(features_dp)
+                densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs)
+            else:
+                densepose_predictor_outputs = None
+
+            densepose_inference(densepose_predictor_outputs, instances)
+            return instances
+
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        proposals: List[Instances],
+        targets: Optional[List[Instances]] = None,
+    ):
+        instances, losses = super().forward(images, features, proposals, targets)
+        del targets, images
+
+        if self.training:
+            losses.update(self._forward_densepose(features, instances))
+        return instances, losses
+
+    def forward_with_given_boxes(
+        self, features: Dict[str, torch.Tensor], instances: List[Instances]
+    ):
+        """
+        Use the given boxes in `instances` to produce other (non-box) per-ROI outputs.
+
+        This is useful for downstream tasks where a box is known, but need to obtain
+        other attributes (outputs of other heads).
+        Test-time augmentation also uses this.
+
+        Args:
+            features: same as in `forward()`
+            instances (list[Instances]): instances to predict other outputs. Expect the keys
+                "pred_boxes" and "pred_classes" to exist.
+
+        Returns:
+            instances (list[Instances]):
+                the same `Instances` objects, with extra
+                fields such as `pred_masks` or `pred_keypoints`.
+        """
+
+        instances = super().forward_with_given_boxes(features, instances)
+        instances = self._forward_densepose(features, instances)
+        return instances
diff --git a/densepose/modeling/roi_heads/v1convx.py b/densepose/modeling/roi_heads/v1convx.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81c375c5a488af4cb9ab41676d5e6688f740e61
--- /dev/null
+++ b/densepose/modeling/roi_heads/v1convx.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import CfgNode
+from detectron2.layers import Conv2d
+
+from ..utils import initialize_module_params
+from .registry import ROI_DENSEPOSE_HEAD_REGISTRY
+
+
+@ROI_DENSEPOSE_HEAD_REGISTRY.register()
+class DensePoseV1ConvXHead(nn.Module):
+    """
+    Fully convolutional DensePose head.
+    """
+
+    def __init__(self, cfg: CfgNode, input_channels: int):
+        """
+        Initialize DensePose fully convolutional head
+
+        Args:
+            cfg (CfgNode): configuration options
+            input_channels (int): number of input channels
+        """
+        super(DensePoseV1ConvXHead, self).__init__()
+        # fmt: off
+        hidden_dim           = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM
+        kernel_size          = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL
+        self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS
+        # fmt: on
+        pad_size = kernel_size // 2
+        n_channels = input_channels
+        for i in range(self.n_stacked_convs):
+            layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size)
+            layer_name = self._get_layer_name(i)
+            self.add_module(layer_name, layer)
+            n_channels = hidden_dim
+        self.n_out_channels = n_channels
+        initialize_module_params(self)
+
+    def forward(self, features: torch.Tensor):
+        """
+        Apply DensePose fully convolutional head to the input features
+
+        Args:
+            features (tensor): input features
+        Result:
+            A tensor of DensePose head outputs
+        """
+        x = features
+        output = x
+        for i in range(self.n_stacked_convs):
+            layer_name = self._get_layer_name(i)
+            x = getattr(self, layer_name)(x)
+            x = F.relu(x)
+            output = x
+        return output
+
+    def _get_layer_name(self, i: int):
+        layer_name = "body_conv_fcn{}".format(i + 1)
+        return layer_name
diff --git a/densepose/modeling/test_time_augmentation.py b/densepose/modeling/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4cfa038f2fe3072a2520978ff4408df9bca5b3
--- /dev/null
+++ b/densepose/modeling/test_time_augmentation.py
@@ -0,0 +1,209 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import copy
+import numpy as np
+import torch
+from fvcore.transforms import HFlipTransform, TransformList
+from torch.nn import functional as F
+
+from detectron2.data.transforms import RandomRotation, RotationTransform, apply_transform_gens
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.modeling.test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+
+from ..converters import HFlipConverter
+
+
+class DensePoseDatasetMapperTTA(DatasetMapperTTA):
+    def __init__(self, cfg):
+        super().__init__(cfg=cfg)
+        self.angles = cfg.TEST.AUG.ROTATION_ANGLES
+
+    def __call__(self, dataset_dict):
+        ret = super().__call__(dataset_dict=dataset_dict)
+        numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy()
+        for angle in self.angles:
+            rotate = RandomRotation(angle=angle, expand=True)
+            new_numpy_image, tfms = apply_transform_gens([rotate], np.copy(numpy_image))
+            torch_image = torch.from_numpy(np.ascontiguousarray(new_numpy_image.transpose(2, 0, 1)))
+            dic = copy.deepcopy(dataset_dict)
+            # In DatasetMapperTTA, there is a pre_tfm transform (resize or no-op) that is
+            # added at the beginning of each TransformList. That's '.transforms[0]'.
+            dic["transforms"] = TransformList(
+                [ret[-1]["transforms"].transforms[0]] + tfms.transforms
+            )
+            dic["image"] = torch_image
+            ret.append(dic)
+        return ret
+
+
+class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA):
+    def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1):
+        """
+        Args:
+            cfg (CfgNode):
+            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
+            transform_data (DensePoseTransformData): contains symmetry label
+                transforms used for horizontal flip
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        self._transform_data = transform_data.to(model.device)
+        super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size)
+
+    # the implementation follows closely the one from detectron2/modeling
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict with "image" field being a CHW tensor
+
+        Returns:
+            dict: one output dict
+        """
+        orig_shape = (input["height"], input["width"])
+        # For some reason, resize with uint8 slightly increases box AP but decreases densepose AP
+        input["image"] = input["image"].to(torch.uint8)
+        augmented_inputs, tfms = self._get_augmented_inputs(input)
+        # Detect boxes from all augmented versions
+        with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]):
+            # temporarily disable roi heads
+            all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
+        merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)
+
+        if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON:
+            # Use the detected boxes to obtain new fields
+            augmented_instances = self._rescale_detected_boxes(
+                augmented_inputs, merged_instances, tfms
+            )
+            # run forward on the detected boxes
+            outputs = self._batch_inference(augmented_inputs, augmented_instances)
+            # Delete now useless variables to avoid being out of memory
+            del augmented_inputs, augmented_instances
+            # average the predictions
+            if self.cfg.MODEL.MASK_ON:
+                merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
+            if self.cfg.MODEL.DENSEPOSE_ON:
+                merged_instances.pred_densepose = self._reduce_pred_densepose(outputs, tfms)
+            # postprocess
+            merged_instances = detector_postprocess(merged_instances, *orig_shape)
+            return {"instances": merged_instances}
+        else:
+            return {"instances": merged_instances}
+
+    def _get_augmented_boxes(self, augmented_inputs, tfms):
+        # Heavily based on detectron2/modeling/test_time_augmentation.py
+        # Only difference is that RotationTransform is excluded from bbox computation
+        # 1: forward with all augmented images
+        outputs = self._batch_inference(augmented_inputs)
+        # 2: union the results
+        all_boxes = []
+        all_scores = []
+        all_classes = []
+        for output, tfm in zip(outputs, tfms):
+            # Need to inverse the transforms on boxes, to obtain results on original image
+            if not any(isinstance(t, RotationTransform) for t in tfm.transforms):
+                # Some transforms can't compute bbox correctly
+                pred_boxes = output.pred_boxes.tensor
+                original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy())
+                all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device))
+                all_scores.extend(output.scores)
+                all_classes.extend(output.pred_classes)
+        all_boxes = torch.cat(all_boxes, dim=0)
+        return all_boxes, all_scores, all_classes
+
+    def _reduce_pred_densepose(self, outputs, tfms):
+        # Should apply inverse transforms on densepose preds.
+        # We assume only rotation, resize & flip are used. pred_masks is a scale-invariant
+        # representation, so we handle the other ones specially
+        for idx, (output, tfm) in enumerate(zip(outputs, tfms)):
+            for t in tfm.transforms:
+                for attr in ["coarse_segm", "fine_segm", "u", "v"]:
+                    setattr(
+                        output.pred_densepose,
+                        attr,
+                        _inverse_rotation(
+                            getattr(output.pred_densepose, attr), output.pred_boxes.tensor, t
+                        ),
+                    )
+            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
+                output.pred_densepose = HFlipConverter.convert(
+                    output.pred_densepose, self._transform_data
+                )
+            self._incremental_avg_dp(outputs[0].pred_densepose, output.pred_densepose, idx)
+        return outputs[0].pred_densepose
+
+    # incrementally computed average: u_(n + 1) = u_n + (x_(n+1) - u_n) / (n + 1).
+    def _incremental_avg_dp(self, avg, new_el, idx):
+        for attr in ["coarse_segm", "fine_segm", "u", "v"]:
+            setattr(avg, attr, (getattr(avg, attr) * idx + getattr(new_el, attr)) / (idx + 1))
+            if idx:
+                # Deletion of the > 0 index intermediary values to prevent GPU OOM
+                setattr(new_el, attr, None)
+        return avg
+
+
+def _inverse_rotation(densepose_attrs, boxes, transform):
+    # resample outputs to image size and rotate back the densepose preds
+    # on the rotated images to the space of the original image
+    if len(boxes) == 0 or not isinstance(transform, RotationTransform):
+        return densepose_attrs
+    boxes = boxes.int().cpu().numpy()
+    wh_boxes = boxes[:, 2:] - boxes[:, :2]  # bboxes in the rotated space
+    inv_boxes = rotate_box_inverse(transform, boxes).astype(int)  # bboxes in original image
+    wh_diff = (inv_boxes[:, 2:] - inv_boxes[:, :2] - wh_boxes) // 2  # diff between new/old bboxes
+    rotation_matrix = torch.tensor([transform.rm_image]).to(device=densepose_attrs.device).float()
+    rotation_matrix[:, :, -1] = 0
+    # To apply grid_sample for rotation, we need to have enough space to fit the original and
+    # rotated bboxes. l_bds and r_bds are the left/right bounds that will be used to
+    # crop the difference once the rotation is done
+    l_bds = np.maximum(0, -wh_diff)
+    for i in range(len(densepose_attrs)):
+        if min(wh_boxes[i]) <= 0:
+            continue
+        densepose_attr = densepose_attrs[[i]].clone()
+        # 1. Interpolate densepose attribute to size of the rotated bbox
+        densepose_attr = F.interpolate(densepose_attr, wh_boxes[i].tolist()[::-1], mode="bilinear")
+        # 2. Pad the interpolated attribute so it has room for the original + rotated bbox
+        densepose_attr = F.pad(densepose_attr, tuple(np.repeat(np.maximum(0, wh_diff[i]), 2)))
+        # 3. Compute rotation grid and transform
+        grid = F.affine_grid(rotation_matrix, size=densepose_attr.shape)
+        densepose_attr = F.grid_sample(densepose_attr, grid)
+        # 4. Compute right bounds and crop the densepose_attr to the size of the original bbox
+        r_bds = densepose_attr.shape[2:][::-1] - l_bds[i]
+        densepose_attr = densepose_attr[:, :, l_bds[i][1] : r_bds[1], l_bds[i][0] : r_bds[0]]
+        if min(densepose_attr.shape) > 0:
+            # Interpolate back to the original size of the densepose attribute
+            densepose_attr = F.interpolate(
+                densepose_attr, densepose_attrs.shape[-2:], mode="bilinear"
+            )
+            # Adding a very small probability to the background class to fill padded zones
+            densepose_attr[:, 0] += 1e-10
+            densepose_attrs[i] = densepose_attr
+    return densepose_attrs
+
+
+def rotate_box_inverse(rot_tfm, rotated_box):
+    """
+    rotated_box is a N * 4 array of [x0, y0, x1, y1] boxes
+    When a bbox is rotated, it gets bigger, because we need to surround the tilted bbox
+    So when a bbox is rotated then inverse-rotated, it is much bigger than the original
+    This function aims to invert the rotation on the box, but also resize it to its original size
+    """
+    # 1. Compute the inverse rotation of the rotated bboxes (bigger than it )
+    invrot_box = rot_tfm.inverse().apply_box(rotated_box)
+    h, w = rotated_box[:, 3] - rotated_box[:, 1], rotated_box[:, 2] - rotated_box[:, 0]
+    ih, iw = invrot_box[:, 3] - invrot_box[:, 1], invrot_box[:, 2] - invrot_box[:, 0]
+    assert 2 * rot_tfm.abs_sin**2 != 1, "45 degrees angle can't be inverted"
+    # 2. Inverse the corresponding computation in the rotation transform
+    # to get the original height/width of the rotated boxes
+    orig_h = (h * rot_tfm.abs_cos - w * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2)
+    orig_w = (w * rot_tfm.abs_cos - h * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2)
+    # 3. Resize the inverse-rotated bboxes to their original size
+    invrot_box[:, 0] += (iw - orig_w) / 2
+    invrot_box[:, 1] += (ih - orig_h) / 2
+    invrot_box[:, 2] -= (iw - orig_w) / 2
+    invrot_box[:, 3] -= (ih - orig_h) / 2
+
+    return invrot_box
diff --git a/densepose/modeling/utils.py b/densepose/modeling/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..64f53369b5ae3bc69f064c590e0837583ebc213e
--- /dev/null
+++ b/densepose/modeling/utils.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from torch import nn
+
+
+def initialize_module_params(module: nn.Module) -> None:
+    for name, param in module.named_parameters():
+        if "bias" in name:
+            nn.init.constant_(param, 0)
+        elif "weight" in name:
+            nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
diff --git a/densepose/structures/__init__.py b/densepose/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ee84836219994a54bb1249c90a7d0d6f8b72e8b
--- /dev/null
+++ b/densepose/structures/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from .chart import DensePoseChartPredictorOutput
+from .chart_confidence import decorate_predictor_output_class_with_confidences
+from .cse_confidence import decorate_cse_predictor_output_class_with_confidences
+from .chart_result import (
+    DensePoseChartResult,
+    DensePoseChartResultWithConfidences,
+    quantize_densepose_chart_result,
+    compress_quantized_densepose_chart_result,
+    decompress_compressed_densepose_chart_result,
+)
+from .cse import DensePoseEmbeddingPredictorOutput
+from .data_relative import DensePoseDataRelative
+from .list import DensePoseList
+from .mesh import Mesh, create_mesh
+from .transform_data import DensePoseTransformData, normalized_coords_transform
diff --git a/densepose/structures/__pycache__/__init__.cpython-39.pyc b/densepose/structures/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4014f415bc4f953968367d17acd1e2057668f6e4
Binary files /dev/null and b/densepose/structures/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart.cpython-39.pyc b/densepose/structures/__pycache__/chart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10743b5e6630c2a321422b93ecfc45c2d2826267
Binary files /dev/null and b/densepose/structures/__pycache__/chart.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc b/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f94586ecddcfb840662e7f20219eff31575ed704
Binary files /dev/null and b/densepose/structures/__pycache__/chart_confidence.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/chart_result.cpython-39.pyc b/densepose/structures/__pycache__/chart_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a17b1df1d6266225326e715425e1ed2d0e866974
Binary files /dev/null and b/densepose/structures/__pycache__/chart_result.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/cse.cpython-39.pyc b/densepose/structures/__pycache__/cse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb47655ba11bd3caf3dfd39948fda288a560e121
Binary files /dev/null and b/densepose/structures/__pycache__/cse.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc b/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a270e69545008b6a113dc1e60bce28e2f04cded3
Binary files /dev/null and b/densepose/structures/__pycache__/cse_confidence.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/data_relative.cpython-39.pyc b/densepose/structures/__pycache__/data_relative.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97064160c973fc0cca20a6668bb97c1b60f63132
Binary files /dev/null and b/densepose/structures/__pycache__/data_relative.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/list.cpython-39.pyc b/densepose/structures/__pycache__/list.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41381ad6470111845b23978de8c37c9fc018e6d2
Binary files /dev/null and b/densepose/structures/__pycache__/list.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/mesh.cpython-39.pyc b/densepose/structures/__pycache__/mesh.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1acd657eedcb1e8b6c5f8c96943a27c1343c6d9
Binary files /dev/null and b/densepose/structures/__pycache__/mesh.cpython-39.pyc differ
diff --git a/densepose/structures/__pycache__/transform_data.cpython-39.pyc b/densepose/structures/__pycache__/transform_data.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb7e717c72243246caecb18cb0e917d414ea14b0
Binary files /dev/null and b/densepose/structures/__pycache__/transform_data.cpython-39.pyc differ
diff --git a/densepose/structures/chart.py b/densepose/structures/chart.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f8640ef3dc9ca7e66e1a639e2e23211300dbbac
--- /dev/null
+++ b/densepose/structures/chart.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Union
+import torch
+
+
+@dataclass
+class DensePoseChartPredictorOutput:
+    """
+    Predictor output that contains segmentation and inner coordinates predictions for predefined
+    body parts:
+     * coarse segmentation, a tensor of shape [N, K, Hout, Wout]
+     * fine segmentation, a tensor of shape [N, C, Hout, Wout]
+     * U coordinates, a tensor of shape [N, C, Hout, Wout]
+     * V coordinates, a tensor of shape [N, C, Hout, Wout]
+    where
+     - N is the number of instances
+     - K is the number of coarse segmentation channels (
+         2 = foreground / background,
+         15 = one of 14 body parts / background)
+     - C is the number of fine segmentation channels (
+         24 fine body parts / background)
+     - Hout and Wout are height and width of predictions
+    """
+
+    coarse_segm: torch.Tensor
+    fine_segm: torch.Tensor
+    u: torch.Tensor
+    v: torch.Tensor
+
+    def __len__(self):
+        """
+        Number of instances (N) in the output
+        """
+        return self.coarse_segm.size(0)
+
+    def __getitem__(
+        self, item: Union[int, slice, torch.BoolTensor]
+    ) -> "DensePoseChartPredictorOutput":
+        """
+        Get outputs for the selected instance(s)
+
+        Args:
+            item (int or slice or tensor): selected items
+        """
+        if isinstance(item, int):
+            return DensePoseChartPredictorOutput(
+                coarse_segm=self.coarse_segm[item].unsqueeze(0),
+                fine_segm=self.fine_segm[item].unsqueeze(0),
+                u=self.u[item].unsqueeze(0),
+                v=self.v[item].unsqueeze(0),
+            )
+        else:
+            return DensePoseChartPredictorOutput(
+                coarse_segm=self.coarse_segm[item],
+                fine_segm=self.fine_segm[item],
+                u=self.u[item],
+                v=self.v[item],
+            )
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        coarse_segm = self.coarse_segm.to(device)
+        fine_segm = self.fine_segm.to(device)
+        u = self.u.to(device)
+        v = self.v.to(device)
+        return DensePoseChartPredictorOutput(coarse_segm=coarse_segm, fine_segm=fine_segm, u=u, v=v)
diff --git a/densepose/structures/chart_confidence.py b/densepose/structures/chart_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..faec3a0f161939591a8424058871d50198327b08
--- /dev/null
+++ b/densepose/structures/chart_confidence.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import make_dataclass
+from functools import lru_cache
+from typing import Any, Optional
+import torch
+
+
+@lru_cache(maxsize=None)
+def decorate_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type:
+    """
+    Create a new output class from an existing one by adding new attributes
+    related to confidence estimation:
+    - sigma_1 (tensor)
+    - sigma_2 (tensor)
+    - kappa_u (tensor)
+    - kappa_v (tensor)
+    - fine_segm_confidence (tensor)
+    - coarse_segm_confidence (tensor)
+
+    Details on confidence estimation parameters can be found in:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+        Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+
+    The new class inherits the provided `BasePredictorOutput` class,
+    it's name is composed of the name of the provided class and
+    "WithConfidences" suffix.
+
+    Args:
+        BasePredictorOutput (type): output type to which confidence data
+            is to be added, assumed to be a dataclass
+    Return:
+        New dataclass derived from the provided one that has attributes
+        for confidence estimation
+    """
+
+    PredictorOutput = make_dataclass(
+        BasePredictorOutput.__name__ + "WithConfidences",
+        fields=[
+            ("sigma_1", Optional[torch.Tensor], None),
+            ("sigma_2", Optional[torch.Tensor], None),
+            ("kappa_u", Optional[torch.Tensor], None),
+            ("kappa_v", Optional[torch.Tensor], None),
+            ("fine_segm_confidence", Optional[torch.Tensor], None),
+            ("coarse_segm_confidence", Optional[torch.Tensor], None),
+        ],
+        bases=(BasePredictorOutput,),
+    )
+
+    # add possibility to index PredictorOutput
+
+    def slice_if_not_none(data, item):
+        if data is None:
+            return None
+        if isinstance(item, int):
+            return data[item].unsqueeze(0)
+        return data[item]
+
+    def PredictorOutput_getitem(self, item):
+        PredictorOutput = type(self)
+        base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item)
+        return PredictorOutput(
+            **base_predictor_output_sliced.__dict__,
+            coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item),
+            fine_segm_confidence=slice_if_not_none(self.fine_segm_confidence, item),
+            sigma_1=slice_if_not_none(self.sigma_1, item),
+            sigma_2=slice_if_not_none(self.sigma_2, item),
+            kappa_u=slice_if_not_none(self.kappa_u, item),
+            kappa_v=slice_if_not_none(self.kappa_v, item),
+        )
+
+    PredictorOutput.__getitem__ = PredictorOutput_getitem
+
+    def PredictorOutput_to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        PredictorOutput = type(self)
+        base_predictor_output_to = super(PredictorOutput, self).to(device)  # pyre-ignore[16]
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return PredictorOutput(
+            **base_predictor_output_to.__dict__,
+            sigma_1=to_device_if_tensor(self.sigma_1),
+            sigma_2=to_device_if_tensor(self.sigma_2),
+            kappa_u=to_device_if_tensor(self.kappa_u),
+            kappa_v=to_device_if_tensor(self.kappa_v),
+            fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence),
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+    PredictorOutput.to = PredictorOutput_to
+    return PredictorOutput
diff --git a/densepose/structures/chart_result.py b/densepose/structures/chart_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a9e56dee9fb81fd6a6596c524dcd9f2e471af19
--- /dev/null
+++ b/densepose/structures/chart_result.py
@@ -0,0 +1,185 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple
+import torch
+
+
+@dataclass
+class DensePoseChartResult:
+    """
+    DensePose results for chart-based methods represented by labels and inner
+    coordinates (U, V) of individual charts. Each chart is a 2D manifold
+    that has an associated label and is parameterized by two coordinates U and V.
+    Both U and V take values in [0, 1].
+    Thus the results are represented by two tensors:
+    - labels (tensor [H, W] of long): contains estimated label for each pixel of
+        the detection bounding box of size (H, W)
+    - uv (tensor [2, H, W] of float): contains estimated U and V coordinates
+        for each pixel of the detection bounding box of size (H, W)
+    """
+
+    labels: torch.Tensor
+    uv: torch.Tensor
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        labels = self.labels.to(device)
+        uv = self.uv.to(device)
+        return DensePoseChartResult(labels=labels, uv=uv)
+
+
+@dataclass
+class DensePoseChartResultWithConfidences:
+    """
+    We add confidence values to DensePoseChartResult
+    Thus the results are represented by two tensors:
+    - labels (tensor [H, W] of long): contains estimated label for each pixel of
+        the detection bounding box of size (H, W)
+    - uv (tensor [2, H, W] of float): contains estimated U and V coordinates
+        for each pixel of the detection bounding box of size (H, W)
+    Plus one [H, W] tensor of float for each confidence type
+    """
+
+    labels: torch.Tensor
+    uv: torch.Tensor
+    sigma_1: Optional[torch.Tensor] = None
+    sigma_2: Optional[torch.Tensor] = None
+    kappa_u: Optional[torch.Tensor] = None
+    kappa_v: Optional[torch.Tensor] = None
+    fine_segm_confidence: Optional[torch.Tensor] = None
+    coarse_segm_confidence: Optional[torch.Tensor] = None
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device, except if their value is None
+        """
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return DensePoseChartResultWithConfidences(
+            labels=self.labels.to(device),
+            uv=self.uv.to(device),
+            sigma_1=to_device_if_tensor(self.sigma_1),
+            sigma_2=to_device_if_tensor(self.sigma_2),
+            kappa_u=to_device_if_tensor(self.kappa_u),
+            kappa_v=to_device_if_tensor(self.kappa_v),
+            fine_segm_confidence=to_device_if_tensor(self.fine_segm_confidence),
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+
+@dataclass
+class DensePoseChartResultQuantized:
+    """
+    DensePose results for chart-based methods represented by labels and quantized
+    inner coordinates (U, V) of individual charts. Each chart is a 2D manifold
+    that has an associated label and is parameterized by two coordinates U and V.
+    Both U and V take values in [0, 1].
+    Quantized coordinates Uq and Vq have uint8 values which are obtained as:
+      Uq = U * 255 (hence 0 <= Uq <= 255)
+      Vq = V * 255 (hence 0 <= Vq <= 255)
+    Thus the results are represented by one tensor:
+    - labels_uv_uint8 (tensor [3, H, W] of uint8): contains estimated label
+        and quantized coordinates Uq and Vq for each pixel of the detection
+        bounding box of size (H, W)
+    """
+
+    labels_uv_uint8: torch.Tensor
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        labels_uv_uint8 = self.labels_uv_uint8.to(device)
+        return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8)
+
+
+@dataclass
+class DensePoseChartResultCompressed:
+    """
+    DensePose results for chart-based methods represented by a PNG-encoded string.
+    The tensor of quantized DensePose results of size [3, H, W] is considered
+    as an image with 3 color channels. PNG compression is applied and the result
+    is stored as a Base64-encoded string. The following attributes are defined:
+    - shape_chw (tuple of 3 int): contains shape of the result tensor
+        (number of channels, height, width)
+    - labels_uv_str (str): contains Base64-encoded results tensor of size
+        [3, H, W] compressed with PNG compression methods
+    """
+
+    shape_chw: Tuple[int, int, int]
+    labels_uv_str: str
+
+
+def quantize_densepose_chart_result(result: DensePoseChartResult) -> DensePoseChartResultQuantized:
+    """
+    Applies quantization to DensePose chart-based result.
+
+    Args:
+        result (DensePoseChartResult): DensePose chart-based result
+    Return:
+        Quantized DensePose chart-based result (DensePoseChartResultQuantized)
+    """
+    h, w = result.labels.shape
+    labels_uv_uint8 = torch.zeros([3, h, w], dtype=torch.uint8, device=result.labels.device)
+    labels_uv_uint8[0] = result.labels
+    labels_uv_uint8[1:] = (result.uv * 255).clamp(0, 255).byte()
+    return DensePoseChartResultQuantized(labels_uv_uint8=labels_uv_uint8)
+
+
+def compress_quantized_densepose_chart_result(
+    result: DensePoseChartResultQuantized,
+) -> DensePoseChartResultCompressed:
+    """
+    Compresses quantized DensePose chart-based result
+
+    Args:
+        result (DensePoseChartResultQuantized): quantized DensePose chart-based result
+    Return:
+        Compressed DensePose chart-based result (DensePoseChartResultCompressed)
+    """
+    import base64
+    import numpy as np
+    from io import BytesIO
+    from PIL import Image
+
+    labels_uv_uint8_np_chw = result.labels_uv_uint8.cpu().numpy()
+    labels_uv_uint8_np_hwc = np.moveaxis(labels_uv_uint8_np_chw, 0, -1)
+    im = Image.fromarray(labels_uv_uint8_np_hwc)
+    fstream = BytesIO()
+    im.save(fstream, format="png", optimize=True)
+    labels_uv_str = base64.encodebytes(fstream.getvalue()).decode()
+    shape_chw = labels_uv_uint8_np_chw.shape
+    return DensePoseChartResultCompressed(labels_uv_str=labels_uv_str, shape_chw=shape_chw)
+
+
+def decompress_compressed_densepose_chart_result(
+    result: DensePoseChartResultCompressed,
+) -> DensePoseChartResultQuantized:
+    """
+    Decompresses DensePose chart-based result encoded into a base64 string
+
+    Args:
+        result (DensePoseChartResultCompressed): compressed DensePose chart result
+    Return:
+        Quantized DensePose chart-based result (DensePoseChartResultQuantized)
+    """
+    import base64
+    import numpy as np
+    from io import BytesIO
+    from PIL import Image
+
+    fstream = BytesIO(base64.decodebytes(result.labels_uv_str.encode()))
+    im = Image.open(fstream)
+    labels_uv_uint8_np_chw = np.moveaxis(np.array(im, dtype=np.uint8), -1, 0)
+    return DensePoseChartResultQuantized(
+        labels_uv_uint8=torch.from_numpy(labels_uv_uint8_np_chw.reshape(result.shape_chw))
+    )
diff --git a/densepose/structures/cse.py b/densepose/structures/cse.py
new file mode 100644
index 0000000000000000000000000000000000000000..381f1384a8d4d42f81cda8ff1558002149bdea74
--- /dev/null
+++ b/densepose/structures/cse.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import Union
+import torch
+
+
+@dataclass
+class DensePoseEmbeddingPredictorOutput:
+    """
+    Predictor output that contains embedding and coarse segmentation data:
+     * embedding: float tensor of size [N, D, H, W], contains estimated embeddings
+     * coarse_segm: float tensor of size [N, K, H, W]
+    Here D = MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE
+         K = MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
+    """
+
+    embedding: torch.Tensor
+    coarse_segm: torch.Tensor
+
+    def __len__(self):
+        """
+        Number of instances (N) in the output
+        """
+        return self.coarse_segm.size(0)
+
+    def __getitem__(
+        self, item: Union[int, slice, torch.BoolTensor]
+    ) -> "DensePoseEmbeddingPredictorOutput":
+        """
+        Get outputs for the selected instance(s)
+
+        Args:
+            item (int or slice or tensor): selected items
+        """
+        if isinstance(item, int):
+            return DensePoseEmbeddingPredictorOutput(
+                coarse_segm=self.coarse_segm[item].unsqueeze(0),
+                embedding=self.embedding[item].unsqueeze(0),
+            )
+        else:
+            return DensePoseEmbeddingPredictorOutput(
+                coarse_segm=self.coarse_segm[item], embedding=self.embedding[item]
+            )
+
+    def to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        coarse_segm = self.coarse_segm.to(device)
+        embedding = self.embedding.to(device)
+        return DensePoseEmbeddingPredictorOutput(coarse_segm=coarse_segm, embedding=embedding)
diff --git a/densepose/structures/cse_confidence.py b/densepose/structures/cse_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..251a7e823e38931fb1b86b017417538af5350944
--- /dev/null
+++ b/densepose/structures/cse_confidence.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+
+from dataclasses import make_dataclass
+from functools import lru_cache
+from typing import Any, Optional
+import torch
+
+
+@lru_cache(maxsize=None)
+def decorate_cse_predictor_output_class_with_confidences(BasePredictorOutput: type) -> type:
+    """
+    Create a new output class from an existing one by adding new attributes
+    related to confidence estimation:
+    - coarse_segm_confidence (tensor)
+
+    Details on confidence estimation parameters can be found in:
+    N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning
+        Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019
+    A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020
+
+    The new class inherits the provided `BasePredictorOutput` class,
+    it's name is composed of the name of the provided class and
+    "WithConfidences" suffix.
+
+    Args:
+        BasePredictorOutput (type): output type to which confidence data
+            is to be added, assumed to be a dataclass
+    Return:
+        New dataclass derived from the provided one that has attributes
+        for confidence estimation
+    """
+
+    PredictorOutput = make_dataclass(
+        BasePredictorOutput.__name__ + "WithConfidences",
+        fields=[
+            ("coarse_segm_confidence", Optional[torch.Tensor], None),
+        ],
+        bases=(BasePredictorOutput,),
+    )
+
+    # add possibility to index PredictorOutput
+
+    def slice_if_not_none(data, item):
+        if data is None:
+            return None
+        if isinstance(item, int):
+            return data[item].unsqueeze(0)
+        return data[item]
+
+    def PredictorOutput_getitem(self, item):
+        PredictorOutput = type(self)
+        base_predictor_output_sliced = super(PredictorOutput, self).__getitem__(item)
+        return PredictorOutput(
+            **base_predictor_output_sliced.__dict__,
+            coarse_segm_confidence=slice_if_not_none(self.coarse_segm_confidence, item),
+        )
+
+    PredictorOutput.__getitem__ = PredictorOutput_getitem
+
+    def PredictorOutput_to(self, device: torch.device):
+        """
+        Transfers all tensors to the given device
+        """
+        PredictorOutput = type(self)
+        base_predictor_output_to = super(PredictorOutput, self).to(device)  # pyre-ignore[16]
+
+        def to_device_if_tensor(var: Any):
+            if isinstance(var, torch.Tensor):
+                return var.to(device)
+            return var
+
+        return PredictorOutput(
+            **base_predictor_output_to.__dict__,
+            coarse_segm_confidence=to_device_if_tensor(self.coarse_segm_confidence),
+        )
+
+    PredictorOutput.to = PredictorOutput_to
+    return PredictorOutput
diff --git a/densepose/structures/data_relative.py b/densepose/structures/data_relative.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf27ef9bb69f5d9f74d6499e55408e8d4ec5803
--- /dev/null
+++ b/densepose/structures/data_relative.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from densepose.data.meshes.catalog import MeshCatalog
+from densepose.structures.mesh import load_mesh_symmetry
+from densepose.structures.transform_data import DensePoseTransformData
+
+
+class DensePoseDataRelative:
+    """
+    Dense pose relative annotations that can be applied to any bounding box:
+        x - normalized X coordinates [0, 255] of annotated points
+        y - normalized Y coordinates [0, 255] of annotated points
+        i - body part labels 0,...,24 for annotated points
+        u - body part U coordinates [0, 1] for annotated points
+        v - body part V coordinates [0, 1] for annotated points
+        segm - 256x256 segmentation mask with values 0,...,14
+    To obtain absolute x and y data wrt some bounding box one needs to first
+    divide the data by 256, multiply by the respective bounding box size
+    and add bounding box offset:
+        x_img = x0 + x_norm * w / 256.0
+        y_img = y0 + y_norm * h / 256.0
+    Segmentation masks are typically sampled to get image-based masks.
+    """
+
+    # Key for normalized X coordinates in annotation dict
+    X_KEY = "dp_x"
+    # Key for normalized Y coordinates in annotation dict
+    Y_KEY = "dp_y"
+    # Key for U part coordinates in annotation dict (used in chart-based annotations)
+    U_KEY = "dp_U"
+    # Key for V part coordinates in annotation dict (used in chart-based annotations)
+    V_KEY = "dp_V"
+    # Key for I point labels in annotation dict (used in chart-based annotations)
+    I_KEY = "dp_I"
+    # Key for segmentation mask in annotation dict
+    S_KEY = "dp_masks"
+    # Key for vertex ids (used in continuous surface embeddings annotations)
+    VERTEX_IDS_KEY = "dp_vertex"
+    # Key for mesh id (used in continuous surface embeddings annotations)
+    MESH_NAME_KEY = "ref_model"
+    # Number of body parts in segmentation masks
+    N_BODY_PARTS = 14
+    # Number of parts in point labels
+    N_PART_LABELS = 24
+    MASK_SIZE = 256
+
+    def __init__(self, annotation, cleanup=False):
+        self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY])
+        self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY])
+        if (
+            DensePoseDataRelative.I_KEY in annotation
+            and DensePoseDataRelative.U_KEY in annotation
+            and DensePoseDataRelative.V_KEY in annotation
+        ):
+            self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY])
+            self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY])
+            self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY])
+        if (
+            DensePoseDataRelative.VERTEX_IDS_KEY in annotation
+            and DensePoseDataRelative.MESH_NAME_KEY in annotation
+        ):
+            self.vertex_ids = torch.as_tensor(
+                annotation[DensePoseDataRelative.VERTEX_IDS_KEY], dtype=torch.long
+            )
+            self.mesh_id = MeshCatalog.get_mesh_id(annotation[DensePoseDataRelative.MESH_NAME_KEY])
+        if DensePoseDataRelative.S_KEY in annotation:
+            self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation)
+        self.device = torch.device("cpu")
+        if cleanup:
+            DensePoseDataRelative.cleanup_annotation(annotation)
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        new_data = DensePoseDataRelative.__new__(DensePoseDataRelative)
+        new_data.x = self.x.to(device)
+        new_data.y = self.y.to(device)
+        for attr in ["i", "u", "v", "vertex_ids", "segm"]:
+            if hasattr(self, attr):
+                setattr(new_data, attr, getattr(self, attr).to(device))
+        if hasattr(self, "mesh_id"):
+            new_data.mesh_id = self.mesh_id
+        new_data.device = device
+        return new_data
+
+    @staticmethod
+    def extract_segmentation_mask(annotation):
+        import pycocotools.mask as mask_utils
+
+        # TODO: annotation instance is accepted if it contains either
+        # DensePose segmentation or instance segmentation. However, here we
+        # only rely on DensePose segmentation
+        poly_specs = annotation[DensePoseDataRelative.S_KEY]
+        if isinstance(poly_specs, torch.Tensor):
+            # data is already given as mask tensors, no need to decode
+            return poly_specs
+        segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32)
+        if isinstance(poly_specs, dict):
+            if poly_specs:
+                mask = mask_utils.decode(poly_specs)
+                segm[mask > 0] = 1
+        else:
+            for i in range(len(poly_specs)):
+                poly_i = poly_specs[i]
+                if poly_i:
+                    mask_i = mask_utils.decode(poly_i)
+                    segm[mask_i > 0] = i + 1
+        return segm
+
+    @staticmethod
+    def validate_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+        ]:
+            if key not in annotation:
+                return False, "no {key} data in the annotation".format(key=key)
+        valid_for_iuv_setting = all(
+            key in annotation
+            for key in [
+                DensePoseDataRelative.I_KEY,
+                DensePoseDataRelative.U_KEY,
+                DensePoseDataRelative.V_KEY,
+            ]
+        )
+        valid_for_cse_setting = all(
+            key in annotation
+            for key in [
+                DensePoseDataRelative.VERTEX_IDS_KEY,
+                DensePoseDataRelative.MESH_NAME_KEY,
+            ]
+        )
+        if not valid_for_iuv_setting and not valid_for_cse_setting:
+            return (
+                False,
+                "expected either {} (IUV setting) or {} (CSE setting) annotations".format(
+                    ", ".join(
+                        [
+                            DensePoseDataRelative.I_KEY,
+                            DensePoseDataRelative.U_KEY,
+                            DensePoseDataRelative.V_KEY,
+                        ]
+                    ),
+                    ", ".join(
+                        [
+                            DensePoseDataRelative.VERTEX_IDS_KEY,
+                            DensePoseDataRelative.MESH_NAME_KEY,
+                        ]
+                    ),
+                ),
+            )
+        return True, None
+
+    @staticmethod
+    def cleanup_annotation(annotation):
+        for key in [
+            DensePoseDataRelative.X_KEY,
+            DensePoseDataRelative.Y_KEY,
+            DensePoseDataRelative.I_KEY,
+            DensePoseDataRelative.U_KEY,
+            DensePoseDataRelative.V_KEY,
+            DensePoseDataRelative.S_KEY,
+            DensePoseDataRelative.VERTEX_IDS_KEY,
+            DensePoseDataRelative.MESH_NAME_KEY,
+        ]:
+            if key in annotation:
+                del annotation[key]
+
+    def apply_transform(self, transforms, densepose_transform_data):
+        self._transform_pts(transforms, densepose_transform_data)
+        if hasattr(self, "segm"):
+            self._transform_segm(transforms, densepose_transform_data)
+
+    def _transform_pts(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.x = self.MASK_SIZE - self.x
+            if hasattr(self, "i"):
+                self._flip_iuv_semantics(dp_transform_data)
+            if hasattr(self, "vertex_ids"):
+                self._flip_vertices()
+
+        for t in transforms.transforms:
+            if isinstance(t, T.RotationTransform):
+                xy_scale = np.array((t.w, t.h)) / DensePoseDataRelative.MASK_SIZE
+                xy = t.apply_coords(np.stack((self.x, self.y), axis=1) * xy_scale)
+                self.x, self.y = torch.tensor(xy / xy_scale, dtype=self.x.dtype).T
+
+    def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None:
+        i_old = self.i.clone()
+        uv_symmetries = dp_transform_data.uv_symmetries
+        pt_label_symmetries = dp_transform_data.point_label_symmetries
+        for i in range(self.N_PART_LABELS):
+            if i + 1 in i_old:
+                annot_indices_i = i_old == i + 1
+                if pt_label_symmetries[i + 1] != i + 1:
+                    self.i[annot_indices_i] = pt_label_symmetries[i + 1]
+                u_loc = (self.u[annot_indices_i] * 255).long()
+                v_loc = (self.v[annot_indices_i] * 255).long()
+                self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to(
+                    device=self.u.device
+                )
+                self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to(
+                    device=self.v.device
+                )
+
+    def _flip_vertices(self):
+        mesh_info = MeshCatalog[MeshCatalog.get_mesh_name(self.mesh_id)]
+        mesh_symmetry = (
+            load_mesh_symmetry(mesh_info.symmetry) if mesh_info.symmetry is not None else None
+        )
+        self.vertex_ids = mesh_symmetry["vertex_transforms"][self.vertex_ids]
+
+    def _transform_segm(self, transforms, dp_transform_data):
+        import detectron2.data.transforms as T
+
+        # NOTE: This assumes that HorizFlipTransform is the only one that does flip
+        do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1
+        if do_hflip:
+            self.segm = torch.flip(self.segm, [1])
+            self._flip_segm_semantics(dp_transform_data)
+
+        for t in transforms.transforms:
+            if isinstance(t, T.RotationTransform):
+                self._transform_segm_rotation(t)
+
+    def _flip_segm_semantics(self, dp_transform_data):
+        old_segm = self.segm.clone()
+        mask_label_symmetries = dp_transform_data.mask_label_symmetries
+        for i in range(self.N_BODY_PARTS):
+            if mask_label_symmetries[i + 1] != i + 1:
+                self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1]
+
+    def _transform_segm_rotation(self, rotation):
+        self.segm = F.interpolate(self.segm[None, None, :], (rotation.h, rotation.w)).numpy()
+        self.segm = torch.tensor(rotation.apply_segmentation(self.segm[0, 0]))[None, None, :]
+        self.segm = F.interpolate(self.segm, [DensePoseDataRelative.MASK_SIZE] * 2)[0, 0]
diff --git a/densepose/structures/list.py b/densepose/structures/list.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7dde3acd42ff33c103a50bcf6eebff21a59ce53
--- /dev/null
+++ b/densepose/structures/list.py
@@ -0,0 +1,72 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import torch
+
+from densepose.structures.data_relative import DensePoseDataRelative
+
+
+class DensePoseList:
+
+    _TORCH_DEVICE_CPU = torch.device("cpu")
+
+    def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU):
+        assert len(densepose_datas) == len(
+            boxes_xyxy_abs
+        ), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format(
+            len(densepose_datas), len(boxes_xyxy_abs)
+        )
+        self.densepose_datas = []
+        for densepose_data in densepose_datas:
+            assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, (
+                "Attempt to initialize DensePoseList with DensePose datas "
+                "of type {}, expected DensePoseDataRelative".format(type(densepose_data))
+            )
+            densepose_data_ondevice = (
+                densepose_data.to(device) if densepose_data is not None else None
+            )
+            self.densepose_datas.append(densepose_data_ondevice)
+        self.boxes_xyxy_abs = boxes_xyxy_abs.to(device)
+        self.image_size_hw = image_size_hw
+        self.device = device
+
+    def to(self, device):
+        if self.device == device:
+            return self
+        return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device)
+
+    def __iter__(self):
+        return iter(self.densepose_datas)
+
+    def __len__(self):
+        return len(self.densepose_datas)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.densepose_datas))
+        s += "image_width={}, ".format(self.image_size_hw[1])
+        s += "image_height={})".format(self.image_size_hw[0])
+        return s
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            densepose_data_rel = self.densepose_datas[item]
+            return densepose_data_rel
+        elif isinstance(item, slice):
+            densepose_datas_rel = self.densepose_datas[item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool):
+            densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
+        else:
+            densepose_datas_rel = [self.densepose_datas[i] for i in item]
+            boxes_xyxy_abs = self.boxes_xyxy_abs[item]
+            return DensePoseList(
+                densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device
+            )
diff --git a/densepose/structures/mesh.py b/densepose/structures/mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..faaad9cb5650f5e6a1bef76c599d5fd370238e4c
--- /dev/null
+++ b/densepose/structures/mesh.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+
+import pickle
+from functools import lru_cache
+from typing import Dict, Optional, Tuple
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.data.meshes.catalog import MeshCatalog, MeshInfo
+
+
+def _maybe_copy_to_device(
+    attribute: Optional[torch.Tensor], device: torch.device
+) -> Optional[torch.Tensor]:
+    if attribute is None:
+        return None
+    return attribute.to(device)
+
+
+class Mesh:
+    def __init__(
+        self,
+        vertices: Optional[torch.Tensor] = None,
+        faces: Optional[torch.Tensor] = None,
+        geodists: Optional[torch.Tensor] = None,
+        symmetry: Optional[Dict[str, torch.Tensor]] = None,
+        texcoords: Optional[torch.Tensor] = None,
+        mesh_info: Optional[MeshInfo] = None,
+        device: Optional[torch.device] = None,
+    ):
+        """
+        Args:
+            vertices (tensor [N, 3] of float32): vertex coordinates in 3D
+            faces (tensor [M, 3] of long): triangular face represented as 3
+                vertex indices
+            geodists (tensor [N, N] of float32): geodesic distances from
+                vertex `i` to vertex `j` (optional, default: None)
+            symmetry (dict: str -> tensor): various mesh symmetry data:
+                - "vertex_transforms": vertex mapping under horizontal flip,
+                  tensor of size [N] of type long; vertex `i` is mapped to
+                  vertex `tensor[i]` (optional, default: None)
+            texcoords (tensor [N, 2] of float32): texture coordinates, i.e. global
+                and normalized mesh UVs (optional, default: None)
+            mesh_info (MeshInfo type): necessary to load the attributes on-the-go,
+                can be used instead of passing all the variables one by one
+            device (torch.device): device of the Mesh. If not provided, will use
+                the device of the vertices
+        """
+        self._vertices = vertices
+        self._faces = faces
+        self._geodists = geodists
+        self._symmetry = symmetry
+        self._texcoords = texcoords
+        self.mesh_info = mesh_info
+        self.device = device
+
+        assert self._vertices is not None or self.mesh_info is not None
+
+        all_fields = [self._vertices, self._faces, self._geodists, self._texcoords]
+
+        if self.device is None:
+            for field in all_fields:
+                if field is not None:
+                    self.device = field.device
+                    break
+            if self.device is None and symmetry is not None:
+                for key in symmetry:
+                    self.device = symmetry[key].device
+                    break
+            self.device = torch.device("cpu") if self.device is None else self.device
+
+        assert all([var.device == self.device for var in all_fields if var is not None])
+        if symmetry:
+            assert all(symmetry[key].device == self.device for key in symmetry)
+        if texcoords and vertices:
+            assert len(vertices) == len(texcoords)
+
+    def to(self, device: torch.device):
+        device_symmetry = self._symmetry
+        if device_symmetry:
+            device_symmetry = {key: value.to(device) for key, value in device_symmetry.items()}
+        return Mesh(
+            _maybe_copy_to_device(self._vertices, device),
+            _maybe_copy_to_device(self._faces, device),
+            _maybe_copy_to_device(self._geodists, device),
+            device_symmetry,
+            _maybe_copy_to_device(self._texcoords, device),
+            self.mesh_info,
+            device,
+        )
+
+    @property
+    def vertices(self):
+        if self._vertices is None and self.mesh_info is not None:
+            self._vertices = load_mesh_data(self.mesh_info.data, "vertices", self.device)
+        return self._vertices
+
+    @property
+    def faces(self):
+        if self._faces is None and self.mesh_info is not None:
+            self._faces = load_mesh_data(self.mesh_info.data, "faces", self.device)
+        return self._faces
+
+    @property
+    def geodists(self):
+        if self._geodists is None and self.mesh_info is not None:
+            self._geodists = load_mesh_auxiliary_data(self.mesh_info.geodists, self.device)
+        return self._geodists
+
+    @property
+    def symmetry(self):
+        if self._symmetry is None and self.mesh_info is not None:
+            self._symmetry = load_mesh_symmetry(self.mesh_info.symmetry, self.device)
+        return self._symmetry
+
+    @property
+    def texcoords(self):
+        if self._texcoords is None and self.mesh_info is not None:
+            self._texcoords = load_mesh_auxiliary_data(self.mesh_info.texcoords, self.device)
+        return self._texcoords
+
+    def get_geodists(self):
+        if self.geodists is None:
+            self.geodists = self._compute_geodists()
+        return self.geodists
+
+    def _compute_geodists(self):
+        # TODO: compute using Laplace-Beltrami
+        geodists = None
+        return geodists
+
+
+def load_mesh_data(
+    mesh_fpath: str, field: str, device: Optional[torch.device] = None
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    with PathManager.open(mesh_fpath, "rb") as hFile:
+        # pyre-fixme[7]: Expected `Tuple[Optional[Tensor], Optional[Tensor]]` but
+        #  got `Tensor`.
+        return torch.as_tensor(pickle.load(hFile)[field], dtype=torch.float).to(device)
+    return None
+
+
+def load_mesh_auxiliary_data(
+    fpath: str, device: Optional[torch.device] = None
+) -> Optional[torch.Tensor]:
+    fpath_local = PathManager.get_local_path(fpath)
+    with PathManager.open(fpath_local, "rb") as hFile:
+        return torch.as_tensor(pickle.load(hFile), dtype=torch.float).to(device)
+    return None
+
+
+@lru_cache()
+def load_mesh_symmetry(
+    symmetry_fpath: str, device: Optional[torch.device] = None
+) -> Optional[Dict[str, torch.Tensor]]:
+    with PathManager.open(symmetry_fpath, "rb") as hFile:
+        symmetry_loaded = pickle.load(hFile)
+        symmetry = {
+            "vertex_transforms": torch.as_tensor(
+                symmetry_loaded["vertex_transforms"], dtype=torch.long
+            ).to(device),
+        }
+        return symmetry
+    return None
+
+
+@lru_cache()
+def create_mesh(mesh_name: str, device: Optional[torch.device] = None) -> Mesh:
+    return Mesh(mesh_info=MeshCatalog[mesh_name], device=device)
diff --git a/densepose/structures/transform_data.py b/densepose/structures/transform_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..c85ec88514205679d39808a794c00613a8c0f495
--- /dev/null
+++ b/densepose/structures/transform_data.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from typing import BinaryIO, Dict, Union
+import torch
+
+
+def normalized_coords_transform(x0, y0, w, h):
+    """
+    Coordinates transform that maps top left corner to (-1, -1) and bottom
+    right corner to (1, 1). Used for torch.grid_sample to initialize the
+    grid
+    """
+
+    def f(p):
+        return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1)
+
+    return f
+
+
+class DensePoseTransformData:
+
+    # Horizontal symmetry label transforms used for horizontal flip
+    MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14]
+    # fmt: off
+    POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23]  # noqa
+    # fmt: on
+
+    def __init__(self, uv_symmetries: Dict[str, torch.Tensor], device: torch.device):
+        self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES
+        self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES
+        self.uv_symmetries = uv_symmetries
+        self.device = torch.device("cpu")
+
+    def to(self, device: torch.device, copy: bool = False) -> "DensePoseTransformData":
+        """
+        Convert transform data to the specified device
+
+        Args:
+            device (torch.device): device to convert the data to
+            copy (bool): flag that specifies whether to copy or to reference the data
+                in case the device is the same
+        Return:
+            An instance of `DensePoseTransformData` with data stored on the specified device
+        """
+        if self.device == device and not copy:
+            return self
+        uv_symmetry_map = {}
+        for key in self.uv_symmetries:
+            uv_symmetry_map[key] = self.uv_symmetries[key].to(device=device, copy=copy)
+        return DensePoseTransformData(uv_symmetry_map, device)
+
+    @staticmethod
+    def load(io: Union[str, BinaryIO]):
+        """
+        Args:
+            io: (str or binary file-like object): input file to load data from
+        Returns:
+            An instance of `DensePoseTransformData` with transforms loaded from the file
+        """
+        import scipy.io
+
+        uv_symmetry_map = scipy.io.loadmat(io)
+        uv_symmetry_map_torch = {}
+        for key in ["U_transforms", "V_transforms"]:
+            uv_symmetry_map_torch[key] = []
+            map_src = uv_symmetry_map[key]
+            map_dst = uv_symmetry_map_torch[key]
+            for i in range(map_src.shape[1]):
+                map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float))
+            uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0)
+        transform_data = DensePoseTransformData(uv_symmetry_map_torch, device=torch.device("cpu"))
+        return transform_data
diff --git a/densepose/utils/__init__.py b/densepose/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/densepose/utils/__pycache__/__init__.cpython-39.pyc b/densepose/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45529431ff8dcc517cf540085bd2918a4b48bc20
Binary files /dev/null and b/densepose/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/utils/__pycache__/transform.cpython-39.pyc b/densepose/utils/__pycache__/transform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..979e5ed0aa7dd396155869a64262d1daa6db9f69
Binary files /dev/null and b/densepose/utils/__pycache__/transform.cpython-39.pyc differ
diff --git a/densepose/utils/dbhelper.py b/densepose/utils/dbhelper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba380303a06f42674aa59f03690504f825b56ed7
--- /dev/null
+++ b/densepose/utils/dbhelper.py
@@ -0,0 +1,149 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from typing import Any, Dict, Optional, Tuple
+
+
+class EntrySelector:
+    """
+    Base class for entry selectors
+    """
+
+    @staticmethod
+    def from_string(spec: str) -> "EntrySelector":
+        if spec == "*":
+            return AllEntrySelector()
+        return FieldEntrySelector(spec)
+
+
+class AllEntrySelector(EntrySelector):
+    """
+    Selector that accepts all entries
+    """
+
+    SPECIFIER = "*"
+
+    def __call__(self, entry):
+        return True
+
+
+class FieldEntrySelector(EntrySelector):
+    """
+    Selector that accepts only entries that match provided field
+    specifier(s). Only a limited set of specifiers is supported for now:
+      <specifiers>::=<specifier>[<comma><specifiers>]
+      <specifier>::=<field_name>[<type_delim><type>]<equal><value_or_range>
+      <field_name> is a valid identifier
+      <type> ::= "int" | "str"
+      <equal> ::= "="
+      <comma> ::= ","
+      <type_delim> ::= ":"
+      <value_or_range> ::= <value> | <range>
+      <range> ::= <value><range_delim><value>
+      <range_delim> ::= "-"
+      <value> is a string without spaces and special symbols
+        (e.g. <comma>, <equal>, <type_delim>, <range_delim>)
+    """
+
+    _SPEC_DELIM = ","
+    _TYPE_DELIM = ":"
+    _RANGE_DELIM = "-"
+    _EQUAL = "="
+    _ERROR_PREFIX = "Invalid field selector specifier"
+
+    class _FieldEntryValuePredicate:
+        """
+        Predicate that checks strict equality for the specified entry field
+        """
+
+        def __init__(self, name: str, typespec: Optional[str], value: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.value = value
+
+        def __call__(self, entry):
+            return entry[self.name] == self.type(self.value)
+
+    class _FieldEntryRangePredicate:
+        """
+        Predicate that checks whether an entry field falls into the specified range
+        """
+
+        def __init__(self, name: str, typespec: Optional[str], vmin: str, vmax: str):
+            import builtins
+
+            self.name = name
+            self.type = getattr(builtins, typespec) if typespec is not None else str
+            self.vmin = vmin
+            self.vmax = vmax
+
+        def __call__(self, entry):
+            return (entry[self.name] >= self.type(self.vmin)) and (
+                entry[self.name] <= self.type(self.vmax)
+            )
+
+    def __init__(self, spec: str):
+        self._predicates = self._parse_specifier_into_predicates(spec)
+
+    def __call__(self, entry: Dict[str, Any]):
+        for predicate in self._predicates:
+            if not predicate(entry):
+                return False
+        return True
+
+    def _parse_specifier_into_predicates(self, spec: str):
+        predicates = []
+        specs = spec.split(self._SPEC_DELIM)
+        for subspec in specs:
+            eq_idx = subspec.find(self._EQUAL)
+            if eq_idx > 0:
+                field_name_with_type = subspec[:eq_idx]
+                field_name, field_type = self._parse_field_name_type(field_name_with_type)
+                field_value_or_range = subspec[eq_idx + 1 :]
+                if self._is_range_spec(field_value_or_range):
+                    vmin, vmax = self._get_range_spec(field_value_or_range)
+                    predicate = FieldEntrySelector._FieldEntryRangePredicate(
+                        field_name, field_type, vmin, vmax
+                    )
+                else:
+                    predicate = FieldEntrySelector._FieldEntryValuePredicate(
+                        field_name, field_type, field_value_or_range
+                    )
+                predicates.append(predicate)
+            elif eq_idx == 0:
+                self._parse_error(f'"{subspec}", field name is empty!')
+            else:
+                self._parse_error(f'"{subspec}", should have format ' "<field>=<value_or_range>!")
+        return predicates
+
+    def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]:
+        type_delim_idx = field_name_with_type.find(self._TYPE_DELIM)
+        if type_delim_idx > 0:
+            field_name = field_name_with_type[:type_delim_idx]
+            field_type = field_name_with_type[type_delim_idx + 1 :]
+        elif type_delim_idx == 0:
+            self._parse_error(f'"{field_name_with_type}", field name is empty!')
+        else:
+            field_name = field_name_with_type
+            field_type = None
+        # pyre-fixme[61]: `field_name` may not be initialized here.
+        # pyre-fixme[61]: `field_type` may not be initialized here.
+        return field_name, field_type
+
+    def _is_range_spec(self, field_value_or_range):
+        delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+        return delim_idx > 0
+
+    def _get_range_spec(self, field_value_or_range):
+        if self._is_range_spec(field_value_or_range):
+            delim_idx = field_value_or_range.find(self._RANGE_DELIM)
+            vmin = field_value_or_range[:delim_idx]
+            vmax = field_value_or_range[delim_idx + 1 :]
+            return vmin, vmax
+        else:
+            self._parse_error('"field_value_or_range", range of values expected!')
+
+    def _parse_error(self, msg):
+        raise ValueError(f"{self._ERROR_PREFIX}: {msg}")
diff --git a/densepose/utils/logger.py b/densepose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aad2c0895afff0514c59b10cc80d01e47d50918
--- /dev/null
+++ b/densepose/utils/logger.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import logging
+
+
+def verbosity_to_level(verbosity) -> int:
+    if verbosity is not None:
+        if verbosity == 0:
+            return logging.WARNING
+        elif verbosity == 1:
+            return logging.INFO
+        elif verbosity >= 2:
+            return logging.DEBUG
+    return logging.WARNING
diff --git a/densepose/utils/transform.py b/densepose/utils/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f8a8ba038588bf8c014390f8b8feadfcdc40307
--- /dev/null
+++ b/densepose/utils/transform.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from detectron2.data import MetadataCatalog
+from detectron2.utils.file_io import PathManager
+
+from densepose import DensePoseTransformData
+
+
+def load_for_dataset(dataset_name):
+    path = MetadataCatalog.get(dataset_name).densepose_transform_src
+    densepose_transform_data_fpath = PathManager.get_local_path(path)
+    return DensePoseTransformData.load(densepose_transform_data_fpath)
+
+
+def load_from_cfg(cfg):
+    return load_for_dataset(cfg.DATASETS.TEST[0])
diff --git a/densepose/vis/__init__.py b/densepose/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/densepose/vis/__pycache__/__init__.cpython-39.pyc b/densepose/vis/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e293aa2ffa8b0ce5cf6b69d26a05c881c7deebea
Binary files /dev/null and b/densepose/vis/__pycache__/__init__.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/base.cpython-39.pyc b/densepose/vis/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c565bdf280e17a433fc48caec6d3378f6e17251
Binary files /dev/null and b/densepose/vis/__pycache__/base.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/bounding_box.cpython-39.pyc b/densepose/vis/__pycache__/bounding_box.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07c03156bc94a24482afefb71c54c580536020ce
Binary files /dev/null and b/densepose/vis/__pycache__/bounding_box.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11feb65e9310f0ee96445fb31fd31de9e129b8d5
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_outputs_vertex.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results.cpython-39.pyc b/densepose/vis/__pycache__/densepose_results.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af6e1f537ba749d0897ee69e4036ff43ff40cb26
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc b/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..336c6dc9692946ffcd3ddaa47cfab22f3a893bc3
Binary files /dev/null and b/densepose/vis/__pycache__/densepose_results_textures.cpython-39.pyc differ
diff --git a/densepose/vis/__pycache__/extractor.cpython-39.pyc b/densepose/vis/__pycache__/extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..940ab2d0ade32e7013675a6c02fecab6a9f2836d
Binary files /dev/null and b/densepose/vis/__pycache__/extractor.cpython-39.pyc differ
diff --git a/densepose/vis/base.py b/densepose/vis/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7b07000c41f49386de5d7752c0d277b9da1979
--- /dev/null
+++ b/densepose/vis/base.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import logging
+import numpy as np
+import cv2
+import torch
+
+Image = np.ndarray
+Boxes = torch.Tensor
+
+
+class MatrixVisualizer:
+    """
+    Base visualizer for matrix data
+    """
+
+    def __init__(
+        self,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        val_scale=1.0,
+        alpha=0.7,
+        interp_method_matrix=cv2.INTER_LINEAR,
+        interp_method_mask=cv2.INTER_NEAREST,
+    ):
+        self.inplace = inplace
+        self.cmap = cmap
+        self.val_scale = val_scale
+        self.alpha = alpha
+        self.interp_method_matrix = interp_method_matrix
+        self.interp_method_mask = interp_method_mask
+
+    def visualize(self, image_bgr, mask, matrix, bbox_xywh):
+        self._check_image(image_bgr)
+        self._check_mask_matrix(mask, matrix)
+        if self.inplace:
+            image_target_bgr = image_bgr
+        else:
+            image_target_bgr = image_bgr * 0
+        x, y, w, h = [int(v) for v in bbox_xywh]
+        if w <= 0 or h <= 0:
+            return image_bgr
+        mask, matrix = self._resize(mask, matrix, w, h)
+        mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3])
+        matrix_scaled = matrix.astype(np.float32) * self.val_scale
+        _EPSILON = 1e-6
+        if np.any(matrix_scaled > 255 + _EPSILON):
+            logger = logging.getLogger(__name__)
+            logger.warning(
+                f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]"
+            )
+        matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8)
+        matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap)
+        matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg]
+        image_target_bgr[y : y + h, x : x + w, :] = (
+            image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha
+        )
+        return image_target_bgr.astype(np.uint8)
+
+    def _resize(self, mask, matrix, w, h):
+        if (w != mask.shape[1]) or (h != mask.shape[0]):
+            mask = cv2.resize(mask, (w, h), self.interp_method_mask)
+        if (w != matrix.shape[1]) or (h != matrix.shape[0]):
+            matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix)
+        return mask, matrix
+
+    def _check_image(self, image_rgb):
+        assert len(image_rgb.shape) == 3
+        assert image_rgb.shape[2] == 3
+        assert image_rgb.dtype == np.uint8
+
+    def _check_mask_matrix(self, mask, matrix):
+        assert len(matrix.shape) == 2
+        assert len(mask.shape) == 2
+        assert mask.dtype == np.uint8
+
+
+class RectangleVisualizer:
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color=_COLOR_GREEN, thickness=1):
+        self.color = color
+        self.thickness = thickness
+
+    def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None):
+        x, y, w, h = bbox_xywh
+        color = color or self.color
+        thickness = thickness or self.thickness
+        cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness)
+        return image_bgr
+
+
+class PointsVisualizer:
+
+    _COLOR_GREEN = (18, 127, 15)
+
+    def __init__(self, color_bgr=_COLOR_GREEN, r=5):
+        self.color_bgr = color_bgr
+        self.r = r
+
+    def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None):
+        for j, pt_xy in enumerate(pts_xy):
+            x, y = pt_xy
+            color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr
+            r = rs[j] if rs is not None else self.r
+            cv2.circle(image_bgr, (x, y), r, color_bgr, -1)
+        return image_bgr
+
+
+class TextVisualizer:
+
+    _COLOR_GRAY = (218, 227, 218)
+    _COLOR_WHITE = (255, 255, 255)
+
+    def __init__(
+        self,
+        font_face=cv2.FONT_HERSHEY_SIMPLEX,
+        font_color_bgr=_COLOR_GRAY,
+        font_scale=0.35,
+        font_line_type=cv2.LINE_AA,
+        font_line_thickness=1,
+        fill_color_bgr=_COLOR_WHITE,
+        fill_color_transparency=1.0,
+        frame_color_bgr=_COLOR_WHITE,
+        frame_color_transparency=1.0,
+        frame_thickness=1,
+    ):
+        self.font_face = font_face
+        self.font_color_bgr = font_color_bgr
+        self.font_scale = font_scale
+        self.font_line_type = font_line_type
+        self.font_line_thickness = font_line_thickness
+        self.fill_color_bgr = fill_color_bgr
+        self.fill_color_transparency = fill_color_transparency
+        self.frame_color_bgr = frame_color_bgr
+        self.frame_color_transparency = frame_color_transparency
+        self.frame_thickness = frame_thickness
+
+    def visualize(self, image_bgr, txt, topleft_xy):
+        txt_w, txt_h = self.get_text_size_wh(txt)
+        topleft_xy = tuple(map(int, topleft_xy))
+        x, y = topleft_xy
+        if self.frame_color_transparency < 1.0:
+            t = self.frame_thickness
+            image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = (
+                image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :]
+                * self.frame_color_transparency
+                + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency)
+            ).astype(float)
+        if self.fill_color_transparency < 1.0:
+            image_bgr[y : y + txt_h, x : x + txt_w, :] = (
+                image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency
+                + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency)
+            ).astype(float)
+        cv2.putText(
+            image_bgr,
+            txt,
+            topleft_xy,
+            self.font_face,
+            self.font_scale,
+            self.font_color_bgr,
+            self.font_line_thickness,
+            self.font_line_type,
+        )
+        return image_bgr
+
+    def get_text_size_wh(self, txt):
+        ((txt_w, txt_h), _) = cv2.getTextSize(
+            txt, self.font_face, self.font_scale, self.font_line_thickness
+        )
+        return txt_w, txt_h
+
+
+class CompoundVisualizer:
+    def __init__(self, visualizers):
+        self.visualizers = visualizers
+
+    def visualize(self, image_bgr, data):
+        assert len(data) == len(
+            self.visualizers
+        ), "The number of datas {} should match the number of visualizers" " {}".format(
+            len(data), len(self.visualizers)
+        )
+        image = image_bgr
+        for i, visualizer in enumerate(self.visualizers):
+            image = visualizer.visualize(image, data[i])
+        return image
+
+    def __str__(self):
+        visualizer_str = ", ".join([str(v) for v in self.visualizers])
+        return "Compound Visualizer [{}]".format(visualizer_str)
diff --git a/densepose/vis/bounding_box.py b/densepose/vis/bounding_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..a88ba0ce74b8da539ea3a25c703a9795be8163a6
--- /dev/null
+++ b/densepose/vis/bounding_box.py
@@ -0,0 +1,39 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+from .base import RectangleVisualizer, TextVisualizer
+
+
+class BoundingBoxVisualizer:
+    def __init__(self):
+        self.rectangle_visualizer = RectangleVisualizer()
+
+    def visualize(self, image_bgr, boxes_xywh):
+        for bbox_xywh in boxes_xywh:
+            image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh)
+        return image_bgr
+
+
+class ScoredBoundingBoxVisualizer:
+    def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None, **kwargs):
+        if bbox_visualizer_params is None:
+            bbox_visualizer_params = {}
+        if score_visualizer_params is None:
+            score_visualizer_params = {}
+        self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params)
+        self.visualizer_score = TextVisualizer(**score_visualizer_params)
+
+    def visualize(self, image_bgr, scored_bboxes):
+        boxes_xywh, box_scores = scored_bboxes
+        assert len(boxes_xywh) == len(
+            box_scores
+        ), "Number of bounding boxes {} should be equal to the number of scores {}".format(
+            len(boxes_xywh), len(box_scores)
+        )
+        for i, box_xywh in enumerate(boxes_xywh):
+            score_i = box_scores[i]
+            image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh)
+            score_txt = "{0:6.4f}".format(score_i)
+            topleft_xy = box_xywh[0], box_xywh[1]
+            image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy)
+        return image_bgr
diff --git a/densepose/vis/densepose_data_points.py b/densepose/vis/densepose_data_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..de809f64ee09a50291999774d91443e3edd869ea
--- /dev/null
+++ b/densepose/vis/densepose_data_points.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import numpy as np
+from typing import Iterable, Optional, Tuple
+import cv2
+
+from densepose.structures import DensePoseDataRelative
+
+from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer
+
+
+class DensePoseDataCoarseSegmentationVisualizer:
+    """
+    Visualizer for ground truth segmentation
+    """
+
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace,
+            cmap=cmap,
+            val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS,
+            alpha=alpha,
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            matrix = densepose_data.segm.numpy()
+            mask = np.zeros(matrix.shape, dtype=np.uint8)
+            mask[matrix > 0] = 1
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy())
+        return image_bgr
+
+
+class DensePoseDataPointsVisualizer:
+    def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA, **kwargs):
+        self.points_visualizer = PointsVisualizer()
+        self.densepose_data_to_value_fn = densepose_data_to_value_fn
+        self.cmap = cmap
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]],
+    ) -> Image:
+        if bbox_densepose_datas is None:
+            return image_bgr
+        for bbox_xywh, densepose_data in zip(*bbox_densepose_datas):
+            x0, y0, w, h = bbox_xywh.numpy()
+            x = densepose_data.x.numpy() * w / 255.0 + x0
+            y = densepose_data.y.numpy() * h / 255.0 + y0
+            pts_xy = zip(x, y)
+            if self.densepose_data_to_value_fn is None:
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy)
+            else:
+                v = self.densepose_data_to_value_fn(densepose_data)
+                img_colors_bgr = cv2.applyColorMap(v, self.cmap)
+                colors_bgr = [
+                    [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+                ]
+                image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr)
+        return image_bgr
+
+
+def _densepose_data_u_for_cmap(densepose_data):
+    u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0
+    return u.astype(np.uint8)
+
+
+def _densepose_data_v_for_cmap(densepose_data):
+    v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0
+    return v.astype(np.uint8)
+
+
+def _densepose_data_i_for_cmap(densepose_data):
+    i = (
+        np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS)
+        * 255.0
+        / DensePoseDataRelative.N_PART_LABELS
+    )
+    return i.astype(np.uint8)
+
+
+class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsUVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_u_for_cmap, **kwargs
+        )
+
+
+class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsVVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_v_for_cmap, **kwargs
+        )
+
+
+class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer):
+    def __init__(self, **kwargs):
+        super(DensePoseDataPointsIVisualizer, self).__init__(
+            densepose_data_to_value_fn=_densepose_data_i_for_cmap, **kwargs
+        )
diff --git a/densepose/vis/densepose_outputs_iuv.py b/densepose/vis/densepose_outputs_iuv.py
new file mode 100644
index 0000000000000000000000000000000000000000..960ffba0d4146eda0a4dcd2220c724d944834b33
--- /dev/null
+++ b/densepose/vis/densepose_outputs_iuv.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import numpy as np
+from typing import Optional, Tuple
+import cv2
+
+from densepose.structures import DensePoseDataRelative
+
+from ..structures import DensePoseChartPredictorOutput
+from .base import Boxes, Image, MatrixVisualizer
+
+
+class DensePoseOutputsVisualizer:
+    def __init__(
+        self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, to_visualize=None, **kwargs
+    ):
+        assert to_visualize in "IUV", "can only visualize IUV"
+        self.to_visualize = to_visualize
+
+        if self.to_visualize == "I":
+            val_scale = 255.0 / DensePoseDataRelative.N_PART_LABELS
+        else:
+            val_scale = 1.0
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        dp_output_with_bboxes: Tuple[Optional[DensePoseChartPredictorOutput], Optional[Boxes]],
+    ) -> Image:
+        densepose_output, bboxes_xywh = dp_output_with_bboxes
+        if densepose_output is None or bboxes_xywh is None:
+            return image_bgr
+
+        assert isinstance(
+            densepose_output, DensePoseChartPredictorOutput
+        ), "DensePoseChartPredictorOutput expected, {} encountered".format(type(densepose_output))
+
+        S = densepose_output.coarse_segm
+        I = densepose_output.fine_segm  # noqa
+        U = densepose_output.u
+        V = densepose_output.v
+        N = S.size(0)
+        assert N == I.size(
+            0
+        ), "densepose outputs S {} and I {}" " should have equal first dim size".format(
+            S.size(), I.size()
+        )
+        assert N == U.size(
+            0
+        ), "densepose outputs S {} and U {}" " should have equal first dim size".format(
+            S.size(), U.size()
+        )
+        assert N == V.size(
+            0
+        ), "densepose outputs S {} and V {}" " should have equal first dim size".format(
+            S.size(), V.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        for n in range(N):
+            Sn = S[n].argmax(dim=0)
+            In = I[n].argmax(dim=0) * (Sn > 0).long()
+            segmentation = In.cpu().numpy().astype(np.uint8)
+            mask = np.zeros(segmentation.shape, dtype=np.uint8)
+            mask[segmentation > 0] = 1
+            bbox_xywh = bboxes_xywh[n]
+
+            if self.to_visualize == "I":
+                vis = segmentation
+            elif self.to_visualize in "UV":
+                U_or_Vn = {"U": U, "V": V}[self.to_visualize][n].cpu().numpy().astype(np.float32)
+                vis = np.zeros(segmentation.shape, dtype=np.float32)
+                for partId in range(U_or_Vn.shape[0]):
+                    vis[segmentation == partId] = (
+                        U_or_Vn[partId][segmentation == partId].clip(0, 1) * 255
+                    )
+
+            # pyre-fixme[61]: `vis` may not be initialized here.
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask, vis, bbox_xywh)
+
+        return image_bgr
+
+
+class DensePoseOutputsUVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="U", **kwargs)
+
+
+class DensePoseOutputsVVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="V", **kwargs)
+
+
+class DensePoseOutputsFineSegmentationVisualizer(DensePoseOutputsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super().__init__(inplace=inplace, cmap=cmap, alpha=alpha, to_visualize="I", **kwargs)
diff --git a/densepose/vis/densepose_outputs_vertex.py b/densepose/vis/densepose_outputs_vertex.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe296fcf81e5711eea21049a1b4de17eb2541b3f
--- /dev/null
+++ b/densepose/vis/densepose_outputs_vertex.py
@@ -0,0 +1,231 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+# pyre-unsafe
+import json
+import numpy as np
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple
+import cv2
+import torch
+
+from detectron2.utils.file_io import PathManager
+
+from densepose.modeling import build_densepose_embedder
+from densepose.modeling.cse.utils import get_closest_vertices_mask_from_ES
+
+from ..data.utils import get_class_to_mesh_name_mapping
+from ..structures import DensePoseEmbeddingPredictorOutput
+from ..structures.mesh import create_mesh
+from .base import Boxes, Image, MatrixVisualizer
+from .densepose_results_textures import get_texture_atlas
+
+
+@lru_cache()
+def get_xyz_vertex_embedding(mesh_name: str, device: torch.device):
+    if mesh_name == "smpl_27554":
+        embed_path = PathManager.get_local_path(
+            "https://dl.fbaipublicfiles.com/densepose/data/cse/mds_d=256.npy"
+        )
+        embed_map, _ = np.load(embed_path, allow_pickle=True)
+        embed_map = torch.tensor(embed_map).float()[:, 0]
+        embed_map -= embed_map.min()
+        embed_map /= embed_map.max()
+    else:
+        mesh = create_mesh(mesh_name, device)
+        embed_map = mesh.vertices.sum(dim=1)
+        embed_map -= embed_map.min()
+        embed_map /= embed_map.max()
+        embed_map = embed_map**2
+    return embed_map
+
+
+class DensePoseOutputsVertexVisualizer:
+    def __init__(
+        self,
+        cfg,
+        inplace=True,
+        cmap=cv2.COLORMAP_JET,
+        alpha=0.7,
+        device="cuda",
+        default_class=0,
+        **kwargs,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha
+        )
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.embedder = build_densepose_embedder(cfg)
+        self.device = torch.device(device)
+        self.default_class = default_class
+
+        self.mesh_vertex_embeddings = {
+            mesh_name: self.embedder(mesh_name).to(self.device)
+            for mesh_name in self.class_to_mesh_name.values()
+            if self.embedder.has_embeddings(mesh_name)
+        }
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        outputs_boxes_xywh_classes: Tuple[
+            Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]]
+        ],
+    ) -> Image:
+        if outputs_boxes_xywh_classes[0] is None:
+            return image_bgr
+
+        S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes(
+            outputs_boxes_xywh_classes
+        )
+
+        for n in range(N):
+            x, y, w, h = bboxes_xywh[n].int().tolist()
+            mesh_name = self.class_to_mesh_name[pred_classes[n]]
+            closest_vertices, mask = get_closest_vertices_mask_from_ES(
+                E[[n]],
+                S[[n]],
+                h,
+                w,
+                self.mesh_vertex_embeddings[mesh_name],
+                self.device,
+            )
+            embed_map = get_xyz_vertex_embedding(mesh_name, self.device)
+            vis = (embed_map[closest_vertices].clip(0, 1) * 255.0).cpu().numpy()
+            mask_numpy = mask.cpu().numpy().astype(dtype=np.uint8)
+            image_bgr = self.mask_visualizer.visualize(image_bgr, mask_numpy, vis, [x, y, w, h])
+
+        return image_bgr
+
+    def extract_and_check_outputs_and_boxes(self, outputs_boxes_xywh_classes):
+
+        densepose_output, bboxes_xywh, pred_classes = outputs_boxes_xywh_classes
+
+        if pred_classes is None:
+            pred_classes = [self.default_class] * len(bboxes_xywh)
+
+        assert isinstance(
+            densepose_output, DensePoseEmbeddingPredictorOutput
+        ), "DensePoseEmbeddingPredictorOutput expected, {} encountered".format(
+            type(densepose_output)
+        )
+
+        S = densepose_output.coarse_segm
+        E = densepose_output.embedding
+        N = S.size(0)
+        assert N == E.size(
+            0
+        ), "CSE coarse_segm {} and embeddings {}" " should have equal first dim size".format(
+            S.size(), E.size()
+        )
+        assert N == len(
+            bboxes_xywh
+        ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format(
+            len(bboxes_xywh), N
+        )
+        assert N == len(pred_classes), (
+            "number of predicted classes {}"
+            " should be equal to first dim size of outputs {}".format(len(bboxes_xywh), N)
+        )
+
+        return S, E, N, bboxes_xywh, pred_classes
+
+
+def get_texture_atlases(json_str: Optional[str]) -> Optional[Dict[str, Optional[np.ndarray]]]:
+    """
+    json_str is a JSON string representing a mesh_name -> texture_atlas_path dictionary
+    """
+    if json_str is None:
+        return None
+
+    paths = json.loads(json_str)
+    return {mesh_name: get_texture_atlas(path) for mesh_name, path in paths.items()}
+
+
+class DensePoseOutputsTextureVisualizer(DensePoseOutputsVertexVisualizer):
+    def __init__(
+        self,
+        cfg,
+        texture_atlases_dict,
+        device="cuda",
+        default_class=0,
+        **kwargs,
+    ):
+        self.embedder = build_densepose_embedder(cfg)
+
+        self.texture_image_dict = {}
+        self.alpha_dict = {}
+
+        for mesh_name in texture_atlases_dict.keys():
+            if texture_atlases_dict[mesh_name].shape[-1] == 4:  # Image with alpha channel
+                self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, -1] / 255.0
+                self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name][:, :, :3]
+            else:
+                self.alpha_dict[mesh_name] = texture_atlases_dict[mesh_name].sum(axis=-1) > 0
+                self.texture_image_dict[mesh_name] = texture_atlases_dict[mesh_name]
+
+        self.device = torch.device(device)
+        self.class_to_mesh_name = get_class_to_mesh_name_mapping(cfg)
+        self.default_class = default_class
+
+        self.mesh_vertex_embeddings = {
+            mesh_name: self.embedder(mesh_name).to(self.device)
+            for mesh_name in self.class_to_mesh_name.values()
+        }
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        outputs_boxes_xywh_classes: Tuple[
+            Optional[DensePoseEmbeddingPredictorOutput], Optional[Boxes], Optional[List[int]]
+        ],
+    ) -> Image:
+        image_target_bgr = image_bgr.copy()
+        if outputs_boxes_xywh_classes[0] is None:
+            return image_target_bgr
+
+        S, E, N, bboxes_xywh, pred_classes = self.extract_and_check_outputs_and_boxes(
+            outputs_boxes_xywh_classes
+        )
+
+        meshes = {
+            p: create_mesh(self.class_to_mesh_name[p], self.device) for p in np.unique(pred_classes)
+        }
+
+        for n in range(N):
+            x, y, w, h = bboxes_xywh[n].int().cpu().numpy()
+            mesh_name = self.class_to_mesh_name[pred_classes[n]]
+            closest_vertices, mask = get_closest_vertices_mask_from_ES(
+                E[[n]],
+                S[[n]],
+                h,
+                w,
+                self.mesh_vertex_embeddings[mesh_name],
+                self.device,
+            )
+            uv_array = meshes[pred_classes[n]].texcoords[closest_vertices].permute((2, 0, 1))
+            uv_array = uv_array.cpu().numpy().clip(0, 1)
+            textured_image = self.generate_image_with_texture(
+                image_target_bgr[y : y + h, x : x + w],
+                uv_array,
+                mask.cpu().numpy(),
+                self.class_to_mesh_name[pred_classes[n]],
+            )
+            if textured_image is None:
+                continue
+            image_target_bgr[y : y + h, x : x + w] = textured_image
+
+        return image_target_bgr
+
+    def generate_image_with_texture(self, bbox_image_bgr, uv_array, mask, mesh_name):
+        alpha = self.alpha_dict.get(mesh_name)
+        texture_image = self.texture_image_dict.get(mesh_name)
+        if alpha is None or texture_image is None:
+            return None
+        U, V = uv_array
+        x_index = (U * texture_image.shape[1]).astype(int)
+        y_index = (V * texture_image.shape[0]).astype(int)
+        local_texture = texture_image[y_index, x_index][mask]
+        local_alpha = np.expand_dims(alpha[y_index, x_index][mask], -1)
+        output_image = bbox_image_bgr.copy()
+        output_image[mask] = output_image[mask] * (1 - local_alpha) + local_texture * local_alpha
+        return output_image.astype(np.uint8)
diff --git a/densepose/vis/densepose_results.py b/densepose/vis/densepose_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..d49a3828339b6ff03735924d3621396ca8f00e5c
--- /dev/null
+++ b/densepose/vis/densepose_results.py
@@ -0,0 +1,357 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import logging
+import numpy as np
+from typing import List, Optional, Tuple
+import cv2
+import torch
+
+from densepose.structures import DensePoseDataRelative
+
+from ..structures import DensePoseChartResult
+from .base import Boxes, Image, MatrixVisualizer
+
+
+class DensePoseResultsVisualizer:
+    def visualize(
+        self,
+        image_bgr: Image,
+        results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]],
+    ) -> Image:
+        densepose_result, boxes_xywh = results_and_boxes_xywh
+        if densepose_result is None or boxes_xywh is None:
+            return image_bgr
+
+        boxes_xywh = boxes_xywh.cpu().numpy()
+        context = self.create_visualization_context(image_bgr)
+        for i, result in enumerate(densepose_result):
+            iuv_array = torch.cat(
+                (result.labels[None].type(torch.float32), result.uv * 255.0)
+            ).type(torch.uint8)
+            self.visualize_iuv_arr(context, iuv_array.cpu().numpy(), boxes_xywh[i])
+        image_bgr = self.context_to_image_bgr(context)
+        return image_bgr
+
+    def create_visualization_context(self, image_bgr: Image):
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+        pass
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def get_image_bgr_from_context(self, context):
+        return context
+
+
+class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer):
+    def __init__(
+        self,
+        data_extractor,
+        segm_extractor,
+        inplace=True,
+        cmap=cv2.COLORMAP_PARULA,
+        alpha=0.7,
+        val_scale=1.0,
+        **kwargs,
+    ):
+        self.mask_visualizer = MatrixVisualizer(
+            inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha
+        )
+        self.data_extractor = data_extractor
+        self.segm_extractor = segm_extractor
+
+    def context_to_image_bgr(self, context):
+        return context
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh) -> None:
+        image_bgr = self.get_image_bgr_from_context(context)
+        matrix = self.data_extractor(iuv_arr)
+        segm = self.segm_extractor(iuv_arr)
+        mask = np.zeros(matrix.shape, dtype=np.uint8)
+        mask[segm > 0] = 1
+        image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh)
+
+
+def _extract_i_from_iuvarr(iuv_arr):
+    return iuv_arr[0, :, :]
+
+
+def _extract_u_from_iuvarr(iuv_arr):
+    return iuv_arr[1, :, :]
+
+
+def _extract_v_from_iuvarr(iuv_arr):
+    return iuv_arr[2, :, :]
+
+
+class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer):
+    def __init__(self, levels=10, **kwargs):
+        self.levels = levels
+        self.plot_args = kwargs
+
+    def create_visualization_context(self, image_bgr: Image):
+        import matplotlib.pyplot as plt
+        from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+
+        context = {}
+        context["image_bgr"] = image_bgr
+        dpi = 100
+        height_inches = float(image_bgr.shape[0]) / dpi
+        width_inches = float(image_bgr.shape[1]) / dpi
+        fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi)
+        plt.axes([0, 0, 1, 1])
+        plt.axis("off")
+        context["fig"] = fig
+        canvas = FigureCanvas(fig)
+        context["canvas"] = canvas
+        extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0)
+        plt.imshow(image_bgr[:, :, ::-1], extent=extent)
+        return context
+
+    def context_to_image_bgr(self, context):
+        fig = context["fig"]
+        w, h = map(int, fig.get_size_inches() * fig.get_dpi())
+        canvas = context["canvas"]
+        canvas.draw()
+        image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8")
+        image_rgb = image_1d.reshape(h, w, 3)
+        image_bgr = image_rgb[:, :, ::-1].copy()
+        return image_bgr
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None:
+        import matplotlib.pyplot as plt
+
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        extent = (
+            bbox_xywh[0],
+            bbox_xywh[0] + bbox_xywh[2],
+            bbox_xywh[1],
+            bbox_xywh[1] + bbox_xywh[3],
+        )
+        plt.contour(u, self.levels, extent=extent, **self.plot_args)
+        plt.contour(v, self.levels, extent=extent, **self.plot_args)
+
+
+class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer):
+    """
+    Contour visualization using marching squares
+    """
+
+    def __init__(self, levels=10, **kwargs):
+        # TODO: colormap is hardcoded
+        cmap = cv2.COLORMAP_PARULA
+        if isinstance(levels, int):
+            self.levels = np.linspace(0, 1, levels)
+        else:
+            self.levels = levels
+        if "linewidths" in kwargs:
+            self.linewidths = kwargs["linewidths"]
+        else:
+            self.linewidths = [1] * len(self.levels)
+        self.plot_args = kwargs
+        img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap)
+        self.level_colors_bgr = [
+            [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr
+        ]
+
+    def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> None:
+        image_bgr = self.get_image_bgr_from_context(context)
+        segm = _extract_i_from_iuvarr(iuv_arr)
+        u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0
+        v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0
+        self._contours(image_bgr, u, segm, bbox_xywh)
+        self._contours(image_bgr, v, segm, bbox_xywh)
+
+    def _contours(self, image_bgr, arr, segm, bbox_xywh):
+        for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1):
+            mask = segm == part_idx
+            if not np.any(mask):
+                continue
+            arr_min = np.amin(arr[mask])
+            arr_max = np.amax(arr[mask])
+            I, J = np.nonzero(mask)
+            i0 = np.amin(I)
+            i1 = np.amax(I) + 1
+            j0 = np.amin(J)
+            j1 = np.amax(J) + 1
+            if (j1 == j0 + 1) or (i1 == i0 + 1):
+                continue
+            Nw = arr.shape[1] - 1
+            Nh = arr.shape[0] - 1
+            for level_idx, level in enumerate(self.levels):
+                if (level < arr_min) or (level > arr_max):
+                    continue
+                vp = arr[i0:i1, j0:j1] >= level
+                bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8
+                mp = mask[i0:i1, j0:j1]
+                bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8
+                it = np.nditer(bin_codes, flags=["multi_index"])
+                color_bgr = self.level_colors_bgr[level_idx]
+                linewidth = self.linewidths[level_idx]
+                while not it.finished:
+                    if (it[0] != 0) and (it[0] != 15):
+                        i, j = it.multi_index
+                        if bin_mask_codes[i, j] != 0:
+                            self._draw_line(
+                                image_bgr,
+                                arr,
+                                mask,
+                                level,
+                                color_bgr,
+                                linewidth,
+                                it[0],
+                                it.multi_index,
+                                bbox_xywh,
+                                Nw,
+                                Nh,
+                                (i0, j0),
+                            )
+                    it.iternext()
+
+    def _draw_line(
+        self,
+        image_bgr,
+        arr,
+        mask,
+        v,
+        color_bgr,
+        linewidth,
+        bin_code,
+        multi_idx,
+        bbox_xywh,
+        Nw,
+        Nh,
+        offset,
+    ):
+        lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset)
+        x0, y0, w, h = bbox_xywh
+        x1 = x0 + w
+        y1 = y0 + h
+        for line in lines:
+            x0r, y0r = line[0]
+            x1r, y1r = line[1]
+            pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0)))
+            pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0)))
+            cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth)
+
+    def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset):
+        i0, j0 = offset
+        i, j = multi_idx
+        i += i0
+        j += j0
+        v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1]
+        x0i = float(j) / Nw
+        y0j = float(i) / Nh
+        He = 1.0 / Nh
+        We = 1.0 / Nw
+        if (bin_code == 1) or (bin_code == 14):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v0) / (v3 - v0)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j)
+            return [(pt1, pt2)]
+        elif (bin_code == 2) or (bin_code == 13):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 3) or (bin_code == 12):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v1) / (v2 - v1)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + b * We, y0j + He)
+            return [(pt1, pt2)]
+        elif (bin_code == 4) or (bin_code == 11):
+            a = (v - v1) / (v2 - v1)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j + He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 6) or (bin_code == 9):
+            a = (v - v0) / (v1 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i, y0j + a * He)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif (bin_code == 7) or (bin_code == 8):
+            a = (v - v0) / (v3 - v0)
+            b = (v - v3) / (v2 - v3)
+            pt1 = (x0i + a * We, y0j)
+            pt2 = (x0i + We, y0j + b * He)
+            return [(pt1, pt2)]
+        elif bin_code == 5:
+            a1 = (v - v0) / (v1 - v0)
+            b1 = (v - v1) / (v2 - v1)
+            pt11 = (x0i, y0j + a1 * He)
+            pt12 = (x0i + b1 * We, y0j + He)
+            a2 = (v - v0) / (v3 - v0)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        elif bin_code == 10:
+            a1 = (v - v0) / (v3 - v0)
+            b1 = (v - v0) / (v1 - v0)
+            pt11 = (x0i + a1 * We, y0j)
+            pt12 = (x0i, y0j + b1 * He)
+            a2 = (v - v1) / (v2 - v1)
+            b2 = (v - v3) / (v2 - v3)
+            pt21 = (x0i + a2 * We, y0j + He)
+            pt22 = (x0i + We, y0j + b2 * He)
+            return [(pt11, pt12), (pt21, pt22)]
+        return []
+
+
+try:
+    import matplotlib
+
+    matplotlib.use("Agg")
+    DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer
+except ModuleNotFoundError:
+    logger = logging.getLogger(__name__)
+    logger.warning("Could not import matplotlib, using custom contour visualizer")
+    DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer
+
+
+class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super(DensePoseResultsFineSegmentationVisualizer, self).__init__(
+            _extract_i_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS,
+            **kwargs,
+        )
+
+
+class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super(DensePoseResultsUVisualizer, self).__init__(
+            _extract_u_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=1.0,
+            **kwargs,
+        )
+
+
+class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer):
+    def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7, **kwargs):
+        super(DensePoseResultsVVisualizer, self).__init__(
+            _extract_v_from_iuvarr,
+            _extract_i_from_iuvarr,
+            inplace,
+            cmap,
+            alpha,
+            val_scale=1.0,
+            **kwargs,
+        )
diff --git a/densepose/vis/densepose_results_textures.py b/densepose/vis/densepose_results_textures.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa33b861100b796f411f3aade1c03a68c279262e
--- /dev/null
+++ b/densepose/vis/densepose_results_textures.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import numpy as np
+from typing import List, Optional, Tuple
+import torch
+
+from detectron2.data.detection_utils import read_image
+
+from ..structures import DensePoseChartResult
+from .base import Boxes, Image
+from .densepose_results import DensePoseResultsVisualizer
+
+
+def get_texture_atlas(path: Optional[str]) -> Optional[np.ndarray]:
+    if path is None:
+        return None
+
+    # Reading images like that downsamples 16-bit images to 8-bit
+    # If 16-bit images are needed, we can replace that by cv2.imread with the
+    # cv2.IMREAD_UNCHANGED flag (with cv2 we also need it to keep alpha channels)
+    # The rest of the pipeline would need to be adapted to 16-bit images too
+    bgr_image = read_image(path)
+    rgb_image = np.copy(bgr_image)  # Convert BGR -> RGB
+    rgb_image[:, :, :3] = rgb_image[:, :, 2::-1]  # Works with alpha channel
+    return rgb_image
+
+
+class DensePoseResultsVisualizerWithTexture(DensePoseResultsVisualizer):
+    """
+    texture_atlas: An image, size 6N * 4N, with N * N squares for each of the 24 body parts.
+            It must follow the grid found at https://github.com/facebookresearch/DensePose/blob/master/DensePoseData/demo_data/texture_atlas_200.png  # noqa
+            For each body part, U is proportional to the x coordinate, and (1 - V) to y
+    """
+
+    def __init__(self, texture_atlas, **kwargs):
+        self.texture_atlas = texture_atlas
+        self.body_part_size = texture_atlas.shape[0] // 6
+        assert self.body_part_size == texture_atlas.shape[1] // 4
+
+    def visualize(
+        self,
+        image_bgr: Image,
+        results_and_boxes_xywh: Tuple[Optional[List[DensePoseChartResult]], Optional[Boxes]],
+    ) -> Image:
+        densepose_result, boxes_xywh = results_and_boxes_xywh
+        if densepose_result is None or boxes_xywh is None:
+            return image_bgr
+
+        boxes_xywh = boxes_xywh.int().cpu().numpy()
+        texture_image, alpha = self.get_texture()
+        for i, result in enumerate(densepose_result):
+            iuv_array = torch.cat((result.labels[None], result.uv.clamp(0, 1)))
+            x, y, w, h = boxes_xywh[i]
+            bbox_image = image_bgr[y : y + h, x : x + w]
+            image_bgr[y : y + h, x : x + w] = self.generate_image_with_texture(
+                texture_image, alpha, bbox_image, iuv_array.cpu().numpy()
+            )
+        return image_bgr
+
+    def get_texture(self):
+        N = self.body_part_size
+        texture_image = np.zeros([24, N, N, self.texture_atlas.shape[-1]])
+        for i in range(4):
+            for j in range(6):
+                texture_image[(6 * i + j), :, :, :] = self.texture_atlas[
+                    N * j : N * (j + 1), N * i : N * (i + 1), :
+                ]
+
+        if texture_image.shape[-1] == 4:  # Image with alpha channel
+            alpha = texture_image[:, :, :, -1] / 255.0
+            texture_image = texture_image[:, :, :, :3]
+        else:
+            alpha = texture_image.sum(axis=-1) > 0
+
+        return texture_image, alpha
+
+    def generate_image_with_texture(self, texture_image, alpha, bbox_image_bgr, iuv_array):
+
+        I, U, V = iuv_array
+        generated_image_bgr = bbox_image_bgr.copy()
+
+        for PartInd in range(1, 25):
+            x, y = np.where(I == PartInd)
+            x_index = (U[x, y] * (self.body_part_size - 1)).astype(int)
+            y_index = ((1 - V[x, y]) * (self.body_part_size - 1)).astype(int)
+            part_alpha = np.expand_dims(alpha[PartInd - 1, y_index, x_index], -1)
+            generated_image_bgr[I == PartInd] = (
+                generated_image_bgr[I == PartInd] * (1 - part_alpha)
+                + texture_image[PartInd - 1, y_index, x_index] * part_alpha
+            )
+
+        return generated_image_bgr.astype(np.uint8)
diff --git a/densepose/vis/extractor.py b/densepose/vis/extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc52a51955750a178521b8ed9442b31dd9f1ebb
--- /dev/null
+++ b/densepose/vis/extractor.py
@@ -0,0 +1,201 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# pyre-unsafe
+import logging
+from typing import List, Optional, Sequence, Tuple
+import torch
+
+from detectron2.layers.nms import batched_nms
+from detectron2.structures.instances import Instances
+
+from densepose.converters import ToChartResultConverterWithConfidences
+from densepose.structures import (
+    DensePoseChartResultWithConfidences,
+    DensePoseEmbeddingPredictorOutput,
+)
+from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer
+from densepose.vis.densepose_outputs_vertex import DensePoseOutputsVertexVisualizer
+from densepose.vis.densepose_results import DensePoseResultsVisualizer
+
+from .base import CompoundVisualizer
+
+Scores = Sequence[float]
+DensePoseChartResultsWithConfidences = List[DensePoseChartResultWithConfidences]
+
+
+def extract_scores_from_instances(instances: Instances, select=None):
+    if instances.has("scores"):
+        return instances.scores if select is None else instances.scores[select]
+    return None
+
+
+def extract_boxes_xywh_from_instances(instances: Instances, select=None):
+    if instances.has("pred_boxes"):
+        boxes_xywh = instances.pred_boxes.tensor.clone()
+        boxes_xywh[:, 2] -= boxes_xywh[:, 0]
+        boxes_xywh[:, 3] -= boxes_xywh[:, 1]
+        return boxes_xywh if select is None else boxes_xywh[select]
+    return None
+
+
+def create_extractor(visualizer: object):
+    """
+    Create an extractor for the provided visualizer
+    """
+    if isinstance(visualizer, CompoundVisualizer):
+        extractors = [create_extractor(v) for v in visualizer.visualizers]
+        return CompoundExtractor(extractors)
+    elif isinstance(visualizer, DensePoseResultsVisualizer):
+        return DensePoseResultExtractor()
+    elif isinstance(visualizer, ScoredBoundingBoxVisualizer):
+        return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances])
+    elif isinstance(visualizer, BoundingBoxVisualizer):
+        return extract_boxes_xywh_from_instances
+    elif isinstance(visualizer, DensePoseOutputsVertexVisualizer):
+        return DensePoseOutputsExtractor()
+    else:
+        logger = logging.getLogger(__name__)
+        logger.error(f"Could not create extractor for {visualizer}")
+        return None
+
+
+class BoundingBoxExtractor:
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances):
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        return boxes_xywh
+
+
+class ScoredBoundingBoxExtractor:
+    """
+    Extracts bounding boxes from instances
+    """
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if (scores is None) or (boxes_xywh is None):
+            return (boxes_xywh, scores)
+        if select is not None:
+            scores = scores[select]
+            boxes_xywh = boxes_xywh[select]
+        return (boxes_xywh, scores)
+
+
+class DensePoseResultExtractor:
+    """
+    Extracts DensePose chart result with confidences from instances
+    """
+
+    def __call__(
+        self, instances: Instances, select=None
+    ) -> Tuple[Optional[DensePoseChartResultsWithConfidences], Optional[torch.Tensor]]:
+        if instances.has("pred_densepose") and instances.has("pred_boxes"):
+            dpout = instances.pred_densepose
+            boxes_xyxy = instances.pred_boxes
+            boxes_xywh = extract_boxes_xywh_from_instances(instances)
+            if select is not None:
+                dpout = dpout[select]
+                boxes_xyxy = boxes_xyxy[select]
+            converter = ToChartResultConverterWithConfidences()
+            results = [converter.convert(dpout[i], boxes_xyxy[[i]]) for i in range(len(dpout))]
+            return results, boxes_xywh
+        else:
+            return None, None
+
+
+class DensePoseOutputsExtractor:
+    """
+    Extracts DensePose result from instances
+    """
+
+    def __call__(
+        self,
+        instances: Instances,
+        select=None,
+    ) -> Tuple[
+        Optional[DensePoseEmbeddingPredictorOutput], Optional[torch.Tensor], Optional[List[int]]
+    ]:
+        if not (instances.has("pred_densepose") and instances.has("pred_boxes")):
+            return None, None, None
+
+        dpout = instances.pred_densepose
+        boxes_xyxy = instances.pred_boxes
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+
+        if instances.has("pred_classes"):
+            classes = instances.pred_classes.tolist()
+        else:
+            classes = None
+
+        if select is not None:
+            dpout = dpout[select]
+            boxes_xyxy = boxes_xyxy[select]
+            if classes is not None:
+                classes = classes[select]
+
+        return dpout, boxes_xywh, classes
+
+
+class CompoundExtractor:
+    """
+    Extracts data for CompoundVisualizer
+    """
+
+    def __init__(self, extractors):
+        self.extractors = extractors
+
+    def __call__(self, instances: Instances, select=None):
+        datas = []
+        for extractor in self.extractors:
+            data = extractor(instances, select)
+            datas.append(data)
+        return datas
+
+
+class NmsFilteredExtractor:
+    """
+    Extracts data in the format accepted by NmsFilteredVisualizer
+    """
+
+    def __init__(self, extractor, iou_threshold):
+        self.extractor = extractor
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        boxes_xywh = extract_boxes_xywh_from_instances(instances)
+        if boxes_xywh is None:
+            return None
+        select_local_idx = batched_nms(
+            boxes_xywh,
+            scores,
+            torch.zeros(len(scores), dtype=torch.int32),
+            iou_threshold=self.iou_threshold,
+        ).squeeze()
+        select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device)
+        select_local[select_local_idx] = True
+        select = select_local if select is None else (select & select_local)
+        return self.extractor(instances, select=select)
+
+
+class ScoreThresholdedExtractor:
+    """
+    Extracts data in the format accepted by ScoreThresholdedVisualizer
+    """
+
+    def __init__(self, extractor, min_score):
+        self.extractor = extractor
+        self.min_score = min_score
+
+    def __call__(self, instances: Instances, select=None):
+        scores = extract_scores_from_instances(instances)
+        if scores is None:
+            return None
+        select_local = scores > self.min_score
+        select = select_local if select is None else (select & select_local)
+        data = self.extractor(instances, select=select)
+        return data
diff --git a/model/DensePose/__pycache__/__init__.cpython-39.pyc b/model/DensePose/__pycache__/__init__.cpython-39.pyc
index e28a4bf3960d96c7a845132b2fba691fa9fb269d..ea0e8015d651fe0abda5d0c48544ee25079c7c24 100644
Binary files a/model/DensePose/__pycache__/__init__.cpython-39.pyc and b/model/DensePose/__pycache__/__init__.cpython-39.pyc differ
diff --git a/model/SCHP/__init__.py b/model/SCHP/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2f5709a10b48c0f73c7c4d6c176e29fa6ff088
--- /dev/null
+++ b/model/SCHP/__init__.py
@@ -0,0 +1,163 @@
+from model.SCHP import networks
+from model.SCHP.utils.transforms import get_affine_transform, transform_logits
+
+from collections import OrderedDict
+import torch
+import numpy as np
+import cv2
+from PIL import Image
+from torchvision import transforms
+
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+
+dataset_settings = {
+    'lip': {
+        'input_size': [473, 473],
+        'num_classes': 20,
+        'label': ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat',
+                  'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm',
+                  'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
+    },
+    'atr': {
+        'input_size': [512, 512],
+        'num_classes': 18,
+        'label': ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt',
+                  'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
+    },
+    'pascal': {
+        'input_size': [512, 512],
+        'num_classes': 7,
+        'label': ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'],
+    }
+}
+
+class SCHP:
+    def __init__(self, ckpt_path, device):
+        dataset_type = None
+        if 'lip' in ckpt_path:
+            dataset_type = 'lip'
+        elif 'atr' in ckpt_path:
+            dataset_type = 'atr'
+        elif 'pascal' in ckpt_path:
+            dataset_type = 'pascal'
+        assert dataset_type is not None, 'Dataset type not found in checkpoint path'
+        self.device = device
+        self.num_classes = dataset_settings[dataset_type]['num_classes']
+        self.input_size = dataset_settings[dataset_type]['input_size']
+        self.aspect_ratio = self.input_size[1] * 1.0 / self.input_size[0]
+        self.palette = get_palette(self.num_classes)
+
+        self.label = dataset_settings[dataset_type]['label']
+        self.model = networks.init_model('resnet101', num_classes=self.num_classes, pretrained=None).to(device)
+        self.load_ckpt(ckpt_path)
+        self.model.eval()
+        
+        self.transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
+        ])
+        self.upsample = torch.nn.Upsample(size=self.input_size, mode='bilinear', align_corners=True)
+
+
+    def load_ckpt(self, ckpt_path):
+        state_dict = torch.load(ckpt_path, map_location='cpu')['state_dict']
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:]  # remove `module.`
+            new_state_dict[name] = v
+        self.model.load_state_dict(new_state_dict)
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+
+    def preprocess(self, image):
+        if isinstance(image, str):
+            img = cv2.imread(image, cv2.IMREAD_COLOR)
+        elif isinstance(image, Image.Image):
+            # to cv2 format
+            img = np.array(image)
+    
+        h, w, _ = img.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            img,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+
+        input = self.transform(input).to(self.device).unsqueeze(0)
+        meta = {
+                'center': person_center,
+                'height': h,
+                'width': w,
+                'scale': s,
+                'rotation': r
+        }
+        return input, meta
+
+
+    def __call__(self, image_or_path):
+        if isinstance(image_or_path, list):
+            image_list = []
+            meta_list = []
+            for image in image_or_path:
+                image, meta = self.preprocess(image)
+                image_list.append(image)
+                meta_list.append(meta)
+            image = torch.cat(image_list, dim=0)
+        else:
+            image, meta = self.preprocess(image_or_path)
+            meta_list = [meta]
+                
+        output = self.model(image)
+        upsample_outputs = self.upsample(output[0][-1])
+        upsample_outputs = upsample_outputs.permute(0, 2, 3, 1)  # BCHW -> BHWC
+
+        output_img_list = []
+        for upsample_output, meta in zip(upsample_outputs, meta_list):
+            c, s, w, h = meta['center'], meta['scale'], meta['width'], meta['height']
+            logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=self.input_size)
+            parsing_result = np.argmax(logits_result, axis=2)
+            output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+            output_img.putpalette(self.palette)
+            output_img_list.append(output_img)
+
+        return output_img_list[0] if len(output_img_list) == 1 else output_img_list
\ No newline at end of file
diff --git a/model/SCHP/__pycache__/__init__.cpython-39.pyc b/model/SCHP/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ec10278c747231147fa4dfdeda08c9d38446cc4
Binary files /dev/null and b/model/SCHP/__pycache__/__init__.cpython-39.pyc differ
diff --git a/model/SCHP/networks/AugmentCE2P.py b/model/SCHP/networks/AugmentCE2P.py
new file mode 100644
index 0000000000000000000000000000000000000000..246a87ebca53c7ed089f4288ca6d91ba9ded7f32
--- /dev/null
+++ b/model/SCHP/networks/AugmentCE2P.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   AugmentCE2P.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+
+import functools
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
+# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
+from inplace_abn import InPlaceABNSync
+
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='identity')
+
+affine_par = True
+
+pretrained_settings = {
+    'resnet101': {
+        'imagenet': {
+            'input_space': 'BGR',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.406, 0.456, 0.485],
+            'std': [0.225, 0.224, 0.229],
+            'num_classes': 1000
+        }
+    },
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = out + residual
+        out = self.relu_inplace(out)
+
+        return out
+
+
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+
+        self.stages = []
+        self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
+
+
+class ASPPModule(nn.Module):
+    """
+    Reference: 
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+
+    def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+
+    def forward(self, x):
+        _, _, h, w = x.size()
+
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+
+        bottle = self.bottleneck(out)
+        return bottle
+
+
+class Edge_Module(nn.Module):
+    """
+    Edge Learning Branch
+    """
+
+    def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
+        super(Edge_Module, self).__init__()
+
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+
+        edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
+        edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
+
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        edge = self.conv5(edge)
+
+        return edge, edge_fea
+
+
+class Decoder_Module(nn.Module):
+    """
+    Parsing Branch Decoder Module.
+    """
+
+    def __init__(self, num_classes):
+        super(Decoder_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(48)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=False)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
+
+        self.context_encoding = PSPModule(2048, 512)
+
+        self.edge = Edge_Module()
+        self.decoder = Decoder_Module(num_classes)
+
+        self.fushion = nn.Sequential(
+            nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+        )
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion, affine=affine_par))
+
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
+                            multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+        x = self.context_encoding(x5)
+        parsing_result, parsing_fea = self.decoder(x, x2)
+        # Edge Branch
+        edge_result, edge_fea = self.edge(x2, x3, x4)
+        # Fusion Branch
+        x = torch.cat([parsing_fea, edge_fea], dim=1)
+        fusion_result = self.fushion(x)
+        return [[parsing_result, fusion_result], [edge_result]]
+
+
+def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+
+    if pretrained is not None:
+        saved_state_dict = torch.load(pretrained)
+        new_params = model.state_dict().copy()
+        for i in saved_state_dict:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
+        model.load_state_dict(new_params)
+
+
+def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+    settings = pretrained_settings['resnet101']['imagenet']
+    initialize_pretrained_model(model, settings, pretrained)
+    return model
diff --git a/model/SCHP/networks/__init__.py b/model/SCHP/networks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d037294541626d38b3ef521b0690bfd4a36e864f
--- /dev/null
+++ b/model/SCHP/networks/__init__.py
@@ -0,0 +1,13 @@
+from __future__ import absolute_import
+
+from model.SCHP.networks.AugmentCE2P import resnet101
+
+__factory = {
+    'resnet101': resnet101,
+}
+
+
+def init_model(name, *args, **kwargs):
+    if name not in __factory.keys():
+        raise KeyError("Unknown model arch: {}".format(name))
+    return __factory[name](*args, **kwargs)
\ No newline at end of file
diff --git a/model/SCHP/networks/__pycache__/AugmentCE2P.cpython-39.pyc b/model/SCHP/networks/__pycache__/AugmentCE2P.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3d9f79122a57789ab8464a90bf3d7e97eee3e47
Binary files /dev/null and b/model/SCHP/networks/__pycache__/AugmentCE2P.cpython-39.pyc differ
diff --git a/model/SCHP/networks/__pycache__/__init__.cpython-39.pyc b/model/SCHP/networks/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91208568875f4ae9c46b5509b6e0f28f2853e26a
Binary files /dev/null and b/model/SCHP/networks/__pycache__/__init__.cpython-39.pyc differ
diff --git a/model/SCHP/utils/__pycache__/transforms.cpython-39.pyc b/model/SCHP/utils/__pycache__/transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d75c45636b662f0f51be89dbd80233e882a7d98
Binary files /dev/null and b/model/SCHP/utils/__pycache__/transforms.cpython-39.pyc differ
diff --git a/model/SCHP/utils/transforms.py b/model/SCHP/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..1442a728938ca19fcb4ac21ae6588266df45631c
--- /dev/null
+++ b/model/SCHP/utils/transforms.py
@@ -0,0 +1,167 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+import torch
+
+class BRG2Tensor_transform(object):
+    def __call__(self, pic):
+        img = torch.from_numpy(pic.transpose((2, 0, 1)))
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+
+class BGR2RGB_transform(object):
+    def __call__(self, tensor):
+        return tensor[[2,1,0],:,:]
+
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+
+    output_flipped = output_flipped[:, :, :, ::-1]
+
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+
+    return output_flipped
+
+
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+
+    return joints*joints_vis, joints_vis
+
+
+def transform_preds(coords, center, scale, input_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+def transform_parsing(pred, center, scale, width, height, input_size):
+
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    target_pred = cv2.warpAffine(
+            pred,
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_NEAREST,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+
+    return target_pred
+
+def transform_logits(logits, center, scale, width, height, input_size):
+
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    channel = logits.shape[2]
+    target_logits = []
+    for i in range(channel):
+        target_logit = cv2.warpAffine(
+            logits[:,:,i],
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+        target_logits.append(target_logit)
+    target_logits = np.stack(target_logits,axis=2)
+
+    return target_logits
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale
+
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
+    dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
diff --git a/model/__pycache__/attn_processor.cpython-39.pyc b/model/__pycache__/attn_processor.cpython-39.pyc
index a521d17a1e187759ba4e60c5cd5f20d560e21d04..196e72f2acdbcd585a77a2eb0b3de35a831db501 100644
Binary files a/model/__pycache__/attn_processor.cpython-39.pyc and b/model/__pycache__/attn_processor.cpython-39.pyc differ
diff --git a/model/__pycache__/cloth_masker.cpython-39.pyc b/model/__pycache__/cloth_masker.cpython-39.pyc
index ec20a4ab103e71f2571cf50ef4dffc6d26b4b932..4d7eaf357437890e88449578283c341cf27dd8ba 100644
Binary files a/model/__pycache__/cloth_masker.cpython-39.pyc and b/model/__pycache__/cloth_masker.cpython-39.pyc differ
diff --git a/model/__pycache__/pipeline.cpython-39.pyc b/model/__pycache__/pipeline.cpython-39.pyc
index 0e7b8e10ffe9c935375999604205b5e0851d5789..167acbffcdd861af178c127e2be0d7e9e709d6f1 100644
Binary files a/model/__pycache__/pipeline.cpython-39.pyc and b/model/__pycache__/pipeline.cpython-39.pyc differ
diff --git a/model/__pycache__/utils.cpython-39.pyc b/model/__pycache__/utils.cpython-39.pyc
index 93edc07c5160a1afa59318d3d4fdb640cfb8f3e4..5c79849a20c76952b54e2aac3c0470151e4cc59d 100644
Binary files a/model/__pycache__/utils.cpython-39.pyc and b/model/__pycache__/utils.cpython-39.pyc differ
diff --git a/model/cloth_masker.py b/model/cloth_masker.py
index a829bcbb5a1b08e35467c393575e805bdca1c8e7..098793109a41f0dc18bd38ed8f8b9f4efa63234e 100644
--- a/model/cloth_masker.py
+++ b/model/cloth_masker.py
@@ -6,8 +6,8 @@ import cv2
 from diffusers.image_processor import VaeImageProcessor
 import torch
 
-from model.DensePose import DensePose
-from model.segformer_b2 import Segformer  # type: ignore
+from model.SCHP import SCHP  # type: ignore
+from model.DensePose import DensePose  # type: ignore
 
 DENSE_INDEX_MAP = {
     "background": [0],
@@ -152,37 +152,43 @@ def hull_mask(mask_area: np.ndarray):
     return hull_mask
     
 
-class AutoMaskerSeg:
+class AutoMasker:
     def __init__(
         self, 
         densepose_ckpt='./Models/DensePose', 
-        segformer_ckpt='./Models/segformer_b3_clothes',
+        schp_ckpt='./Models/SCHP', 
         device='cuda'):
         np.random.seed(0)
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
         
         self.densepose_processor = DensePose(densepose_ckpt, device)
-        self.segformer_processor = Segformer(segformer_ckpt, device)
+        self.schp_processor_atr = SCHP(ckpt_path=os.path.join(schp_ckpt, 'exp-schp-201908301523-atr.pth'), device=device)
+        self.schp_processor_lip = SCHP(ckpt_path=os.path.join(schp_ckpt, 'exp-schp-201908261155-lip.pth'), device=device)
         
         self.mask_processor = VaeImageProcessor(vae_scale_factor=8, do_normalize=False, do_binarize=True, do_convert_grayscale=True)
 
     def process_densepose(self, image_or_path):
         return self.densepose_processor(image_or_path, resize=1024)
 
-    def process_atr(self, image_or_path):
-        return self.segformer_processor(image_or_path)
+    def process_schp_lip(self, image_or_path):
+        return self.schp_processor_lip(image_or_path)
+
+    def process_schp_atr(self, image_or_path):
+        return self.schp_processor_atr(image_or_path)
         
     def preprocess_image(self, image_or_path):
         return {
             'densepose': self.densepose_processor(image_or_path, resize=1024),
-            'atr': self.process_atr(image_or_path),
+            'schp_atr': self.schp_processor_atr(image_or_path),
+            'schp_lip': self.schp_processor_lip(image_or_path)
         }
     
     @staticmethod
     def cloth_agnostic_mask(
         densepose_mask: Image.Image,
-        atr_mask: Image.Image,
+        schp_lip_mask: Image.Image,
+        schp_atr_mask: Image.Image,
         part: str='overall',
         **kwargs
     ):
@@ -197,30 +203,33 @@ class AutoMaskerSeg:
         kernal_size = kernal_size if kernal_size % 2 == 1 else kernal_size + 1
         
         densepose_mask = np.array(densepose_mask)
-        # schp_lip_mask = np.array(schp_lip_mask)
-        atr_mask = np.array(atr_mask)
+        schp_lip_mask = np.array(schp_lip_mask)
+        schp_atr_mask = np.array(schp_atr_mask)
         
         # Strong Protect Area (Hands, Face, Accessory, Feet)
         hands_protect_area = part_mask_of(['hands', 'feet'], densepose_mask, DENSE_INDEX_MAP)
         hands_protect_area = cv2.dilate(hands_protect_area, dilate_kernel, iterations=1)
-        hands_protect_area = hands_protect_area & (part_mask_of(['Left-arm', 'Right-arm', 'Left-leg', 'Right-leg'], atr_mask, ATR_MAPPING)) 
-            #  | part_mask_of(['Left-arm', 'Right-arm', 'Left-leg', 'Right-leg'], schp_lip_mask, LIP_MAPPING))
-        face_protect_area = part_mask_of('face', densepose_mask, DENSE_INDEX_MAP) & part_mask_of('Face', atr_mask, ATR_MAPPING)
+        hands_protect_area = hands_protect_area & \
+            (part_mask_of(['Left-arm', 'Right-arm', 'Left-leg', 'Right-leg'], schp_atr_mask, ATR_MAPPING) | \
+             part_mask_of(['Left-arm', 'Right-arm', 'Left-leg', 'Right-leg'], schp_lip_mask, LIP_MAPPING))
+        face_protect_area = part_mask_of('Face', schp_lip_mask, LIP_MAPPING)
 
         strong_protect_area = hands_protect_area | face_protect_area 
 
         # Weak Protect Area (Hair, Irrelevant Clothes, Body Parts)
-        body_protect_area = part_mask_of(PROTECT_BODY_PARTS[part], atr_mask, ATR_MAPPING) # part_mask_of(PROTECT_BODY_PARTS[part], schp_lip_mask, LIP_MAPPING) | 
-        hair_protect_area = part_mask_of(['Hair'], atr_mask, ATR_MAPPING)#part_mask_of(['Hair'], schp_lip_mask, LIP_MAPPING) | \
-            
-        cloth_protect_area = part_mask_of(PROTECT_CLOTH_PARTS[part]['ATR'], atr_mask, ATR_MAPPING) #part_mask_of(PROTECT_CLOTH_PARTS[part]['LIP'], schp_lip_mask, LIP_MAPPING) | \
-            
-        accessory_protect_area = part_mask_of((accessory_parts := ['Hat', 'Glove', 'Sunglasses', 'Bag', 'Left-shoe', 'Right-shoe', 'Scarf', 'Socks']), atr_mask, ATR_MAPPING) 
+        body_protect_area = part_mask_of(PROTECT_BODY_PARTS[part], schp_lip_mask, LIP_MAPPING) | part_mask_of(PROTECT_BODY_PARTS[part], schp_atr_mask, ATR_MAPPING)
+        hair_protect_area = part_mask_of(['Hair'], schp_lip_mask, LIP_MAPPING) | \
+            part_mask_of(['Hair'], schp_atr_mask, ATR_MAPPING)
+        cloth_protect_area = part_mask_of(PROTECT_CLOTH_PARTS[part]['LIP'], schp_lip_mask, LIP_MAPPING) | \
+            part_mask_of(PROTECT_CLOTH_PARTS[part]['ATR'], schp_atr_mask, ATR_MAPPING)
+        accessory_protect_area = part_mask_of((accessory_parts := ['Hat', 'Glove', 'Sunglasses', 'Bag', 'Left-shoe', 'Right-shoe', 'Scarf', 'Socks']), schp_lip_mask, LIP_MAPPING) | \
+            part_mask_of(accessory_parts, schp_atr_mask, ATR_MAPPING) 
         weak_protect_area = body_protect_area | cloth_protect_area | hair_protect_area | strong_protect_area | accessory_protect_area
         
         # Mask Area
-        strong_mask_area = part_mask_of(MASK_CLOTH_PARTS[part], atr_mask, ATR_MAPPING)
-        background_area = part_mask_of(['Background'], atr_mask, ATR_MAPPING)
+        strong_mask_area = part_mask_of(MASK_CLOTH_PARTS[part], schp_lip_mask, LIP_MAPPING) | \
+            part_mask_of(MASK_CLOTH_PARTS[part], schp_atr_mask, ATR_MAPPING)
+        background_area = part_mask_of(['Background'], schp_lip_mask, LIP_MAPPING) & part_mask_of(['Background'], schp_atr_mask, ATR_MAPPING)
         mask_dense_area = part_mask_of(MASK_DENSE_PARTS[part], densepose_mask, DENSE_INDEX_MAP)
         mask_dense_area = cv2.resize(mask_dense_area.astype(np.uint8), None, fx=0.25, fy=0.25, interpolation=cv2.INTER_NEAREST)
         mask_dense_area = cv2.dilate(mask_dense_area, dilate_kernel, iterations=2)
@@ -248,17 +257,15 @@ class AutoMaskerSeg:
         preprocess_results = self.preprocess_image(image)
         mask = self.cloth_agnostic_mask(
             preprocess_results['densepose'], 
-            preprocess_results['atr'],
-            # preprocess_results['schp_lip'], 
-            # preprocess_results['schp_atr'], 
+            preprocess_results['schp_lip'], 
+            preprocess_results['schp_atr'], 
             part=mask_type,
         )
         return {
             'mask': mask,
             'densepose': preprocess_results['densepose'],
-            'atr': preprocess_results['atr'],
-            # 'schp_lip': preprocess_results['schp_lip'],
-            # 'schp_atr': preprocess_results['schp_atr']
+            'schp_lip': preprocess_results['schp_lip'],
+            'schp_atr': preprocess_results['schp_atr']
         }
 
 
diff --git a/model/segformer_b2.py b/model/segformer_b2.py
deleted file mode 100644
index 8ae130f984042242f529e77bb55fa9de49f0a13e..0000000000000000000000000000000000000000
--- a/model/segformer_b2.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
-from PIL import Image
-import requests
-import matplotlib.pyplot as plt
-import torch.nn as nn
-
-
-FASHION_MAP = {
-    "0":"Everything Else", "1": "shirt, blouse", "2": "top, t-shirt, sweatshirt", 
-    "3": "sweater", "4": "cardigan", "5": "jacket", "6": "vest", "7": "pants", 
-    "8": "shorts", "9": "skirt", "10": "coat", "11": "dress", "12": "jumpsuit", 
-    "13": "cape", "14": "glasses", "15": "hat", "16": "headband, head covering, hair accessory", 
-    "17": "tie", "18": "glove", "19": "watch", "20": "belt", "21": "leg warmer", 
-    "22": "tights, stockings", "23": "sock", "24": "shoe", "25": "bag, wallet", 
-    "26": "scarf", "27": "umbrella", "28": "hood", "29": "collar", "30": "lapel", 
-    "31": "epaulette", "32": "sleeve", "33": "pocket", "34": "neckline", "35": "buckle", 
-    "36": "zipper", "37": "applique", "38": "bead", "39": "bow", "40": "flower", "41": "fringe", 
-    "42": "ribbon", "43": "rivet", "44": "ruffle", "45": "sequin", "46": "tassel"
-}
-
-
-HUMAN_MAP = {
-    "0":"Background","1":"shirt, blouse","2":"top, t-shirt, sweatshirt","3":"sweater",
-    "4":"cardigan","5":"jacket","6":"vest","7":"pants","8":"shorts","9":"skirt",
-    "10":"coat","11":"dress","12":"jumpsuit","13":"cape","14":"glasses","15":"hat",
-    "16":"headband, head covering, hair accessory","17":"tie","18":"glove","19":"watch",
-    "20":"belt","21":"leg warmer","22":"tights, stockings","23":"sock","24":"shoe",
-    "25":"bag, wallet","26":"scarf","27":"umbrella","28":"hood","29":"collar","30":"lapel",
-    "31":"epaulette","32":"sleeve","33":"pocket","34":"neckline","35":"buckle","36":"zipper",
-    "37":"applique","38":"bead","39":"bow","40":"flower","41":"fringe","42":"ribbon",
-    "43":"rivet","44":"ruffle","45":"sequin","46":"tassel","47":"Hair","48":"Sunglasses",
-    "49":"Upper-clothes","50":"Left-shoe","51":"Right-shoe","52":"Face","53":"Left-leg",
-    "54":"Right-leg","55":"Left-arm","56":"Right-arm"
-}
-
-
-
-class Segformer:
-    def __init__(self, model_name, device='cuda'):
-        self.device = device
-        self.processor = SegformerImageProcessor.from_pretrained(model_name)
-        self.model = AutoModelForSemanticSegmentation.from_pretrained(model_name).to(device)
-        
-        
-    def predict(self, image: Image):
-        inputs = self.processor(images=image, return_tensors="pt").to(self.device)
-        outputs = self.model(**inputs) 
-        logits = outputs.logits.cpu()
-
-        upsampled_logits = nn.functional.interpolate(
-            logits,
-            size=image.size[::-1],
-            mode="bilinear",
-            align_corners=False,
-        )
-        pred_seg = upsampled_logits.argmax(dim=1)[0]
-        # to PIL image
-        pred_seg = Image.fromarray(pred_seg.byte().cpu().numpy())
-        return pred_seg
-    
-    def __call__(self, image: Image):
-        return self.predict(image)
-
-
-
diff --git a/playground.py b/playground.py
deleted file mode 100644
index f2d936db364c945e11ab77e98632ed027149d4ce..0000000000000000000000000000000000000000
--- a/playground.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from model.segformer_b2 import Segformer
-from PIL import Image
-from model.cloth_masker import AutoMaskerSeg
-# model = Segformer("/home/chongzheng_p23/data/Projects/CatVTON-main/Models/segformer_b3_clothes")
-image = Image.open("/home/chongzheng_p23/data/Projects/CatVTON-main/resource/demo/example/person/women/1-model_3.png")
-# result = model(image)
-# result.save("a.png")
-
-masker = AutoMaskerSeg(
-    densepose_ckpt="/home/chongzheng_p23/data/Projects/CatVTON-main/Models/densepose",
-    segformer_ckpt="/home/chongzheng_p23/data/Projects/CatVTON-main/Models/segformer_b3_clothes")
-
-
-
-result = masker(image)['mask']
-result.save("b.png")
-
diff --git a/requirements.txt b/requirements.txt
index 52aa6e60095703733af7d219da99ae96e5174bad..090a050ec33f478ac3ad0bc9af0a18166d696fba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,6 +13,10 @@ setuptools==51.0.0
 scikit-image==0.24.0
 tqdm==4.66.4
 transformers==4.27.3
-xformers==0.0.23.post1
-Ninja==1.11.1.1
-git+https://github.com/facebookresearch/detectron2@main#subdirectory=projects/DensePose
\ No newline at end of file
+fvcore==0.1.5.post20221221
+cloudpickle==3.0.0
+omegaconf==2.3.0
+pycocotools==2.0.8
+av==12.3.0
+inplace-abn==1.1.0
+gradio==4.41.0
\ No newline at end of file