stable-point-aware-3d

Running

File size: 3,584 Bytes

from dataclasses import dataclass, field
from typing import Any, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from jaxtyping import Float
from torch import Tensor

from spar3d.models.illumination.reni.env_map import RENIEnvMap
from spar3d.models.utils import BaseModule


def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
    assert d6.shape[-1] == 6, "Input tensor must have shape (..., 6)"

    def proj_u2a(u, a):
        r"""
        u: batch x 3
        a: batch x 3
        """
        inner_prod = torch.sum(u * a, dim=-1, keepdim=True)
        norm2 = torch.sum(u**2, dim=-1, keepdim=True)
        norm2 = torch.clamp(norm2, min=1e-8)
        factor = inner_prod / (norm2 + 1e-10)
        return factor * u

    x_raw, y_raw = d6[..., :3], d6[..., 3:]

    x = F.normalize(x_raw, dim=-1)
    y = F.normalize(y_raw - proj_u2a(x, y_raw), dim=-1)
    z = torch.cross(x, y, dim=-1)

    return torch.stack((x, y, z), dim=-1)


class ReniLatentCodeEstimator(BaseModule):
    @dataclass
    class Config(BaseModule.Config):
        triplane_features: int = 40

        n_layers: int = 5
        hidden_features: int = 512
        activation: str = "relu"

        pool: str = "mean"

        reni_env_config: dict = field(default_factory=dict)

    cfg: Config

    def configure(self):
        layers = []
        cur_features = self.cfg.triplane_features * 3
        for _ in range(self.cfg.n_layers):
            layers.append(
                nn.Conv2d(
                    cur_features,
                    self.cfg.hidden_features,
                    kernel_size=3,
                    padding=0,
                    stride=2,
                )
            )
            layers.append(self.make_activation(self.cfg.activation))

            cur_features = self.cfg.hidden_features

        self.layers = nn.Sequential(*layers)

        self.reni_env_map = RENIEnvMap(self.cfg.reni_env_config)
        self.latent_dim = self.reni_env_map.field.latent_dim

        self.fc_latents = nn.Linear(self.cfg.hidden_features, self.latent_dim * 3)
        nn.init.normal_(self.fc_latents.weight, mean=0.0, std=0.3)

        self.fc_rotations = nn.Linear(self.cfg.hidden_features, 6)
        nn.init.constant_(self.fc_rotations.bias, 0.0)
        nn.init.normal_(
            self.fc_rotations.weight, mean=0.0, std=0.01
        )  # Small variance here

        self.fc_scale = nn.Linear(self.cfg.hidden_features, 1)
        nn.init.constant_(self.fc_scale.bias, 0.0)
        nn.init.normal_(self.fc_scale.weight, mean=0.0, std=0.01)  # Small variance here

    def make_activation(self, activation):
        if activation == "relu":
            return nn.ReLU(inplace=True)
        elif activation == "silu":
            return nn.SiLU(inplace=True)
        else:
            raise NotImplementedError

    def forward(
        self,
        triplane: Float[Tensor, "B 3 F Ht Wt"],
        rotation: Optional[Float[Tensor, "B 3 3"]] = None,
    ) -> dict[str, Any]:
        x = self.layers(
            triplane.reshape(
                triplane.shape[0], -1, triplane.shape[-2], triplane.shape[-1]
            )
        )
        x = x.mean(dim=[-2, -1])

        latents = self.fc_latents(x).reshape(-1, self.latent_dim, 3)
        rotations = rotation_6d_to_matrix(self.fc_rotations(x))
        scale = self.fc_scale(x)

        if rotation is not None:
            rotations = rotations @ rotation.to(dtype=rotations.dtype)

        env_map = self.reni_env_map(latents, rotations, scale)

        return {"illumination": env_map["rgb"]}