Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

__init__.py +5 -0
config.json +9 -0
config.py +32 -0
pfsq.py +234 -0
plpq.py +196 -0
wavelet.py +167 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .plpq import PLPQ
+from .pfsq import PFSQ
+from .config import PLPQConfig
+from .wavelet import WaveletTransform

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_name_or_path": "StanfordNeuroAILab/PLPQ",
+  "architectures": ["PLPQ"],
+  "auto_map": {
+    "AutoConfig": "config.PLPQConfig",
+    "AutoModel": "plpq.PLPQ"
+  },
+  "model_type": "PLPQ"
+}

config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import Tuple, List
+from transformers import PretrainedConfig
+class PLPQConfig(PretrainedConfig):
+    model_type: str = "PLPQ"
+    def __init__(self,
+        image_size: List[int, int],
+        patch_size: int,
+        dropout: float,
+        vocab_size: int,
+        levels: List[int],
+        num_quantizers: int,
+        num_in_channels: int,
+        num_out_channels: int,
+        use_wavelets: bool,
+        encoder_blocks: List[List],
+        decoder_blocks: List[List],
+        **kwargs
+    ):
+        image_size = image_size
+        patch_size = patch_size
+        dropout = dropout
+        vocab_size = vocab_size
+        levels = levels
+        num_quantizers = num_quantizers
+        num_in_channels = num_in_channels
+        num_out_channels = num_out_channels
+        use_wavelets = use_wavelets
+        encoder_blocks = encoder_blocks
+        decoder_blocks = decoder_blocks
+        super.__init__(**kwargs)

pfsq.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""
+Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+Code adapted from Jax version in Appendix A.1
+"""
+from __future__ import annotations
+from functools import wraps, partial
+from contextlib import nullcontext
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import Module
+from torch import Tensor, int32
+from torch.cuda.amp import autocast
+from einops import rearrange, pack, unpack
+# helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+class PFSQ(Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks = 1,
+        keep_num_codebooks_dim: bool | None = None,
+        scale: float | None = None,
+        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
+        channel_first: bool = False,
+        projection_has_bias: bool = True,
+        return_indices = True,
+        force_quantization_f32 = True
+    ):
+        super().__init__()
+        _levels = torch.tensor(levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent = False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent = False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim, bias = projection_has_bias) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim, bias = projection_has_bias) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
+            self.register_buffer("implicit_codebook", implicit_codebook, persistent = False)
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def bound(self, z, eps: float = 1e-3):
+        """ Bound `z`, an array of shape (..., d). """
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """ Quantizes z, returns quantized zhat, same shape as z. """
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2 # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """ Converts a `code` to an index in the codebook. """
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """ Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings """
+        indices = rearrange(indices, '... -> ... 1')
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices, return_first=False):
+        """ Inverse of `codes_to_indices`. """
+        assert exists(indices)
+        n_codes = indices.shape[-1]
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, '... c d -> ... (c d)')
+        if n_codes == 1:
+            return codes
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, 'b ... d -> b d ...')
+        return codes
+    @autocast(enabled = False)
+    def forward(self, z):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, 'b d ... -> b ... d')
+            z, ps = pack_one(z, 'b * d')
+        assert z.shape[-1] == self.dim, f'expected dimension of {self.dim} but found dimension of {z.shape[-1]}'
+        z = self.project_in(z)
+        z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = partial(autocast, enabled = False) if force_f32 else nullcontext
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            first_codes = codes[:, :, 0, :] # first codebook
+            codes = rearrange(codes, 'b n c d -> b n (c d)')
+            codes = codes.type(orig_dtype)
+            first_codes = first_codes.type(orig_dtype)
+        # project out
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, 'b * d')
+            out = rearrange(out, 'b ... d -> b d ...')
+            indices = maybe(unpack_one)(indices, ps, 'b * c')
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, '... 1 -> ...')
+        # return quantized output and indices
+        return out, first_codes, indices

plpq.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from transformers import PreTrainedModel
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from .wavelet import WaveletTransform
+from .pfsq import PFSQ
+from .config import PLPQConfig
+class PLPQ(PreTrainedModel):
+    """
+    Pyramidal Local Patch Quantizer
+    """
+    config_class = PLPQConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        if config.__dict__.get('use_wavelets', False):
+            wavelets = WaveletTransform(patch_size=config.patch_size)
+            wavelet_channels = wavelets.num_transformed_channels(config.num_in_channels)
+            in_proj = nn.Sequential(
+                wavelets,
+                nn.Conv2d(
+                    wavelet_channels, config.encoder_blocks[0][1],
+                    kernel_size=1, stride=1     # keep fully local
+                )
+            )
+            out_proj = nn.Sequential(
+                nn.Conv2d(
+                    config.decoder_blocks[-1][2], wavelet_channels,
+                    kernel_size=3, stride=1, padding=1
+                ),
+                WaveletTransform(patch_size=config.patch_size, inverse=True)
+            )
+        else:
+            in_proj = nn.Conv2d(
+                config.num_in_channels, config.encoder_blocks[0][1],
+                kernel_size=config.patch_size, stride=config.patch_size
+            )
+            out_proj = nn.Conv2d(
+                config.decoder_blocks[-1][2], config.num_out_channels,
+                kernel_size=3, stride=1, padding=1
+            )
+        self.encoder = nn.Sequential(
+            in_proj,
+            nn.SiLU(),
+            *[
+                PatchResidualConvBlock(*block_params[1:]) if block_params[0] == "ResBlock" else Downsample(*block_params[1:])
+                for block_params in config.encoder_blocks
+            ]
+        )
+        # Pyramidal Quantizer
+        self.quantizer = PFSQ(
+            levels = config.levels,                      # number of levels for each codebook
+            num_codebooks = config.num_quantizers,   # number of quantizers
+            dim = config.encoder_blocks[-1][2],          # this is the input feature dimension, defaults to log2(codebook_size) if not defined
+        )
+        # coarse decoder output -> 32x32 supervision
+        self.coarse_decoder = nn.Conv2d(len(config.levels), config.num_out_channels, kernel_size=1, stride=1)
+        self.decoder = nn.Sequential(
+            *[
+                PatchResidualConvBlock(*block_params[1:]) if block_params[0] == "ResBlock" else Upsample(*block_params[1:])
+                for block_params in config.decoder_blocks
+            ],
+            out_proj
+        )
+    def get_num_params(self) -> int:
+        """
+        Return the number of parameters in the model.
+        """
+        return sum(p.numel() for p in self.parameters())
+    @torch.no_grad()
+    def quantize(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Quantize the input tensor
+        Parameters:
+            x (torch.Tensor): The input tensor. Size b, c, h, w
+        Returns:
+            torch.Tensor: The indices tensor. Size b, h, w
+        """
+        # encode the input
+        z = self.encoder(x).permute(0, 2, 3, 1).contiguous()
+        # reshape the input
+        b, h, w, c = z.shape
+        z = z.view(b, h * w, -1)
+        # quantize the input
+        quantized, coarse_quantized, all_codes = self.quantizer(z)
+        return all_codes
+    @torch.no_grad()
+    def decode(self, indices: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+            indices: torch.Tensor of shape (b, t, n_freq_bins)
+        Returns:
+            emb: torch.Tensor of shape (b, t, n_embd)
+        """
+        ncodes = indices.shape[-1]
+        emb = self.quantizer.indices_to_codes(indices).squeeze(-1)
+        # reshape [b t c] -> [b c h w]
+        b, h, w = emb.size(0), int(math.sqrt(emb.size(1))), int(math.sqrt(emb.size(1)))
+        emb = emb.permute(0, 2, 1).view(b, -1, h, w).contiguous()
+        if ncodes == 1:
+            pred = self.coarse_decoder(emb)
+            return pred
+        # full decoder: full image prediction
+        pred = self.decoder(emb)
+        return pred
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class PatchResidualConvBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, kernel_size, stride, padding, dorpout=0.1) -> None:
+        super().__init__()
+        self.nonlinearity = nn.SiLU()
+        self.ln1 = LayerNorm(in_dim, bias=True)
+        self.dropout = nn.Dropout(dorpout)
+        self.conv1 = nn.Conv2d(in_dim, hidden_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.conv2 = nn.Conv2d(hidden_dim, out_dim, kernel_size=kernel_size, stride=stride, padding=padding)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        z = self.ln1(x.permute(0, 2, 3, 1).reshape(b * h * w, c)).reshape(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        z = self.nonlinearity(self.conv1(z))
+        z = self.dropout(z)
+        z = self.nonlinearity(self.conv2(z))
+        return z + x
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels,
+                                    out_channels,
+                                    kernel_size=3,
+                                    stride=1,
+                                    padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = torch.nn.Conv2d(in_channels,
+                                    out_channels,
+                                    kernel_size=3,
+                                    stride=2,
+                                    padding=0)
+    def forward(self, x):
+        pad = (0,1,0,1)
+        x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x

wavelet.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+class WaveletTransform(nn.Module):
+    def __init__(self, patch_size: int, inverse: bool = False):
+        '''
+        `patchwise` in forward/invert makes *no difference*; the result
+        is numerically identical either way. It's still enabled by default
+        in case we pass in a non-square image, which may not be equivalent.
+        `reshape` is pretty much useless.
+        TODO: Clean up these options.
+        '''
+        super().__init__()
+        self.patch_size = patch_size
+        self.inverse = inverse
+        # From https://github.com/NVIDIA/Cosmos-Tokenizer/blob/3584ae752ce8ebdbe06a420bf60d7513c0e878cc/cosmos_tokenizer/modules/patching.py#L33
+        self.haar = torch.tensor([0.7071067811865476, 0.7071067811865476])
+        self.arange = torch.arange(len(self.haar))
+        self.steps = int(math.log2(self.patch_size))
+    def num_transformed_channels(self, in_channels: int = 3) -> int:
+        '''
+        Returns the number of channels to expect in the transformed image
+        given the channels in the input image.
+        '''
+        return in_channels * (4 ** self.steps)
+    def forward(self, x: torch.Tensor, patchwise: bool = True, reshape: bool = False) -> torch.Tensor:
+        if self.inverse:
+            return self.invert(x, patchwise=patchwise, from_reshaped=reshape)
+        else:
+            return self.transform(x, patchwise=patchwise, reshape=reshape)
+    def transform(self, x: torch.Tensor, patchwise: bool = True, reshape: bool = False) -> torch.Tensor:
+        '''
+        ### Parameters:
+            `x`: ImageNet-normalized images with shape (B C H W)
+            `patchwise`: Whether to compute independently on patches
+            `reshape`: Reshape the results to match the input HxW
+        ### Returns:
+            If `reshape`, returns (B C H W)
+            otherwise, returns (B C*patch_size**2 H/patch_size W/patch_size)
+        '''
+        p = self.patch_size
+        if patchwise:
+            # Place patches into batch dimension
+            # (B C H W) -> (B*L C H/root(L), W/root(L))
+            b, c, h, w = x.shape
+            init_b = b
+            # (B C H W) -> (B C LH LW P P)
+            x = x.reshape(b, c, h//p, p, w//p, p).moveaxis(4,3)
+            # (B C LH LW P P) -> (B' C P P)
+            x = x.moveaxis(1,3).reshape(-1, c, p, p)
+        for _ in range(self.steps):
+            x = self.dwt(x)
+        if patchwise:
+            # Extract patches from batch dimension
+            # (B' C' 1 1) -> (B LH LW C') -> (B C' LH LW)
+            x = x.reshape(init_b, h//p, w//p, -1).moveaxis(3,1)
+        if reshape:
+            # (B C*patch_size**2 H/patch_size W/patch_size) -> (B C H W)
+            b, cp2, hdp, wdp = x.shape
+            c, h, w = cp2//(p**2), hdp*p, wdp*p
+            x = x.reshape(b, p, p, c, hdp, wdp)
+            x = x.moveaxis(3,1).moveaxis(3,4).reshape(b, c, h, w).contiguous()
+        return x
+    def invert(self, x: torch.Tensor, patchwise: bool = True, from_reshaped: bool = False) -> torch.Tensor:
+        '''
+        ### Parameters:
+            `x`: Wavelet-space input of either (B C H W) (when `from_reshaped=True`) or
+                (B C*patch_size**2 H/patch_size W/patch_size)
+            `patchwise`: Whether to compute independently on patches
+            `from_reshaped`: Determines the shape of `x`; should match the value of `reshape`
+                used when calling `forward`
+        '''
+        p = self.patch_size
+        if from_reshaped:
+            # (B C H W) -> (B C*patch_size**2 H/patch_size W/patch_size)
+            b, c, h, w = x.shape
+            cp2, hdp, wdp = c*self.patch_size**2, h//self.patch_size, w//self.patch_size
+            x = x.reshape(b, c, self.patch_size, hdp, self.patch_size, wdp)
+            x = x.moveaxis(4,3).moveaxis(1,3).reshape(b, cp2, hdp, wdp)
+        if patchwise:
+            # Put patches into batch dimension
+            # (B C' LH LW) -> (B LH LW C') -> (B' C' 1 1)
+            init_b, lh, lw = x.shape[0], x.shape[2], x.shape[3]
+            x = x.moveaxis(1,3).reshape(-1, x.shape[1], 1, 1)
+        for _ in range(self.steps):
+            x = self.idwt(x)
+        if patchwise:
+            # Extract patches from batch dimension and expand
+            # (B' C P P) -> (B C LH LW P P)
+            x = x.reshape(init_b, lh, lw, *x.shape[1:]).moveaxis(3,1)
+            # (B C LH LW P P) -> (B C H W)
+            x = x.moveaxis(3,4).reshape(*x.shape[:2], lh*p, lw*p)
+        return x
+    def dwt(self, x: torch.Tensor):
+        dtype = x.dtype
+        h = self.haar
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self.arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(device=x.device, dtype=dtype)
+        hl = hl.to(device=x.device, dtype=dtype)
+        x = F.pad(x, pad=(n - 2, n - 1, n - 2, n - 1), mode='reflect').to(dtype)
+        xl = F.conv2d(x, hl.unsqueeze(2), groups=g, stride=(1, 2))
+        xh = F.conv2d(x, hh.unsqueeze(2), groups=g, stride=(1, 2))
+        xll = F.conv2d(xl, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xlh = F.conv2d(xl, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        xhl = F.conv2d(xh, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xhh = F.conv2d(xh, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        return 0.5 * torch.cat([xll, xlh, xhl, xhh], dim=1)
+    def idwt(self, x: torch.Tensor):
+        dtype = x.dtype
+        h = self.haar
+        n = h.shape[0]
+        g = x.shape[1] // 4
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self.arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(device=x.device, dtype=dtype)
+        hl = hl.to(device=x.device, dtype=dtype)
+        xll, xlh, xhl, xhh = torch.chunk(x.to(dtype), 4, dim=1)
+        # Inverse transform.
+        yl = torch.nn.functional.conv_transpose2d(
+            xll, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yl += torch.nn.functional.conv_transpose2d(
+            xlh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yh = torch.nn.functional.conv_transpose2d(
+            xhl, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yh += torch.nn.functional.conv_transpose2d(
+            xhh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        y = torch.nn.functional.conv_transpose2d(
+            yl, hl.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
+        )
+        y += torch.nn.functional.conv_transpose2d(
+            yh, hh.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
+        )
+        return 2.0 * y