[UNet2DConditionModel] add gradient checkpointing (#461)

* add grad ckpt to downsample blocks * make it work * don't pass gradient_checkpointing to upsample block * add tests for UNet2DConditionModel * add test_gradient_checkpointing * add gradient_checkpointing for up and down blocks * add functions to enable and disable grad ckpt * remove the forward argument * better naming * make supports_gradient_checkpointing private
2022-09-22 15:36:47 +02:00 · 2022-09-22 15:36:47 +02:00 · e7120bae95
parent 534512bedb
commit e7120bae95
6 changed files with 219 additions and 11 deletions
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@ -15,6 +15,7 @@
 # limitations under the License.
 import os
 from functools import partial
 from typing import Callable, List, Optional, Tuple, Union
 import torch
@ -121,10 +122,42 @@ class ModelMixin(torch.nn.Module):
    """
    config_name = CONFIG_NAME
    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
    _supports_gradient_checkpointing = False
    def __init__(self):
        super().__init__()
    @property
    def is_gradient_checkpointing(self) -> bool:
        """
        Whether gradient checkpointing is activated for this model or not.
        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        """
        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
    def enable_gradient_checkpointing(self):
        """
        Activates gradient checkpointing for the current model.
        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        """
        if not self._supports_gradient_checkpointing:
            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
        self.apply(partial(self._set_gradient_checkpointing, value=True))
    def disable_gradient_checkpointing(self):
        """
        Deactivates gradient checkpointing for the current model.
        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        """
        if self._supports_gradient_checkpointing:
            self.apply(partial(self._set_gradient_checkpointing, value=False))
    def save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@ -3,12 +3,21 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..modeling_utils import ModelMixin
 from ..utils import BaseOutput
 from .embeddings import TimestepEmbedding, Timesteps
-from .unet_blocks import UNetMidBlock2DCrossAttn, get_down_block, get_up_block
+from .unet_blocks import (
    CrossAttnDownBlock2D,
    CrossAttnUpBlock2D,
    DownBlock2D,
    UNetMidBlock2DCrossAttn,
    UpBlock2D,
    get_down_block,
    get_up_block,
 )
@dataclass
@ -54,6 +63,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
    """
    _supports_gradient_checkpointing = True
    @register_to_config
    def __init__(
        self,
@ -188,6 +199,10 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
            if hasattr(block, "attentions") and block.attentions is not None:
                block.set_attention_slice(slice_size)
    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
            module.gradient_checkpointing = value
    def forward(
        self,
        sample: torch.FloatTensor,
@ -234,7 +249,9 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
        for downsample_block in self.down_blocks:
            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
                sample, res_samples = downsample_block(
-                    hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+                    hidden_states=sample,
                    temb=emb,
                    encoder_hidden_states=encoder_hidden_states,
                )
            else:
                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
--- a/src/diffusers/models/unet_blocks.py
+++ b/src/diffusers/models/unet_blocks.py
@ -527,6 +527,8 @@ class CrossAttnDownBlock2D(nn.Module):
        else:
            self.downsamplers = None
        self.gradient_checkpointing = False
    def set_attention_slice(self, slice_size):
        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
            raise ValueError(
@ -546,8 +548,22 @@ class CrossAttnDownBlock2D(nn.Module):
        output_states = ()
        for resnet, attn in zip(self.resnets, self.attentions):
-            hidden_states = resnet(hidden_states, temb)
+            if self.training and self.gradient_checkpointing:
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)
                    return custom_forward
                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(attn), hidden_states, encoder_hidden_states
                )
            else:
                hidden_states = resnet(hidden_states, temb)
                hidden_states = attn(hidden_states, context=encoder_hidden_states)
            output_states += (hidden_states,)
        if self.downsamplers is not None:
@ -609,11 +625,24 @@ class DownBlock2D(nn.Module):
        else:
            self.downsamplers = None
        self.gradient_checkpointing = False
    def forward(self, hidden_states, temb=None):
        output_states = ()
        for resnet in self.resnets:
-            hidden_states = resnet(hidden_states, temb)
+            if self.training and self.gradient_checkpointing:
                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)
                    return custom_forward
                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
            else:
                hidden_states = resnet(hidden_states, temb)
            output_states += (hidden_states,)
        if self.downsamplers is not None:
@ -1072,6 +1101,8 @@ class CrossAttnUpBlock2D(nn.Module):
        else:
            self.upsamplers = None
        self.gradient_checkpointing = False
    def set_attention_slice(self, slice_size):
        if slice_size is not None and self.attn_num_head_channels % slice_size != 0:
            raise ValueError(
@ -1087,15 +1118,36 @@ class CrossAttnUpBlock2D(nn.Module):
        for attn in self.attentions:
            attn._set_attention_slice(slice_size)
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, encoder_hidden_states=None):
+        self.gradient_checkpointing = False
    def forward(
        self,
        hidden_states,
        res_hidden_states_tuple,
        temb=None,
        encoder_hidden_states=None,
    ):
        for resnet, attn in zip(self.resnets, self.attentions):
            # pop res hidden states
            res_hidden_states = res_hidden_states_tuple[-1]
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
+            if self.training and self.gradient_checkpointing:
-            hidden_states = attn(hidden_states, context=encoder_hidden_states)
+
                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)
                    return custom_forward
                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
                hidden_states = torch.utils.checkpoint.checkpoint(
                    create_custom_forward(attn), hidden_states, encoder_hidden_states
                )
            else:
                hidden_states = resnet(hidden_states, temb)
                hidden_states = attn(hidden_states, context=encoder_hidden_states)
        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
@ -1150,6 +1202,8 @@ class UpBlock2D(nn.Module):
        else:
            self.upsamplers = None
        self.gradient_checkpointing = False
    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
        for resnet in self.resnets:
            # pop res hidden states
@ -1157,7 +1211,17 @@ class UpBlock2D(nn.Module):
            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-            hidden_states = resnet(hidden_states, temb)
+            if self.training and self.gradient_checkpointing:
                def create_custom_forward(module):
                    def custom_forward(*inputs):
                        return module(*inputs)
                    return custom_forward
                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
            else:
                hidden_states = resnet(hidden_states, temb)
        if self.upsamplers is not None:
            for upsampler in self.upsamplers:
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@ -478,7 +478,7 @@ class LDMBertEncoderLayer(nn.Module):
 class LDMBertPreTrainedModel(PreTrainedModel):
    config_class = LDMBertConfig
    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
+    _supports_gradient_checkpointing = True
    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
    def _init_weights(self, module):
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@ -246,3 +246,21 @@ class ModelTesterMixin:
            outputs_tuple = model(**inputs_dict, return_dict=False)
        recursive_check(outputs_tuple, outputs_dict)
    def test_enable_disable_gradient_checkpointing(self):
        if not self.model_class._supports_gradient_checkpointing:
            return  # Skip test if model does not support gradient checkpointing
        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
        # at init model should have gradient checkpointing disabled
        model = self.model_class(**init_dict)
        self.assertFalse(model.is_gradient_checkpointing)
        # check enable works
        model.enable_gradient_checkpointing()
        self.assertTrue(model.is_gradient_checkpointing)
        # check disable works
        model.disable_gradient_checkpointing()
        self.assertFalse(model.is_gradient_checkpointing)
--- a/tests/test_models_unet.py
+++ b/tests/test_models_unet.py
@ -18,7 +18,7 @@ import unittest
 import torch
-from diffusers import UNet2DModel
+from diffusers import UNet2DConditionModel, UNet2DModel
 from diffusers.testing_utils import floats_tensor, slow, torch_device
 from .test_modeling_common import ModelTesterMixin
@ -159,6 +159,82 @@ class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
 class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
    model_class = UNet2DConditionModel
    @property
    def dummy_input(self):
        batch_size = 4
        num_channels = 4
        sizes = (32, 32)
        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
        time_step = torch.tensor([10]).to(torch_device)
        encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device)
        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
    @property
    def input_shape(self):
        return (4, 32, 32)
    @property
    def output_shape(self):
        return (4, 32, 32)
    def prepare_init_args_and_inputs_for_common(self):
        init_dict = {
            "block_out_channels": (32, 64),
            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
            "cross_attention_dim": 32,
            "attention_head_dim": 8,
            "out_channels": 4,
            "in_channels": 4,
            "layers_per_block": 2,
            "sample_size": 32,
        }
        inputs_dict = self.dummy_input
        return init_dict, inputs_dict
    def test_gradient_checkpointing(self):
        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**init_dict)
        model.to(torch_device)
        out = model(**inputs_dict).sample
        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
        # we won't calculate the loss and rather backprop on out.sum()
        model.zero_grad()
        out.sum().backward()
        # now we save the output and parameter gradients that we will use for comparison purposes with
        # the non-checkpointed run.
        output_not_checkpointed = out.data.clone()
        grad_not_checkpointed = {}
        for name, param in model.named_parameters():
            grad_not_checkpointed[name] = param.grad.data.clone()
        model.enable_gradient_checkpointing()
        out = model(**inputs_dict).sample
        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
        # we won't calculate the loss and rather backprop on out.sum()
        model.zero_grad()
        out.sum().backward()
        # now we save the output and parameter gradients that we will use for comparison purposes with
        # the non-checkpointed run.
        output_checkpointed = out.data.clone()
        grad_checkpointed = {}
        for name, param in model.named_parameters():
            grad_checkpointed[name] = param.grad.data.clone()
        # compare the output and parameters gradients
        self.assertTrue((output_checkpointed == output_not_checkpointed).all())
        for name in grad_checkpointed:
            self.assertTrue(torch.allclose(grad_checkpointed[name], grad_not_checkpointed[name], atol=5e-5))
 #    TODO(Patrick) - Re-add this test after having cleaned up LDM
 #    def test_output_pretrained_spatial_transformer(self):
 #        model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial")