stable-diffusion-webui/modules/sd_hijack_unet.py

import torch
from packaging import version
from einops import repeat
import math

from modules import devices
from modules.sd_hijack_utils import CondFunc


class TorchHijackForUnet:
    """
    This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
    this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
    """

    def __getattr__(self, item):
        if item == 'cat':
            return self.cat

        if hasattr(torch, item):
            return getattr(torch, item)

        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")

    def cat(self, tensors, *args, **kwargs):
        if len(tensors) == 2:
            a, b = tensors
            if a.shape[-2:] != b.shape[-2:]:
                a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")

            tensors = (a, b)

        return torch.cat(tensors, *args, **kwargs)


th = TorchHijackForUnet()


# Below are monkey patches to enable upcasting a float16 UNet for float32 sampling
def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
    """Always make sure inputs to unet are in correct dtype."""
    if isinstance(cond, dict):
        for y in cond.keys():
            if isinstance(cond[y], list):
                cond[y] = [x.to(devices.dtype_unet) if isinstance(x, torch.Tensor) else x for x in cond[y]]
            else:
                cond[y] = cond[y].to(devices.dtype_unet) if isinstance(cond[y], torch.Tensor) else cond[y]

    with devices.autocast():
        result = orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs)
        if devices.unet_needs_upcast:
            return result.float()
        else:
            return result


# Monkey patch to create timestep embed tensor on device, avoiding a block.
def timestep_embedding(_, timesteps, dim, max_period=10000, repeat_only=False):
    """
    Create sinusoidal timestep embeddings.
    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
    :param dim: the dimension of the output.
    :param max_period: controls the minimum frequency of the embeddings.
    :return: an [N x dim] Tensor of positional embeddings.
    """
    if not repeat_only:
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half
        )
        args = timesteps[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
    else:
        embedding = repeat(timesteps, 'b -> b d', d=dim)
    return embedding


# Monkey patch to SpatialTransformer removing unnecessary contiguous calls.
# Prevents a lot of unnecessary aten::copy_ calls
def spatial_transformer_forward(_, self, x: torch.Tensor, context=None):
    # note: if no context is given, cross-attention defaults to self-attention
    if not isinstance(context, list):
        context = [context]
    b, c, h, w = x.shape
    x_in = x
    x = self.norm(x)
    if not self.use_linear:
        x = self.proj_in(x)
    x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)
    if self.use_linear:
        x = self.proj_in(x)
    for i, block in enumerate(self.transformer_blocks):
        x = block(x, context=context[i])
    if self.use_linear:
        x = self.proj_out(x)
    x = x.view(b, h, w, c).permute(0, 3, 1, 2)
    if not self.use_linear:
        x = self.proj_out(x)
    return x + x_in


class GELUHijack(torch.nn.GELU, torch.nn.Module):
    def __init__(self, *args, **kwargs):
        torch.nn.GELU.__init__(self, *args, **kwargs)
    def forward(self, x):
        if devices.unet_needs_upcast:
            return torch.nn.GELU.forward(self.float(), x.float()).to(devices.dtype_unet)
        else:
            return torch.nn.GELU.forward(self, x)


ddpm_edit_hijack = None
def hijack_ddpm_edit():
    global ddpm_edit_hijack
    if not ddpm_edit_hijack:
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)
        CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
        ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model)


unet_needs_upcast = lambda *args, **kwargs: devices.unet_needs_upcast
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding)
CondFunc('ldm.modules.attention.SpatialTransformer.forward', spatial_transformer_forward)
CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)

if version.parse(torch.__version__) <= version.parse("1.13.2") or torch.cuda.is_available():
    CondFunc('ldm.modules.diffusionmodules.util.GroupNorm32.forward', lambda orig_func, self, *args, **kwargs: orig_func(self.float(), *args, **kwargs), unet_needs_upcast)
    CondFunc('ldm.modules.attention.GEGLU.forward', lambda orig_func, self, x: orig_func(self.float(), x.float()).to(devices.dtype_unet), unet_needs_upcast)
    CondFunc('open_clip.transformer.ResidualAttentionBlock.__init__', lambda orig_func, *args, **kwargs: kwargs.update({'act_layer': GELUHijack}) and False or orig_func(*args, **kwargs), lambda _, *args, **kwargs: kwargs.get('act_layer') is None or kwargs['act_layer'] == torch.nn.GELU)

first_stage_cond = lambda _, self, *args, **kwargs: devices.unet_needs_upcast and self.model.diffusion_model.dtype == torch.float16
first_stage_sub = lambda orig_func, self, x, **kwargs: orig_func(self, x.to(devices.dtype_vae), **kwargs)
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding', lambda orig_func, *args, **kwargs: orig_func(*args, **kwargs).float(), first_stage_cond)

CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model)
CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model)


def timestep_embedding_cast_result(orig_func, timesteps, *args, **kwargs):
    if devices.unet_needs_upcast and timesteps.dtype == torch.int64:
        dtype = torch.float32
    else:
        dtype = devices.dtype_unet
    return orig_func(timesteps, *args, **kwargs).to(dtype=dtype)


CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
do not replace entire unet for the resolution hack 2022-12-09 23:14:30 -07:00			`import torch`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`from packaging import version`
Patch timestep embedding to create tensor on-device 2024-05-17 10:12:57 -06:00			`from einops import repeat`
			`import math`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00
			`from modules import devices`
			`from modules.sd_hijack_utils import CondFunc`
do not replace entire unet for the resolution hack 2022-12-09 23:14:30 -07:00

			`class TorchHijackForUnet:`
			`"""`
			`This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;`
Fix various typos 2022-12-14 19:01:32 -07:00			`this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64`
do not replace entire unet for the resolution hack 2022-12-09 23:14:30 -07:00			`"""`

			`def __getattr__(self, item):`
			`if item == 'cat':`
			`return self.cat`

			`if hasattr(torch, item):`
			`return getattr(torch, item)`

Fix up string formatting/concatenation to f-strings where feasible 2023-05-09 13:17:58 -06:00			`raise AttributeError(f"'{type(self).__name__}' object has no attribute '{item}'")`
do not replace entire unet for the resolution hack 2022-12-09 23:14:30 -07:00
			`def cat(self, tensors, args, *kwargs):`
			`if len(tensors) == 2:`
			`a, b = tensors`
			`if a.shape[-2:] != b.shape[-2:]:`
			`a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")`

			`tensors = (a, b)`

			`return torch.cat(tensors, args, *kwargs)`


			`th = TorchHijackForUnet()`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00

			`# Below are monkey patches to enable upcasting a float16 UNet for float32 sampling`
			`def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`"""Always make sure inputs to unet are in correct dtype."""`
fix for unet hijack breaking the train tab 2023-01-25 10:11:01 -07:00			`if isinstance(cond, dict):`
			`for y in cond.keys():`
Add support for `--upcast-sampling` with SD XL 2023-07-17 21:39:38 -06:00			`if isinstance(cond[y], list):`
			`cond[y] = [x.to(devices.dtype_unet) if isinstance(x, torch.Tensor) else x for x in cond[y]]`
			`else:`
			`cond[y] = cond[y].to(devices.dtype_unet) if isinstance(cond[y], torch.Tensor) else cond[y]`
fix for unet hijack breaking the train tab 2023-01-25 10:11:01 -07:00
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`with devices.autocast():`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`result = orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs)`
			`if devices.unet_needs_upcast:`
			`return result.float()`
			`else:`
			`return result`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00
Apply hijacks in ddpm_edit for upcast sampling To avoid import errors, ddpm_edit hijacks are done after an instruct pix2pix model is loaded. 2023-02-06 22:05:54 -07:00
Patch timestep embedding to create tensor on-device 2024-05-17 10:12:57 -06:00			`# Monkey patch to create timestep embed tensor on device, avoiding a block.`
			`def timestep_embedding(_, timesteps, dim, max_period=10000, repeat_only=False):`
			`"""`
			`Create sinusoidal timestep embeddings.`
			`:param timesteps: a 1-D Tensor of N indices, one per batch element.`
			`These may be fractional.`
			`:param dim: the dimension of the output.`
			`:param max_period: controls the minimum frequency of the embeddings.`
			`:return: an [N x dim] Tensor of positional embeddings.`
			`"""`
			`if not repeat_only:`
			`half = dim // 2`
			`freqs = torch.exp(`
			`-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half`
			`)`
			`args = timesteps[:, None].float() * freqs[None]`
			`embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)`
			`if dim % 2:`
			`embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)`
			`else:`
			`embedding = repeat(timesteps, 'b -> b d', d=dim)`
			`return embedding`


Add transformer forward patch 2024-05-17 11:14:26 -06:00			`# Monkey patch to SpatialTransformer removing unnecessary contiguous calls.`
			`# Prevents a lot of unnecessary aten::copy_ calls`
			`def spatial_transformer_forward(_, self, x: torch.Tensor, context=None):`
			`# note: if no context is given, cross-attention defaults to self-attention`
			`if not isinstance(context, list):`
			`context = [context]`
			`b, c, h, w = x.shape`
			`x_in = x`
			`x = self.norm(x)`
			`if not self.use_linear:`
			`x = self.proj_in(x)`
			`x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)`
			`if self.use_linear:`
			`x = self.proj_in(x)`
			`for i, block in enumerate(self.transformer_blocks):`
			`x = block(x, context=context[i])`
			`if self.use_linear:`
			`x = self.proj_out(x)`
			`x = x.view(b, h, w, c).permute(0, 3, 1, 2)`
			`if not self.use_linear:`
			`x = self.proj_out(x)`
			`return x + x_in`


Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`class GELUHijack(torch.nn.GELU, torch.nn.Module):`
			`def __init__(self, args, *kwargs):`
			`torch.nn.GELU.__init__(self, args, *kwargs)`
			`def forward(self, x):`
			`if devices.unet_needs_upcast:`
			`return torch.nn.GELU.forward(self.float(), x.float()).to(devices.dtype_unet)`
			`else:`
			`return torch.nn.GELU.forward(self, x)`

Apply hijacks in ddpm_edit for upcast sampling To avoid import errors, ddpm_edit hijacks are done after an instruct pix2pix model is loaded. 2023-02-06 22:05:54 -07:00
			`ddpm_edit_hijack = None`
			`def hijack_ddpm_edit():`
			`global ddpm_edit_hijack`
			`if not ddpm_edit_hijack:`
			`CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)`
			`CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model)`
Apply hijacks in ddpm_edit for upcast sampling To avoid import errors, ddpm_edit hijacks are done after an instruct pix2pix model is loaded. 2023-02-06 22:05:54 -07:00

Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`unet_needs_upcast = lambda args, *kwargs: devices.unet_needs_upcast`
			`CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)`
Merge branch 'dev' into patch-4 2024-06-08 01:35:09 -06:00			`CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding)`
			`CondFunc('ldm.modules.attention.SpatialTransformer.forward', spatial_transformer_forward)`
Refactor conditional casting, fix upscalers 2023-01-27 08:19:43 -07:00			`CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, args, kwargs: orig_func(timesteps, args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00
Update sd_hijack_unet.py 2023-03-24 06:25:42 -06:00			`if version.parse(torch.__version__) <= version.parse("1.13.2") or torch.cuda.is_available():`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`CondFunc('ldm.modules.diffusionmodules.util.GroupNorm32.forward', lambda orig_func, self, args, kwargs: orig_func(self.float(), args, **kwargs), unet_needs_upcast)`
			`CondFunc('ldm.modules.attention.GEGLU.forward', lambda orig_func, self, x: orig_func(self.float(), x.float()).to(devices.dtype_unet), unet_needs_upcast)`
			`CondFunc('open_clip.transformer.ResidualAttentionBlock.__init__', lambda orig_func, args, kwargs: kwargs.update({'act_layer': GELUHijack}) and False or orig_func(args, *kwargs), lambda _, args, **kwargs: kwargs.get('act_layer') is None or kwargs['act_layer'] == torch.nn.GELU)`
Refactor conditional casting, fix upscalers 2023-01-27 08:19:43 -07:00
			`first_stage_cond = lambda _, self, args, *kwargs: devices.unet_needs_upcast and self.model.diffusion_model.dtype == torch.float16`
			`first_stage_sub = lambda orig_func, self, x, kwargs: orig_func(self, x.to(devices.dtype_vae), kwargs)`
			`CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)`
			`CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)`
			`CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding', lambda orig_func, args, kwargs: orig_func(args, **kwargs).float(), first_stage_cond)`
Add support for `--upcast-sampling` with SD XL 2023-07-17 21:39:38 -06:00
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model)`
			`CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model)`


			`def timestep_embedding_cast_result(orig_func, timesteps, args, *kwargs):`
			`if devices.unet_needs_upcast and timesteps.dtype == torch.int64:`
			`dtype = torch.float32`
			`else:`
			`dtype = devices.dtype_unet`
			`return orig_func(timesteps, args, *kwargs).to(dtype=dtype)`


			`CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)`
			`CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)`