stable-diffusion-webui/modules/devices.py

import sys
import contextlib
from functools import lru_cache

import torch
from modules import errors, shared, npu_specific

if sys.platform == "darwin":
    from modules import mac_specific

if shared.cmd_opts.use_ipex:
    from modules import xpu_specific


def has_xpu() -> bool:
    return shared.cmd_opts.use_ipex and xpu_specific.has_xpu


def has_mps() -> bool:
    if sys.platform != "darwin":
        return False
    else:
        return mac_specific.has_mps


def cuda_no_autocast(device_id=None) -> bool:
    if device_id is None:
        device_id = get_cuda_device_id()
    return (
        torch.cuda.get_device_capability(device_id) == (7, 5)
        and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")
    )


def get_cuda_device_id():
    return (
        int(shared.cmd_opts.device_id)
        if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()
        else 0
    ) or torch.cuda.current_device()


def get_cuda_device_string():
    if shared.cmd_opts.device_id is not None:
        return f"cuda:{shared.cmd_opts.device_id}"

    return "cuda"


def get_optimal_device_name():
    if torch.cuda.is_available():
        return get_cuda_device_string()

    if has_mps():
        return "mps"

    if has_xpu():
        return xpu_specific.get_xpu_device_string()

    if npu_specific.has_npu:
        return npu_specific.get_npu_device_string()

    return "cpu"


def get_optimal_device():
    return torch.device(get_optimal_device_name())


def get_device_for(task):
    if task in shared.cmd_opts.use_cpu or "all" in shared.cmd_opts.use_cpu:
        return cpu

    return get_optimal_device()


def torch_gc():

    if torch.cuda.is_available():
        with torch.cuda.device(get_cuda_device_string()):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

    if has_mps():
        mac_specific.torch_mps_gc()

    if has_xpu():
        xpu_specific.torch_xpu_gc()

    if npu_specific.has_npu:
        torch_npu_set_device()
        npu_specific.torch_npu_gc()


def torch_npu_set_device():
    # Work around due to bug in torch_npu, revert me after fixed, @see https://gitee.com/ascend/pytorch/issues/I8KECW?from=project-issue
    if npu_specific.has_npu:
        torch.npu.set_device(0)


def enable_tf32():
    if torch.cuda.is_available():

        # enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't
        # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407
        if cuda_no_autocast():
            torch.backends.cudnn.benchmark = True

        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True


errors.run(enable_tf32, "Enabling TF32")

cpu: torch.device = torch.device("cpu")
fp8: bool = False
# Force fp16 for all models in inference. No casting during inference.
# This flag is controlled by "--precision half" command line arg.
force_fp16: bool = False
device: torch.device = None
device_interrogate: torch.device = None
device_gfpgan: torch.device = None
device_esrgan: torch.device = None
device_codeformer: torch.device = None
dtype: torch.dtype = torch.float16
dtype_vae: torch.dtype = torch.float16
dtype_unet: torch.dtype = torch.float16
dtype_inference: torch.dtype = torch.float16
unet_needs_upcast = False


def cond_cast_unet(input):
    if force_fp16:
        return input.to(torch.float16)
    return input.to(dtype_unet) if unet_needs_upcast else input


def cond_cast_float(input):
    return input.float() if unet_needs_upcast else input


nv_rng = None
patch_module_list = [
    torch.nn.Linear,
    torch.nn.Conv2d,
    torch.nn.MultiheadAttention,
    torch.nn.GroupNorm,
    torch.nn.LayerNorm,
]


def manual_cast_forward(target_dtype):
    def forward_wrapper(self, *args, **kwargs):
        if any(
            isinstance(arg, torch.Tensor) and arg.dtype != target_dtype
            for arg in args
        ):
            args = [arg.to(target_dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]
            kwargs = {k: v.to(target_dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}

        org_dtype = target_dtype
        for param in self.parameters():
            if param.dtype != target_dtype:
                org_dtype = param.dtype
                break

        if org_dtype != target_dtype:
            self.to(target_dtype)
        result = self.org_forward(*args, **kwargs)
        if org_dtype != target_dtype:
            self.to(org_dtype)

        if target_dtype != dtype_inference:
            if isinstance(result, tuple):
                result = tuple(
                    i.to(dtype_inference)
                    if isinstance(i, torch.Tensor)
                    else i
                    for i in result
                )
            elif isinstance(result, torch.Tensor):
                result = result.to(dtype_inference)
        return result
    return forward_wrapper


@contextlib.contextmanager
def manual_cast(target_dtype):
    applied = False
    for module_type in patch_module_list:
        if hasattr(module_type, "org_forward"):
            continue
        applied = True
        org_forward = module_type.forward
        if module_type == torch.nn.MultiheadAttention:
            module_type.forward = manual_cast_forward(torch.float32)
        else:
            module_type.forward = manual_cast_forward(target_dtype)
        module_type.org_forward = org_forward
    try:
        yield None
    finally:
        if applied:
            for module_type in patch_module_list:
                if hasattr(module_type, "org_forward"):
                    module_type.forward = module_type.org_forward
                    delattr(module_type, "org_forward")


def autocast(disable=False):
    if disable:
        return contextlib.nullcontext()

    if force_fp16:
        # No casting during inference if force_fp16 is enabled.
        # All tensor dtype conversion happens before inference.
        return contextlib.nullcontext()

    if fp8 and device==cpu:
        return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)

    if fp8 and dtype_inference == torch.float32:
        return manual_cast(dtype)

    if dtype == torch.float32 or dtype_inference == torch.float32:
        return contextlib.nullcontext()

    if has_xpu() or has_mps() or cuda_no_autocast():
        return manual_cast(dtype)

    return torch.autocast("cuda")


def without_autocast(disable=False):
    return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()


class NansException(Exception):
    pass


def test_for_nans(x, where):
    if shared.cmd_opts.disable_nan_check:
        return

    if not torch.all(torch.isnan(x)).item():
        return

    if where == "unet":
        message = "A tensor with all NaNs was produced in Unet."

        if not shared.cmd_opts.no_half:
            message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."

    elif where == "vae":
        message = "A tensor with all NaNs was produced in VAE."

        if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:
            message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
    else:
        message = "A tensor with all NaNs was produced."

    message += " Use --disable-nan-check commandline argument to disable this check."

    raise NansException(message)


@lru_cache
def first_time_calculation():
    """
    just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
    spends about 2.7 seconds doing that, at least with NVidia.
    """

    x = torch.zeros((1, 1)).to(device, dtype)
    linear = torch.nn.Linear(1, 1).to(device, dtype)
    linear(x)

    x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
    conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
    conv2d(x)


def force_model_fp16():
    """
    ldm and sgm has modules.diffusionmodules.util.GroupNorm32.forward, which
    force conversion of input to float32. If force_fp16 is enabled, we need to
    prevent this casting.
    """
    assert force_fp16
    import sgm.modules.diffusionmodules.util as sgm_util
    import ldm.modules.diffusionmodules.util as ldm_util
    sgm_util.GroupNorm32 = torch.nn.GroupNorm
    ldm_util.GroupNorm32 = torch.nn.GroupNorm
    print("ldm/sgm GroupNorm32 replaced with normal torch.nn.GroupNorm due to `--precision half`.")
Refactor Mac specific code to a separate file Move most Mac related code to a separate file, don't even load it unless web UI is run under macOS. 2023-02-01 07:28:16 -07:00			`import sys`
send all three of GFPGAN's and codeformer's models to CPU memory instead of just one for #1283 2022-10-04 03:32:22 -06:00			`import contextlib`
run basic torch calculation at startup in parallel to reduce the performance impact of first generation 2023-05-21 12:55:14 -06:00			`from functools import lru_cache`

Modular device management 2022-09-10 23:11:27 -06:00			`import torch`
Add NPU Support 2024-01-27 02:21:32 -07:00			`from modules import errors, shared, npu_specific`
Refactor Mac specific code to a separate file Move most Mac related code to a separate file, don't even load it unless web UI is run under macOS. 2023-02-01 07:28:16 -07:00
			`if sys.platform == "darwin":`
			`from modules import mac_specific`
Allow TF32 in CUDA for increased performance #279 2022-09-12 07:34:13 -06:00
Disable ipex autocast due to its bad perf 2023-12-01 23:00:46 -07:00			`if shared.cmd_opts.use_ipex:`
			`from modules import xpu_specific`


			`def has_xpu() -> bool:`
			`return shared.cmd_opts.use_ipex and xpu_specific.has_xpu`

change formatting to match the main program in devices.py 2022-11-12 00:00:49 -07:00
Fix wrong mps selection below MasOS 12.3 2022-11-11 20:02:40 -07:00			`def has_mps() -> bool:`
Refactor Mac specific code to a separate file Move most Mac related code to a separate file, don't even load it unless web UI is run under macOS. 2023-02-01 07:28:16 -07:00			`if sys.platform != "darwin":`
Fix wrong mps selection below MasOS 12.3 2022-11-11 20:02:40 -07:00			`return False`
Refactor Mac specific code to a separate file Move most Mac related code to a separate file, don't even load it unless web UI is run under macOS. 2023-02-01 07:28:16 -07:00			`else:`
			`return mac_specific.has_mps`
CLIP interrogator 2022-09-11 09:48:36 -06:00
change formatting to match the main program in devices.py 2022-11-12 00:00:49 -07:00
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`def cuda_no_autocast(device_id=None) -> bool:`
			`if device_id is None:`
			`device_id = get_cuda_device_id()`
			`return (`
Use options instead of cmd_args 2023-11-19 00:50:06 -07:00			`torch.cuda.get_device_capability(device_id) == (7, 5)`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`and torch.cuda.get_device_name(device_id).startswith("NVIDIA GeForce GTX 16")`
			`)`


			`def get_cuda_device_id():`
			`return (`
Use options instead of cmd_args 2023-11-19 00:50:06 -07:00			`int(shared.cmd_opts.device_id)`
			`if shared.cmd_opts.device_id is not None and shared.cmd_opts.device_id.isdigit()`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`else 0`
			`) or torch.cuda.current_device()`


eliminate duplicated code from #5095 2022-11-27 03:08:54 -07:00			`def get_cuda_device_string():`
			`if shared.cmd_opts.device_id is not None:`
			`return f"cuda:{shared.cmd_opts.device_id}"`
remove parsing command line from devices.py 2022-10-22 05:04:14 -06:00
eliminate duplicated code from #5095 2022-11-27 03:08:54 -07:00			`return "cuda"`
remove parsing command line from devices.py 2022-10-22 05:04:14 -06:00
eliminate duplicated code from #5095 2022-11-27 03:08:54 -07:00
remove the need to place configs near models 2023-01-27 01:28:12 -07:00			`def get_optimal_device_name():`
eliminate duplicated code from #5095 2022-11-27 03:08:54 -07:00			`if torch.cuda.is_available():`
remove the need to place configs near models 2023-01-27 01:28:12 -07:00			`return get_cuda_device_string()`
CLIP interrogator 2022-09-11 09:48:36 -06:00
Fix wrong mps selection below MasOS 12.3 2022-11-11 20:02:40 -07:00			`if has_mps():`
remove the need to place configs near models 2023-01-27 01:28:12 -07:00			`return "mps"`

Disable ipex autocast due to its bad perf 2023-12-01 23:00:46 -07:00			`if has_xpu():`
Initial IPEX support 2023-11-09 20:06:26 -07:00			`return xpu_specific.get_xpu_device_string()`

Add NPU Support 2024-01-27 02:21:32 -07:00			`if npu_specific.has_npu:`
			`return npu_specific.get_npu_device_string()`

remove the need to place configs near models 2023-01-27 01:28:12 -07:00			`return "cpu"`
CLIP interrogator 2022-09-11 09:48:36 -06:00
remove the need to place configs near models 2023-01-27 01:28:12 -07:00
			`def get_optimal_device():`
			`return torch.device(get_optimal_device_name())`
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00

add built-in extension system add support for adding upscalers in extensions move LDSR, ScuNET and SwinIR to built-in extensions 2022-12-03 08:06:33 -07:00			`def get_device_for(task):`
Update devices.py fixes issue where "--use-cpu" all properly makes SD run on CPU but leaves ControlNet (and other extensions, I presume) pointed at GPU, causing a crash in ControlNet caused by a mismatch between devices between SD and CN https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/14097 2023-11-27 17:21:43 -07:00			`if task in shared.cmd_opts.use_cpu or "all" in shared.cmd_opts.use_cpu:`
add built-in extension system add support for adding upscalers in extensions move LDSR, ScuNET and SwinIR to built-in extensions 2022-12-03 08:06:33 -07:00			`return cpu`

			`return get_optimal_device()`


add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`def torch_gc():`
added torch.mps.empty_cache() to torch_gc() changed a bunch of places that use torch.cuda.empty_cache() to use torch_gc() instead 2023-07-08 08:13:18 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`if torch.cuda.is_available():`
eliminate duplicated code from #5095 2022-11-27 03:08:54 -07:00			`with torch.cuda.device(get_cuda_device_string()):`
torch.cuda.empty_cache() defaults to cuda:0 device unless explicitly set otherwise first. Updating torch_gc() to use the device set by --device-id if specified to avoid OOM edge cases on multi-GPU systems. 2022-11-26 16:25:16 -07:00			`torch.cuda.empty_cache()`
			`torch.cuda.ipc_collect()`
Fix MPS cache cleanup Importing torch does not import torch.mps so the call failed. 2023-07-10 12:18:34 -06:00
			`if has_mps():`
			`mac_specific.torch_mps_gc()`
Allow TF32 in CUDA for increased performance #279 2022-09-12 07:34:13 -06:00
Disable ipex autocast due to its bad perf 2023-12-01 23:00:46 -07:00			`if has_xpu():`
			`xpu_specific.torch_xpu_gc()`

Add NPU Support 2024-01-27 02:21:32 -07:00			`if npu_specific.has_npu:`
Update 2024-01-30 19:46:53 -07:00			`torch_npu_set_device()`
Add NPU Support 2024-01-27 02:21:32 -07:00			`npu_specific.torch_npu_gc()`

Allow TF32 in CUDA for increased performance #279 2022-09-12 07:34:13 -06:00
Update 2024-01-30 19:46:53 -07:00			`def torch_npu_set_device():`
			`# Work around due to bug in torch_npu, revert me after fixed, @see https://gitee.com/ascend/pytorch/issues/I8KECW?from=project-issue`
			`if npu_specific.has_npu:`
			`torch.npu.set_device(0)`


Allow TF32 in CUDA for increased performance #279 2022-09-12 07:34:13 -06:00			`def enable_tf32():`
			`if torch.cuda.is_available():`
add comment for #4407 and remove seemingly unnecessary cudnn.enabled 2022-12-03 06:01:23 -07:00
			`# enabling benchmark option seems to enable a range of cards to do fp16 when they otherwise can't`
			`# see https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/4407`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`if cuda_no_autocast():`
terrible hack 2022-11-07 19:06:48 -07:00			`torch.backends.cudnn.benchmark = True`
fix #4407 breaking UI entirely for card other than ones related to the PR 2022-12-03 05:57:52 -07:00
Allow TF32 in CUDA for increased performance #279 2022-09-12 07:34:13 -06:00			`torch.backends.cuda.matmul.allow_tf32 = True`
			`torch.backends.cudnn.allow_tf32 = True`


			`errors.run(enable_tf32, "Enabling TF32")`
changes for #294 2022-09-12 11:09:32 -06:00
rework torchsde._brownian.brownian_interval replacement to use device.randn_local and respect the NV setting. 2023-08-02 22:18:55 -06:00			`cpu: torch.device = torch.device("cpu")`
Add CPU fp8 support Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear) And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet. 2023-10-23 11:49:05 -06:00			`fp8: bool = False`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`# Force fp16 for all models in inference. No casting during inference.`
			`# This flag is controlled by "--precision half" command line arg.`
			`force_fp16: bool = False`
rework torchsde._brownian.brownian_interval replacement to use device.randn_local and respect the NV setting. 2023-08-02 22:18:55 -06:00			`device: torch.device = None`
			`device_interrogate: torch.device = None`
			`device_gfpgan: torch.device = None`
			`device_esrgan: torch.device = None`
			`device_codeformer: torch.device = None`
			`dtype: torch.dtype = torch.float16`
			`dtype_vae: torch.dtype = torch.float16`
			`dtype_unet: torch.dtype = torch.float16`
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`dtype_inference: torch.dtype = torch.float16`
Add option for float32 sampling with float16 UNet This also handles type casting so that ROCm and MPS torch devices work correctly without --no-half. One cast is required for deepbooru in deepbooru_model.py, some explicit casting is required for img2img and inpainting. depth_model can't be converted to float16 or it won't work correctly on some systems (it's known to have issues on MPS) so in sd_models.py model.depth_model is removed for model.half(). 2023-01-24 21:51:45 -07:00			`unet_needs_upcast = False`
changes for #294 2022-09-12 11:09:32 -06:00
change formatting to match the main program in devices.py 2022-11-12 00:00:49 -07:00
Refactor conditional casting, fix upscalers 2023-01-27 08:19:43 -07:00			`def cond_cast_unet(input):`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`if force_fp16:`
			`return input.to(torch.float16)`
Refactor conditional casting, fix upscalers 2023-01-27 08:19:43 -07:00			`return input.to(dtype_unet) if unet_needs_upcast else input`


			`def cond_cast_float(input):`
			`return input.float() if unet_needs_upcast else input`


add NV option for Random number generator source setting, which allows to generate same pictures on CPU/AMD/Mac as on NVidia videocards. 2023-08-02 15:00:23 -06:00			`nv_rng = None`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`patch_module_list = [`
			`torch.nn.Linear,`
			`torch.nn.Conv2d,`
			`torch.nn.MultiheadAttention,`
			`torch.nn.GroupNorm,`
			`torch.nn.LayerNorm,`
			`]`

Use options instead of cmd_args 2023-11-19 00:50:06 -07:00
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`def manual_cast_forward(target_dtype):`
			`def forward_wrapper(self, args, *kwargs):`
Fix bugs when arg dtype doesn't match 2024-01-09 07:39:39 -07:00			`if any(`
			`isinstance(arg, torch.Tensor) and arg.dtype != target_dtype`
			`for arg in args`
			`):`
			`args = [arg.to(target_dtype) if isinstance(arg, torch.Tensor) else arg for arg in args]`
			`kwargs = {k: v.to(target_dtype) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}`
linting and debugs 2024-01-09 07:53:58 -07:00
Revert "Try to reverse the dtype checking mechanism" This reverts commit d243e24f539d717b221992e894a5db5a321bf3cd. 2024-01-29 07:54:12 -07:00			`org_dtype = target_dtype`
			`for param in self.parameters():`
			`if param.dtype != target_dtype:`
			`org_dtype = param.dtype`
			`break`
Fix potential bugs 2024-01-29 07:27:53 -07:00
Fix bugs when arg dtype doesn't match 2024-01-09 07:39:39 -07:00			`if org_dtype != target_dtype:`
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`self.to(target_dtype)`
			`result = self.org_forward(args, *kwargs)`
Fix bugs when arg dtype doesn't match 2024-01-09 07:39:39 -07:00			`if org_dtype != target_dtype:`
			`self.to(org_dtype)`
linting and debugs 2024-01-09 07:53:58 -07:00
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`if target_dtype != dtype_inference:`
Revert "Apply correct inference precision implementation" This reverts commit e00365962b17550a42235d1fbe2ad2c7cc4b8961. 2024-01-09 08:15:05 -07:00			`if isinstance(result, tuple):`
			`result = tuple(`
			`i.to(dtype_inference)`
			`if isinstance(i, torch.Tensor)`
			`else i`
			`for i in result`
			`)`
			`elif isinstance(result, torch.Tensor):`
			`result = result.to(dtype_inference)`
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`return result`
			`return forward_wrapper`
Use options instead of cmd_args 2023-11-19 00:50:06 -07:00

ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`@contextlib.contextmanager`
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`def manual_cast(target_dtype):`
Avoid early disable 2024-01-20 01:31:12 -07:00			`applied = False`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`for module_type in patch_module_list:`
Fix nested manual cast 2024-01-18 09:14:03 -07:00			`if hasattr(module_type, "org_forward"):`
			`continue`
Avoid early disable 2024-01-20 01:31:12 -07:00			`applied = True`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`org_forward = module_type.forward`
Fix potential bugs 2024-01-29 07:27:53 -07:00			`if module_type == torch.nn.MultiheadAttention:`
improve efficiency and support more device 2024-01-09 07:11:44 -07:00			`module_type.forward = manual_cast_forward(torch.float32)`
			`else:`
			`module_type.forward = manual_cast_forward(target_dtype)`
ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`module_type.org_forward = org_forward`
			`try:`
			`yield None`
			`finally:`
Avoid exceptions to be silenced 2024-01-20 01:33:59 -07:00			`if applied:`
			`for module_type in patch_module_list:`
			`if hasattr(module_type, "org_forward"):`
			`module_type.forward = module_type.org_forward`
			`delattr(module_type, "org_forward")`
add NV option for Random number generator source setting, which allows to generate same pictures on CPU/AMD/Mac as on NVidia videocards. 2023-08-02 15:00:23 -06:00

--no-half-vae 2022-10-10 07:11:14 -06:00			`def autocast(disable=False):`
			`if disable:`
			`return contextlib.nullcontext()`

Add --precision half cmd option 2024-05-16 17:50:06 -06:00			`if force_fp16:`
			`# No casting during inference if force_fp16 is enabled.`
			`# All tensor dtype conversion happens before inference.`
			`return contextlib.nullcontext()`

ManualCast for 10/16 series gpu 2023-10-28 01:24:26 -06:00			`if fp8 and device==cpu:`
Add CPU fp8 support Since norm layer need fp32, I only convert the linear operation layer(conv2d/linear) And TE have some pytorch function not support bf16 amp in CPU. I add a condition to indicate if the autocast is for unet. 2023-10-23 11:49:05 -06:00			`return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)`

Apply the correct behavior of precision='full' 2024-01-09 08:23:40 -07:00			`if fp8 and dtype_inference == torch.float32:`
			`return manual_cast(dtype)`

			`if dtype == torch.float32 or dtype_inference == torch.float32:`
send all three of GFPGAN's and codeformer's models to CPU memory instead of just one for #1283 2022-10-04 03:32:22 -06:00			`return contextlib.nullcontext()`

rearrange if-statements for cpu 2024-01-09 08:30:55 -07:00			`if has_xpu() or has_mps() or cuda_no_autocast():`
			`return manual_cast(dtype)`

send all three of GFPGAN's and codeformer's models to CPU memory instead of just one for #1283 2022-10-04 03:32:22 -06:00			`return torch.autocast("cuda")`
MPS Upscalers Fix Get ESRGAN, SCUNet, and SwinIR working correctly on MPS by ensuring memory is contiguous for tensor views before sending to MPS device. 2022-10-25 00:01:57 -06:00
change formatting to match the main program in devices.py 2022-11-12 00:00:49 -07:00
Add UI setting for upcasting attention to float32 Adds "Upcast cross attention layer to float32" option in Stable Diffusion settings. This allows for generating images using SD 2.1 models without --no-half or xFormers. In order to make upcasting cross attention layer optimizations possible it is necessary to indent several sections of code in sd_hijack_optimizations.py so that a context manager can be used to disable autocast. Also, even though Stable Diffusion (and Diffusers) only upcast q and k, unfortunately my findings were that most of the cross attention layer optimizations could not function unless v is upcast also. 2023-01-24 22:23:10 -07:00			`def without_autocast(disable=False):`
			`return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()`


Add a check and explanation for tensor with all NaNs. 2023-01-16 12:59:46 -07:00			`class NansException(Exception):`
			`pass`


			`def test_for_nans(x, where):`
disable the new NaN check for the CI 2023-01-17 01:04:56 -07:00			`if shared.cmd_opts.disable_nan_check:`
			`return`

Add a check and explanation for tensor with all NaNs. 2023-01-16 12:59:46 -07:00			`if not torch.all(torch.isnan(x)).item():`
			`return`

			`if where == "unet":`
			`message = "A tensor with all NaNs was produced in Unet."`

			`if not shared.cmd_opts.no_half:`
Add UI setting for upcasting attention to float32 Adds "Upcast cross attention layer to float32" option in Stable Diffusion settings. This allows for generating images using SD 2.1 models without --no-half or xFormers. In order to make upcasting cross attention layer optimizations possible it is necessary to indent several sections of code in sd_hijack_optimizations.py so that a context manager can be used to disable autocast. Also, even though Stable Diffusion (and Diffusers) only upcast q and k, unfortunately my findings were that most of the cross attention layer optimizations could not function unless v is upcast also. 2023-01-24 22:23:10 -07:00			`message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."`
Add a check and explanation for tensor with all NaNs. 2023-01-16 12:59:46 -07:00
			`elif where == "vae":`
			`message = "A tensor with all NaNs was produced in VAE."`

			`if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:`
			`message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."`
			`else:`
			`message = "A tensor with all NaNs was produced."`

clarify the option to disable NaN check. 2023-01-27 03:08:00 -07:00			`message += " Use --disable-nan-check commandline argument to disable this check."`

Add a check and explanation for tensor with all NaNs. 2023-01-16 12:59:46 -07:00			`raise NansException(message)`
run basic torch calculation at startup in parallel to reduce the performance impact of first generation 2023-05-21 12:55:14 -06:00

			`@lru_cache`
			`def first_time_calculation():`
			`"""`
			`just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and`
Fix various typos with crate-ci/typos 2024-03-03 23:37:23 -07:00			`spends about 2.7 seconds doing that, at least with NVidia.`
run basic torch calculation at startup in parallel to reduce the performance impact of first generation 2023-05-21 12:55:14 -06:00			`"""`

			`x = torch.zeros((1, 1)).to(device, dtype)`
			`linear = torch.nn.Linear(1, 1).to(device, dtype)`
			`linear(x)`

			`x = torch.zeros((1, 1, 3, 3)).to(device, dtype)`
			`conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)`
			`conv2d(x)`
Add --precision half cmd option 2024-05-16 17:50:06 -06:00

			`def force_model_fp16():`
			`"""`
			`ldm and sgm has modules.diffusionmodules.util.GroupNorm32.forward, which`
			`force conversion of input to float32. If force_fp16 is enabled, we need to`
			`prevent this casting.`
			`"""`
			`assert force_fp16`
			`import sgm.modules.diffusionmodules.util as sgm_util`
			`import ldm.modules.diffusionmodules.util as ldm_util`
			`sgm_util.GroupNorm32 = torch.nn.GroupNorm`
			`ldm_util.GroupNorm32 = torch.nn.GroupNorm`
			print("ldm/sgm GroupNorm32 replaced with normal torch.nn.GroupNorm due to `--precision half`.")