stable-diffusion-webui/modules/models/sd3/sd3_model.py

import contextlib

import torch

import k_diffusion
from modules.models.sd3.sd3_impls import BaseModel, SDVAE, SD3LatentFormat
from modules.models.sd3.sd3_cond import SD3Cond

from modules import shared, devices


class SD3Denoiser(k_diffusion.external.DiscreteSchedule):
    def __init__(self, inner_model, sigmas):
        super().__init__(sigmas, quantize=shared.opts.enable_quantization)
        self.inner_model = inner_model

    def forward(self, input, sigma, **kwargs):
        return self.inner_model.apply_model(input, sigma, **kwargs)


class SD3Inferencer(torch.nn.Module):
    def __init__(self, state_dict, shift=3, use_ema=False):
        super().__init__()

        self.shift = shift

        with torch.no_grad():
            self.model = BaseModel(shift=shift, state_dict=state_dict, prefix="model.diffusion_model.", device="cpu", dtype=devices.dtype)
            self.first_stage_model = SDVAE(device="cpu", dtype=devices.dtype_vae)
            self.first_stage_model.dtype = self.model.diffusion_model.dtype

        self.alphas_cumprod = 1 / (self.model.model_sampling.sigmas ** 2 + 1)

        self.text_encoders = SD3Cond()
        self.cond_stage_key = 'txt'

        self.parameterization = "eps"
        self.model.conditioning_key = "crossattn"

        self.latent_format = SD3LatentFormat()
        self.latent_channels = 16

    @property
    def cond_stage_model(self):
        return self.text_encoders

    def before_load_weights(self, state_dict):
        self.cond_stage_model.before_load_weights(state_dict)

    def ema_scope(self):
        return contextlib.nullcontext()

    def get_learned_conditioning(self, batch: list[str]):
        return self.cond_stage_model(batch)

    def apply_model(self, x, t, cond):
        return self.model(x, t, c_crossattn=cond['crossattn'], y=cond['vector'])

    def decode_first_stage(self, latent):
        latent = self.latent_format.process_out(latent)
        return self.first_stage_model.decode(latent)

    def encode_first_stage(self, image):
        latent = self.first_stage_model.encode(image)
        return self.latent_format.process_in(latent)

    def get_first_stage_encoding(self, x):
        return x

    def create_denoiser(self):
        return SD3Denoiser(self, self.model.model_sampling.sigmas)

    def medvram_fields(self):
        return [
            (self, 'first_stage_model'),
            (self, 'text_encoders'),
            (self, 'model'),
        ]

    def add_noise_to_latent(self, x, noise, amount):
        return x * (1 - amount) + noise * amount

    def fix_dimensions(self, width, height):
        return width // 16 * 16, height // 16 * 16
initial SD3 support 2024-06-15 23:04:31 -06:00			`import contextlib`

			`import torch`

			`import k_diffusion`
			`from modules.models.sd3.sd3_impls import BaseModel, SDVAE, SD3LatentFormat`
support for SD3: infinite prompt length, token counting 2024-06-26 14:22:00 -06:00			`from modules.models.sd3.sd3_cond import SD3Cond`
initial SD3 support 2024-06-15 23:04:31 -06:00
support for SD3: infinite prompt length, token counting 2024-06-26 14:22:00 -06:00			`from modules import shared, devices`
medvram support for SD3 2024-06-24 01:15:46 -06:00
initial SD3 support 2024-06-15 23:04:31 -06:00
			`class SD3Denoiser(k_diffusion.external.DiscreteSchedule):`
			`def __init__(self, inner_model, sigmas):`
			`super().__init__(sigmas, quantize=shared.opts.enable_quantization)`
			`self.inner_model = inner_model`

			`def forward(self, input, sigma, **kwargs):`
			`return self.inner_model.apply_model(input, sigma, **kwargs)`


			`class SD3Inferencer(torch.nn.Module):`
			`def __init__(self, state_dict, shift=3, use_ema=False):`
			`super().__init__()`

			`self.shift = shift`

			`with torch.no_grad():`
			`self.model = BaseModel(shift=shift, state_dict=state_dict, prefix="model.diffusion_model.", device="cpu", dtype=devices.dtype)`
			`self.first_stage_model = SDVAE(device="cpu", dtype=devices.dtype_vae)`
			`self.first_stage_model.dtype = self.model.diffusion_model.dtype`

			`self.alphas_cumprod = 1 / (self.model.model_sampling.sigmas ** 2 + 1)`

support loading clip/t5 from the main model checkpoint 2024-06-28 15:38:52 -06:00			`self.text_encoders = SD3Cond()`
initial SD3 support 2024-06-15 23:04:31 -06:00			`self.cond_stage_key = 'txt'`

			`self.parameterization = "eps"`
			`self.model.conditioning_key = "crossattn"`

			`self.latent_format = SD3LatentFormat()`
			`self.latent_channels = 16`

support loading clip/t5 from the main model checkpoint 2024-06-28 15:38:52 -06:00			`@property`
			`def cond_stage_model(self):`
			`return self.text_encoders`

			`def before_load_weights(self, state_dict):`
			`self.cond_stage_model.before_load_weights(state_dict)`
initial SD3 support 2024-06-15 23:04:31 -06:00
			`def ema_scope(self):`
			`return contextlib.nullcontext()`

			`def get_learned_conditioning(self, batch: list[str]):`
fix the problem with infinite prompts where empty cond would be calculated incorrectly 2024-06-28 02:15:34 -06:00			`return self.cond_stage_model(batch)`
initial SD3 support 2024-06-15 23:04:31 -06:00
			`def apply_model(self, x, t, cond):`
medvram support for SD3 2024-06-24 01:15:46 -06:00			`return self.model(x, t, c_crossattn=cond['crossattn'], y=cond['vector'])`
initial SD3 support 2024-06-15 23:04:31 -06:00
			`def decode_first_stage(self, latent):`
			`latent = self.latent_format.process_out(latent)`
			`return self.first_stage_model.decode(latent)`

			`def encode_first_stage(self, image):`
			`latent = self.first_stage_model.encode(image)`
			`return self.latent_format.process_in(latent)`

add the missing get_first_stage_encoding function 2024-06-27 23:10:32 -06:00			`def get_first_stage_encoding(self, x):`
			`return x`

initial SD3 support 2024-06-15 23:04:31 -06:00			`def create_denoiser(self):`
			`return SD3Denoiser(self, self.model.model_sampling.sigmas)`
medvram support for SD3 2024-06-24 01:15:46 -06:00
			`def medvram_fields(self):`
			`return [`
			`(self, 'first_stage_model'),`
fix --medvram 2024-06-29 22:06:28 -06:00			`(self, 'text_encoders'),`
medvram support for SD3 2024-06-24 01:15:46 -06:00			`(self, 'model'),`
			`]`
fix img2img 2024-06-28 00:23:41 -06:00
			`def add_noise_to_latent(self, x, noise, amount):`
			`return x * (1 - amount) + noise * amount`
allow generation to be started with any dimensions specified 2024-06-28 09:06:49 -06:00
			`def fix_dimensions(self, width, height):`
			`return width // 16 * 16, height // 16 * 16`