"
+ else:
+ profiling_html = ''
+
# last item is always HTML
- res[-1] += f"
Time taken: {elapsed_text}
{vram_html}
"
+ res[-1] += f"
Time taken: {elapsed_text}
{vram_html}{profiling_html}
"
return tuple(res)
return f
+
diff --git a/modules/cmd_args.py b/modules/cmd_args.py
index 016a33d10..d71982b2c 100644
--- a/modules/cmd_args.py
+++ b/modules/cmd_args.py
@@ -20,6 +20,7 @@ parser.add_argument("--dump-sysinfo", action='store_true', help="launch.py argum
parser.add_argument("--loglevel", type=str, help="log level; one of: CRITICAL, ERROR, WARNING, INFO, DEBUG", default=None)
parser.add_argument("--do-not-download-clip", action='store_true', help="do not download CLIP model even if it's not included in the checkpoint")
parser.add_argument("--data-dir", type=normalized_filepath, default=os.path.dirname(os.path.dirname(os.path.realpath(__file__))), help="base path where all user data is stored")
+parser.add_argument("--models-dir", type=normalized_filepath, default=None, help="base path where models are stored; overrides --data-dir")
parser.add_argument("--config", type=normalized_filepath, default=sd_default_config, help="path to config which constructs model",)
parser.add_argument("--ckpt", type=normalized_filepath, default=sd_model_file, help="path to checkpoint of stable diffusion model; if specified, this checkpoint will be added to the list of checkpoints and loaded",)
parser.add_argument("--ckpt-dir", type=normalized_filepath, default=None, help="Path to directory with stable diffusion checkpoints")
@@ -29,7 +30,7 @@ parser.add_argument("--gfpgan-model", type=normalized_filepath, help="GFPGAN mod
parser.add_argument("--no-half", action='store_true', help="do not switch the model to 16-bit floats")
parser.add_argument("--no-half-vae", action='store_true', help="do not switch the VAE model to 16-bit floats")
parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not hide progressbar in gradio UI (we hide it because it slows down ML if you have hardware acceleration in browser)")
-parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
+parser.add_argument("--max-batch-count", type=int, default=16, help="does not do anything")
parser.add_argument("--embeddings-dir", type=normalized_filepath, default=os.path.join(data_path, 'embeddings'), help="embeddings directory for textual inversion (default: embeddings)")
parser.add_argument("--textual-inversion-templates-dir", type=normalized_filepath, default=os.path.join(script_path, 'textual_inversion_templates'), help="directory with textual inversion templates")
parser.add_argument("--hypernetwork-dir", type=normalized_filepath, default=os.path.join(models_path, 'hypernetworks'), help="hypernetwork directory")
@@ -41,7 +42,7 @@ parser.add_argument("--lowvram", action='store_true', help="enable stable diffus
parser.add_argument("--lowram", action='store_true', help="load stable diffusion checkpoint weights to VRAM instead of RAM")
parser.add_argument("--always-batch-cond-uncond", action='store_true', help="does not do anything")
parser.add_argument("--unload-gfpgan", action='store_true', help="does not do anything.")
-parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast")
+parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "half", "autocast"], default="autocast")
parser.add_argument("--upcast-sampling", action='store_true', help="upcast sampling. No effect with --no-half. Usually produces similar results to --no-half with better performance while using less memory.")
parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site")
parser.add_argument("--ngrok", type=str, help="ngrok authtoken, alternative to gradio --share", default=None)
diff --git a/modules/deepbooru.py b/modules/deepbooru.py
index 547e1b4c6..fb043feb2 100644
--- a/modules/deepbooru.py
+++ b/modules/deepbooru.py
@@ -57,7 +57,7 @@ class DeepDanbooru:
a = np.expand_dims(np.array(pic, dtype=np.float32), 0) / 255
with torch.no_grad(), devices.autocast():
- x = torch.from_numpy(a).to(devices.device)
+ x = torch.from_numpy(a).to(devices.device, devices.dtype)
y = self.model(x)[0].detach().cpu().numpy()
probability_dict = {}
diff --git a/modules/devices.py b/modules/devices.py
index e4f671ac6..ee679141a 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -114,6 +114,9 @@ errors.run(enable_tf32, "Enabling TF32")
cpu: torch.device = torch.device("cpu")
fp8: bool = False
+# Force fp16 for all models in inference. No casting during inference.
+# This flag is controlled by "--precision half" command line arg.
+force_fp16: bool = False
device: torch.device = None
device_interrogate: torch.device = None
device_gfpgan: torch.device = None
@@ -127,6 +130,8 @@ unet_needs_upcast = False
def cond_cast_unet(input):
+ if force_fp16:
+ return input.to(torch.float16)
return input.to(dtype_unet) if unet_needs_upcast else input
@@ -206,6 +211,11 @@ def autocast(disable=False):
if disable:
return contextlib.nullcontext()
+ if force_fp16:
+ # No casting during inference if force_fp16 is enabled.
+ # All tensor dtype conversion happens before inference.
+ return contextlib.nullcontext()
+
if fp8 and device==cpu:
return torch.autocast("cpu", dtype=torch.bfloat16, enabled=True)
@@ -233,22 +243,22 @@ def test_for_nans(x, where):
if shared.cmd_opts.disable_nan_check:
return
- if not torch.all(torch.isnan(x)).item():
+ if not torch.isnan(x[(0, ) * len(x.shape)]):
return
if where == "unet":
- message = "A tensor with all NaNs was produced in Unet."
+ message = "A tensor with NaNs was produced in Unet."
if not shared.cmd_opts.no_half:
message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."
elif where == "vae":
- message = "A tensor with all NaNs was produced in VAE."
+ message = "A tensor with NaNs was produced in VAE."
if not shared.cmd_opts.no_half and not shared.cmd_opts.no_half_vae:
message += " This could be because there's not enough precision to represent the picture. Try adding --no-half-vae commandline argument to fix this."
else:
- message = "A tensor with all NaNs was produced."
+ message = "A tensor with NaNs was produced."
message += " Use --disable-nan-check commandline argument to disable this check."
@@ -258,7 +268,7 @@ def test_for_nans(x, where):
@lru_cache
def first_time_calculation():
"""
- just do any calculation with pytorch layers - the first time this is done it allocaltes about 700MB of memory and
+ just do any calculation with pytorch layers - the first time this is done it allocates about 700MB of memory and
spends about 2.7 seconds doing that, at least with NVidia.
"""
@@ -269,3 +279,17 @@ def first_time_calculation():
x = torch.zeros((1, 1, 3, 3)).to(device, dtype)
conv2d = torch.nn.Conv2d(1, 1, (3, 3)).to(device, dtype)
conv2d(x)
+
+
+def force_model_fp16():
+ """
+ ldm and sgm has modules.diffusionmodules.util.GroupNorm32.forward, which
+ force conversion of input to float32. If force_fp16 is enabled, we need to
+ prevent this casting.
+ """
+ assert force_fp16
+ import sgm.modules.diffusionmodules.util as sgm_util
+ import ldm.modules.diffusionmodules.util as ldm_util
+ sgm_util.GroupNorm32 = torch.nn.GroupNorm
+ ldm_util.GroupNorm32 = torch.nn.GroupNorm
+ print("ldm/sgm GroupNorm32 replaced with normal torch.nn.GroupNorm due to `--precision half`.")
diff --git a/modules/extensions.py b/modules/extensions.py
index 5ad934b4d..24de766eb 100644
--- a/modules/extensions.py
+++ b/modules/extensions.py
@@ -191,8 +191,9 @@ class Extension:
def check_updates(self):
repo = Repo(self.path)
+ branch_name = f'{repo.remote().name}/{self.branch}'
for fetch in repo.remote().fetch(dry_run=True):
- if self.branch and fetch.name != f'{repo.remote().name}/{self.branch}':
+ if self.branch and fetch.name != branch_name:
continue
if fetch.flags != fetch.HEAD_UPTODATE:
self.can_update = True
@@ -200,7 +201,7 @@ class Extension:
return
try:
- origin = repo.rev_parse('origin')
+ origin = repo.rev_parse(branch_name)
if repo.head.commit != origin:
self.can_update = True
self.status = "behind HEAD"
@@ -213,8 +214,10 @@ class Extension:
self.can_update = False
self.status = "latest"
- def fetch_and_reset_hard(self, commit='origin'):
+ def fetch_and_reset_hard(self, commit=None):
repo = Repo(self.path)
+ if commit is None:
+ commit = f'{repo.remote().name}/{self.branch}'
# Fix: `error: Your local changes to the following files would be overwritten by merge`,
# because WSL2 Docker set 755 file permissions instead of 644, this results to the error.
repo.git.fetch(all=True)
diff --git a/modules/gfpgan_model.py b/modules/gfpgan_model.py
index 445b04092..01ef899e4 100644
--- a/modules/gfpgan_model.py
+++ b/modules/gfpgan_model.py
@@ -36,13 +36,11 @@ class FaceRestorerGFPGAN(face_restoration_utils.CommonFaceRestoration):
ext_filter=['.pth'],
):
if 'GFPGAN' in os.path.basename(model_path):
- model = modelloader.load_spandrel_model(
+ return modelloader.load_spandrel_model(
model_path,
device=self.get_device(),
expected_architecture='GFPGAN',
).model
- model.different_w = True # see https://github.com/chaiNNer-org/spandrel/pull/81
- return model
raise ValueError("No GFPGAN model found")
def restore(self, np_image):
diff --git a/modules/images.py b/modules/images.py
index c0ff8a630..cfdfb3384 100644
--- a/modules/images.py
+++ b/modules/images.py
@@ -54,11 +54,14 @@ def image_grid(imgs, batch_size=1, rows=None):
params = script_callbacks.ImageGridLoopParams(imgs, cols, rows)
script_callbacks.image_grid_callback(params)
- w, h = imgs[0].size
- grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color='black')
+ w, h = map(max, zip(*(img.size for img in imgs)))
+ grid_background_color = ImageColor.getcolor(opts.grid_background_color, 'RGB')
+ grid = Image.new('RGB', size=(params.cols * w, params.rows * h), color=grid_background_color)
for i, img in enumerate(params.imgs):
- grid.paste(img, box=(i % params.cols * w, i // params.cols * h))
+ img_w, img_h = img.size
+ w_offset, h_offset = 0 if img_w == w else (w - img_w) // 2, 0 if img_h == h else (h - img_h) // 2
+ grid.paste(img, box=(i % params.cols * w + w_offset, i // params.cols * h + h_offset))
return grid
@@ -377,6 +380,7 @@ def get_sampler_scheduler(p, sampler):
class FilenameGenerator:
replacements = {
+ 'basename': lambda self: self.basename or 'img',
'seed': lambda self: self.seed if self.seed is not None else '',
'seed_first': lambda self: self.seed if self.p.batch_size == 1 else self.p.all_seeds[0],
'seed_last': lambda self: NOTHING_AND_SKIP_PREVIOUS_TEXT if self.p.batch_size == 1 else self.p.all_seeds[-1],
@@ -413,12 +417,13 @@ class FilenameGenerator:
}
default_time_format = '%Y%m%d%H%M%S'
- def __init__(self, p, seed, prompt, image, zip=False):
+ def __init__(self, p, seed, prompt, image, zip=False, basename=""):
self.p = p
self.seed = seed
self.prompt = prompt
self.image = image
self.zip = zip
+ self.basename = basename
def get_vae_filename(self):
"""Get the name of the VAE file."""
@@ -606,9 +611,10 @@ def save_image_with_geninfo(image, geninfo, filename, extension=None, existing_p
piexif.ExifIFD.UserComment: piexif.helper.UserComment.dump(geninfo or "", encoding="unicode")
},
})
+ else:
+ exif_bytes = None
-
- image.save(filename,format=image_format, exif=exif_bytes)
+ image.save(filename,format=image_format, quality=opts.jpeg_quality, exif=exif_bytes)
elif extension.lower() == ".gif":
image.save(filename, format=image_format, comment=geninfo)
else:
@@ -648,12 +654,12 @@ def save_image(image, path, basename, seed=None, prompt=None, extension='png', i
txt_fullfn (`str` or None):
If a text file is saved for this image, this will be its full path. Otherwise None.
"""
- namegen = FilenameGenerator(p, seed, prompt, image)
+ namegen = FilenameGenerator(p, seed, prompt, image, basename=basename)
# WebP and JPG formats have maximum dimension limits of 16383 and 65535 respectively. switch to PNG which has a much higher limit
if (image.height > 65535 or image.width > 65535) and extension.lower() in ("jpg", "jpeg") or (image.height > 16383 or image.width > 16383) and extension.lower() == "webp":
print('Image dimensions too large; saving as PNG')
- extension = ".png"
+ extension = "png"
if save_to_dirs is None:
save_to_dirs = (grid and opts.grid_save_to_dirs) or (not grid and opts.save_to_dirs and not no_prompt)
@@ -789,7 +795,10 @@ def read_info_from_image(image: Image.Image) -> tuple[str | None, dict]:
if exif_comment:
geninfo = exif_comment
elif "comment" in items: # for gif
- geninfo = items["comment"].decode('utf8', errors="ignore")
+ if isinstance(items["comment"], bytes):
+ geninfo = items["comment"].decode('utf8', errors="ignore")
+ else:
+ geninfo = items["comment"]
for field in IGNORED_INFO_KEYS:
items.pop(field, None)
diff --git a/modules/img2img.py b/modules/img2img.py
index a1d042c21..24f869f5c 100644
--- a/modules/img2img.py
+++ b/modules/img2img.py
@@ -17,11 +17,14 @@ from modules.ui import plaintext_to_html
import modules.scripts
-def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
+def process_batch(p, input, output_dir, inpaint_mask_dir, args, to_scale=False, scale_by=1.0, use_png_info=False, png_info_props=None, png_info_dir=None):
output_dir = output_dir.strip()
processing.fix_seed(p)
- batch_images = list(shared.walk_files(input_dir, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+ if isinstance(input, str):
+ batch_images = list(shared.walk_files(input, allowed_extensions=(".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff")))
+ else:
+ batch_images = [os.path.abspath(x.name) for x in input]
is_inpaint_batch = False
if inpaint_mask_dir:
@@ -146,7 +149,7 @@ def process_batch(p, input_dir, output_dir, inpaint_mask_dir, args, to_scale=Fal
return batch_results
-def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, *args):
+def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_prompt: str, prompt_styles, init_img, sketch, init_img_with_mask, inpaint_color_sketch, inpaint_color_sketch_orig, init_img_inpaint, init_mask_inpaint, mask_blur: int, mask_alpha: float, inpainting_fill: int, n_iter: int, batch_size: int, cfg_scale: float, image_cfg_scale: float, denoising_strength: float, selected_scale_tab: int, height: int, width: int, scale_by: float, resize_mode: int, inpaint_full_res: bool, inpaint_full_res_padding: int, inpainting_mask_invert: int, img2img_batch_input_dir: str, img2img_batch_output_dir: str, img2img_batch_inpaint_mask_dir: str, override_settings_texts, img2img_batch_use_png_info: bool, img2img_batch_png_info_props: list, img2img_batch_png_info_dir: str, img2img_batch_source_type: str, img2img_batch_upload: list, *args):
override_settings = create_override_settings_dict(override_settings_texts)
is_batch = mode == 5
@@ -221,8 +224,15 @@ def img2img(id_task: str, request: gr.Request, mode: int, prompt: str, negative_
with closing(p):
if is_batch:
- assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
- processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)
+ if img2img_batch_source_type == "upload":
+ assert isinstance(img2img_batch_upload, list) and img2img_batch_upload
+ output_dir = ""
+ inpaint_mask_dir = ""
+ png_info_dir = img2img_batch_png_info_dir if not shared.cmd_opts.hide_ui_dir_config else ""
+ processed = process_batch(p, img2img_batch_upload, output_dir, inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=png_info_dir)
+ else: # "from dir"
+ assert not shared.cmd_opts.hide_ui_dir_config, "Launched with --hide-ui-dir-config, batch img2img disabled"
+ processed = process_batch(p, img2img_batch_input_dir, img2img_batch_output_dir, img2img_batch_inpaint_mask_dir, args, to_scale=selected_scale_tab == 1, scale_by=scale_by, use_png_info=img2img_batch_use_png_info, png_info_props=img2img_batch_png_info_props, png_info_dir=img2img_batch_png_info_dir)
if processed is None:
processed = Processed(p, [], p.seed, "")
diff --git a/modules/infotext_utils.py b/modules/infotext_utils.py
index f1e8f54ba..32dbafa65 100644
--- a/modules/infotext_utils.py
+++ b/modules/infotext_utils.py
@@ -146,18 +146,19 @@ def connect_paste_params_buttons():
destination_height_component = next(iter([field for field, name in fields if name == "Size-2"] if fields else []), None)
if binding.source_image_component and destination_image_component:
+ need_send_dementions = destination_width_component and binding.tabname != 'inpaint'
if isinstance(binding.source_image_component, gr.Gallery):
- func = send_image_and_dimensions if destination_width_component else image_from_url_text
+ func = send_image_and_dimensions if need_send_dementions else image_from_url_text
jsfunc = "extract_image_from_gallery"
else:
- func = send_image_and_dimensions if destination_width_component else lambda x: x
+ func = send_image_and_dimensions if need_send_dementions else lambda x: x
jsfunc = None
binding.paste_button.click(
fn=func,
_js=jsfunc,
inputs=[binding.source_image_component],
- outputs=[destination_image_component, destination_width_component, destination_height_component] if destination_width_component else [destination_image_component],
+ outputs=[destination_image_component, destination_width_component, destination_height_component] if need_send_dementions else [destination_image_component],
show_progress=False,
)
diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 5812b0e58..20c7dc127 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -9,6 +9,7 @@ import importlib.util
import importlib.metadata
import platform
import json
+import shlex
from functools import lru_cache
from modules import cmd_args, errors
@@ -76,7 +77,7 @@ def git_tag():
except Exception:
try:
- changelog_md = os.path.join(os.path.dirname(os.path.dirname(__file__)), "CHANGELOG.md")
+ changelog_md = os.path.join(script_path, "CHANGELOG.md")
with open(changelog_md, "r", encoding="utf-8") as file:
line = next((line.strip() for line in file if line.strip()), "")
line = line.replace("## ", "")
@@ -231,7 +232,7 @@ def run_extension_installer(extension_dir):
try:
env = os.environ.copy()
- env['PYTHONPATH'] = f"{os.path.abspath('.')}{os.pathsep}{env.get('PYTHONPATH', '')}"
+ env['PYTHONPATH'] = f"{script_path}{os.pathsep}{env.get('PYTHONPATH', '')}"
stdout = run(f'"{python}" "{path_installer}"', errdesc=f"Error running install.py for extension {extension_dir}", custom_env=env).strip()
if stdout:
@@ -445,7 +446,6 @@ def prepare_environment():
exit(0)
-
def configure_for_tests():
if "--api" not in sys.argv:
sys.argv.append("--api")
@@ -461,7 +461,7 @@ def configure_for_tests():
def start():
- print(f"Launching {'API server' if '--nowebui' in sys.argv else 'Web UI'} with arguments: {' '.join(sys.argv[1:])}")
+ print(f"Launching {'API server' if '--nowebui' in sys.argv else 'Web UI'} with arguments: {shlex.join(sys.argv[1:])}")
import webui
if '--nowebui' in sys.argv:
webui.api_only()
diff --git a/modules/lowvram.py b/modules/lowvram.py
index 45701046b..6728c337b 100644
--- a/modules/lowvram.py
+++ b/modules/lowvram.py
@@ -1,9 +1,12 @@
+from collections import namedtuple
+
import torch
from modules import devices, shared
module_in_gpu = None
cpu = torch.device("cpu")
+ModuleWithParent = namedtuple('ModuleWithParent', ['module', 'parent'], defaults=['None'])
def send_everything_to_cpu():
global module_in_gpu
@@ -75,13 +78,14 @@ def setup_for_low_vram(sd_model, use_medvram):
(sd_model, 'depth_model'),
(sd_model, 'embedder'),
(sd_model, 'model'),
- (sd_model, 'embedder'),
]
is_sdxl = hasattr(sd_model, 'conditioner')
is_sd2 = not is_sdxl and hasattr(sd_model.cond_stage_model, 'model')
- if is_sdxl:
+ if hasattr(sd_model, 'medvram_fields'):
+ to_remain_in_cpu = sd_model.medvram_fields()
+ elif is_sdxl:
to_remain_in_cpu.append((sd_model, 'conditioner'))
elif is_sd2:
to_remain_in_cpu.append((sd_model.cond_stage_model, 'model'))
@@ -103,7 +107,21 @@ def setup_for_low_vram(sd_model, use_medvram):
setattr(obj, field, module)
# register hooks for those the first three models
- if is_sdxl:
+ if hasattr(sd_model, "cond_stage_model") and hasattr(sd_model.cond_stage_model, "medvram_modules"):
+ for module in sd_model.cond_stage_model.medvram_modules():
+ if isinstance(module, ModuleWithParent):
+ parent = module.parent
+ module = module.module
+ else:
+ parent = None
+
+ if module:
+ module.register_forward_pre_hook(send_me_to_gpu)
+
+ if parent:
+ parents[module] = parent
+
+ elif is_sdxl:
sd_model.conditioner.register_forward_pre_hook(send_me_to_gpu)
elif is_sd2:
sd_model.cond_stage_model.model.register_forward_pre_hook(send_me_to_gpu)
@@ -117,9 +135,9 @@ def setup_for_low_vram(sd_model, use_medvram):
sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
sd_model.first_stage_model.encode = first_stage_model_encode_wrap
sd_model.first_stage_model.decode = first_stage_model_decode_wrap
- if sd_model.depth_model:
+ if getattr(sd_model, 'depth_model', None) is not None:
sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
- if sd_model.embedder:
+ if getattr(sd_model, 'embedder', None) is not None:
sd_model.embedder.register_forward_pre_hook(send_me_to_gpu)
if use_medvram:
diff --git a/modules/modelloader.py b/modules/modelloader.py
index 115415c8e..36e7415af 100644
--- a/modules/modelloader.py
+++ b/modules/modelloader.py
@@ -23,6 +23,7 @@ def load_file_from_url(
model_dir: str,
progress: bool = True,
file_name: str | None = None,
+ hash_prefix: str | None = None,
) -> str:
"""Download a file from `url` into `model_dir`, using the file present if possible.
@@ -36,11 +37,11 @@ def load_file_from_url(
if not os.path.exists(cached_file):
print(f'Downloading: "{url}" to {cached_file}\n')
from torch.hub import download_url_to_file
- download_url_to_file(url, cached_file, progress=progress)
+ download_url_to_file(url, cached_file, progress=progress, hash_prefix=hash_prefix)
return cached_file
-def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None) -> list:
+def load_models(model_path: str, model_url: str = None, command_path: str = None, ext_filter=None, download_name=None, ext_blacklist=None, hash_prefix=None) -> list:
"""
A one-and done loader to try finding the desired models in specified directories.
@@ -49,6 +50,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None
@param model_path: The location to store/find models in.
@param command_path: A command-line argument to search for models in first.
@param ext_filter: An optional list of filename extensions to filter by
+ @param hash_prefix: the expected sha256 of the model_url
@return: A list of paths containing the desired model(s)
"""
output = []
@@ -78,7 +80,7 @@ def load_models(model_path: str, model_url: str = None, command_path: str = None
if model_url is not None and len(output) == 0:
if download_name is not None:
- output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name))
+ output.append(load_file_from_url(model_url, model_dir=places[0], file_name=download_name, hash_prefix=hash_prefix))
else:
output.append(model_url)
@@ -137,6 +139,27 @@ def load_upscalers():
key=lambda x: x.name.lower() if not isinstance(x.scaler, (UpscalerNone, UpscalerLanczos, UpscalerNearest)) else ""
)
+# None: not loaded, False: failed to load, True: loaded
+_spandrel_extra_init_state = None
+
+
+def _init_spandrel_extra_archs() -> None:
+ """
+ Try to initialize `spandrel_extra_archs` (exactly once).
+ """
+ global _spandrel_extra_init_state
+ if _spandrel_extra_init_state is not None:
+ return
+
+ try:
+ import spandrel
+ import spandrel_extra_arches
+ spandrel.MAIN_REGISTRY.add(*spandrel_extra_arches.EXTRA_REGISTRY)
+ _spandrel_extra_init_state = True
+ except Exception:
+ logger.warning("Failed to load spandrel_extra_arches", exc_info=True)
+ _spandrel_extra_init_state = False
+
def load_spandrel_model(
path: str | os.PathLike,
@@ -146,11 +169,16 @@ def load_spandrel_model(
dtype: str | torch.dtype | None = None,
expected_architecture: str | None = None,
) -> spandrel.ModelDescriptor:
+ global _spandrel_extra_init_state
+
import spandrel
+ _init_spandrel_extra_archs()
+
model_descriptor = spandrel.ModelLoader(device=device).load_from_file(str(path))
- if expected_architecture and model_descriptor.architecture != expected_architecture:
+ arch = model_descriptor.architecture
+ if expected_architecture and arch.name != expected_architecture:
logger.warning(
- f"Model {path!r} is not a {expected_architecture!r} model (got {model_descriptor.architecture!r})",
+ f"Model {path!r} is not a {expected_architecture!r} model (got {arch.name!r})",
)
half = False
if prefer_half:
@@ -164,6 +192,6 @@ def load_spandrel_model(
model_descriptor.model.eval()
logger.debug(
"Loaded %s from %s (device=%s, half=%s, dtype=%s)",
- model_descriptor, path, device, half, dtype,
+ arch, path, device, half, dtype,
)
return model_descriptor
diff --git a/modules/models/diffusion/uni_pc/uni_pc.py b/modules/models/diffusion/uni_pc/uni_pc.py
index d257a7286..3333bc808 100644
--- a/modules/models/diffusion/uni_pc/uni_pc.py
+++ b/modules/models/diffusion/uni_pc/uni_pc.py
@@ -323,7 +323,7 @@ def model_wrapper(
def model_fn(x, t_continuous, condition, unconditional_condition):
"""
- The noise predicition model function that is used for DPM-Solver.
+ The noise prediction model function that is used for DPM-Solver.
"""
if t_continuous.reshape((-1,)).shape[0] == 1:
t_continuous = t_continuous.expand((x.shape[0]))
diff --git a/modules/models/sd3/mmdit.py b/modules/models/sd3/mmdit.py
new file mode 100644
index 000000000..8ddf49a4e
--- /dev/null
+++ b/modules/models/sd3/mmdit.py
@@ -0,0 +1,622 @@
+### This file contains impls for MM-DiT, the core model component of SD3
+
+import math
+from typing import Dict, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from modules.models.sd3.other_impls import attention, Mlp
+
+
+class PatchEmbed(nn.Module):
+ """ 2D Image to Patch Embedding"""
+ def __init__(
+ self,
+ img_size: Optional[int] = 224,
+ patch_size: int = 16,
+ in_chans: int = 3,
+ embed_dim: int = 768,
+ flatten: bool = True,
+ bias: bool = True,
+ strict_img_size: bool = True,
+ dynamic_img_pad: bool = False,
+ dtype=None,
+ device=None,
+ ):
+ super().__init__()
+ self.patch_size = (patch_size, patch_size)
+ if img_size is not None:
+ self.img_size = (img_size, img_size)
+ self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
+ self.num_patches = self.grid_size[0] * self.grid_size[1]
+ else:
+ self.img_size = None
+ self.grid_size = None
+ self.num_patches = None
+
+ # flatten spatial dim and transpose to channels last, kept for bwd compat
+ self.flatten = flatten
+ self.strict_img_size = strict_img_size
+ self.dynamic_img_pad = dynamic_img_pad
+
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, dtype=dtype, device=device)
+
+ def forward(self, x):
+ B, C, H, W = x.shape
+ x = self.proj(x)
+ if self.flatten:
+ x = x.flatten(2).transpose(1, 2) # NCHW -> NLC
+ return x
+
+
+def modulate(x, shift, scale):
+ if shift is None:
+ shift = torch.zeros_like(scale)
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+#################################################################################
+# Sine/Cosine Positional Embedding Functions #
+#################################################################################
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, scaling_factor=None, offset=None):
+ """
+ grid_size: int of the grid height and width
+ return:
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+ """
+ grid_h = np.arange(grid_size, dtype=np.float32)
+ grid_w = np.arange(grid_size, dtype=np.float32)
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
+ grid = np.stack(grid, axis=0)
+ if scaling_factor is not None:
+ grid = grid / scaling_factor
+ if offset is not None:
+ grid = grid - offset
+ grid = grid.reshape([2, 1, grid_size, grid_size])
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+ if cls_token and extra_tokens > 0:
+ pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+ return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+ assert embed_dim % 2 == 0
+ # use half of dimensions to encode grid_h
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+ return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+ """
+ embed_dim: output dimension for each position
+ pos: a list of positions to be encoded: size (M,)
+ out: (M, D)
+ """
+ assert embed_dim % 2 == 0
+ omega = np.arange(embed_dim // 2, dtype=np.float64)
+ omega /= embed_dim / 2.0
+ omega = 1.0 / 10000**omega # (D/2,)
+ pos = pos.reshape(-1) # (M,)
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
+ emb_sin = np.sin(out) # (M, D/2)
+ emb_cos = np.cos(out) # (M, D/2)
+ return np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
+
+
+#################################################################################
+# Embedding Layers for Timesteps and Class Labels #
+#################################################################################
+
+
+class TimestepEmbedder(nn.Module):
+ """Embeds scalar timesteps into vector representations."""
+
+ def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device),
+ nn.SiLU(),
+ nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+ )
+ self.frequency_embedding_size = frequency_embedding_size
+
+ @staticmethod
+ def timestep_embedding(t, dim, max_period=10000):
+ """
+ Create sinusoidal timestep embeddings.
+ :param t: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ :param dim: the dimension of the output.
+ :param max_period: controls the minimum frequency of the embeddings.
+ :return: an (N, D) Tensor of positional embeddings.
+ """
+ half = dim // 2
+ freqs = torch.exp(
+ -math.log(max_period)
+ * torch.arange(start=0, end=half, dtype=torch.float32)
+ / half
+ ).to(device=t.device)
+ args = t[:, None].float() * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ if torch.is_floating_point(t):
+ embedding = embedding.to(dtype=t.dtype)
+ return embedding
+
+ def forward(self, t, dtype, **kwargs):
+ t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+ t_emb = self.mlp(t_freq)
+ return t_emb
+
+
+class VectorEmbedder(nn.Module):
+ """Embeds a flat vector of dimension input_dim"""
+
+ def __init__(self, input_dim: int, hidden_size: int, dtype=None, device=None):
+ super().__init__()
+ self.mlp = nn.Sequential(
+ nn.Linear(input_dim, hidden_size, bias=True, dtype=dtype, device=device),
+ nn.SiLU(),
+ nn.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return self.mlp(x)
+
+
+#################################################################################
+# Core DiT Model #
+#################################################################################
+
+
+class QkvLinear(torch.nn.Linear):
+ pass
+
+def split_qkv(qkv, head_dim):
+ qkv = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, -1, head_dim).movedim(2, 0)
+ return qkv[0], qkv[1], qkv[2]
+
+def optimized_attention(qkv, num_heads):
+ return attention(qkv[0], qkv[1], qkv[2], num_heads)
+
+class SelfAttention(nn.Module):
+ ATTENTION_MODES = ("xformers", "torch", "torch-hb", "math", "debug")
+
+ def __init__(
+ self,
+ dim: int,
+ num_heads: int = 8,
+ qkv_bias: bool = False,
+ qk_scale: Optional[float] = None,
+ attn_mode: str = "xformers",
+ pre_only: bool = False,
+ qk_norm: Optional[str] = None,
+ rmsnorm: bool = False,
+ dtype=None,
+ device=None,
+ ):
+ super().__init__()
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+
+ self.qkv = QkvLinear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+ if not pre_only:
+ self.proj = nn.Linear(dim, dim, dtype=dtype, device=device)
+ assert attn_mode in self.ATTENTION_MODES
+ self.attn_mode = attn_mode
+ self.pre_only = pre_only
+
+ if qk_norm == "rms":
+ self.ln_q = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ self.ln_k = RMSNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ elif qk_norm == "ln":
+ self.ln_q = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ self.ln_k = nn.LayerNorm(self.head_dim, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ elif qk_norm is None:
+ self.ln_q = nn.Identity()
+ self.ln_k = nn.Identity()
+ else:
+ raise ValueError(qk_norm)
+
+ def pre_attention(self, x: torch.Tensor):
+ B, L, C = x.shape
+ qkv = self.qkv(x)
+ q, k, v = split_qkv(qkv, self.head_dim)
+ q = self.ln_q(q).reshape(q.shape[0], q.shape[1], -1)
+ k = self.ln_k(k).reshape(q.shape[0], q.shape[1], -1)
+ return (q, k, v)
+
+ def post_attention(self, x: torch.Tensor) -> torch.Tensor:
+ assert not self.pre_only
+ x = self.proj(x)
+ return x
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ (q, k, v) = self.pre_attention(x)
+ x = attention(q, k, v, self.num_heads)
+ x = self.post_attention(x)
+ return x
+
+
+class RMSNorm(torch.nn.Module):
+ def __init__(
+ self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6, device=None, dtype=None
+ ):
+ """
+ Initialize the RMSNorm normalization layer.
+ Args:
+ dim (int): The dimension of the input tensor.
+ eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+ Attributes:
+ eps (float): A small value added to the denominator for numerical stability.
+ weight (nn.Parameter): Learnable scaling parameter.
+ """
+ super().__init__()
+ self.eps = eps
+ self.learnable_scale = elementwise_affine
+ if self.learnable_scale:
+ self.weight = nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+ else:
+ self.register_parameter("weight", None)
+
+ def _norm(self, x):
+ """
+ Apply the RMSNorm normalization to the input tensor.
+ Args:
+ x (torch.Tensor): The input tensor.
+ Returns:
+ torch.Tensor: The normalized tensor.
+ """
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+ def forward(self, x):
+ """
+ Forward pass through the RMSNorm layer.
+ Args:
+ x (torch.Tensor): The input tensor.
+ Returns:
+ torch.Tensor: The output tensor after applying RMSNorm.
+ """
+ x = self._norm(x)
+ if self.learnable_scale:
+ return x * self.weight.to(device=x.device, dtype=x.dtype)
+ else:
+ return x
+
+
+class SwiGLUFeedForward(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ hidden_dim: int,
+ multiple_of: int,
+ ffn_dim_multiplier: Optional[float] = None,
+ ):
+ """
+ Initialize the FeedForward module.
+
+ Args:
+ dim (int): Input dimension.
+ hidden_dim (int): Hidden dimension of the feedforward layer.
+ multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+ ffn_dim_multiplier (float, optional): Custom multiplier for hidden dimension. Defaults to None.
+
+ Attributes:
+ w1 (ColumnParallelLinear): Linear transformation for the first layer.
+ w2 (RowParallelLinear): Linear transformation for the second layer.
+ w3 (ColumnParallelLinear): Linear transformation for the third layer.
+
+ """
+ super().__init__()
+ hidden_dim = int(2 * hidden_dim / 3)
+ # custom dim factor multiplier
+ if ffn_dim_multiplier is not None:
+ hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+ hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+ self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+ self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+ self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+ def forward(self, x):
+ return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
+
+
+class DismantledBlock(nn.Module):
+ """A DiT block with gated adaptive layer norm (adaLN) conditioning."""
+
+ ATTENTION_MODES = ("xformers", "torch", "torch-hb", "math", "debug")
+
+ def __init__(
+ self,
+ hidden_size: int,
+ num_heads: int,
+ mlp_ratio: float = 4.0,
+ attn_mode: str = "xformers",
+ qkv_bias: bool = False,
+ pre_only: bool = False,
+ rmsnorm: bool = False,
+ scale_mod_only: bool = False,
+ swiglu: bool = False,
+ qk_norm: Optional[str] = None,
+ dtype=None,
+ device=None,
+ **block_kwargs,
+ ):
+ super().__init__()
+ assert attn_mode in self.ATTENTION_MODES
+ if not rmsnorm:
+ self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+ else:
+ self.norm1 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+ self.attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, attn_mode=attn_mode, pre_only=pre_only, qk_norm=qk_norm, rmsnorm=rmsnorm, dtype=dtype, device=device)
+ if not pre_only:
+ if not rmsnorm:
+ self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+ else:
+ self.norm2 = RMSNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
+ if not pre_only:
+ if not swiglu:
+ self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=nn.GELU(approximate="tanh"), dtype=dtype, device=device)
+ else:
+ self.mlp = SwiGLUFeedForward(dim=hidden_size, hidden_dim=mlp_hidden_dim, multiple_of=256)
+ self.scale_mod_only = scale_mod_only
+ if not scale_mod_only:
+ n_mods = 6 if not pre_only else 2
+ else:
+ n_mods = 4 if not pre_only else 1
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, n_mods * hidden_size, bias=True, dtype=dtype, device=device))
+ self.pre_only = pre_only
+
+ def pre_attention(self, x: torch.Tensor, c: torch.Tensor):
+ assert x is not None, "pre_attention called with None input"
+ if not self.pre_only:
+ if not self.scale_mod_only:
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+ else:
+ shift_msa = None
+ shift_mlp = None
+ scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(4, dim=1)
+ qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+ return qkv, (x, gate_msa, shift_mlp, scale_mlp, gate_mlp)
+ else:
+ if not self.scale_mod_only:
+ shift_msa, scale_msa = self.adaLN_modulation(c).chunk(2, dim=1)
+ else:
+ shift_msa = None
+ scale_msa = self.adaLN_modulation(c)
+ qkv = self.attn.pre_attention(modulate(self.norm1(x), shift_msa, scale_msa))
+ return qkv, None
+
+ def post_attention(self, attn, x, gate_msa, shift_mlp, scale_mlp, gate_mlp):
+ assert not self.pre_only
+ x = x + gate_msa.unsqueeze(1) * self.attn.post_attention(attn)
+ x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+ return x
+
+ def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+ assert not self.pre_only
+ (q, k, v), intermediates = self.pre_attention(x, c)
+ attn = attention(q, k, v, self.attn.num_heads)
+ return self.post_attention(attn, *intermediates)
+
+
+def block_mixing(context, x, context_block, x_block, c):
+ assert context is not None, "block_mixing called with None context"
+ context_qkv, context_intermediates = context_block.pre_attention(context, c)
+
+ x_qkv, x_intermediates = x_block.pre_attention(x, c)
+
+ o = []
+ for t in range(3):
+ o.append(torch.cat((context_qkv[t], x_qkv[t]), dim=1))
+ q, k, v = tuple(o)
+
+ attn = attention(q, k, v, x_block.attn.num_heads)
+ context_attn, x_attn = (attn[:, : context_qkv[0].shape[1]], attn[:, context_qkv[0].shape[1] :])
+
+ if not context_block.pre_only:
+ context = context_block.post_attention(context_attn, *context_intermediates)
+ else:
+ context = None
+ x = x_block.post_attention(x_attn, *x_intermediates)
+ return context, x
+
+
+class JointBlock(nn.Module):
+ """just a small wrapper to serve as a fsdp unit"""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+ pre_only = kwargs.pop("pre_only")
+ qk_norm = kwargs.pop("qk_norm", None)
+ self.context_block = DismantledBlock(*args, pre_only=pre_only, qk_norm=qk_norm, **kwargs)
+ self.x_block = DismantledBlock(*args, pre_only=False, qk_norm=qk_norm, **kwargs)
+
+ def forward(self, *args, **kwargs):
+ return block_mixing(*args, context_block=self.context_block, x_block=self.x_block, **kwargs)
+
+
+class FinalLayer(nn.Module):
+ """
+ The final layer of DiT.
+ """
+
+ def __init__(self, hidden_size: int, patch_size: int, out_channels: int, total_out_channels: Optional[int] = None, dtype=None, device=None):
+ super().__init__()
+ self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+ self.linear = (
+ nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+ if (total_out_channels is None)
+ else nn.Linear(hidden_size, total_out_channels, bias=True, dtype=dtype, device=device)
+ )
+ self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+
+ def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
+ shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+ x = modulate(self.norm_final(x), shift, scale)
+ x = self.linear(x)
+ return x
+
+
+class MMDiT(nn.Module):
+ """Diffusion model with a Transformer backbone."""
+
+ def __init__(
+ self,
+ input_size: int = 32,
+ patch_size: int = 2,
+ in_channels: int = 4,
+ depth: int = 28,
+ mlp_ratio: float = 4.0,
+ learn_sigma: bool = False,
+ adm_in_channels: Optional[int] = None,
+ context_embedder_config: Optional[Dict] = None,
+ register_length: int = 0,
+ attn_mode: str = "torch",
+ rmsnorm: bool = False,
+ scale_mod_only: bool = False,
+ swiglu: bool = False,
+ out_channels: Optional[int] = None,
+ pos_embed_scaling_factor: Optional[float] = None,
+ pos_embed_offset: Optional[float] = None,
+ pos_embed_max_size: Optional[int] = None,
+ num_patches = None,
+ qk_norm: Optional[str] = None,
+ qkv_bias: bool = True,
+ dtype = None,
+ device = None,
+ ):
+ super().__init__()
+ self.dtype = dtype
+ self.learn_sigma = learn_sigma
+ self.in_channels = in_channels
+ default_out_channels = in_channels * 2 if learn_sigma else in_channels
+ self.out_channels = out_channels if out_channels is not None else default_out_channels
+ self.patch_size = patch_size
+ self.pos_embed_scaling_factor = pos_embed_scaling_factor
+ self.pos_embed_offset = pos_embed_offset
+ self.pos_embed_max_size = pos_embed_max_size
+
+ # apply magic --> this defines a head_size of 64
+ hidden_size = 64 * depth
+ num_heads = depth
+
+ self.num_heads = num_heads
+
+ self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True, strict_img_size=self.pos_embed_max_size is None, dtype=dtype, device=device)
+ self.t_embedder = TimestepEmbedder(hidden_size, dtype=dtype, device=device)
+
+ if adm_in_channels is not None:
+ assert isinstance(adm_in_channels, int)
+ self.y_embedder = VectorEmbedder(adm_in_channels, hidden_size, dtype=dtype, device=device)
+
+ self.context_embedder = nn.Identity()
+ if context_embedder_config is not None:
+ if context_embedder_config["target"] == "torch.nn.Linear":
+ self.context_embedder = nn.Linear(**context_embedder_config["params"], dtype=dtype, device=device)
+
+ self.register_length = register_length
+ if self.register_length > 0:
+ self.register = nn.Parameter(torch.randn(1, register_length, hidden_size, dtype=dtype, device=device))
+
+ # num_patches = self.x_embedder.num_patches
+ # Will use fixed sin-cos embedding:
+ # just use a buffer already
+ if num_patches is not None:
+ self.register_buffer(
+ "pos_embed",
+ torch.zeros(1, num_patches, hidden_size, dtype=dtype, device=device),
+ )
+ else:
+ self.pos_embed = None
+
+ self.joint_blocks = nn.ModuleList(
+ [
+ JointBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, attn_mode=attn_mode, pre_only=i == depth - 1, rmsnorm=rmsnorm, scale_mod_only=scale_mod_only, swiglu=swiglu, qk_norm=qk_norm, dtype=dtype, device=device)
+ for i in range(depth)
+ ]
+ )
+
+ self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels, dtype=dtype, device=device)
+
+ def cropped_pos_embed(self, hw):
+ assert self.pos_embed_max_size is not None
+ p = self.x_embedder.patch_size[0]
+ h, w = hw
+ # patched size
+ h = h // p
+ w = w // p
+ assert h <= self.pos_embed_max_size, (h, self.pos_embed_max_size)
+ assert w <= self.pos_embed_max_size, (w, self.pos_embed_max_size)
+ top = (self.pos_embed_max_size - h) // 2
+ left = (self.pos_embed_max_size - w) // 2
+ spatial_pos_embed = rearrange(
+ self.pos_embed,
+ "1 (h w) c -> 1 h w c",
+ h=self.pos_embed_max_size,
+ w=self.pos_embed_max_size,
+ )
+ spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+ spatial_pos_embed = rearrange(spatial_pos_embed, "1 h w c -> 1 (h w) c")
+ return spatial_pos_embed
+
+ def unpatchify(self, x, hw=None):
+ """
+ x: (N, T, patch_size**2 * C)
+ imgs: (N, H, W, C)
+ """
+ c = self.out_channels
+ p = self.x_embedder.patch_size[0]
+ if hw is None:
+ h = w = int(x.shape[1] ** 0.5)
+ else:
+ h, w = hw
+ h = h // p
+ w = w // p
+ assert h * w == x.shape[1]
+
+ x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+ x = torch.einsum("nhwpqc->nchpwq", x)
+ imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+ return imgs
+
+ def forward_core_with_concat(self, x: torch.Tensor, c_mod: torch.Tensor, context: Optional[torch.Tensor] = None) -> torch.Tensor:
+ if self.register_length > 0:
+ context = torch.cat((repeat(self.register, "1 ... -> b ...", b=x.shape[0]), context if context is not None else torch.Tensor([]).type_as(x)), 1)
+
+ # context is B, L', D
+ # x is B, L, D
+ for block in self.joint_blocks:
+ context, x = block(context, x, c=c_mod)
+
+ x = self.final_layer(x, c_mod) # (N, T, patch_size ** 2 * out_channels)
+ return x
+
+ def forward(self, x: torch.Tensor, t: torch.Tensor, y: Optional[torch.Tensor] = None, context: Optional[torch.Tensor] = None) -> torch.Tensor:
+ """
+ Forward pass of DiT.
+ x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+ t: (N,) tensor of diffusion timesteps
+ y: (N,) tensor of class labels
+ """
+ hw = x.shape[-2:]
+ x = self.x_embedder(x) + self.cropped_pos_embed(hw)
+ c = self.t_embedder(t, dtype=x.dtype) # (N, D)
+ if y is not None:
+ y = self.y_embedder(y) # (N, D)
+ c = c + y # (N, D)
+
+ context = self.context_embedder(context)
+
+ x = self.forward_core_with_concat(x, c, context)
+
+ x = self.unpatchify(x, hw=hw) # (N, out_channels, H, W)
+ return x
diff --git a/modules/models/sd3/other_impls.py b/modules/models/sd3/other_impls.py
new file mode 100644
index 000000000..78c1dc687
--- /dev/null
+++ b/modules/models/sd3/other_impls.py
@@ -0,0 +1,510 @@
+### This file contains impls for underlying related models (CLIP, T5, etc)
+
+import torch
+import math
+from torch import nn
+from transformers import CLIPTokenizer, T5TokenizerFast
+
+from modules import sd_hijack
+
+
+#################################################################################################
+### Core/Utility
+#################################################################################################
+
+
+class AutocastLinear(nn.Linear):
+ """Same as usual linear layer, but casts its weights to whatever the parameter type is.
+
+ This is different from torch.autocast in a way that float16 layer processing float32 input
+ will return float16 with autocast on, and float32 with this. T5 seems to be fucked
+ if you do it in full float16 (returning almost all zeros in the final output).
+ """
+
+ def forward(self, x):
+ return torch.nn.functional.linear(x, self.weight.to(x.dtype), self.bias.to(x.dtype) if self.bias is not None else None)
+
+
+def attention(q, k, v, heads, mask=None):
+ """Convenience wrapper around a basic attention operation"""
+ b, _, dim_head = q.shape
+ dim_head //= heads
+ q, k, v = [t.view(b, -1, heads, dim_head).transpose(1, 2) for t in (q, k, v)]
+ out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+ return out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+
+
+class Mlp(nn.Module):
+ """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, dtype=None, device=None):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias, dtype=dtype, device=device)
+ self.act = act_layer
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias, dtype=dtype, device=device)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.fc2(x)
+ return x
+
+
+#################################################################################################
+### CLIP
+#################################################################################################
+
+
+class CLIPAttention(torch.nn.Module):
+ def __init__(self, embed_dim, heads, dtype, device):
+ super().__init__()
+ self.heads = heads
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
+
+ def forward(self, x, mask=None):
+ q = self.q_proj(x)
+ k = self.k_proj(x)
+ v = self.v_proj(x)
+ out = attention(q, k, v, self.heads, mask)
+ return self.out_proj(out)
+
+
+ACTIVATIONS = {
+ "quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
+ "gelu": torch.nn.functional.gelu,
+}
+
+class CLIPLayer(torch.nn.Module):
+ def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+ super().__init__()
+ self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+ self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
+ self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+ #self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
+ self.mlp = Mlp(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device)
+
+ def forward(self, x, mask=None):
+ x += self.self_attn(self.layer_norm1(x), mask)
+ x += self.mlp(self.layer_norm2(x))
+ return x
+
+
+class CLIPEncoder(torch.nn.Module):
+ def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
+ super().__init__()
+ self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)])
+
+ def forward(self, x, mask=None, intermediate_output=None):
+ if intermediate_output is not None:
+ if intermediate_output < 0:
+ intermediate_output = len(self.layers) + intermediate_output
+ intermediate = None
+ for i, layer in enumerate(self.layers):
+ x = layer(x, mask)
+ if i == intermediate_output:
+ intermediate = x.clone()
+ return x, intermediate
+
+
+class CLIPEmbeddings(torch.nn.Module):
+ def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None, textual_inversion_key="clip_l"):
+ super().__init__()
+ self.token_embedding = sd_hijack.TextualInversionEmbeddings(vocab_size, embed_dim, dtype=dtype, device=device, textual_inversion_key=textual_inversion_key)
+ self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
+
+ def forward(self, input_tokens):
+ return self.token_embedding(input_tokens) + self.position_embedding.weight
+
+
+class CLIPTextModel_(torch.nn.Module):
+ def __init__(self, config_dict, dtype, device):
+ num_layers = config_dict["num_hidden_layers"]
+ embed_dim = config_dict["hidden_size"]
+ heads = config_dict["num_attention_heads"]
+ intermediate_size = config_dict["intermediate_size"]
+ intermediate_activation = config_dict["hidden_act"]
+ super().__init__()
+ self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device, textual_inversion_key=config_dict.get('textual_inversion_key', 'clip_l'))
+ self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
+ self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
+
+ def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
+ x = self.embeddings(input_tokens)
+ causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
+ x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
+ x = self.final_layer_norm(x)
+ if i is not None and final_layer_norm_intermediate:
+ i = self.final_layer_norm(i)
+ pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
+ return x, i, pooled_output
+
+
+class CLIPTextModel(torch.nn.Module):
+ def __init__(self, config_dict, dtype, device):
+ super().__init__()
+ self.num_layers = config_dict["num_hidden_layers"]
+ self.text_model = CLIPTextModel_(config_dict, dtype, device)
+ embed_dim = config_dict["hidden_size"]
+ self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+ self.text_projection.weight.copy_(torch.eye(embed_dim))
+ self.dtype = dtype
+
+ def get_input_embeddings(self):
+ return self.text_model.embeddings.token_embedding
+
+ def set_input_embeddings(self, embeddings):
+ self.text_model.embeddings.token_embedding = embeddings
+
+ def forward(self, *args, **kwargs):
+ x = self.text_model(*args, **kwargs)
+ out = self.text_projection(x[2])
+ return (x[0], x[1], out, x[2])
+
+
+class SDTokenizer:
+ def __init__(self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None):
+ self.tokenizer = tokenizer
+ self.max_length = max_length
+ self.min_length = min_length
+ empty = self.tokenizer('')["input_ids"]
+ if has_start_token:
+ self.tokens_start = 1
+ self.start_token = empty[0]
+ self.end_token = empty[1]
+ else:
+ self.tokens_start = 0
+ self.start_token = None
+ self.end_token = empty[0]
+ self.pad_with_end = pad_with_end
+ self.pad_to_max_length = pad_to_max_length
+ vocab = self.tokenizer.get_vocab()
+ self.inv_vocab = {v: k for k, v in vocab.items()}
+ self.max_word_length = 8
+
+
+ def tokenize_with_weights(self, text:str):
+ """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here. The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
+ if self.pad_with_end:
+ pad_token = self.end_token
+ else:
+ pad_token = 0
+ batch = []
+ if self.start_token is not None:
+ batch.append((self.start_token, 1.0))
+ to_tokenize = text.replace("\n", " ").split(' ')
+ to_tokenize = [x for x in to_tokenize if x != ""]
+ for word in to_tokenize:
+ batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start:-1]])
+ batch.append((self.end_token, 1.0))
+ if self.pad_to_max_length:
+ batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
+ if self.min_length is not None and len(batch) < self.min_length:
+ batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
+ return [batch]
+
+
+class SDXLClipGTokenizer(SDTokenizer):
+ def __init__(self, tokenizer):
+ super().__init__(pad_with_end=False, tokenizer=tokenizer)
+
+
+class SD3Tokenizer:
+ def __init__(self):
+ clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+ self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
+ self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
+ self.t5xxl = T5XXLTokenizer()
+
+ def tokenize_with_weights(self, text:str):
+ out = {}
+ out["g"] = self.clip_g.tokenize_with_weights(text)
+ out["l"] = self.clip_l.tokenize_with_weights(text)
+ out["t5xxl"] = self.t5xxl.tokenize_with_weights(text)
+ return out
+
+
+class ClipTokenWeightEncoder:
+ def encode_token_weights(self, token_weight_pairs):
+ tokens = [a[0] for a in token_weight_pairs[0]]
+ out, pooled = self([tokens])
+ if pooled is not None:
+ first_pooled = pooled[0:1].cpu()
+ else:
+ first_pooled = pooled
+ output = [out[0:1]]
+ return torch.cat(output, dim=-2).cpu(), first_pooled
+
+
+class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
+ """Uses the CLIP transformer encoder for text (from huggingface)"""
+ LAYERS = ["last", "pooled", "hidden"]
+ def __init__(self, device="cpu", max_length=77, layer="last", layer_idx=None, textmodel_json_config=None, dtype=None, model_class=CLIPTextModel,
+ special_tokens=None, layer_norm_hidden_state=True, return_projected_pooled=True):
+ super().__init__()
+ assert layer in self.LAYERS
+ self.transformer = model_class(textmodel_json_config, dtype, device)
+ self.num_layers = self.transformer.num_layers
+ self.max_length = max_length
+ self.transformer = self.transformer.eval()
+ for param in self.parameters():
+ param.requires_grad = False
+ self.layer = layer
+ self.layer_idx = None
+ self.special_tokens = special_tokens if special_tokens is not None else {"start": 49406, "end": 49407, "pad": 49407}
+ self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
+ self.layer_norm_hidden_state = layer_norm_hidden_state
+ self.return_projected_pooled = return_projected_pooled
+ if layer == "hidden":
+ assert layer_idx is not None
+ assert abs(layer_idx) < self.num_layers
+ self.set_clip_options({"layer": layer_idx})
+ self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
+
+ def set_clip_options(self, options):
+ layer_idx = options.get("layer", self.layer_idx)
+ self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
+ if layer_idx is None or abs(layer_idx) > self.num_layers:
+ self.layer = "last"
+ else:
+ self.layer = "hidden"
+ self.layer_idx = layer_idx
+
+ def forward(self, tokens):
+ backup_embeds = self.transformer.get_input_embeddings()
+ tokens = torch.asarray(tokens, dtype=torch.int64, device=backup_embeds.weight.device)
+ outputs = self.transformer(tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state)
+ self.transformer.set_input_embeddings(backup_embeds)
+ if self.layer == "last":
+ z = outputs[0]
+ else:
+ z = outputs[1]
+ pooled_output = None
+ if len(outputs) >= 3:
+ if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
+ pooled_output = outputs[3].float()
+ elif outputs[2] is not None:
+ pooled_output = outputs[2].float()
+ return z.float(), pooled_output
+
+
+class SDXLClipG(SDClipModel):
+ """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
+ def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
+ if layer == "penultimate":
+ layer="hidden"
+ layer_idx=-2
+ super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False)
+
+
+class T5XXLModel(SDClipModel):
+ """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
+ def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
+ super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5)
+
+
+#################################################################################################
+### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
+#################################################################################################
+
+class T5XXLTokenizer(SDTokenizer):
+ """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
+ def __init__(self):
+ super().__init__(pad_with_end=False, tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"), has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+
+
+class T5LayerNorm(torch.nn.Module):
+ def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
+ super().__init__()
+ self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
+ self.variance_epsilon = eps
+
+ def forward(self, x):
+ variance = x.pow(2).mean(-1, keepdim=True)
+ x = x * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight.to(device=x.device, dtype=x.dtype) * x
+
+
+class T5DenseGatedActDense(torch.nn.Module):
+ def __init__(self, model_dim, ff_dim, dtype, device):
+ super().__init__()
+ self.wi_0 = AutocastLinear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+ self.wi_1 = AutocastLinear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
+ self.wo = AutocastLinear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
+
+ def forward(self, x):
+ hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
+ hidden_linear = self.wi_1(x)
+ x = hidden_gelu * hidden_linear
+ x = self.wo(x)
+ return x
+
+
+class T5LayerFF(torch.nn.Module):
+ def __init__(self, model_dim, ff_dim, dtype, device):
+ super().__init__()
+ self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device)
+ self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+ def forward(self, x):
+ forwarded_states = self.layer_norm(x)
+ forwarded_states = self.DenseReluDense(forwarded_states)
+ x += forwarded_states
+ return x
+
+
+class T5Attention(torch.nn.Module):
+ def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
+ super().__init__()
+ # Mesh TensorFlow initialization to avoid scaling before softmax
+ self.q = AutocastLinear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.k = AutocastLinear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.v = AutocastLinear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.o = AutocastLinear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
+ self.num_heads = num_heads
+ self.relative_attention_bias = None
+ if relative_attention_bias:
+ self.relative_attention_num_buckets = 32
+ self.relative_attention_max_distance = 128
+ self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
+
+ @staticmethod
+ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+ """
+ Adapted from Mesh Tensorflow:
+ https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+ Translate relative position to a bucket number for relative attention. The relative position is defined as
+ memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+ position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+ small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+ positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+ This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+ Args:
+ relative_position: an int32 Tensor
+ bidirectional: a boolean - whether the attention is bidirectional
+ num_buckets: an integer
+ max_distance: an integer
+
+ Returns:
+ a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+ """
+ relative_buckets = 0
+ if bidirectional:
+ num_buckets //= 2
+ relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+ relative_position = torch.abs(relative_position)
+ else:
+ relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+ # now relative_position is in the range [0, inf)
+ # half of the buckets are for exact increments in positions
+ max_exact = num_buckets // 2
+ is_small = relative_position < max_exact
+ # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+ relative_position_if_large = max_exact + (
+ torch.log(relative_position.float() / max_exact)
+ / math.log(max_distance / max_exact)
+ * (num_buckets - max_exact)
+ ).to(torch.long)
+ relative_position_if_large = torch.min(relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1))
+ relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+ return relative_buckets
+
+ def compute_bias(self, query_length, key_length, device):
+ """Compute binned relative position bias"""
+ context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+ memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+ relative_position = memory_position - context_position # shape (query_length, key_length)
+ relative_position_bucket = self._relative_position_bucket(
+ relative_position, # shape (query_length, key_length)
+ bidirectional=True,
+ num_buckets=self.relative_attention_num_buckets,
+ max_distance=self.relative_attention_max_distance,
+ )
+ values = self.relative_attention_bias(relative_position_bucket) # shape (query_length, key_length, num_heads)
+ values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length)
+ return values
+
+ def forward(self, x, past_bias=None):
+ q = self.q(x)
+ k = self.k(x)
+ v = self.v(x)
+
+ if self.relative_attention_bias is not None:
+ past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
+ if past_bias is not None:
+ mask = past_bias
+ else:
+ mask = None
+
+ out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask.to(x.dtype) if mask is not None else None)
+
+ return self.o(out), past_bias
+
+
+class T5LayerSelfAttention(torch.nn.Module):
+ def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+ super().__init__()
+ self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
+ self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+ def forward(self, x, past_bias=None):
+ output, past_bias = self.SelfAttention(self.layer_norm(x), past_bias=past_bias)
+ x += output
+ return x, past_bias
+
+
+class T5Block(torch.nn.Module):
+ def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
+ super().__init__()
+ self.layer = torch.nn.ModuleList()
+ self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
+ self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
+
+ def forward(self, x, past_bias=None):
+ x, past_bias = self.layer[0](x, past_bias)
+ x = self.layer[-1](x)
+ return x, past_bias
+
+
+class T5Stack(torch.nn.Module):
+ def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
+ super().__init__()
+ self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
+ self.block = torch.nn.ModuleList([T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device) for i in range(num_layers)])
+ self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
+
+ def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
+ intermediate = None
+ x = self.embed_tokens(input_ids).to(torch.float32) # needs float32 or else T5 returns all zeroes
+ past_bias = None
+ for i, layer in enumerate(self.block):
+ x, past_bias = layer(x, past_bias)
+ if i == intermediate_output:
+ intermediate = x.clone()
+ x = self.final_layer_norm(x)
+ if intermediate is not None and final_layer_norm_intermediate:
+ intermediate = self.final_layer_norm(intermediate)
+ return x, intermediate
+
+
+class T5(torch.nn.Module):
+ def __init__(self, config_dict, dtype, device):
+ super().__init__()
+ self.num_layers = config_dict["num_layers"]
+ self.encoder = T5Stack(self.num_layers, config_dict["d_model"], config_dict["d_model"], config_dict["d_ff"], config_dict["num_heads"], config_dict["vocab_size"], dtype, device)
+ self.dtype = dtype
+
+ def get_input_embeddings(self):
+ return self.encoder.embed_tokens
+
+ def set_input_embeddings(self, embeddings):
+ self.encoder.embed_tokens = embeddings
+
+ def forward(self, *args, **kwargs):
+ return self.encoder(*args, **kwargs)
diff --git a/modules/models/sd3/sd3_cond.py b/modules/models/sd3/sd3_cond.py
new file mode 100644
index 000000000..325c512d5
--- /dev/null
+++ b/modules/models/sd3/sd3_cond.py
@@ -0,0 +1,222 @@
+import os
+import safetensors
+import torch
+import typing
+
+from transformers import CLIPTokenizer, T5TokenizerFast
+
+from modules import shared, devices, modelloader, sd_hijack_clip, prompt_parser
+from modules.models.sd3.other_impls import SDClipModel, SDXLClipG, T5XXLModel, SD3Tokenizer
+
+
+class SafetensorsMapping(typing.Mapping):
+ def __init__(self, file):
+ self.file = file
+
+ def __len__(self):
+ return len(self.file.keys())
+
+ def __iter__(self):
+ for key in self.file.keys():
+ yield key
+
+ def __getitem__(self, key):
+ return self.file.get_tensor(key)
+
+
+CLIPL_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/clip_l.safetensors"
+CLIPL_CONFIG = {
+ "hidden_act": "quick_gelu",
+ "hidden_size": 768,
+ "intermediate_size": 3072,
+ "num_attention_heads": 12,
+ "num_hidden_layers": 12,
+}
+
+CLIPG_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/clip_g.safetensors"
+CLIPG_CONFIG = {
+ "hidden_act": "gelu",
+ "hidden_size": 1280,
+ "intermediate_size": 5120,
+ "num_attention_heads": 20,
+ "num_hidden_layers": 32,
+ "textual_inversion_key": "clip_g",
+}
+
+T5_URL = "https://huggingface.co/AUTOMATIC/stable-diffusion-3-medium-text-encoders/resolve/main/t5xxl_fp16.safetensors"
+T5_CONFIG = {
+ "d_ff": 10240,
+ "d_model": 4096,
+ "num_heads": 64,
+ "num_layers": 24,
+ "vocab_size": 32128,
+}
+
+
+class Sd3ClipLG(sd_hijack_clip.TextConditionalModel):
+ def __init__(self, clip_l, clip_g):
+ super().__init__()
+
+ self.clip_l = clip_l
+ self.clip_g = clip_g
+
+ self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+
+ empty = self.tokenizer('')["input_ids"]
+ self.id_start = empty[0]
+ self.id_end = empty[1]
+ self.id_pad = empty[1]
+
+ self.return_pooled = True
+
+ def tokenize(self, texts):
+ return self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+
+ def encode_with_transformers(self, tokens):
+ tokens_g = tokens.clone()
+
+ for batch_pos in range(tokens_g.shape[0]):
+ index = tokens_g[batch_pos].cpu().tolist().index(self.id_end)
+ tokens_g[batch_pos, index+1:tokens_g.shape[1]] = 0
+
+ l_out, l_pooled = self.clip_l(tokens)
+ g_out, g_pooled = self.clip_g(tokens_g)
+
+ lg_out = torch.cat([l_out, g_out], dim=-1)
+ lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
+
+ vector_out = torch.cat((l_pooled, g_pooled), dim=-1)
+
+ lg_out.pooled = vector_out
+ return lg_out
+
+ def encode_embedding_init_text(self, init_text, nvpt):
+ return torch.zeros((nvpt, 768+1280), device=devices.device) # XXX
+
+
+class Sd3T5(torch.nn.Module):
+ def __init__(self, t5xxl):
+ super().__init__()
+
+ self.t5xxl = t5xxl
+ self.tokenizer = T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl")
+
+ empty = self.tokenizer('', padding='max_length', max_length=2)["input_ids"]
+ self.id_end = empty[0]
+ self.id_pad = empty[1]
+
+ def tokenize(self, texts):
+ return self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
+
+ def tokenize_line(self, line, *, target_token_count=None):
+ if shared.opts.emphasis != "None":
+ parsed = prompt_parser.parse_prompt_attention(line)
+ else:
+ parsed = [[line, 1.0]]
+
+ tokenized = self.tokenize([text for text, _ in parsed])
+
+ tokens = []
+ multipliers = []
+
+ for text_tokens, (text, weight) in zip(tokenized, parsed):
+ if text == 'BREAK' and weight == -1:
+ continue
+
+ tokens += text_tokens
+ multipliers += [weight] * len(text_tokens)
+
+ tokens += [self.id_end]
+ multipliers += [1.0]
+
+ if target_token_count is not None:
+ if len(tokens) < target_token_count:
+ tokens += [self.id_pad] * (target_token_count - len(tokens))
+ multipliers += [1.0] * (target_token_count - len(tokens))
+ else:
+ tokens = tokens[0:target_token_count]
+ multipliers = multipliers[0:target_token_count]
+
+ return tokens, multipliers
+
+ def forward(self, texts, *, token_count):
+ if not self.t5xxl or not shared.opts.sd3_enable_t5:
+ return torch.zeros((len(texts), token_count, 4096), device=devices.device, dtype=devices.dtype)
+
+ tokens_batch = []
+
+ for text in texts:
+ tokens, multipliers = self.tokenize_line(text, target_token_count=token_count)
+ tokens_batch.append(tokens)
+
+ t5_out, t5_pooled = self.t5xxl(tokens_batch)
+
+ return t5_out
+
+ def encode_embedding_init_text(self, init_text, nvpt):
+ return torch.zeros((nvpt, 4096), device=devices.device) # XXX
+
+
+class SD3Cond(torch.nn.Module):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.tokenizer = SD3Tokenizer()
+
+ with torch.no_grad():
+ self.clip_g = SDXLClipG(CLIPG_CONFIG, device="cpu", dtype=devices.dtype)
+ self.clip_l = SDClipModel(layer="hidden", layer_idx=-2, device="cpu", dtype=devices.dtype, layer_norm_hidden_state=False, return_projected_pooled=False, textmodel_json_config=CLIPL_CONFIG)
+
+ if shared.opts.sd3_enable_t5:
+ self.t5xxl = T5XXLModel(T5_CONFIG, device="cpu", dtype=devices.dtype)
+ else:
+ self.t5xxl = None
+
+ self.model_lg = Sd3ClipLG(self.clip_l, self.clip_g)
+ self.model_t5 = Sd3T5(self.t5xxl)
+
+ def forward(self, prompts: list[str]):
+ with devices.without_autocast():
+ lg_out, vector_out = self.model_lg(prompts)
+ t5_out = self.model_t5(prompts, token_count=lg_out.shape[1])
+ lgt_out = torch.cat([lg_out, t5_out], dim=-2)
+
+ return {
+ 'crossattn': lgt_out,
+ 'vector': vector_out,
+ }
+
+ def before_load_weights(self, state_dict):
+ clip_path = os.path.join(shared.models_path, "CLIP")
+
+ if 'text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight' not in state_dict:
+ clip_g_file = modelloader.load_file_from_url(CLIPG_URL, model_dir=clip_path, file_name="clip_g.safetensors")
+ with safetensors.safe_open(clip_g_file, framework="pt") as file:
+ self.clip_g.transformer.load_state_dict(SafetensorsMapping(file))
+
+ if 'text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight' not in state_dict:
+ clip_l_file = modelloader.load_file_from_url(CLIPL_URL, model_dir=clip_path, file_name="clip_l.safetensors")
+ with safetensors.safe_open(clip_l_file, framework="pt") as file:
+ self.clip_l.transformer.load_state_dict(SafetensorsMapping(file), strict=False)
+
+ if self.t5xxl and 'text_encoders.t5xxl.transformer.encoder.embed_tokens.weight' not in state_dict:
+ t5_file = modelloader.load_file_from_url(T5_URL, model_dir=clip_path, file_name="t5xxl_fp16.safetensors")
+ with safetensors.safe_open(t5_file, framework="pt") as file:
+ self.t5xxl.transformer.load_state_dict(SafetensorsMapping(file), strict=False)
+
+ def encode_embedding_init_text(self, init_text, nvpt):
+ return self.model_lg.encode_embedding_init_text(init_text, nvpt)
+
+ def tokenize(self, texts):
+ return self.model_lg.tokenize(texts)
+
+ def medvram_modules(self):
+ return [self.clip_g, self.clip_l, self.t5xxl]
+
+ def get_token_count(self, text):
+ _, token_count = self.model_lg.process_texts([text])
+
+ return token_count
+
+ def get_target_prompt_token_count(self, token_count):
+ return self.model_lg.get_target_prompt_token_count(token_count)
diff --git a/modules/models/sd3/sd3_impls.py b/modules/models/sd3/sd3_impls.py
new file mode 100644
index 000000000..59f11b2cb
--- /dev/null
+++ b/modules/models/sd3/sd3_impls.py
@@ -0,0 +1,374 @@
+### Impls of the SD3 core diffusion model and VAE
+
+import torch
+import math
+import einops
+from modules.models.sd3.mmdit import MMDiT
+from PIL import Image
+
+
+#################################################################################################
+### MMDiT Model Wrapping
+#################################################################################################
+
+
+class ModelSamplingDiscreteFlow(torch.nn.Module):
+ """Helper for sampler scheduling (ie timestep/sigma calculations) for Discrete Flow models"""
+ def __init__(self, shift=1.0):
+ super().__init__()
+ self.shift = shift
+ timesteps = 1000
+ ts = self.sigma(torch.arange(1, timesteps + 1, 1))
+ self.register_buffer('sigmas', ts)
+
+ @property
+ def sigma_min(self):
+ return self.sigmas[0]
+
+ @property
+ def sigma_max(self):
+ return self.sigmas[-1]
+
+ def timestep(self, sigma):
+ return sigma * 1000
+
+ def sigma(self, timestep: torch.Tensor):
+ timestep = timestep / 1000.0
+ if self.shift == 1.0:
+ return timestep
+ return self.shift * timestep / (1 + (self.shift - 1) * timestep)
+
+ def calculate_denoised(self, sigma, model_output, model_input):
+ sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+ return model_input - model_output * sigma
+
+ def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+ return sigma * noise + (1.0 - sigma) * latent_image
+
+
+class BaseModel(torch.nn.Module):
+ """Wrapper around the core MM-DiT model"""
+ def __init__(self, shift=1.0, device=None, dtype=torch.float32, state_dict=None, prefix=""):
+ super().__init__()
+ # Important configuration values can be quickly determined by checking shapes in the source file
+ # Some of these will vary between models (eg 2B vs 8B primarily differ in their depth, but also other details change)
+ patch_size = state_dict[f"{prefix}x_embedder.proj.weight"].shape[2]
+ depth = state_dict[f"{prefix}x_embedder.proj.weight"].shape[0] // 64
+ num_patches = state_dict[f"{prefix}pos_embed"].shape[1]
+ pos_embed_max_size = round(math.sqrt(num_patches))
+ adm_in_channels = state_dict[f"{prefix}y_embedder.mlp.0.weight"].shape[1]
+ context_shape = state_dict[f"{prefix}context_embedder.weight"].shape
+ context_embedder_config = {
+ "target": "torch.nn.Linear",
+ "params": {
+ "in_features": context_shape[1],
+ "out_features": context_shape[0]
+ }
+ }
+ self.diffusion_model = MMDiT(input_size=None, pos_embed_scaling_factor=None, pos_embed_offset=None, pos_embed_max_size=pos_embed_max_size, patch_size=patch_size, in_channels=16, depth=depth, num_patches=num_patches, adm_in_channels=adm_in_channels, context_embedder_config=context_embedder_config, device=device, dtype=dtype)
+ self.model_sampling = ModelSamplingDiscreteFlow(shift=shift)
+ self.depth = depth
+
+ def apply_model(self, x, sigma, c_crossattn=None, y=None):
+ dtype = self.get_dtype()
+ timestep = self.model_sampling.timestep(sigma).float()
+ model_output = self.diffusion_model(x.to(dtype), timestep, context=c_crossattn.to(dtype), y=y.to(dtype)).float()
+ return self.model_sampling.calculate_denoised(sigma, model_output, x)
+
+ def forward(self, *args, **kwargs):
+ return self.apply_model(*args, **kwargs)
+
+ def get_dtype(self):
+ return self.diffusion_model.dtype
+
+
+class CFGDenoiser(torch.nn.Module):
+ """Helper for applying CFG Scaling to diffusion outputs"""
+ def __init__(self, model):
+ super().__init__()
+ self.model = model
+
+ def forward(self, x, timestep, cond, uncond, cond_scale):
+ # Run cond and uncond in a batch together
+ batched = self.model.apply_model(torch.cat([x, x]), torch.cat([timestep, timestep]), c_crossattn=torch.cat([cond["c_crossattn"], uncond["c_crossattn"]]), y=torch.cat([cond["y"], uncond["y"]]))
+ # Then split and apply CFG Scaling
+ pos_out, neg_out = batched.chunk(2)
+ scaled = neg_out + (pos_out - neg_out) * cond_scale
+ return scaled
+
+
+class SD3LatentFormat:
+ """Latents are slightly shifted from center - this class must be called after VAE Decode to correct for the shift"""
+ def __init__(self):
+ self.scale_factor = 1.5305
+ self.shift_factor = 0.0609
+
+ def process_in(self, latent):
+ return (latent - self.shift_factor) * self.scale_factor
+
+ def process_out(self, latent):
+ return (latent / self.scale_factor) + self.shift_factor
+
+ def decode_latent_to_preview(self, x0):
+ """Quick RGB approximate preview of sd3 latents"""
+ factors = torch.tensor([
+ [-0.0645, 0.0177, 0.1052], [ 0.0028, 0.0312, 0.0650],
+ [ 0.1848, 0.0762, 0.0360], [ 0.0944, 0.0360, 0.0889],
+ [ 0.0897, 0.0506, -0.0364], [-0.0020, 0.1203, 0.0284],
+ [ 0.0855, 0.0118, 0.0283], [-0.0539, 0.0658, 0.1047],
+ [-0.0057, 0.0116, 0.0700], [-0.0412, 0.0281, -0.0039],
+ [ 0.1106, 0.1171, 0.1220], [-0.0248, 0.0682, -0.0481],
+ [ 0.0815, 0.0846, 0.1207], [-0.0120, -0.0055, -0.0867],
+ [-0.0749, -0.0634, -0.0456], [-0.1418, -0.1457, -0.1259]
+ ], device="cpu")
+ latent_image = x0[0].permute(1, 2, 0).cpu() @ factors
+
+ latents_ubyte = (((latent_image + 1) / 2)
+ .clamp(0, 1) # change scale from -1..1 to 0..1
+ .mul(0xFF) # to 0..255
+ .byte()).cpu()
+
+ return Image.fromarray(latents_ubyte.numpy())
+
+
+#################################################################################################
+### K-Diffusion Sampling
+#################################################################################################
+
+
+def append_dims(x, target_dims):
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+ dims_to_append = target_dims - x.ndim
+ return x[(...,) + (None,) * dims_to_append]
+
+
+def to_d(x, sigma, denoised):
+ """Converts a denoiser output to a Karras ODE derivative."""
+ return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+@torch.no_grad()
+@torch.autocast("cuda", dtype=torch.float16)
+def sample_euler(model, x, sigmas, extra_args=None):
+ """Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
+ extra_args = {} if extra_args is None else extra_args
+ s_in = x.new_ones([x.shape[0]])
+ for i in range(len(sigmas) - 1):
+ sigma_hat = sigmas[i]
+ denoised = model(x, sigma_hat * s_in, **extra_args)
+ d = to_d(x, sigma_hat, denoised)
+ dt = sigmas[i + 1] - sigma_hat
+ # Euler method
+ x = x + d * dt
+ return x
+
+
+#################################################################################################
+### VAE
+#################################################################################################
+
+
+def Normalize(in_channels, num_groups=32, dtype=torch.float32, device=None):
+ return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
+
+
+class ResnetBlock(torch.nn.Module):
+ def __init__(self, *, in_channels, out_channels=None, dtype=torch.float32, device=None):
+ super().__init__()
+ self.in_channels = in_channels
+ out_channels = in_channels if out_channels is None else out_channels
+ self.out_channels = out_channels
+
+ self.norm1 = Normalize(in_channels, dtype=dtype, device=device)
+ self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ self.norm2 = Normalize(out_channels, dtype=dtype, device=device)
+ self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ if self.in_channels != self.out_channels:
+ self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+ else:
+ self.nin_shortcut = None
+ self.swish = torch.nn.SiLU(inplace=True)
+
+ def forward(self, x):
+ hidden = x
+ hidden = self.norm1(hidden)
+ hidden = self.swish(hidden)
+ hidden = self.conv1(hidden)
+ hidden = self.norm2(hidden)
+ hidden = self.swish(hidden)
+ hidden = self.conv2(hidden)
+ if self.in_channels != self.out_channels:
+ x = self.nin_shortcut(x)
+ return x + hidden
+
+
+class AttnBlock(torch.nn.Module):
+ def __init__(self, in_channels, dtype=torch.float32, device=None):
+ super().__init__()
+ self.norm = Normalize(in_channels, dtype=dtype, device=device)
+ self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+ self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+ self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+ self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype, device=device)
+
+ def forward(self, x):
+ hidden = self.norm(x)
+ q = self.q(hidden)
+ k = self.k(hidden)
+ v = self.v(hidden)
+ b, c, h, w = q.shape
+ q, k, v = [einops.rearrange(x, "b c h w -> b 1 (h w) c").contiguous() for x in (q, k, v)]
+ hidden = torch.nn.functional.scaled_dot_product_attention(q, k, v) # scale is dim ** -0.5 per default
+ hidden = einops.rearrange(hidden, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+ hidden = self.proj_out(hidden)
+ return x + hidden
+
+
+class Downsample(torch.nn.Module):
+ def __init__(self, in_channels, dtype=torch.float32, device=None):
+ super().__init__()
+ self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0, dtype=dtype, device=device)
+
+ def forward(self, x):
+ pad = (0,1,0,1)
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+ x = self.conv(x)
+ return x
+
+
+class Upsample(torch.nn.Module):
+ def __init__(self, in_channels, dtype=torch.float32, device=None):
+ super().__init__()
+ self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+
+ def forward(self, x):
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+ x = self.conv(x)
+ return x
+
+
+class VAEEncoder(torch.nn.Module):
+ def __init__(self, ch=128, ch_mult=(1,2,4,4), num_res_blocks=2, in_channels=3, z_channels=16, dtype=torch.float32, device=None):
+ super().__init__()
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ # downsampling
+ self.conv_in = torch.nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ in_ch_mult = (1,) + tuple(ch_mult)
+ self.in_ch_mult = in_ch_mult
+ self.down = torch.nn.ModuleList()
+ for i_level in range(self.num_resolutions):
+ block = torch.nn.ModuleList()
+ attn = torch.nn.ModuleList()
+ block_in = ch*in_ch_mult[i_level]
+ block_out = ch*ch_mult[i_level]
+ for _ in range(num_res_blocks):
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+ block_in = block_out
+ down = torch.nn.Module()
+ down.block = block
+ down.attn = attn
+ if i_level != self.num_resolutions - 1:
+ down.downsample = Downsample(block_in, dtype=dtype, device=device)
+ self.down.append(down)
+ # middle
+ self.mid = torch.nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+ self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+ # end
+ self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+ self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ self.swish = torch.nn.SiLU(inplace=True)
+
+ def forward(self, x):
+ # downsampling
+ hs = [self.conv_in(x)]
+ for i_level in range(self.num_resolutions):
+ for i_block in range(self.num_res_blocks):
+ h = self.down[i_level].block[i_block](hs[-1])
+ hs.append(h)
+ if i_level != self.num_resolutions-1:
+ hs.append(self.down[i_level].downsample(hs[-1]))
+ # middle
+ h = hs[-1]
+ h = self.mid.block_1(h)
+ h = self.mid.attn_1(h)
+ h = self.mid.block_2(h)
+ # end
+ h = self.norm_out(h)
+ h = self.swish(h)
+ h = self.conv_out(h)
+ return h
+
+
+class VAEDecoder(torch.nn.Module):
+ def __init__(self, ch=128, out_ch=3, ch_mult=(1, 2, 4, 4), num_res_blocks=2, resolution=256, z_channels=16, dtype=torch.float32, device=None):
+ super().__init__()
+ self.num_resolutions = len(ch_mult)
+ self.num_res_blocks = num_res_blocks
+ block_in = ch * ch_mult[self.num_resolutions - 1]
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
+ # z to block_in
+ self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ # middle
+ self.mid = torch.nn.Module()
+ self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+ self.mid.attn_1 = AttnBlock(block_in, dtype=dtype, device=device)
+ self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dtype=dtype, device=device)
+ # upsampling
+ self.up = torch.nn.ModuleList()
+ for i_level in reversed(range(self.num_resolutions)):
+ block = torch.nn.ModuleList()
+ block_out = ch * ch_mult[i_level]
+ for _ in range(self.num_res_blocks + 1):
+ block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dtype=dtype, device=device))
+ block_in = block_out
+ up = torch.nn.Module()
+ up.block = block
+ if i_level != 0:
+ up.upsample = Upsample(block_in, dtype=dtype, device=device)
+ curr_res = curr_res * 2
+ self.up.insert(0, up) # prepend to get consistent order
+ # end
+ self.norm_out = Normalize(block_in, dtype=dtype, device=device)
+ self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1, dtype=dtype, device=device)
+ self.swish = torch.nn.SiLU(inplace=True)
+
+ def forward(self, z):
+ # z to block_in
+ hidden = self.conv_in(z)
+ # middle
+ hidden = self.mid.block_1(hidden)
+ hidden = self.mid.attn_1(hidden)
+ hidden = self.mid.block_2(hidden)
+ # upsampling
+ for i_level in reversed(range(self.num_resolutions)):
+ for i_block in range(self.num_res_blocks + 1):
+ hidden = self.up[i_level].block[i_block](hidden)
+ if i_level != 0:
+ hidden = self.up[i_level].upsample(hidden)
+ # end
+ hidden = self.norm_out(hidden)
+ hidden = self.swish(hidden)
+ hidden = self.conv_out(hidden)
+ return hidden
+
+
+class SDVAE(torch.nn.Module):
+ def __init__(self, dtype=torch.float32, device=None):
+ super().__init__()
+ self.encoder = VAEEncoder(dtype=dtype, device=device)
+ self.decoder = VAEDecoder(dtype=dtype, device=device)
+
+ @torch.autocast("cuda", dtype=torch.float16)
+ def decode(self, latent):
+ return self.decoder(latent)
+
+ @torch.autocast("cuda", dtype=torch.float16)
+ def encode(self, image):
+ hidden = self.encoder(image)
+ mean, logvar = torch.chunk(hidden, 2, dim=1)
+ logvar = torch.clamp(logvar, -30.0, 20.0)
+ std = torch.exp(0.5 * logvar)
+ return mean + std * torch.randn_like(mean)
diff --git a/modules/models/sd3/sd3_model.py b/modules/models/sd3/sd3_model.py
new file mode 100644
index 000000000..37cf85eb3
--- /dev/null
+++ b/modules/models/sd3/sd3_model.py
@@ -0,0 +1,96 @@
+import contextlib
+
+import torch
+
+import k_diffusion
+from modules.models.sd3.sd3_impls import BaseModel, SDVAE, SD3LatentFormat
+from modules.models.sd3.sd3_cond import SD3Cond
+
+from modules import shared, devices
+
+
+class SD3Denoiser(k_diffusion.external.DiscreteSchedule):
+ def __init__(self, inner_model, sigmas):
+ super().__init__(sigmas, quantize=shared.opts.enable_quantization)
+ self.inner_model = inner_model
+
+ def forward(self, input, sigma, **kwargs):
+ return self.inner_model.apply_model(input, sigma, **kwargs)
+
+
+class SD3Inferencer(torch.nn.Module):
+ def __init__(self, state_dict, shift=3, use_ema=False):
+ super().__init__()
+
+ self.shift = shift
+
+ with torch.no_grad():
+ self.model = BaseModel(shift=shift, state_dict=state_dict, prefix="model.diffusion_model.", device="cpu", dtype=devices.dtype)
+ self.first_stage_model = SDVAE(device="cpu", dtype=devices.dtype_vae)
+ self.first_stage_model.dtype = self.model.diffusion_model.dtype
+
+ self.alphas_cumprod = 1 / (self.model.model_sampling.sigmas ** 2 + 1)
+
+ self.text_encoders = SD3Cond()
+ self.cond_stage_key = 'txt'
+
+ self.parameterization = "eps"
+ self.model.conditioning_key = "crossattn"
+
+ self.latent_format = SD3LatentFormat()
+ self.latent_channels = 16
+
+ @property
+ def cond_stage_model(self):
+ return self.text_encoders
+
+ def before_load_weights(self, state_dict):
+ self.cond_stage_model.before_load_weights(state_dict)
+
+ def ema_scope(self):
+ return contextlib.nullcontext()
+
+ def get_learned_conditioning(self, batch: list[str]):
+ return self.cond_stage_model(batch)
+
+ def apply_model(self, x, t, cond):
+ return self.model(x, t, c_crossattn=cond['crossattn'], y=cond['vector'])
+
+ def decode_first_stage(self, latent):
+ latent = self.latent_format.process_out(latent)
+ return self.first_stage_model.decode(latent)
+
+ def encode_first_stage(self, image):
+ latent = self.first_stage_model.encode(image)
+ return self.latent_format.process_in(latent)
+
+ def get_first_stage_encoding(self, x):
+ return x
+
+ def create_denoiser(self):
+ return SD3Denoiser(self, self.model.model_sampling.sigmas)
+
+ def medvram_fields(self):
+ return [
+ (self, 'first_stage_model'),
+ (self, 'text_encoders'),
+ (self, 'model'),
+ ]
+
+ def add_noise_to_latent(self, x, noise, amount):
+ return x * (1 - amount) + noise * amount
+
+ def fix_dimensions(self, width, height):
+ return width // 16 * 16, height // 16 * 16
+
+ def diffusers_weight_mapping(self):
+ for i in range(self.model.depth):
+ yield f"transformer.transformer_blocks.{i}.attn.to_q", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_q_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.to_k", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_k_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.to_v", f"diffusion_model_joint_blocks_{i}_x_block_attn_qkv_v_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.to_out.0", f"diffusion_model_joint_blocks_{i}_x_block_attn_proj"
+
+ yield f"transformer.transformer_blocks.{i}.attn.add_q_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_q_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.add_k_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_k_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.add_v_proj", f"diffusion_model_joint_blocks_{i}_context_block.attn_qkv_v_proj"
+ yield f"transformer.transformer_blocks.{i}.attn.add_out_proj.0", f"diffusion_model_joint_blocks_{i}_context_block_attn_proj"
diff --git a/modules/paths_internal.py b/modules/paths_internal.py
index cf9da45ab..67521f5cd 100644
--- a/modules/paths_internal.py
+++ b/modules/paths_internal.py
@@ -24,11 +24,12 @@ default_sd_model_file = sd_model_file
# Parse the --data-dir flag first so we can use it as a base for our other argument default values
parser_pre = argparse.ArgumentParser(add_help=False)
parser_pre.add_argument("--data-dir", type=str, default=os.path.dirname(modules_path), help="base path where all user data is stored", )
+parser_pre.add_argument("--models-dir", type=str, default=None, help="base path where models are stored; overrides --data-dir", )
cmd_opts_pre = parser_pre.parse_known_args()[0]
data_path = cmd_opts_pre.data_dir
-models_path = os.path.join(data_path, "models")
+models_path = cmd_opts_pre.models_dir if cmd_opts_pre.models_dir else os.path.join(data_path, "models")
extensions_dir = os.path.join(data_path, "extensions")
extensions_builtin_dir = os.path.join(script_path, "extensions-builtin")
config_states_dir = os.path.join(script_path, "config_states")
diff --git a/modules/postprocessing.py b/modules/postprocessing.py
index 812cbccae..a413d1027 100644
--- a/modules/postprocessing.py
+++ b/modules/postprocessing.py
@@ -51,7 +51,7 @@ def run_postprocessing(extras_mode, image, image_folder, input_dir, output_dir,
shared.state.textinfo = name
shared.state.skipped = False
- if shared.state.interrupted:
+ if shared.state.interrupted or shared.state.stopping_generation:
break
if isinstance(image_placeholder, str):
@@ -62,11 +62,13 @@ def run_postprocessing(extras_mode, image, image_folder, input_dir, output_dir,
else:
image_data = image_placeholder
+ image_data = image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB")
+
parameters, existing_pnginfo = images.read_info_from_image(image_data)
if parameters:
existing_pnginfo["parameters"] = parameters
- initial_pp = scripts_postprocessing.PostprocessedImage(image_data if image_data.mode in ("RGBA", "RGB") else image_data.convert("RGB"))
+ initial_pp = scripts_postprocessing.PostprocessedImage(image_data)
scripts.scripts_postproc.run(initial_pp, args)
diff --git a/modules/processing.py b/modules/processing.py
index 76557dd7f..7535b56e1 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -16,7 +16,7 @@ from skimage import exposure
from typing import Any
import modules.sd_hijack
-from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng
+from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext_utils, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng, profiling
from modules.rng import slerp # noqa: F401
from modules.sd_hijack import model_hijack
from modules.sd_samplers_common import images_tensor_to_samples, decode_first_stage, approximation_indexes
@@ -115,20 +115,17 @@ def txt2img_image_conditioning(sd_model, x, width, height):
return x.new_zeros(x.shape[0], 2*sd_model.noise_augmentor.time_embed.dim, dtype=x.dtype, device=x.device)
else:
- sd = sd_model.model.state_dict()
- diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
- if diffusion_model_input is not None:
- if diffusion_model_input.shape[1] == 9:
- # The "masked-image" in this case will just be all 0.5 since the entire image is masked.
- image_conditioning = torch.ones(x.shape[0], 3, height, width, device=x.device) * 0.5
- image_conditioning = images_tensor_to_samples(image_conditioning,
- approximation_indexes.get(opts.sd_vae_encode_method))
+ if sd_model.is_sdxl_inpaint:
+ # The "masked-image" in this case will just be all 0.5 since the entire image is masked.
+ image_conditioning = torch.ones(x.shape[0], 3, height, width, device=x.device) * 0.5
+ image_conditioning = images_tensor_to_samples(image_conditioning,
+ approximation_indexes.get(opts.sd_vae_encode_method))
- # Add the fake full 1s mask to the first dimension.
- image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
- image_conditioning = image_conditioning.to(x.dtype)
+ # Add the fake full 1s mask to the first dimension.
+ image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
+ image_conditioning = image_conditioning.to(x.dtype)
- return image_conditioning
+ return image_conditioning
# Dummy zero conditioning if we're not using inpainting or unclip models.
# Still takes up a bit of memory, but no encoder call.
@@ -238,11 +235,6 @@ class StableDiffusionProcessing:
self.styles = []
self.sampler_noise_scheduler_override = None
- self.s_min_uncond = self.s_min_uncond if self.s_min_uncond is not None else opts.s_min_uncond
- self.s_churn = self.s_churn if self.s_churn is not None else opts.s_churn
- self.s_tmin = self.s_tmin if self.s_tmin is not None else opts.s_tmin
- self.s_tmax = (self.s_tmax if self.s_tmax is not None else opts.s_tmax) or float('inf')
- self.s_noise = self.s_noise if self.s_noise is not None else opts.s_noise
self.extra_generation_params = self.extra_generation_params or {}
self.override_settings = self.override_settings or {}
@@ -259,6 +251,13 @@ class StableDiffusionProcessing:
self.cached_uc = StableDiffusionProcessing.cached_uc
self.cached_c = StableDiffusionProcessing.cached_c
+ def fill_fields_from_opts(self):
+ self.s_min_uncond = self.s_min_uncond if self.s_min_uncond is not None else opts.s_min_uncond
+ self.s_churn = self.s_churn if self.s_churn is not None else opts.s_churn
+ self.s_tmin = self.s_tmin if self.s_tmin is not None else opts.s_tmin
+ self.s_tmax = (self.s_tmax if self.s_tmax is not None else opts.s_tmax) or float('inf')
+ self.s_noise = self.s_noise if self.s_noise is not None else opts.s_noise
+
@property
def sd_model(self):
return shared.sd_model
@@ -390,11 +389,8 @@ class StableDiffusionProcessing:
if self.sampler.conditioning_key == "crossattn-adm":
return self.unclip_image_conditioning(source_image)
- sd = self.sampler.model_wrap.inner_model.model.state_dict()
- diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
- if diffusion_model_input is not None:
- if diffusion_model_input.shape[1] == 9:
- return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
+ if self.sampler.model_wrap.inner_model.is_sdxl_inpaint:
+ return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
# Dummy zero conditioning if we're not using inpainting or depth model.
return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
@@ -569,7 +565,7 @@ class Processed:
self.all_negative_prompts = all_negative_prompts or p.all_negative_prompts or [self.negative_prompt]
self.all_seeds = all_seeds or p.all_seeds or [self.seed]
self.all_subseeds = all_subseeds or p.all_subseeds or [self.subseed]
- self.infotexts = infotexts or [info]
+ self.infotexts = infotexts or [info] * len(images_list)
self.version = program_version()
def js(self):
@@ -629,6 +625,9 @@ class DecodedSamples(list):
def decode_latent_batch(model, batch, target_device=None, check_for_nans=False):
samples = DecodedSamples()
+ if check_for_nans:
+ devices.test_for_nans(batch, "unet")
+
for i in range(batch.shape[0]):
sample = decode_first_stage(model, batch[i:i + 1])[0]
@@ -794,7 +793,6 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
"Token merging ratio hr": None if not enable_hr or token_merging_ratio_hr == 0 else token_merging_ratio_hr,
"Init image hash": getattr(p, 'init_img_hash', None),
"RNG": opts.randn_source if opts.randn_source != "GPU" else None,
- "NGMS": None if p.s_min_uncond == 0 else p.s_min_uncond,
"Tiling": "True" if p.tiling else None,
**p.extra_generation_params,
"Version": program_version() if opts.add_version_to_infotext else None,
@@ -842,7 +840,11 @@ def process_images(p: StableDiffusionProcessing) -> Processed:
sd_models.apply_token_merging(p.sd_model, p.get_token_merging_ratio())
- res = process_images_inner(p)
+ # backwards compatibility, fix sampler and scheduler if invalid
+ sd_samplers.fix_p_invalid_sampler_and_scheduler(p)
+
+ with profiling.Profiler():
+ res = process_images_inner(p)
finally:
sd_models.apply_token_merging(p.sd_model, 0)
@@ -882,6 +884,9 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
if p.refiner_checkpoint_info is None:
raise Exception(f'Could not find checkpoint with name {p.refiner_checkpoint}')
+ if hasattr(shared.sd_model, 'fix_dimensions'):
+ p.width, p.height = shared.sd_model.fix_dimensions(p.width, p.height)
+
p.sd_model_name = shared.sd_model.sd_checkpoint_info.name_for_extra
p.sd_model_hash = shared.sd_model.sd_model_hash
p.sd_vae_name = sd_vae.get_loaded_vae_name()
@@ -890,6 +895,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
modules.sd_hijack.model_hijack.apply_circular(p.tiling)
modules.sd_hijack.model_hijack.clear_comments()
+ p.fill_fields_from_opts()
p.setup_prompts()
if isinstance(seed, list):
@@ -939,7 +945,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
p.seeds = p.all_seeds[n * p.batch_size:(n + 1) * p.batch_size]
p.subseeds = p.all_subseeds[n * p.batch_size:(n + 1) * p.batch_size]
- p.rng = rng.ImageRNG((opt_C, p.height // opt_f, p.width // opt_f), p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, seed_resize_from_h=p.seed_resize_from_h, seed_resize_from_w=p.seed_resize_from_w)
+ latent_channels = getattr(shared.sd_model, 'latent_channels', opt_C)
+ p.rng = rng.ImageRNG((latent_channels, p.height // opt_f, p.width // opt_f), p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, seed_resize_from_h=p.seed_resize_from_h, seed_resize_from_w=p.seed_resize_from_w)
if p.scripts is not None:
p.scripts.before_process_batch(p, batch_number=n, prompts=p.prompts, seeds=p.seeds, subseeds=p.subseeds)
@@ -988,6 +995,8 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
if getattr(samples_ddim, 'already_decoded', False):
x_samples_ddim = samples_ddim
else:
+ devices.test_for_nans(samples_ddim, "unet")
+
if opts.sd_vae_decode_method != 'Full':
p.extra_generation_params['VAE Decoder'] = opts.sd_vae_decode_method
x_samples_ddim = decode_latent_batch(p.sd_model, samples_ddim, target_device=devices.cpu, check_for_nans=True)
@@ -1325,6 +1334,15 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
# here we generate an image normally
x = self.rng.next()
+ if self.scripts is not None:
+ self.scripts.process_before_every_sampling(
+ p=self,
+ x=x,
+ noise=x,
+ c=conditioning,
+ uc=unconditional_conditioning
+ )
+
samples = self.sampler.sample(self, x, conditioning, unconditional_conditioning, image_conditioning=self.txt2img_image_conditioning(x))
del x
@@ -1425,6 +1443,13 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
if self.scripts is not None:
self.scripts.before_hr(self)
+ self.scripts.process_before_every_sampling(
+ p=self,
+ x=samples,
+ noise=noise,
+ c=self.hr_c,
+ uc=self.hr_uc,
+ )
samples = self.sampler.sample_img2img(self, samples, noise, self.hr_c, self.hr_uc, steps=self.hr_second_pass_steps or self.steps, image_conditioning=image_conditioning)
@@ -1715,10 +1740,10 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
latmask = latmask[0]
if self.mask_round:
latmask = np.around(latmask)
- latmask = np.tile(latmask[None], (4, 1, 1))
+ latmask = np.tile(latmask[None], (self.init_latent.shape[1], 1, 1))
- self.mask = torch.asarray(1.0 - latmask).to(shared.device).type(self.sd_model.dtype)
- self.nmask = torch.asarray(latmask).to(shared.device).type(self.sd_model.dtype)
+ self.mask = torch.asarray(1.0 - latmask).to(shared.device).type(devices.dtype)
+ self.nmask = torch.asarray(latmask).to(shared.device).type(devices.dtype)
# this needs to be fixed to be done in sample() using actual seeds for batches
if self.inpainting_fill == 2:
@@ -1738,6 +1763,14 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
self.extra_generation_params["Noise multiplier"] = self.initial_noise_multiplier
x *= self.initial_noise_multiplier
+ if self.scripts is not None:
+ self.scripts.process_before_every_sampling(
+ p=self,
+ x=self.init_latent,
+ noise=x,
+ c=conditioning,
+ uc=unconditional_conditioning
+ )
samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning, image_conditioning=self.image_conditioning)
if self.mask is not None:
diff --git a/modules/profiling.py b/modules/profiling.py
new file mode 100644
index 000000000..95b59f71a
--- /dev/null
+++ b/modules/profiling.py
@@ -0,0 +1,46 @@
+import torch
+
+from modules import shared, ui_gradio_extensions
+
+
+class Profiler:
+ def __init__(self):
+ if not shared.opts.profiling_enable:
+ self.profiler = None
+ return
+
+ activities = []
+ if "CPU" in shared.opts.profiling_activities:
+ activities.append(torch.profiler.ProfilerActivity.CPU)
+ if "CUDA" in shared.opts.profiling_activities:
+ activities.append(torch.profiler.ProfilerActivity.CUDA)
+
+ if not activities:
+ self.profiler = None
+ return
+
+ self.profiler = torch.profiler.profile(
+ activities=activities,
+ record_shapes=shared.opts.profiling_record_shapes,
+ profile_memory=shared.opts.profiling_profile_memory,
+ with_stack=shared.opts.profiling_with_stack
+ )
+
+ def __enter__(self):
+ if self.profiler:
+ self.profiler.__enter__()
+
+ return self
+
+ def __exit__(self, exc_type, exc, exc_tb):
+ if self.profiler:
+ shared.state.textinfo = "Finishing profile..."
+
+ self.profiler.__exit__(exc_type, exc, exc_tb)
+
+ self.profiler.export_chrome_trace(shared.opts.profiling_filename)
+
+
+def webpath():
+ return ui_gradio_extensions.webpath(shared.opts.profiling_filename)
+
diff --git a/modules/prompt_parser.py b/modules/prompt_parser.py
index cba134554..4e393d286 100644
--- a/modules/prompt_parser.py
+++ b/modules/prompt_parser.py
@@ -268,7 +268,7 @@ def get_multicond_learned_conditioning(model, prompts, steps, hires_steps=None,
class DictWithShape(dict):
- def __init__(self, x, shape):
+ def __init__(self, x, shape=None):
super().__init__()
self.update(x)
diff --git a/modules/safe.py b/modules/safe.py
index b1d08a792..af019ffd9 100644
--- a/modules/safe.py
+++ b/modules/safe.py
@@ -64,8 +64,8 @@ class RestrictedUnpickler(pickle.Unpickler):
raise Exception(f"global '{module}/{name}' is forbidden")
-# Regular expression that accepts 'dirname/version', 'dirname/data.pkl', and 'dirname/data/'
-allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|(data\.pkl))$")
+# Regular expression that accepts 'dirname/version', 'dirname/byteorder', 'dirname/data.pkl', '.data/serialization_id', and 'dirname/data/'
+allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|byteorder|.data/serialization_id|(data\.pkl))$")
data_pkl_re = re.compile(r"^([^/]+)/data\.pkl$")
def check_zip_filenames(filename, names):
diff --git a/modules/scripts.py b/modules/scripts.py
index 70ccfbe46..8eca396b1 100644
--- a/modules/scripts.py
+++ b/modules/scripts.py
@@ -187,6 +187,13 @@ class Script:
"""
pass
+ def process_before_every_sampling(self, p, *args, **kwargs):
+ """
+ Similar to process(), called before every sampling.
+ If you use high-res fix, this will be called two times.
+ """
+ pass
+
def process_batch(self, p, *args, **kwargs):
"""
Same as process(), but called for every batch.
@@ -826,6 +833,14 @@ class ScriptRunner:
except Exception:
errors.report(f"Error running process: {script.filename}", exc_info=True)
+ def process_before_every_sampling(self, p, **kwargs):
+ for script in self.ordered_scripts('process_before_every_sampling'):
+ try:
+ script_args = p.script_args[script.args_from:script.args_to]
+ script.process_before_every_sampling(p, *script_args, **kwargs)
+ except Exception:
+ errors.report(f"Error running process_before_every_sampling: {script.filename}", exc_info=True)
+
def before_process_batch(self, p, **kwargs):
for script in self.ordered_scripts('before_process_batch'):
try:
diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py
index e139d9964..0de830541 100644
--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@@ -325,7 +325,10 @@ class StableDiffusionModelHijack:
if self.clip is None:
return "-", "-"
- _, token_count = self.clip.process_texts([text])
+ if hasattr(self.clip, 'get_token_count'):
+ token_count = self.clip.get_token_count(text)
+ else:
+ _, token_count = self.clip.process_texts([text])
return token_count, self.clip.get_target_prompt_token_count(token_count)
@@ -356,13 +359,28 @@ class EmbeddingsWithFixes(torch.nn.Module):
vec = embedding.vec[self.textual_inversion_key] if isinstance(embedding.vec, dict) else embedding.vec
emb = devices.cond_cast_unet(vec)
emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0])
- tensor = torch.cat([tensor[0:offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len:]])
+ tensor = torch.cat([tensor[0:offset + 1], emb[0:emb_len], tensor[offset + 1 + emb_len:]]).to(dtype=inputs_embeds.dtype)
vecs.append(tensor)
return torch.stack(vecs)
+class TextualInversionEmbeddings(torch.nn.Embedding):
+ def __init__(self, num_embeddings: int, embedding_dim: int, textual_inversion_key='clip_l', **kwargs):
+ super().__init__(num_embeddings, embedding_dim, **kwargs)
+
+ self.embeddings = model_hijack
+ self.textual_inversion_key = textual_inversion_key
+
+ @property
+ def wrapped(self):
+ return super().forward
+
+ def forward(self, input_ids):
+ return EmbeddingsWithFixes.forward(self, input_ids)
+
+
def add_circular_option_to_conv_2d():
conv2d_constructor = torch.nn.Conv2d.__init__
diff --git a/modules/sd_hijack_clip.py b/modules/sd_hijack_clip.py
index 6ef10ac7c..a479148fc 100644
--- a/modules/sd_hijack_clip.py
+++ b/modules/sd_hijack_clip.py
@@ -27,24 +27,21 @@ chunk. Those objects are found in PromptChunk.fixes and, are placed into FrozenC
are applied by sd_hijack.EmbeddingsWithFixes's forward function."""
-class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
- """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
- have unlimited prompt length and assign weights to tokens in prompt.
- """
-
- def __init__(self, wrapped, hijack):
+class TextConditionalModel(torch.nn.Module):
+ def __init__(self):
super().__init__()
- self.wrapped = wrapped
- """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
- depending on model."""
-
- self.hijack: sd_hijack.StableDiffusionModelHijack = hijack
+ self.hijack = sd_hijack.model_hijack
self.chunk_length = 75
- self.is_trainable = getattr(wrapped, 'is_trainable', False)
- self.input_key = getattr(wrapped, 'input_key', 'txt')
- self.legacy_ucg_val = None
+ self.is_trainable = False
+ self.input_key = 'txt'
+ self.return_pooled = False
+
+ self.comma_token = None
+ self.id_start = None
+ self.id_end = None
+ self.id_pad = None
def empty_chunk(self):
"""creates an empty PromptChunk and returns it"""
@@ -210,10 +207,6 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
is when you do prompt editing: "a picture of a [cat:dog:0.4] eating ice cream"
"""
- if opts.use_old_emphasis_implementation:
- import modules.sd_hijack_clip_old
- return modules.sd_hijack_clip_old.forward_old(self, texts)
-
batch_chunks, token_count = self.process_texts(texts)
used_embeddings = {}
@@ -252,7 +245,7 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
if any(x for x in texts if "(" in x or "[" in x) and opts.emphasis != "Original":
self.hijack.extra_generation_params["Emphasis"] = opts.emphasis
- if getattr(self.wrapped, 'return_pooled', False):
+ if self.return_pooled:
return torch.hstack(zs), zs[0].pooled
else:
return torch.hstack(zs)
@@ -292,6 +285,34 @@ class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
return z
+class FrozenCLIPEmbedderWithCustomWordsBase(TextConditionalModel):
+ """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
+ have unlimited prompt length and assign weights to tokens in prompt.
+ """
+
+ def __init__(self, wrapped, hijack):
+ super().__init__()
+
+ self.hijack = hijack
+
+ self.wrapped = wrapped
+ """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
+ depending on model."""
+
+ self.is_trainable = getattr(wrapped, 'is_trainable', False)
+ self.input_key = getattr(wrapped, 'input_key', 'txt')
+ self.return_pooled = getattr(self.wrapped, 'return_pooled', False)
+
+ self.legacy_ucg_val = None # for sgm codebase
+
+ def forward(self, texts):
+ if opts.use_old_emphasis_implementation:
+ import modules.sd_hijack_clip_old
+ return modules.sd_hijack_clip_old.forward_old(self, texts)
+
+ return super().forward(texts)
+
+
class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase):
def __init__(self, wrapped, hijack):
super().__init__(wrapped, hijack)
@@ -353,7 +374,9 @@ class FrozenCLIPEmbedderForSDXLWithCustomWords(FrozenCLIPEmbedderWithCustomWords
def encode_with_transformers(self, tokens):
outputs = self.wrapped.transformer(input_ids=tokens, output_hidden_states=self.wrapped.layer == "hidden")
- if self.wrapped.layer == "last":
+ if opts.sdxl_clip_l_skip is True:
+ z = outputs.hidden_states[-opts.CLIP_stop_at_last_layers]
+ elif self.wrapped.layer == "last":
z = outputs.last_hidden_state
else:
z = outputs.hidden_states[self.wrapped.layer_idx]
diff --git a/modules/sd_hijack_optimizations.py b/modules/sd_hijack_optimizations.py
index 7f9e328d0..0269f1f5b 100644
--- a/modules/sd_hijack_optimizations.py
+++ b/modules/sd_hijack_optimizations.py
@@ -486,7 +486,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):
k_in = self.to_k(context_k)
v_in = self.to_v(context_v)
- q, k, v = (rearrange(t, 'b n (h d) -> b n h d', h=h) for t in (q_in, k_in, v_in))
+ q, k, v = (t.reshape(t.shape[0], t.shape[1], h, -1) for t in (q_in, k_in, v_in))
+
del q_in, k_in, v_in
dtype = q.dtype
@@ -497,7 +498,8 @@ def xformers_attention_forward(self, x, context=None, mask=None, **kwargs):
out = out.to(dtype)
- out = rearrange(out, 'b n h d -> b n (h d)', h=h)
+ b, n, h, d = out.shape
+ out = out.reshape(b, n, h * d)
return self.to_out(out)
diff --git a/modules/sd_hijack_unet.py b/modules/sd_hijack_unet.py
index 2101f1a04..b4f03b138 100644
--- a/modules/sd_hijack_unet.py
+++ b/modules/sd_hijack_unet.py
@@ -1,5 +1,7 @@
import torch
from packaging import version
+from einops import repeat
+import math
from modules import devices
from modules.sd_hijack_utils import CondFunc
@@ -36,7 +38,7 @@ th = TorchHijackForUnet()
# Below are monkey patches to enable upcasting a float16 UNet for float32 sampling
def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
-
+ """Always make sure inputs to unet are in correct dtype."""
if isinstance(cond, dict):
for y in cond.keys():
if isinstance(cond[y], list):
@@ -45,7 +47,59 @@ def apply_model(orig_func, self, x_noisy, t, cond, **kwargs):
cond[y] = cond[y].to(devices.dtype_unet) if isinstance(cond[y], torch.Tensor) else cond[y]
with devices.autocast():
- return orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs).float()
+ result = orig_func(self, x_noisy.to(devices.dtype_unet), t.to(devices.dtype_unet), cond, **kwargs)
+ if devices.unet_needs_upcast:
+ return result.float()
+ else:
+ return result
+
+
+# Monkey patch to create timestep embed tensor on device, avoiding a block.
+def timestep_embedding(_, timesteps, dim, max_period=10000, repeat_only=False):
+ """
+ Create sinusoidal timestep embeddings.
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
+ These may be fractional.
+ :param dim: the dimension of the output.
+ :param max_period: controls the minimum frequency of the embeddings.
+ :return: an [N x dim] Tensor of positional embeddings.
+ """
+ if not repeat_only:
+ half = dim // 2
+ freqs = torch.exp(
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half
+ )
+ args = timesteps[:, None].float() * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ else:
+ embedding = repeat(timesteps, 'b -> b d', d=dim)
+ return embedding
+
+
+# Monkey patch to SpatialTransformer removing unnecessary contiguous calls.
+# Prevents a lot of unnecessary aten::copy_ calls
+def spatial_transformer_forward(_, self, x: torch.Tensor, context=None):
+ # note: if no context is given, cross-attention defaults to self-attention
+ if not isinstance(context, list):
+ context = [context]
+ b, c, h, w = x.shape
+ x_in = x
+ x = self.norm(x)
+ if not self.use_linear:
+ x = self.proj_in(x)
+ x = x.permute(0, 2, 3, 1).reshape(b, h * w, c)
+ if self.use_linear:
+ x = self.proj_in(x)
+ for i, block in enumerate(self.transformer_blocks):
+ x = block(x, context=context[i])
+ if self.use_linear:
+ x = self.proj_out(x)
+ x = x.view(b, h, w, c).permute(0, 3, 1, 2)
+ if not self.use_linear:
+ x = self.proj_out(x)
+ return x + x_in
class GELUHijack(torch.nn.GELU, torch.nn.Module):
@@ -64,12 +118,15 @@ def hijack_ddpm_edit():
if not ddpm_edit_hijack:
CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.decode_first_stage', first_stage_sub, first_stage_cond)
CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
- ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+ ddpm_edit_hijack = CondFunc('modules.models.diffusion.ddpm_edit.LatentDiffusion.apply_model', apply_model)
unet_needs_upcast = lambda *args, **kwargs: devices.unet_needs_upcast
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model, unet_needs_upcast)
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding)
+CondFunc('ldm.modules.attention.SpatialTransformer.forward', spatial_transformer_forward)
CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
+
if version.parse(torch.__version__) <= version.parse("1.13.2") or torch.cuda.is_available():
CondFunc('ldm.modules.diffusionmodules.util.GroupNorm32.forward', lambda orig_func, self, *args, **kwargs: orig_func(self.float(), *args, **kwargs), unet_needs_upcast)
CondFunc('ldm.modules.attention.GEGLU.forward', lambda orig_func, self, x: orig_func(self.float(), x.float()).to(devices.dtype_unet), unet_needs_upcast)
@@ -81,5 +138,17 @@ CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.decode_first_stage', first_s
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.encode_first_stage', first_stage_sub, first_stage_cond)
CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding', lambda orig_func, *args, **kwargs: orig_func(*args, **kwargs).float(), first_stage_cond)
-CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model, unet_needs_upcast)
-CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', lambda orig_func, timesteps, *args, **kwargs: orig_func(timesteps, *args, **kwargs).to(torch.float32 if timesteps.dtype == torch.int64 else devices.dtype_unet), unet_needs_upcast)
+CondFunc('ldm.models.diffusion.ddpm.LatentDiffusion.apply_model', apply_model)
+CondFunc('sgm.modules.diffusionmodules.wrappers.OpenAIWrapper.forward', apply_model)
+
+
+def timestep_embedding_cast_result(orig_func, timesteps, *args, **kwargs):
+ if devices.unet_needs_upcast and timesteps.dtype == torch.int64:
+ dtype = torch.float32
+ else:
+ dtype = devices.dtype_unet
+ return orig_func(timesteps, *args, **kwargs).to(dtype=dtype)
+
+
+CondFunc('ldm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
+CondFunc('sgm.modules.diffusionmodules.openaimodel.timestep_embedding', timestep_embedding_cast_result)
diff --git a/modules/sd_hijack_utils.py b/modules/sd_hijack_utils.py
index 79bf6e468..546f2eda4 100644
--- a/modules/sd_hijack_utils.py
+++ b/modules/sd_hijack_utils.py
@@ -1,7 +1,11 @@
import importlib
+
+always_true_func = lambda *args, **kwargs: True
+
+
class CondFunc:
- def __new__(cls, orig_func, sub_func, cond_func):
+ def __new__(cls, orig_func, sub_func, cond_func=always_true_func):
self = super(CondFunc, cls).__new__(cls)
if isinstance(orig_func, str):
func_path = orig_func.split('.')
@@ -20,13 +24,13 @@ class CondFunc:
print(f"Warning: Failed to resolve {orig_func} for CondFunc hijack")
pass
self.__init__(orig_func, sub_func, cond_func)
- return lambda *args, **kwargs: self(*args, **kwargs)
- def __init__(self, orig_func, sub_func, cond_func):
- self.__orig_func = orig_func
- self.__sub_func = sub_func
- self.__cond_func = cond_func
- def __call__(self, *args, **kwargs):
- if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
- return self.__sub_func(self.__orig_func, *args, **kwargs)
- else:
- return self.__orig_func(*args, **kwargs)
+ return lambda *args, **kwargs: self(*args, **kwargs)
+ def __init__(self, orig_func, sub_func, cond_func):
+ self.__orig_func = orig_func
+ self.__sub_func = sub_func
+ self.__cond_func = cond_func
+ def __call__(self, *args, **kwargs):
+ if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
+ return self.__sub_func(self.__orig_func, *args, **kwargs)
+ else:
+ return self.__orig_func(*args, **kwargs)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index ff245b7a6..55bd9ca5e 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -1,7 +1,9 @@
import collections
+import importlib
import os
import sys
import threading
+import enum
import torch
import re
@@ -10,8 +12,6 @@ from omegaconf import OmegaConf, ListConfig
from urllib import request
import ldm.modules.midas as midas
-from ldm.util import instantiate_from_config
-
from modules import paths, shared, modelloader, devices, script_callbacks, sd_vae, sd_disable_initialization, errors, hashes, sd_models_config, sd_unet, sd_models_xl, cache, extra_networks, processing, lowvram, sd_hijack, patches
from modules.timer import Timer
from modules.shared import opts
@@ -27,6 +27,14 @@ checkpoint_alisases = checkpoint_aliases # for compatibility with old name
checkpoints_loaded = collections.OrderedDict()
+class ModelType(enum.Enum):
+ SD1 = 1
+ SD2 = 2
+ SDXL = 3
+ SSD = 4
+ SD3 = 5
+
+
def replace_key(d, key, new_key, value):
keys = list(d.keys())
@@ -149,10 +157,12 @@ def list_models():
cmd_ckpt = shared.cmd_opts.ckpt
if shared.cmd_opts.no_download_sd_model or cmd_ckpt != shared.sd_model_file or os.path.exists(cmd_ckpt):
model_url = None
+ expected_sha256 = None
else:
model_url = f"{shared.hf_endpoint}/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors"
+ expected_sha256 = '6ce0161689b3853acaa03779ec93eafe75a02f4ced659bee03f50797806fa2fa'
- model_list = modelloader.load_models(model_path=model_path, model_url=model_url, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt", ".safetensors"], download_name="v1-5-pruned-emaonly.safetensors", ext_blacklist=[".vae.ckpt", ".vae.safetensors"])
+ model_list = modelloader.load_models(model_path=model_path, model_url=model_url, command_path=shared.cmd_opts.ckpt_dir, ext_filter=[".ckpt", ".safetensors"], download_name="v1-5-pruned-emaonly.safetensors", ext_blacklist=[".vae.ckpt", ".vae.safetensors"], hash_prefix=expected_sha256)
if os.path.exists(cmd_ckpt):
checkpoint_info = CheckpointInfo(cmd_ckpt)
@@ -280,17 +290,21 @@ def read_metadata_from_safetensors(filename):
json_start = file.read(2)
assert metadata_len > 2 and json_start in (b'{"', b"{'"), f"{filename} is not a safetensors file"
- json_data = json_start + file.read(metadata_len-2)
- json_obj = json.loads(json_data)
res = {}
- for k, v in json_obj.get("__metadata__", {}).items():
- res[k] = v
- if isinstance(v, str) and v[0:1] == '{':
- try:
- res[k] = json.loads(v)
- except Exception:
- pass
+
+ try:
+ json_data = json_start + file.read(metadata_len-2)
+ json_obj = json.loads(json_data)
+ for k, v in json_obj.get("__metadata__", {}).items():
+ res[k] = v
+ if isinstance(v, str) and v[0:1] == '{':
+ try:
+ res[k] = json.loads(v)
+ except Exception:
+ pass
+ except Exception:
+ errors.report(f"Error reading metadata from file: {filename}", exc_info=True)
return res
@@ -362,6 +376,37 @@ def check_fp8(model):
return enable_fp8
+def set_model_type(model, state_dict):
+ model.is_sd1 = False
+ model.is_sd2 = False
+ model.is_sdxl = False
+ model.is_ssd = False
+ model.is_sd3 = False
+
+ if "model.diffusion_model.x_embedder.proj.weight" in state_dict:
+ model.is_sd3 = True
+ model.model_type = ModelType.SD3
+ elif hasattr(model, 'conditioner'):
+ model.is_sdxl = True
+
+ if 'model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight' not in state_dict.keys():
+ model.is_ssd = True
+ model.model_type = ModelType.SSD
+ else:
+ model.model_type = ModelType.SDXL
+ elif hasattr(model.cond_stage_model, 'model'):
+ model.is_sd2 = True
+ model.model_type = ModelType.SD2
+ else:
+ model.is_sd1 = True
+ model.model_type = ModelType.SD1
+
+
+def set_model_fields(model):
+ if not hasattr(model, 'latent_channels'):
+ model.latent_channels = 4
+
+
def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer):
sd_model_hash = checkpoint_info.calculate_shorthash()
timer.record("calculate hash")
@@ -376,10 +421,9 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
if state_dict is None:
state_dict = get_checkpoint_state_dict(checkpoint_info, timer)
- model.is_sdxl = hasattr(model, 'conditioner')
- model.is_sd2 = not model.is_sdxl and hasattr(model.cond_stage_model, 'model')
- model.is_sd1 = not model.is_sdxl and not model.is_sd2
- model.is_ssd = model.is_sdxl and 'model.diffusion_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight' not in state_dict.keys()
+ set_model_type(model, state_dict)
+ set_model_fields(model)
+
if model.is_sdxl:
sd_models_xl.extend_sdxl(model)
@@ -390,11 +434,30 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
# cache newly loaded model
checkpoints_loaded[checkpoint_info] = state_dict.copy()
+ if hasattr(model, "before_load_weights"):
+ model.before_load_weights(state_dict)
+
model.load_state_dict(state_dict, strict=False)
timer.record("apply weights to model")
+ if hasattr(model, "after_load_weights"):
+ model.after_load_weights(state_dict)
+
del state_dict
+ # Set is_sdxl_inpaint flag.
+ # Checks Unet structure to detect inpaint model. The inpaint model's
+ # checkpoint state_dict does not contain the key
+ # 'diffusion_model.input_blocks.0.0.weight'.
+ diffusion_model_input = model.model.state_dict().get(
+ 'diffusion_model.input_blocks.0.0.weight'
+ )
+ model.is_sdxl_inpaint = (
+ model.is_sdxl and
+ diffusion_model_input is not None and
+ diffusion_model_input.shape[1] == 9
+ )
+
if shared.cmd_opts.opt_channelslast:
model.to(memory_format=torch.channels_last)
timer.record("apply channels_last")
@@ -403,6 +466,7 @@ def load_model_weights(model, checkpoint_info: CheckpointInfo, state_dict, timer
model.float()
model.alphas_cumprod_original = model.alphas_cumprod
devices.dtype_unet = torch.float32
+ assert shared.cmd_opts.precision != "half", "Cannot use --precision half with --no-half"
timer.record("apply float()")
else:
vae = model.first_stage_model
@@ -532,25 +596,34 @@ def patch_given_betas():
original_register_schedule = patches.patch(__name__, ldm.models.diffusion.ddpm.DDPM, 'register_schedule', patched_register_schedule)
-def repair_config(sd_config):
-
+def repair_config(sd_config, state_dict=None):
if not hasattr(sd_config.model.params, "use_ema"):
sd_config.model.params.use_ema = False
if hasattr(sd_config.model.params, 'unet_config'):
if shared.cmd_opts.no_half:
sd_config.model.params.unet_config.params.use_fp16 = False
- elif shared.cmd_opts.upcast_sampling:
+ elif shared.cmd_opts.upcast_sampling or shared.cmd_opts.precision == "half":
sd_config.model.params.unet_config.params.use_fp16 = True
- if getattr(sd_config.model.params.first_stage_config.params.ddconfig, "attn_type", None) == "vanilla-xformers" and not shared.xformers_available:
- sd_config.model.params.first_stage_config.params.ddconfig.attn_type = "vanilla"
+ if hasattr(sd_config.model.params, 'first_stage_config'):
+ if getattr(sd_config.model.params.first_stage_config.params.ddconfig, "attn_type", None) == "vanilla-xformers" and not shared.xformers_available:
+ sd_config.model.params.first_stage_config.params.ddconfig.attn_type = "vanilla"
# For UnCLIP-L, override the hardcoded karlo directory
if hasattr(sd_config.model.params, "noise_aug_config") and hasattr(sd_config.model.params.noise_aug_config.params, "clip_stats_path"):
karlo_path = os.path.join(paths.models_path, 'karlo')
sd_config.model.params.noise_aug_config.params.clip_stats_path = sd_config.model.params.noise_aug_config.params.clip_stats_path.replace("checkpoints/karlo_models", karlo_path)
+ # Do not use checkpoint for inference.
+ # This helps prevent extra performance overhead on checking parameters.
+ # The perf overhead is about 100ms/it on 4090 for SDXL.
+ if hasattr(sd_config.model.params, "network_config"):
+ sd_config.model.params.network_config.params.use_checkpoint = False
+ if hasattr(sd_config.model.params, "unet_config"):
+ sd_config.model.params.unet_config.params.use_checkpoint = False
+
+
def rescale_zero_terminal_snr_abar(alphas_cumprod):
alphas_bar_sqrt = alphas_cumprod.sqrt()
@@ -651,18 +724,23 @@ def get_empty_cond(sd_model):
p = processing.StableDiffusionProcessingTxt2Img()
extra_networks.activate(p, {})
- if hasattr(sd_model, 'conditioner'):
+ if hasattr(sd_model, 'get_learned_conditioning'):
d = sd_model.get_learned_conditioning([""])
- return d['crossattn']
else:
- return sd_model.cond_stage_model([""])
+ d = sd_model.cond_stage_model([""])
+
+ if isinstance(d, dict):
+ d = d['crossattn']
+
+ return d
def send_model_to_cpu(m):
- if m.lowvram:
- lowvram.send_everything_to_cpu()
- else:
- m.to(devices.cpu)
+ if m is not None:
+ if m.lowvram:
+ lowvram.send_everything_to_cpu()
+ else:
+ m.to(devices.cpu)
devices.torch_gc()
@@ -686,6 +764,25 @@ def send_model_to_trash(m):
devices.torch_gc()
+def instantiate_from_config(config, state_dict=None):
+ constructor = get_obj_from_str(config["target"])
+
+ params = {**config.get("params", {})}
+
+ if state_dict and "state_dict" in params and params["state_dict"] is None:
+ params["state_dict"] = state_dict
+
+ return constructor(**params)
+
+
+def get_obj_from_str(string, reload=False):
+ module, cls = string.rsplit(".", 1)
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+
+
def load_model(checkpoint_info=None, already_loaded_state_dict=None):
from modules import sd_hijack
checkpoint_info = checkpoint_info or select_checkpoint()
@@ -710,7 +807,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
timer.record("find config")
sd_config = OmegaConf.load(checkpoint_config)
- repair_config(sd_config)
+ repair_config(sd_config, state_dict)
timer.record("load config")
@@ -720,7 +817,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
try:
with sd_disable_initialization.DisableInitialization(disable_clip=clip_is_included_into_sd or shared.cmd_opts.do_not_download_clip):
with sd_disable_initialization.InitializeOnMeta():
- sd_model = instantiate_from_config(sd_config.model)
+ sd_model = instantiate_from_config(sd_config.model, state_dict)
except Exception as e:
errors.display(e, "creating model quickly", full_traceback=True)
@@ -729,7 +826,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
print('Failed to create model quickly; will retry using slow method.', file=sys.stderr)
with sd_disable_initialization.InitializeOnMeta():
- sd_model = instantiate_from_config(sd_config.model)
+ sd_model = instantiate_from_config(sd_config.model, state_dict)
sd_model.used_config = checkpoint_config
@@ -746,6 +843,7 @@ def load_model(checkpoint_info=None, already_loaded_state_dict=None):
with sd_disable_initialization.LoadStateDictOnMeta(state_dict, device=model_target_device(sd_model), weight_dtype_conversion=weight_dtype_conversion):
load_model_weights(sd_model, checkpoint_info, state_dict, timer)
+
timer.record("load weights from state dict")
send_model_to_device(sd_model)
diff --git a/modules/sd_models_config.py b/modules/sd_models_config.py
index b38137eb5..fb44c5a8d 100644
--- a/modules/sd_models_config.py
+++ b/modules/sd_models_config.py
@@ -23,6 +23,8 @@ config_inpainting = os.path.join(sd_configs_path, "v1-inpainting-inference.yaml"
config_instruct_pix2pix = os.path.join(sd_configs_path, "instruct-pix2pix.yaml")
config_alt_diffusion = os.path.join(sd_configs_path, "alt-diffusion-inference.yaml")
config_alt_diffusion_m18 = os.path.join(sd_configs_path, "alt-diffusion-m18-inference.yaml")
+config_sd3 = os.path.join(sd_configs_path, "sd3-inference.yaml")
+
def is_using_v_parameterization_for_sd2(state_dict):
"""
@@ -31,11 +33,11 @@ def is_using_v_parameterization_for_sd2(state_dict):
import ldm.modules.diffusionmodules.openaimodel
- device = devices.cpu
+ device = devices.device
with sd_disable_initialization.DisableInitialization():
unet = ldm.modules.diffusionmodules.openaimodel.UNetModel(
- use_checkpoint=True,
+ use_checkpoint=False,
use_fp16=False,
image_size=32,
in_channels=4,
@@ -56,12 +58,13 @@ def is_using_v_parameterization_for_sd2(state_dict):
with torch.no_grad():
unet_sd = {k.replace("model.diffusion_model.", ""): v for k, v in state_dict.items() if "model.diffusion_model." in k}
unet.load_state_dict(unet_sd, strict=True)
- unet.to(device=device, dtype=torch.float)
+ unet.to(device=device, dtype=devices.dtype_unet)
test_cond = torch.ones((1, 2, 1024), device=device) * 0.5
x_test = torch.ones((1, 4, 8, 8), device=device) * 0.5
- out = (unet(x_test, torch.asarray([999], device=device), context=test_cond) - x_test).mean().item()
+ with devices.autocast():
+ out = (unet(x_test, torch.asarray([999], device=device), context=test_cond) - x_test).mean().cpu().item()
return out < -1
@@ -71,11 +74,15 @@ def guess_model_config_from_state_dict(sd, filename):
diffusion_model_input = sd.get('model.diffusion_model.input_blocks.0.0.weight', None)
sd2_variations_weight = sd.get('embedder.model.ln_final.weight', None)
+ if "model.diffusion_model.x_embedder.proj.weight" in sd:
+ return config_sd3
+
if sd.get('conditioner.embedders.1.model.ln_final.weight', None) is not None:
if diffusion_model_input.shape[1] == 9:
return config_sdxl_inpainting
else:
return config_sdxl
+
if sd.get('conditioner.embedders.0.model.ln_final.weight', None) is not None:
return config_sdxl_refiner
elif sd.get('depth_model.model.pretrained.act_postprocess3.0.project.0.bias', None) is not None:
@@ -99,7 +106,6 @@ def guess_model_config_from_state_dict(sd, filename):
if diffusion_model_input.shape[1] == 8:
return config_instruct_pix2pix
-
if sd.get('cond_stage_model.roberta.embeddings.word_embeddings.weight', None) is not None:
if sd.get('cond_stage_model.transformation.weight').size()[0] == 1024:
return config_alt_diffusion_m18
diff --git a/modules/sd_models_types.py b/modules/sd_models_types.py
index f911fbb68..2fce2777b 100644
--- a/modules/sd_models_types.py
+++ b/modules/sd_models_types.py
@@ -32,3 +32,9 @@ class WebuiSdModel(LatentDiffusion):
is_sd1: bool
"""True if the model's architecture is SD 1.x"""
+
+ is_sd3: bool
+ """True if the model's architecture is SD 3"""
+
+ latent_channels: int
+ """number of layer in latent image representation; will be 16 in SD3 and 4 in other version"""
diff --git a/modules/sd_models_xl.py b/modules/sd_models_xl.py
index 94ff973fb..1242a5936 100644
--- a/modules/sd_models_xl.py
+++ b/modules/sd_models_xl.py
@@ -35,11 +35,10 @@ def get_learned_conditioning(self: sgm.models.diffusion.DiffusionEngine, batch:
def apply_model(self: sgm.models.diffusion.DiffusionEngine, x, t, cond):
- sd = self.model.state_dict()
- diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
- if diffusion_model_input is not None:
- if diffusion_model_input.shape[1] == 9:
- x = torch.cat([x] + cond['c_concat'], dim=1)
+ """WARNING: This function is called once per denoising iteration. DO NOT add
+ expensive functionc calls such as `model.state_dict`. """
+ if self.is_sdxl_inpaint:
+ x = torch.cat([x] + cond['c_concat'], dim=1)
return self.model(x, t, cond)
diff --git a/modules/sd_samplers.py b/modules/sd_samplers.py
index 6b7b84b6d..963da5be0 100644
--- a/modules/sd_samplers.py
+++ b/modules/sd_samplers.py
@@ -1,7 +1,7 @@
from __future__ import annotations
import functools
-
+import logging
from modules import sd_samplers_kdiffusion, sd_samplers_timesteps, sd_samplers_lcm, shared, sd_samplers_common, sd_schedulers
# imports for functions that previously were here and are used by other modules
@@ -98,7 +98,7 @@ def get_hr_scheduler_from_infotext(d: dict):
@functools.cache
-def get_sampler_and_scheduler(sampler_name, scheduler_name):
+def get_sampler_and_scheduler(sampler_name, scheduler_name, *, convert_automatic=True):
default_sampler = samplers[0]
found_scheduler = sd_schedulers.schedulers_map.get(scheduler_name, sd_schedulers.schedulers[0])
@@ -116,10 +116,17 @@ def get_sampler_and_scheduler(sampler_name, scheduler_name):
sampler = all_samplers_map.get(name, default_sampler)
# revert back to Automatic if it's the default scheduler for the selected sampler
- if sampler.options.get('scheduler', None) == found_scheduler.name:
+ if convert_automatic and sampler.options.get('scheduler', None) == found_scheduler.name:
found_scheduler = sd_schedulers.schedulers[0]
return sampler.name, found_scheduler.label
+def fix_p_invalid_sampler_and_scheduler(p):
+ i_sampler_name, i_scheduler = p.sampler_name, p.scheduler
+ p.sampler_name, p.scheduler = get_sampler_and_scheduler(p.sampler_name, p.scheduler, convert_automatic=False)
+ if p.sampler_name != i_sampler_name or i_scheduler != p.scheduler:
+ logging.warning(f'Sampler Scheduler autocorrection: "{i_sampler_name}" -> "{p.sampler_name}", "{i_scheduler}" -> "{p.scheduler}"')
+
+
set_samplers()
diff --git a/modules/sd_samplers_cfg_denoiser.py b/modules/sd_samplers_cfg_denoiser.py
index 93581c9ac..b6fbf3372 100644
--- a/modules/sd_samplers_cfg_denoiser.py
+++ b/modules/sd_samplers_cfg_denoiser.py
@@ -1,5 +1,5 @@
import torch
-from modules import prompt_parser, devices, sd_samplers_common
+from modules import prompt_parser, sd_samplers_common
from modules.shared import opts, state
import modules.shared as shared
@@ -58,6 +58,11 @@ class CFGDenoiser(torch.nn.Module):
self.model_wrap = None
self.p = None
+ self.cond_scale_miltiplier = 1.0
+
+ self.need_last_noise_uncond = False
+ self.last_noise_uncond = None
+
# NOTE: masking before denoising can cause the original latents to be oversmoothed
# as the original latents do not have noise
self.mask_before_denoising = False
@@ -212,9 +217,16 @@ class CFGDenoiser(torch.nn.Module):
uncond = denoiser_params.text_uncond
skip_uncond = False
- # alternating uncond allows for higher thresholds without the quality loss normally expected from raising it
- if self.step % 2 and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
+ if shared.opts.skip_early_cond != 0. and self.step / self.total_steps <= shared.opts.skip_early_cond:
skip_uncond = True
+ self.p.extra_generation_params["Skip Early CFG"] = shared.opts.skip_early_cond
+ elif (self.step % 2 or shared.opts.s_min_uncond_all) and s_min_uncond > 0 and sigma[0] < s_min_uncond and not is_edit_model:
+ skip_uncond = True
+ self.p.extra_generation_params["NGMS"] = s_min_uncond
+ if shared.opts.s_min_uncond_all:
+ self.p.extra_generation_params["NGMS all steps"] = shared.opts.s_min_uncond_all
+
+ if skip_uncond:
x_in = x_in[:-batch_size]
sigma_in = sigma_in[:-batch_size]
@@ -266,14 +278,15 @@ class CFGDenoiser(torch.nn.Module):
denoised_params = CFGDenoisedParams(x_out, state.sampling_step, state.sampling_steps, self.inner_model)
cfg_denoised_callback(denoised_params)
- devices.test_for_nans(x_out, "unet")
+ if self.need_last_noise_uncond:
+ self.last_noise_uncond = torch.clone(x_out[-uncond.shape[0]:])
if is_edit_model:
- denoised = self.combine_denoised_for_edit_model(x_out, cond_scale)
+ denoised = self.combine_denoised_for_edit_model(x_out, cond_scale * self.cond_scale_miltiplier)
elif skip_uncond:
denoised = self.combine_denoised(x_out, conds_list, uncond, 1.0)
else:
- denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale)
+ denoised = self.combine_denoised(x_out, conds_list, uncond, cond_scale * self.cond_scale_miltiplier)
# Blend in the original latents (after)
if not self.mask_before_denoising and self.mask is not None:
diff --git a/modules/sd_samplers_common.py b/modules/sd_samplers_common.py
index bda578cc5..c060cccb2 100644
--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@@ -54,7 +54,7 @@ def samples_to_images_tensor(sample, approximation=None, model=None):
else:
if model is None:
model = shared.sd_model
- with devices.without_autocast(): # fixes an issue with unstable VAEs that are flaky even in fp32
+ with torch.no_grad(), devices.without_autocast(): # fixes an issue with unstable VAEs that are flaky even in fp32
x_sample = model.decode_first_stage(sample.to(model.first_stage_model.dtype))
return x_sample
@@ -163,7 +163,7 @@ def apply_refiner(cfg_denoiser, sigma=None):
else:
# torch.max(sigma) only to handle rare case where we might have different sigmas in the same batch
try:
- timestep = torch.argmin(torch.abs(cfg_denoiser.inner_model.sigmas - torch.max(sigma)))
+ timestep = torch.argmin(torch.abs(cfg_denoiser.inner_model.sigmas.to(sigma.device) - torch.max(sigma)))
except AttributeError: # for samplers that don't use sigmas (DDIM) sigma is actually the timestep
timestep = torch.max(sigma).to(dtype=int)
completed_ratio = (999 - timestep) / 1000
@@ -246,7 +246,7 @@ class Sampler:
self.eta_infotext_field = 'Eta'
self.eta_default = 1.0
- self.conditioning_key = shared.sd_model.model.conditioning_key
+ self.conditioning_key = getattr(shared.sd_model.model, 'conditioning_key', 'crossattn')
self.p = None
self.model_wrap_cfg = None
diff --git a/modules/sd_samplers_kdiffusion.py b/modules/sd_samplers_kdiffusion.py
index b45f85b07..0c94d100d 100644
--- a/modules/sd_samplers_kdiffusion.py
+++ b/modules/sd_samplers_kdiffusion.py
@@ -1,7 +1,7 @@
import torch
import inspect
import k_diffusion.sampling
-from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers
+from modules import sd_samplers_common, sd_samplers_extra, sd_samplers_cfg_denoiser, sd_schedulers, devices
from modules.sd_samplers_cfg_denoiser import CFGDenoiser # noqa: F401
from modules.script_callbacks import ExtraNoiseParams, extra_noise_callback
@@ -53,8 +53,13 @@ class CFGDenoiserKDiffusion(sd_samplers_cfg_denoiser.CFGDenoiser):
@property
def inner_model(self):
if self.model_wrap is None:
- denoiser = k_diffusion.external.CompVisVDenoiser if shared.sd_model.parameterization == "v" else k_diffusion.external.CompVisDenoiser
- self.model_wrap = denoiser(shared.sd_model, quantize=shared.opts.enable_quantization)
+ denoiser_constructor = getattr(shared.sd_model, 'create_denoiser', None)
+
+ if denoiser_constructor is not None:
+ self.model_wrap = denoiser_constructor()
+ else:
+ denoiser = k_diffusion.external.CompVisVDenoiser if shared.sd_model.parameterization == "v" else k_diffusion.external.CompVisDenoiser
+ self.model_wrap = denoiser(shared.sd_model, quantize=shared.opts.enable_quantization)
return self.model_wrap
@@ -115,12 +120,16 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
if scheduler.need_inner_model:
sigmas_kwargs['inner_model'] = self.model_wrap
- sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=shared.device)
+ if scheduler.label == 'Beta':
+ p.extra_generation_params["Beta schedule alpha"] = opts.beta_dist_alpha
+ p.extra_generation_params["Beta schedule beta"] = opts.beta_dist_beta
+
+ sigmas = scheduler.function(n=steps, **sigmas_kwargs, device=devices.cpu)
if discard_next_to_last_sigma:
sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
- return sigmas
+ return sigmas.cpu()
def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning, steps=None, image_conditioning=None):
steps, t_enc = sd_samplers_common.setup_img2img_steps(p, steps)
@@ -128,7 +137,10 @@ class KDiffusionSampler(sd_samplers_common.Sampler):
sigmas = self.get_sigmas(p, steps)
sigma_sched = sigmas[steps - t_enc - 1:]
- xi = x + noise * sigma_sched[0]
+ if hasattr(shared.sd_model, 'add_noise_to_latent'):
+ xi = shared.sd_model.add_noise_to_latent(x, noise, sigma_sched[0])
+ else:
+ xi = x + noise * sigma_sched[0]
if opts.img2img_extra_noise > 0:
p.extra_generation_params["Extra noise"] = opts.img2img_extra_noise
diff --git a/modules/sd_samplers_timesteps.py b/modules/sd_samplers_timesteps.py
index 8cc7d3848..81edd67d6 100644
--- a/modules/sd_samplers_timesteps.py
+++ b/modules/sd_samplers_timesteps.py
@@ -10,6 +10,7 @@ import modules.shared as shared
samplers_timesteps = [
('DDIM', sd_samplers_timesteps_impl.ddim, ['ddim'], {}),
+ ('DDIM CFG++', sd_samplers_timesteps_impl.ddim_cfgpp, ['ddim_cfgpp'], {}),
('PLMS', sd_samplers_timesteps_impl.plms, ['plms'], {}),
('UniPC', sd_samplers_timesteps_impl.unipc, ['unipc'], {}),
]
diff --git a/modules/sd_samplers_timesteps_impl.py b/modules/sd_samplers_timesteps_impl.py
index 930a64af5..180e43899 100644
--- a/modules/sd_samplers_timesteps_impl.py
+++ b/modules/sd_samplers_timesteps_impl.py
@@ -5,13 +5,14 @@ import numpy as np
from modules import shared
from modules.models.diffusion.uni_pc import uni_pc
+from modules.torch_utils import float64
@torch.no_grad()
def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
alphas = alphas_cumprod[timesteps]
- alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+ alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
@@ -39,11 +40,51 @@ def ddim(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=
return x
+@torch.no_grad()
+def ddim_cfgpp(model, x, timesteps, extra_args=None, callback=None, disable=None, eta=0.0):
+ """ Implements CFG++: Manifold-constrained Classifier Free Guidance For Diffusion Models (2024).
+ Uses the unconditional noise prediction instead of the conditional noise to guide the denoising direction.
+ The CFG scale is divided by 12.5 to map CFG from [0.0, 12.5] to [0, 1.0].
+ """
+ alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
+ alphas = alphas_cumprod[timesteps]
+ alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
+ sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
+ sigmas = eta * np.sqrt((1 - alphas_prev.cpu().numpy()) / (1 - alphas.cpu()) * (1 - alphas.cpu() / alphas_prev.cpu().numpy()))
+
+ model.cond_scale_miltiplier = 1 / 12.5
+ model.need_last_noise_uncond = True
+
+ extra_args = {} if extra_args is None else extra_args
+ s_in = x.new_ones((x.shape[0]))
+ s_x = x.new_ones((x.shape[0], 1, 1, 1))
+ for i in tqdm.trange(len(timesteps) - 1, disable=disable):
+ index = len(timesteps) - 1 - i
+
+ e_t = model(x, timesteps[index].item() * s_in, **extra_args)
+ last_noise_uncond = model.last_noise_uncond
+
+ a_t = alphas[index].item() * s_x
+ a_prev = alphas_prev[index].item() * s_x
+ sigma_t = sigmas[index].item() * s_x
+ sqrt_one_minus_at = sqrt_one_minus_alphas[index].item() * s_x
+
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+ dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * last_noise_uncond
+ noise = sigma_t * k_diffusion.sampling.torch.randn_like(x)
+ x = a_prev.sqrt() * pred_x0 + dir_xt + noise
+
+ if callback is not None:
+ callback({'x': x, 'i': i, 'sigma': 0, 'sigma_hat': 0, 'denoised': pred_x0})
+
+ return x
+
+
@torch.no_grad()
def plms(model, x, timesteps, extra_args=None, callback=None, disable=None):
alphas_cumprod = model.inner_model.inner_model.alphas_cumprod
alphas = alphas_cumprod[timesteps]
- alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(torch.float64 if x.device.type != 'mps' and x.device.type != 'xpu' else torch.float32)
+ alphas_prev = alphas_cumprod[torch.nn.functional.pad(timesteps[:-1], pad=(1, 0))].to(float64(x))
sqrt_one_minus_alphas = torch.sqrt(1 - alphas)
extra_args = {} if extra_args is None else extra_args
diff --git a/modules/sd_schedulers.py b/modules/sd_schedulers.py
index 75eb3ac03..f4d16e309 100644
--- a/modules/sd_schedulers.py
+++ b/modules/sd_schedulers.py
@@ -1,8 +1,18 @@
import dataclasses
-
import torch
-
import k_diffusion
+import numpy as np
+from scipy import stats
+
+from modules import shared
+
+
+def to_d(x, sigma, denoised):
+ """Converts a denoiser output to a Karras ODE derivative."""
+ return (x - denoised) / sigma
+
+
+k_diffusion.sampling.to_d = to_d
@dataclasses.dataclass
@@ -17,7 +27,7 @@ class Scheduler:
def uniform(n, sigma_min, sigma_max, inner_model, device):
- return inner_model.get_sigmas(n)
+ return inner_model.get_sigmas(n).to(device)
def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
@@ -31,6 +41,92 @@ def sgm_uniform(n, sigma_min, sigma_max, inner_model, device):
return torch.FloatTensor(sigs).to(device)
+def get_align_your_steps_sigmas(n, sigma_min, sigma_max, device):
+ # https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
+ def loglinear_interp(t_steps, num_steps):
+ """
+ Performs log-linear interpolation of a given array of decreasing numbers.
+ """
+ xs = np.linspace(0, 1, len(t_steps))
+ ys = np.log(t_steps[::-1])
+
+ new_xs = np.linspace(0, 1, num_steps)
+ new_ys = np.interp(new_xs, xs, ys)
+
+ interped_ys = np.exp(new_ys)[::-1].copy()
+ return interped_ys
+
+ if shared.sd_model.is_sdxl:
+ sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.029]
+ else:
+ # Default to SD 1.5 sigmas.
+ sigmas = [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.029]
+
+ if n != len(sigmas):
+ sigmas = np.append(loglinear_interp(sigmas, n), [0.0])
+ else:
+ sigmas.append(0.0)
+
+ return torch.FloatTensor(sigmas).to(device)
+
+
+def kl_optimal(n, sigma_min, sigma_max, device):
+ alpha_min = torch.arctan(torch.tensor(sigma_min, device=device))
+ alpha_max = torch.arctan(torch.tensor(sigma_max, device=device))
+ step_indices = torch.arange(n + 1, device=device)
+ sigmas = torch.tan(step_indices / n * alpha_min + (1.0 - step_indices / n) * alpha_max)
+ return sigmas
+
+
+def simple_scheduler(n, sigma_min, sigma_max, inner_model, device):
+ sigs = []
+ ss = len(inner_model.sigmas) / n
+ for x in range(n):
+ sigs += [float(inner_model.sigmas[-(1 + int(x * ss))])]
+ sigs += [0.0]
+ return torch.FloatTensor(sigs).to(device)
+
+
+def normal_scheduler(n, sigma_min, sigma_max, inner_model, device, sgm=False, floor=False):
+ start = inner_model.sigma_to_t(torch.tensor(sigma_max))
+ end = inner_model.sigma_to_t(torch.tensor(sigma_min))
+
+ if sgm:
+ timesteps = torch.linspace(start, end, n + 1)[:-1]
+ else:
+ timesteps = torch.linspace(start, end, n)
+
+ sigs = []
+ for x in range(len(timesteps)):
+ ts = timesteps[x]
+ sigs.append(inner_model.t_to_sigma(ts))
+ sigs += [0.0]
+ return torch.FloatTensor(sigs).to(device)
+
+
+def ddim_scheduler(n, sigma_min, sigma_max, inner_model, device):
+ sigs = []
+ ss = max(len(inner_model.sigmas) // n, 1)
+ x = 1
+ while x < len(inner_model.sigmas):
+ sigs += [float(inner_model.sigmas[x])]
+ x += ss
+ sigs = sigs[::-1]
+ sigs += [0.0]
+ return torch.FloatTensor(sigs).to(device)
+
+
+def beta_scheduler(n, sigma_min, sigma_max, inner_model, device):
+ # From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024) """
+ alpha = shared.opts.beta_dist_alpha
+ beta = shared.opts.beta_dist_beta
+ timesteps = 1 - np.linspace(0, 1, n)
+ timesteps = [stats.beta.ppf(x, alpha, beta) for x in timesteps]
+ sigmas = [sigma_min + (x * (sigma_max-sigma_min)) for x in timesteps]
+ sigmas += [0.0]
+ return torch.FloatTensor(sigmas).to(device)
+
+
schedulers = [
Scheduler('automatic', 'Automatic', None),
Scheduler('uniform', 'Uniform', uniform, need_inner_model=True),
@@ -38,6 +134,12 @@ schedulers = [
Scheduler('exponential', 'Exponential', k_diffusion.sampling.get_sigmas_exponential),
Scheduler('polyexponential', 'Polyexponential', k_diffusion.sampling.get_sigmas_polyexponential, default_rho=1.0),
Scheduler('sgm_uniform', 'SGM Uniform', sgm_uniform, need_inner_model=True, aliases=["SGMUniform"]),
+ Scheduler('kl_optimal', 'KL Optimal', kl_optimal),
+ Scheduler('align_your_steps', 'Align Your Steps', get_align_your_steps_sigmas),
+ Scheduler('simple', 'Simple', simple_scheduler, need_inner_model=True),
+ Scheduler('normal', 'Normal', normal_scheduler, need_inner_model=True),
+ Scheduler('ddim', 'DDIM', ddim_scheduler, need_inner_model=True),
+ Scheduler('beta', 'Beta', beta_scheduler, need_inner_model=True),
]
schedulers_map = {**{x.name: x for x in schedulers}, **{x.label: x for x in schedulers}}
diff --git a/modules/sd_vae_approx.py b/modules/sd_vae_approx.py
index 3965e223e..c5dda7431 100644
--- a/modules/sd_vae_approx.py
+++ b/modules/sd_vae_approx.py
@@ -8,9 +8,9 @@ sd_vae_approx_models = {}
class VAEApprox(nn.Module):
- def __init__(self):
+ def __init__(self, latent_channels=4):
super(VAEApprox, self).__init__()
- self.conv1 = nn.Conv2d(4, 8, (7, 7))
+ self.conv1 = nn.Conv2d(latent_channels, 8, (7, 7))
self.conv2 = nn.Conv2d(8, 16, (5, 5))
self.conv3 = nn.Conv2d(16, 32, (3, 3))
self.conv4 = nn.Conv2d(32, 64, (3, 3))
@@ -40,7 +40,13 @@ def download_model(model_path, model_url):
def model():
- model_name = "vaeapprox-sdxl.pt" if getattr(shared.sd_model, 'is_sdxl', False) else "model.pt"
+ if shared.sd_model.is_sd3:
+ model_name = "vaeapprox-sd3.pt"
+ elif shared.sd_model.is_sdxl:
+ model_name = "vaeapprox-sdxl.pt"
+ else:
+ model_name = "model.pt"
+
loaded_model = sd_vae_approx_models.get(model_name)
if loaded_model is None:
@@ -52,7 +58,7 @@ def model():
model_path = os.path.join(paths.models_path, "VAE-approx", model_name)
download_model(model_path, 'https://github.com/AUTOMATIC1111/stable-diffusion-webui/releases/download/v1.0.0-pre/' + model_name)
- loaded_model = VAEApprox()
+ loaded_model = VAEApprox(latent_channels=shared.sd_model.latent_channels)
loaded_model.load_state_dict(torch.load(model_path, map_location='cpu' if devices.device.type != 'cuda' else None))
loaded_model.eval()
loaded_model.to(devices.device, devices.dtype)
@@ -64,7 +70,18 @@ def model():
def cheap_approximation(sample):
# https://discuss.huggingface.co/t/decoding-latents-to-rgb-without-upscaling/23204/2
- if shared.sd_model.is_sdxl:
+ if shared.sd_model.is_sd3:
+ coeffs = [
+ [-0.0645, 0.0177, 0.1052], [ 0.0028, 0.0312, 0.0650],
+ [ 0.1848, 0.0762, 0.0360], [ 0.0944, 0.0360, 0.0889],
+ [ 0.0897, 0.0506, -0.0364], [-0.0020, 0.1203, 0.0284],
+ [ 0.0855, 0.0118, 0.0283], [-0.0539, 0.0658, 0.1047],
+ [-0.0057, 0.0116, 0.0700], [-0.0412, 0.0281, -0.0039],
+ [ 0.1106, 0.1171, 0.1220], [-0.0248, 0.0682, -0.0481],
+ [ 0.0815, 0.0846, 0.1207], [-0.0120, -0.0055, -0.0867],
+ [-0.0749, -0.0634, -0.0456], [-0.1418, -0.1457, -0.1259],
+ ]
+ elif shared.sd_model.is_sdxl:
coeffs = [
[ 0.3448, 0.4168, 0.4395],
[-0.1953, -0.0290, 0.0250],
diff --git a/modules/sd_vae_taesd.py b/modules/sd_vae_taesd.py
index 808eb3624..d06253d2a 100644
--- a/modules/sd_vae_taesd.py
+++ b/modules/sd_vae_taesd.py
@@ -34,9 +34,9 @@ class Block(nn.Module):
return self.fuse(self.conv(x) + self.skip(x))
-def decoder():
+def decoder(latent_channels=4):
return nn.Sequential(
- Clamp(), conv(4, 64), nn.ReLU(),
+ Clamp(), conv(latent_channels, 64), nn.ReLU(),
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
@@ -44,13 +44,13 @@ def decoder():
)
-def encoder():
+def encoder(latent_channels=4):
return nn.Sequential(
conv(3, 64), Block(64, 64),
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
- conv(64, 4),
+ conv(64, latent_channels),
)
@@ -58,10 +58,14 @@ class TAESDDecoder(nn.Module):
latent_magnitude = 3
latent_shift = 0.5
- def __init__(self, decoder_path="taesd_decoder.pth"):
+ def __init__(self, decoder_path="taesd_decoder.pth", latent_channels=None):
"""Initialize pretrained TAESD on the given device from the given checkpoints."""
super().__init__()
- self.decoder = decoder()
+
+ if latent_channels is None:
+ latent_channels = 16 if "taesd3" in str(decoder_path) else 4
+
+ self.decoder = decoder(latent_channels)
self.decoder.load_state_dict(
torch.load(decoder_path, map_location='cpu' if devices.device.type != 'cuda' else None))
@@ -70,10 +74,14 @@ class TAESDEncoder(nn.Module):
latent_magnitude = 3
latent_shift = 0.5
- def __init__(self, encoder_path="taesd_encoder.pth"):
+ def __init__(self, encoder_path="taesd_encoder.pth", latent_channels=None):
"""Initialize pretrained TAESD on the given device from the given checkpoints."""
super().__init__()
- self.encoder = encoder()
+
+ if latent_channels is None:
+ latent_channels = 16 if "taesd3" in str(encoder_path) else 4
+
+ self.encoder = encoder(latent_channels)
self.encoder.load_state_dict(
torch.load(encoder_path, map_location='cpu' if devices.device.type != 'cuda' else None))
@@ -87,7 +95,13 @@ def download_model(model_path, model_url):
def decoder_model():
- model_name = "taesdxl_decoder.pth" if getattr(shared.sd_model, 'is_sdxl', False) else "taesd_decoder.pth"
+ if shared.sd_model.is_sd3:
+ model_name = "taesd3_decoder.pth"
+ elif shared.sd_model.is_sdxl:
+ model_name = "taesdxl_decoder.pth"
+ else:
+ model_name = "taesd_decoder.pth"
+
loaded_model = sd_vae_taesd_models.get(model_name)
if loaded_model is None:
@@ -106,7 +120,13 @@ def decoder_model():
def encoder_model():
- model_name = "taesdxl_encoder.pth" if getattr(shared.sd_model, 'is_sdxl', False) else "taesd_encoder.pth"
+ if shared.sd_model.is_sd3:
+ model_name = "taesd3_encoder.pth"
+ elif shared.sd_model.is_sdxl:
+ model_name = "taesdxl_encoder.pth"
+ else:
+ model_name = "taesd_encoder.pth"
+
loaded_model = sd_vae_taesd_models.get(model_name)
if loaded_model is None:
diff --git a/modules/shared.py b/modules/shared.py
index a41cd457c..2a3787f99 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -47,7 +47,7 @@ restricted_opts: set[str] = None
sd_model: sd_models_types.WebuiSdModel = None
settings_components: dict = None
-"""assigned from ui.py, a mapping on setting names to gradio components repsponsible for those settings"""
+"""assigned from ui.py, a mapping on setting names to gradio components responsible for those settings"""
tab_names = []
diff --git a/modules/shared_gradio_themes.py b/modules/shared_gradio_themes.py
index b6dc31450..b4e3f32bc 100644
--- a/modules/shared_gradio_themes.py
+++ b/modules/shared_gradio_themes.py
@@ -69,3 +69,44 @@ def reload_gradio_theme(theme_name=None):
# append additional values gradio_theme
shared.gradio_theme.sd_webui_modal_lightbox_toolbar_opacity = shared.opts.sd_webui_modal_lightbox_toolbar_opacity
shared.gradio_theme.sd_webui_modal_lightbox_icon_opacity = shared.opts.sd_webui_modal_lightbox_icon_opacity
+
+
+def resolve_var(name: str, gradio_theme=None, history=None):
+ """
+ Attempt to resolve a theme variable name to its value
+
+ Parameters:
+ name (str): The name of the theme variable
+ ie "background_fill_primary", "background_fill_primary_dark"
+ spaces and asterisk (*) prefix is removed from name before lookup
+ gradio_theme (gradio.themes.ThemeClass): The theme object to resolve the variable from
+ blank to use the webui default shared.gradio_theme
+ history (list): A list of previously resolved variables to prevent circular references
+ for regular use leave blank
+ Returns:
+ str: The resolved value
+
+ Error handling:
+ return either #000000 or #ffffff depending on initial name ending with "_dark"
+ """
+ try:
+ if history is None:
+ history = []
+ if gradio_theme is None:
+ gradio_theme = shared.gradio_theme
+
+ name = name.strip()
+ name = name[1:] if name.startswith("*") else name
+
+ if name in history:
+ raise ValueError(f'Circular references: name "{name}" in {history}')
+
+ if value := getattr(gradio_theme, name, None):
+ return resolve_var(value, gradio_theme, history + [name])
+ else:
+ return name
+
+ except Exception:
+ name = history[0] if history else name
+ errors.report(f'resolve_color({name})', exc_info=True)
+ return '#000000' if name.endswith("_dark") else '#ffffff'
diff --git a/modules/shared_init.py b/modules/shared_init.py
index 935e3a21c..a6ad0433d 100644
--- a/modules/shared_init.py
+++ b/modules/shared_init.py
@@ -31,6 +31,14 @@ def initialize():
devices.dtype_vae = torch.float32 if cmd_opts.no_half or cmd_opts.no_half_vae else torch.float16
devices.dtype_inference = torch.float32 if cmd_opts.precision == 'full' else devices.dtype
+ if cmd_opts.precision == "half":
+ msg = "--no-half and --no-half-vae conflict with --precision half"
+ assert devices.dtype == torch.float16, msg
+ assert devices.dtype_vae == torch.float16, msg
+ assert devices.dtype_inference == torch.float16, msg
+ devices.force_fp16 = True
+ devices.force_model_fp16()
+
shared.device = devices.device
shared.weight_load_location = None if cmd_opts.lowram else "cpu"
diff --git a/modules/shared_options.py b/modules/shared_options.py
index 326a317e0..9f4520274 100644
--- a/modules/shared_options.py
+++ b/modules/shared_options.py
@@ -54,7 +54,7 @@ options_templates.update(options_section(('saving-images', "Saving images/grids"
"save_images_before_color_correction": OptionInfo(False, "Save a copy of image before applying color correction to img2img results"),
"save_mask": OptionInfo(False, "For inpainting, save a copy of the greyscale mask"),
"save_mask_composite": OptionInfo(False, "For inpainting, save a masked composite"),
- "jpeg_quality": OptionInfo(80, "Quality for saved jpeg images", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1}),
+ "jpeg_quality": OptionInfo(80, "Quality for saved jpeg and avif images", gr.Slider, {"minimum": 1, "maximum": 100, "step": 1}),
"webp_lossless": OptionInfo(False, "Use lossless compression for webp images"),
"export_for_4chan": OptionInfo(True, "Save copy of large images as JPG").info("if the file size is above the limit, or either width or height are above the limit"),
"img_downscale_threshold": OptionInfo(4.0, "File size limit for the above option, MB", gr.Number),
@@ -64,6 +64,7 @@ options_templates.update(options_section(('saving-images', "Saving images/grids"
"use_original_name_batch": OptionInfo(True, "Use original name for output filename during batch process in extras tab"),
"use_upscaler_name_as_suffix": OptionInfo(False, "Use upscaler name as filename suffix in the extras tab"),
"save_selected_only": OptionInfo(True, "When using 'Save' button, only save a single selected image"),
+ "save_write_log_csv": OptionInfo(True, "Write log.csv when saving images using 'Save' button"),
"save_init_img": OptionInfo(False, "Save init images when using img2img"),
"temp_dir": OptionInfo("", "Directory for temporary images; leave empty for default"),
@@ -129,6 +130,22 @@ options_templates.update(options_section(('system', "System", "system"), {
"dump_stacks_on_signal": OptionInfo(False, "Print stack traces before exiting the program with ctrl+c."),
}))
+options_templates.update(options_section(('profiler', "Profiler", "system"), {
+ "profiling_explanation": OptionHTML("""
+Those settings allow you to enable torch profiler when generating pictures.
+Profiling allows you to see which code uses how much of computer's resources during generation.
+Each generation writes its own profile to one file, overwriting previous.
+The file can be viewed in Chrome, or on a Perfetto web site.
+Warning: writing profile can take a lot of time, up to 30 seconds, and the file itelf can be around 500MB in size.
+"""),
+ "profiling_enable": OptionInfo(False, "Enable profiling"),
+ "profiling_activities": OptionInfo(["CPU"], "Activities", gr.CheckboxGroup, {"choices": ["CPU", "CUDA"]}),
+ "profiling_record_shapes": OptionInfo(True, "Record shapes"),
+ "profiling_profile_memory": OptionInfo(True, "Profile memory"),
+ "profiling_with_stack": OptionInfo(True, "Include python stack"),
+ "profiling_filename": OptionInfo("trace.json", "Profile filename"),
+}))
+
options_templates.update(options_section(('API', "API", "system"), {
"api_enable_requests": OptionInfo(True, "Allow http:// and https:// URLs for input images in API", restrict_api=True),
"api_forbid_local_requests": OptionInfo(True, "Forbid URLs to local resources", restrict_api=True),
@@ -160,6 +177,7 @@ options_templates.update(options_section(('sd', "Stable Diffusion", "sd"), {
"emphasis": OptionInfo("Original", "Emphasis mode", gr.Radio, lambda: {"choices": [x.name for x in sd_emphasis.options]}, infotext="Emphasis").info("makes it possible to make model to pay (more:1.1) or (less:0.9) attention to text when you use the syntax in prompt; " + sd_emphasis.get_options_descriptions()),
"enable_batch_seeds": OptionInfo(True, "Make K-diffusion samplers produce same images in a batch as when making a single image"),
"comma_padding_backtrack": OptionInfo(20, "Prompt word wrap length limit", gr.Slider, {"minimum": 0, "maximum": 74, "step": 1}).info("in tokens - for texts shorter than specified, if they don't fit into 75 token limit, move them to the next 75 token chunk"),
+ "sdxl_clip_l_skip": OptionInfo(False, "Clip skip SDXL", gr.Checkbox).info("Enable Clip skip for the secondary clip model in sdxl. Has no effect on SD 1.5 or SD 2.0/2.1."),
"CLIP_stop_at_last_layers": OptionInfo(1, "Clip skip", gr.Slider, {"minimum": 1, "maximum": 12, "step": 1}, infotext="Clip skip").link("wiki", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#clip-skip").info("ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer"),
"upcast_attn": OptionInfo(False, "Upcast cross attention layer to float32"),
"randn_source": OptionInfo("GPU", "Random number generator source.", gr.Radio, {"choices": ["GPU", "CPU", "NV"]}, infotext="RNG").info("changes seeds drastically; use CPU to produce the same picture across different videocard vendors; use NV to produce same picture as on NVidia videocards"),
@@ -174,6 +192,10 @@ options_templates.update(options_section(('sdxl', "Stable Diffusion XL", "sd"),
"sdxl_refiner_high_aesthetic_score": OptionInfo(6.0, "SDXL high aesthetic score", gr.Number).info("used for refiner model prompt"),
}))
+options_templates.update(options_section(('sd3', "Stable Diffusion 3", "sd"), {
+ "sd3_enable_t5": OptionInfo(False, "Enable T5").info("load T5 text encoder; increases VRAM use by a lot, potentially improving quality of generation; requires model reload to apply"),
+}))
+
options_templates.update(options_section(('vae', "VAE", "sd"), {
"sd_vae_explanation": OptionHTML("""
VAE is a neural network that transforms a standard RGB
@@ -209,7 +231,8 @@ options_templates.update(options_section(('img2img', "img2img", "sd"), {
options_templates.update(options_section(('optimizations', "Optimizations", "sd"), {
"cross_attention_optimization": OptionInfo("Automatic", "Cross attention optimization", gr.Dropdown, lambda: {"choices": shared_items.cross_attention_optimizations()}),
- "s_min_uncond": OptionInfo(0.0, "Negative Guidance minimum sigma", gr.Slider, {"minimum": 0.0, "maximum": 15.0, "step": 0.01}).link("PR", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/9177").info("skip negative prompt for some steps when the image is almost ready; 0=disable, higher=faster"),
+ "s_min_uncond": OptionInfo(0.0, "Negative Guidance minimum sigma", gr.Slider, {"minimum": 0.0, "maximum": 15.0, "step": 0.01}, infotext='NGMS').link("PR", "https://github.com/AUTOMATIC1111/stablediffusion-webui/pull/9177").info("skip negative prompt for some steps when the image is almost ready; 0=disable, higher=faster"),
+ "s_min_uncond_all": OptionInfo(False, "Negative Guidance minimum sigma all steps", infotext='NGMS all steps').info("By default, NGMS above skips every other step; this makes it skip all steps"),
"token_merging_ratio": OptionInfo(0.0, "Token merging ratio", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}, infotext='Token merging ratio').link("PR", "https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/9256").info("0=disable, higher=faster"),
"token_merging_ratio_img2img": OptionInfo(0.0, "Token merging ratio for img2img", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}).info("only applies if non-zero and overrides above"),
"token_merging_ratio_hr": OptionInfo(0.0, "Token merging ratio for high-res pass", gr.Slider, {"minimum": 0.0, "maximum": 0.9, "step": 0.1}, infotext='Token merging ratio hr').info("only applies if non-zero and overrides above"),
@@ -227,7 +250,6 @@ options_templates.update(options_section(('compatibility', "Compatibility", "sd"
"use_old_karras_scheduler_sigmas": OptionInfo(False, "Use old karras scheduler sigmas (0.1 to 10)."),
"no_dpmpp_sde_batch_determinism": OptionInfo(False, "Do not make DPM++ SDE deterministic across different batch sizes."),
"use_old_hires_fix_width_height": OptionInfo(False, "For hires fix, use width/height sliders to set final resolution rather than first pass (disables Upscale by, Resize width/height to)."),
- "dont_fix_second_order_samplers_schedule": OptionInfo(False, "Do not fix prompt schedule for second order samplers."),
"hires_fix_use_firstpass_conds": OptionInfo(False, "For hires fix, calculate conds of second pass using extra networks of first pass."),
"use_old_scheduling": OptionInfo(False, "Use old prompt editing timelines.", infotext="Old prompt editing timelines").info("For [red:green:N]; old: If N < 1, it's a fraction of steps (and hires fix uses range from 0 to 1), if N >= 1, it's an absolute number of steps; new: If N has a decimal point in it, it's a fraction of steps (and hires fix uses range from 1 to 2), othewrwise it's an absolute number of steps"),
"use_downcasted_alpha_bar": OptionInfo(False, "Downcast model alphas_cumprod to fp16 before sampling. For reproducing old seeds.", infotext="Downcast alphas_cumprod"),
@@ -359,6 +381,7 @@ options_templates.update(options_section(('ui', "Live previews", "ui"), {
"live_preview_refresh_period": OptionInfo(1000, "Progressbar and preview update period").info("in milliseconds"),
"live_preview_fast_interrupt": OptionInfo(False, "Return image with chosen live preview method on interrupt").info("makes interrupts faster"),
"js_live_preview_in_modal_lightbox": OptionInfo(False, "Show Live preview in full page image viewer"),
+ "prevent_screen_sleep_during_generation": OptionInfo(True, "Prevent screen sleep during generation"),
}))
options_templates.update(options_section(('sampler-params', "Sampler parameters", "sd"), {
@@ -380,7 +403,10 @@ options_templates.update(options_section(('sampler-params', "Sampler parameters"
'uni_pc_skip_type': OptionInfo("time_uniform", "UniPC skip type", gr.Radio, {"choices": ["time_uniform", "time_quadratic", "logSNR"]}, infotext='UniPC skip type'),
'uni_pc_order': OptionInfo(3, "UniPC order", gr.Slider, {"minimum": 1, "maximum": 50, "step": 1}, infotext='UniPC order').info("must be < sampling steps"),
'uni_pc_lower_order_final': OptionInfo(True, "UniPC lower order final", infotext='UniPC lower order final'),
- 'sd_noise_schedule': OptionInfo("Default", "Noise schedule for sampling", gr.Radio, {"choices": ["Default", "Zero Terminal SNR"]}, infotext="Noise Schedule").info("for use with zero terminal SNR trained models")
+ 'sd_noise_schedule': OptionInfo("Default", "Noise schedule for sampling", gr.Radio, {"choices": ["Default", "Zero Terminal SNR"]}, infotext="Noise Schedule").info("for use with zero terminal SNR trained models"),
+ 'skip_early_cond': OptionInfo(0.0, "Ignore negative prompt during early sampling", gr.Slider, {"minimum": 0.0, "maximum": 1.0, "step": 0.01}, infotext="Skip Early CFG").info("disables CFG on a proportion of steps at the beginning of generation; 0=skip none; 1=skip all; can both improve sample diversity/quality and speed up sampling"),
+ 'beta_dist_alpha': OptionInfo(0.6, "Beta scheduler - alpha", gr.Slider, {"minimum": 0.01, "maximum": 1.0, "step": 0.01}, infotext='Beta scheduler alpha').info('Default = 0.6; the alpha parameter of the beta distribution used in Beta sampling'),
+ 'beta_dist_beta': OptionInfo(0.6, "Beta scheduler - beta", gr.Slider, {"minimum": 0.01, "maximum": 1.0, "step": 0.01}, infotext='Beta scheduler beta').info('Default = 0.6; the beta parameter of the beta distribution used in Beta sampling'),
}))
options_templates.update(options_section(('postprocessing', "Postprocessing", "postprocessing"), {
diff --git a/modules/shared_state.py b/modules/shared_state.py
index f74eafc58..4cd53af62 100644
--- a/modules/shared_state.py
+++ b/modules/shared_state.py
@@ -162,7 +162,7 @@ class State:
errors.record_exception()
def assign_current_image(self, image):
- if shared.opts.live_previews_image_format == 'jpeg' and image.mode == 'RGBA':
+ if shared.opts.live_previews_image_format == 'jpeg' and image.mode in ('RGBA', 'P'):
image = image.convert('RGB')
self.current_image = image
self.id_live_preview += 1
diff --git a/modules/sysinfo.py b/modules/sysinfo.py
index f336251e4..e9a83d74e 100644
--- a/modules/sysinfo.py
+++ b/modules/sysinfo.py
@@ -1,15 +1,13 @@
import json
import os
import sys
-
+import subprocess
import platform
import hashlib
-import pkg_resources
-import psutil
import re
+from pathlib import Path
-import launch
-from modules import paths_internal, timer, shared, extensions, errors
+from modules import paths_internal, timer, shared_cmd_options, errors, launch_utils
checksum_token = "DontStealMyGamePlz__WINNERS_DONT_USE_DRUGS__DONT_COPY_THAT_FLOPPY"
environment_whitelist = {
@@ -69,14 +67,46 @@ def check(x):
return h.hexdigest() == m.group(1)
-def get_dict():
- ram = psutil.virtual_memory()
+def get_cpu_info():
+ cpu_info = {"model": platform.processor()}
+ try:
+ import psutil
+ cpu_info["count logical"] = psutil.cpu_count(logical=True)
+ cpu_info["count physical"] = psutil.cpu_count(logical=False)
+ except Exception as e:
+ cpu_info["error"] = str(e)
+ return cpu_info
+
+def get_ram_info():
+ try:
+ import psutil
+ ram = psutil.virtual_memory()
+ return {x: pretty_bytes(getattr(ram, x, 0)) for x in ["total", "used", "free", "active", "inactive", "buffers", "cached", "shared"] if getattr(ram, x, 0) != 0}
+ except Exception as e:
+ return str(e)
+
+
+def get_packages():
+ try:
+ return subprocess.check_output([sys.executable, '-m', 'pip', 'freeze', '--all']).decode("utf8").splitlines()
+ except Exception as pip_error:
+ try:
+ import importlib.metadata
+ packages = importlib.metadata.distributions()
+ return sorted([f"{package.metadata['Name']}=={package.version}" for package in packages])
+ except Exception as e2:
+ return {'error pip': pip_error, 'error importlib': str(e2)}
+
+
+def get_dict():
+ config = get_config()
res = {
"Platform": platform.platform(),
"Python": platform.python_version(),
- "Version": launch.git_tag(),
- "Commit": launch.commit_hash(),
+ "Version": launch_utils.git_tag(),
+ "Commit": launch_utils.commit_hash(),
+ "Git status": git_status(paths_internal.script_path),
"Script path": paths_internal.script_path,
"Data path": paths_internal.data_path,
"Extensions dir": paths_internal.extensions_dir,
@@ -84,20 +114,14 @@ def get_dict():
"Commandline": get_argv(),
"Torch env info": get_torch_sysinfo(),
"Exceptions": errors.get_exceptions(),
- "CPU": {
- "model": platform.processor(),
- "count logical": psutil.cpu_count(logical=True),
- "count physical": psutil.cpu_count(logical=False),
- },
- "RAM": {
- x: pretty_bytes(getattr(ram, x, 0)) for x in ["total", "used", "free", "active", "inactive", "buffers", "cached", "shared"] if getattr(ram, x, 0) != 0
- },
- "Extensions": get_extensions(enabled=True),
- "Inactive extensions": get_extensions(enabled=False),
+ "CPU": get_cpu_info(),
+ "RAM": get_ram_info(),
+ "Extensions": get_extensions(enabled=True, fallback_disabled_extensions=config.get('disabled_extensions', [])),
+ "Inactive extensions": get_extensions(enabled=False, fallback_disabled_extensions=config.get('disabled_extensions', [])),
"Environment": get_environment(),
- "Config": get_config(),
+ "Config": config,
"Startup": timer.startup_record,
- "Packages": sorted([f"{pkg.key}=={pkg.version}" for pkg in pkg_resources.working_set]),
+ "Packages": get_packages(),
}
return res
@@ -111,11 +135,11 @@ def get_argv():
res = []
for v in sys.argv:
- if shared.cmd_opts.gradio_auth and shared.cmd_opts.gradio_auth == v:
+ if shared_cmd_options.cmd_opts.gradio_auth and shared_cmd_options.cmd_opts.gradio_auth == v:
res.append("")
continue
- if shared.cmd_opts.api_auth and shared.cmd_opts.api_auth == v:
+ if shared_cmd_options.cmd_opts.api_auth and shared_cmd_options.cmd_opts.api_auth == v:
res.append("")
continue
@@ -123,6 +147,7 @@ def get_argv():
return res
+
re_newline = re.compile(r"\r*\n")
@@ -136,25 +161,55 @@ def get_torch_sysinfo():
return str(e)
-def get_extensions(*, enabled):
-
+def run_git(path, *args):
try:
- def to_json(x: extensions.Extension):
- return {
- "name": x.name,
- "path": x.path,
- "version": x.version,
- "branch": x.branch,
- "remote": x.remote,
- }
+ return subprocess.check_output([launch_utils.git, '-C', path, *args], shell=False, encoding='utf8').strip()
+ except Exception as e:
+ return str(e)
- return [to_json(x) for x in extensions.extensions if not x.is_builtin and x.enabled == enabled]
+
+def git_status(path):
+ if (Path(path) / '.git').is_dir():
+ return run_git(paths_internal.script_path, 'status')
+
+
+def get_info_from_repo_path(path: Path):
+ is_repo = (path / '.git').is_dir()
+ return {
+ 'name': path.name,
+ 'path': str(path),
+ 'commit': run_git(path, 'rev-parse', 'HEAD') if is_repo else None,
+ 'branch': run_git(path, 'branch', '--show-current') if is_repo else None,
+ 'remote': run_git(path, 'remote', 'get-url', 'origin') if is_repo else None,
+ }
+
+
+def get_extensions(*, enabled, fallback_disabled_extensions=None):
+ try:
+ from modules import extensions
+ if extensions.extensions:
+ def to_json(x: extensions.Extension):
+ return {
+ "name": x.name,
+ "path": x.path,
+ "commit": x.commit_hash,
+ "branch": x.branch,
+ "remote": x.remote,
+ }
+ return [to_json(x) for x in extensions.extensions if not x.is_builtin and x.enabled == enabled]
+ else:
+ return [get_info_from_repo_path(d) for d in Path(paths_internal.extensions_dir).iterdir() if d.is_dir() and enabled != (str(d.name) in fallback_disabled_extensions)]
except Exception as e:
return str(e)
def get_config():
try:
+ from modules import shared
return shared.opts.data
- except Exception as e:
- return str(e)
+ except Exception as _:
+ try:
+ with open(shared_cmd_options.cmd_opts.ui_settings_file, 'r') as f:
+ return json.load(f)
+ except Exception as e:
+ return str(e)
diff --git a/modules/textual_inversion/textual_inversion.py b/modules/textual_inversion/textual_inversion.py
index 253f219c4..dc7833e93 100644
--- a/modules/textual_inversion/textual_inversion.py
+++ b/modules/textual_inversion/textual_inversion.py
@@ -181,12 +181,16 @@ class EmbeddingDatabase:
else:
return
- embedding = create_embedding_from_data(data, name, filename=filename, filepath=path)
+ if data is not None:
+ embedding = create_embedding_from_data(data, name, filename=filename, filepath=path)
- if self.expected_shape == -1 or self.expected_shape == embedding.shape:
- self.register_embedding(embedding, shared.sd_model)
+ if self.expected_shape == -1 or self.expected_shape == embedding.shape:
+ self.register_embedding(embedding, shared.sd_model)
+ else:
+ self.skipped_embeddings[name] = embedding
else:
- self.skipped_embeddings[name] = embedding
+ print(f"Unable to load Textual inversion embedding due to data issue: '{name}'.")
+
def load_from_dir(self, embdir):
if not os.path.isdir(embdir.path):
diff --git a/modules/torch_utils.py b/modules/torch_utils.py
index e5b52393e..5ea3da094 100644
--- a/modules/torch_utils.py
+++ b/modules/torch_utils.py
@@ -1,6 +1,7 @@
from __future__ import annotations
import torch.nn
+import torch
def get_param(model) -> torch.nn.Parameter:
@@ -15,3 +16,10 @@ def get_param(model) -> torch.nn.Parameter:
return param
raise ValueError(f"No parameters found in model {model!r}")
+
+
+def float64(t: torch.Tensor):
+ """return torch.float64 if device is not mps or xpu, else return torch.float32"""
+ if t.device.type in ['mps', 'xpu']:
+ return torch.float32
+ return torch.float64
diff --git a/modules/ui.py b/modules/ui.py
index 403425f29..f48638f69 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -10,7 +10,7 @@ import gradio as gr
import gradio.utils
import numpy as np
from PIL import Image, PngImagePlugin # noqa: F401
-from modules.call_queue import wrap_gradio_gpu_call, wrap_queued_call, wrap_gradio_call
+from modules.call_queue import wrap_gradio_gpu_call, wrap_queued_call, wrap_gradio_call, wrap_gradio_call_no_job # noqa: F401
from modules import gradio_extensons, sd_schedulers # noqa: F401
from modules import sd_hijack, sd_models, script_callbacks, ui_extensions, deepbooru, extra_networks, ui_common, ui_postprocessing, progress, ui_loadsave, shared_items, ui_settings, timer, sysinfo, ui_checkpoint_merger, scripts, sd_samplers, processing, ui_extra_networks, ui_toprow, launch_utils
@@ -38,9 +38,11 @@ warnings.filterwarnings("default" if opts.show_gradio_deprecation_warnings else
# this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the browser will not show any UI
mimetypes.init()
mimetypes.add_type('application/javascript', '.js')
+mimetypes.add_type('application/javascript', '.mjs')
# Likewise, add explicit content-type header for certain missing image types
mimetypes.add_type('image/webp', '.webp')
+mimetypes.add_type('image/avif', '.avif')
if not cmd_opts.share and not cmd_opts.listen:
# fix gradio phoning home
@@ -566,18 +568,25 @@ def create_ui():
init_mask_inpaint = gr.Image(label="Mask", source="upload", interactive=True, type="pil", image_mode="RGBA", elem_id="img_inpaint_mask")
with gr.TabItem('Batch', id='batch', elem_id="img2img_batch_tab") as tab_batch:
- hidden = ' Disabled when launched with --hide-ui-dir-config.' if shared.cmd_opts.hide_ui_dir_config else ''
- gr.HTML(
- "
Process images in a directory on the same machine where the server is running." +
- " Use an empty output directory to save pictures normally instead of writing to the output directory." +
- f" Add inpaint batch mask directory to enable inpaint batch processing."
- f"{hidden}
"
- )
- img2img_batch_input_dir = gr.Textbox(label="Input directory", **shared.hide_dirs, elem_id="img2img_batch_input_dir")
- img2img_batch_output_dir = gr.Textbox(label="Output directory", **shared.hide_dirs, elem_id="img2img_batch_output_dir")
- img2img_batch_inpaint_mask_dir = gr.Textbox(label="Inpaint batch mask directory (required for inpaint batch processing only)", **shared.hide_dirs, elem_id="img2img_batch_inpaint_mask_dir")
+ with gr.Tabs(elem_id="img2img_batch_source"):
+ img2img_batch_source_type = gr.Textbox(visible=False, value="upload")
+ with gr.TabItem('Upload', id='batch_upload', elem_id="img2img_batch_upload_tab") as tab_batch_upload:
+ img2img_batch_upload = gr.Files(label="Files", interactive=True, elem_id="img2img_batch_upload")
+ with gr.TabItem('From directory', id='batch_from_dir', elem_id="img2img_batch_from_dir_tab") as tab_batch_from_dir:
+ hidden = ' Disabled when launched with --hide-ui-dir-config.' if shared.cmd_opts.hide_ui_dir_config else ''
+ gr.HTML(
+ "
Process images in a directory on the same machine where the server is running." +
+ " Use an empty output directory to save pictures normally instead of writing to the output directory." +
+ f" Add inpaint batch mask directory to enable inpaint batch processing."
+ f"{hidden}
"
+ )
+ img2img_batch_input_dir = gr.Textbox(label="Input directory", **shared.hide_dirs, elem_id="img2img_batch_input_dir")
+ img2img_batch_output_dir = gr.Textbox(label="Output directory", **shared.hide_dirs, elem_id="img2img_batch_output_dir")
+ img2img_batch_inpaint_mask_dir = gr.Textbox(label="Inpaint batch mask directory (required for inpaint batch processing only)", **shared.hide_dirs, elem_id="img2img_batch_inpaint_mask_dir")
+ tab_batch_upload.select(fn=lambda: "upload", inputs=[], outputs=[img2img_batch_source_type])
+ tab_batch_from_dir.select(fn=lambda: "from dir", inputs=[], outputs=[img2img_batch_source_type])
with gr.Accordion("PNG info", open=False):
- img2img_batch_use_png_info = gr.Checkbox(label="Append png info to prompts", **shared.hide_dirs, elem_id="img2img_batch_use_png_info")
+ img2img_batch_use_png_info = gr.Checkbox(label="Append png info to prompts", elem_id="img2img_batch_use_png_info")
img2img_batch_png_info_dir = gr.Textbox(label="PNG info directory", **shared.hide_dirs, placeholder="Leave empty to use input directory", elem_id="img2img_batch_png_info_dir")
img2img_batch_png_info_props = gr.CheckboxGroup(["Prompt", "Negative prompt", "Seed", "CFG scale", "Sampler", "Steps", "Model hash"], label="Parameters to take from png info", info="Prompts from png info will be appended to prompts set in ui.")
@@ -613,8 +622,8 @@ def create_ui():
with gr.Column(elem_id="img2img_column_size", scale=4):
selected_scale_tab = gr.Number(value=0, visible=False)
- with gr.Tabs():
- with gr.Tab(label="Resize to", elem_id="img2img_tab_resize_to") as tab_scale_to:
+ with gr.Tabs(elem_id="img2img_tabs_resize"):
+ with gr.Tab(label="Resize to", id="to", elem_id="img2img_tab_resize_to") as tab_scale_to:
with FormRow():
with gr.Column(elem_id="img2img_column_size", scale=4):
width = gr.Slider(minimum=64, maximum=2048, step=8, label="Width", value=512, elem_id="img2img_width")
@@ -623,7 +632,7 @@ def create_ui():
res_switch_btn = ToolButton(value=switch_values_symbol, elem_id="img2img_res_switch_btn", tooltip="Switch width/height")
detect_image_size_btn = ToolButton(value=detect_image_size_symbol, elem_id="img2img_detect_image_size_btn", tooltip="Auto detect size from img2img")
- with gr.Tab(label="Resize by", elem_id="img2img_tab_resize_by") as tab_scale_by:
+ with gr.Tab(label="Resize by", id="by", elem_id="img2img_tab_resize_by") as tab_scale_by:
scale_by = gr.Slider(minimum=0.05, maximum=4.0, step=0.05, label="Scale", value=1.0, elem_id="img2img_scale")
with FormRow():
@@ -759,6 +768,8 @@ def create_ui():
img2img_batch_use_png_info,
img2img_batch_png_info_props,
img2img_batch_png_info_dir,
+ img2img_batch_source_type,
+ img2img_batch_upload,
] + custom_inputs,
outputs=[
output_panel.gallery,
@@ -878,7 +889,7 @@ def create_ui():
))
image.change(
- fn=wrap_gradio_call(modules.extras.run_pnginfo),
+ fn=wrap_gradio_call_no_job(modules.extras.run_pnginfo),
inputs=[image],
outputs=[html, generation_info, html2],
)
diff --git a/modules/ui_common.py b/modules/ui_common.py
index 48992a3c1..395bb3b61 100644
--- a/modules/ui_common.py
+++ b/modules/ui_common.py
@@ -3,6 +3,7 @@ import dataclasses
import json
import html
import os
+from contextlib import nullcontext
import gradio as gr
@@ -103,14 +104,15 @@ def save_files(js_data, images, do_make_zip, index):
# NOTE: ensure csv integrity when fields are added by
# updating headers and padding with delimiters where needed
- if os.path.exists(logfile_path):
+ if shared.opts.save_write_log_csv and os.path.exists(logfile_path):
update_logfile(logfile_path, fields)
- with open(logfile_path, "a", encoding="utf8", newline='') as file:
- at_start = file.tell() == 0
- writer = csv.writer(file)
- if at_start:
- writer.writerow(fields)
+ with (open(logfile_path, "a", encoding="utf8", newline='') if shared.opts.save_write_log_csv else nullcontext()) as file:
+ if file:
+ at_start = file.tell() == 0
+ writer = csv.writer(file)
+ if at_start:
+ writer.writerow(fields)
for image_index, filedata in enumerate(images, start_index):
image = image_from_url_text(filedata)
@@ -130,7 +132,8 @@ def save_files(js_data, images, do_make_zip, index):
filenames.append(os.path.basename(txt_fullfn))
fullfns.append(txt_fullfn)
- writer.writerow([parsed_infotexts[0]['Prompt'], parsed_infotexts[0]['Seed'], data["width"], data["height"], data["sampler_name"], data["cfg_scale"], data["steps"], filenames[0], parsed_infotexts[0]['Negative prompt'], data["sd_model_name"], data["sd_model_hash"]])
+ if file:
+ writer.writerow([parsed_infotexts[0]['Prompt'], parsed_infotexts[0]['Seed'], data["width"], data["height"], data["sampler_name"], data["cfg_scale"], data["steps"], filenames[0], parsed_infotexts[0]['Negative prompt'], data["sd_model_name"], data["sd_model_hash"]])
# Make Zip
if do_make_zip:
@@ -228,7 +231,7 @@ def create_output_panel(tabname, outdir, toprow=None):
)
save.click(
- fn=call_queue.wrap_gradio_call(save_files),
+ fn=call_queue.wrap_gradio_call_no_job(save_files),
_js="(x, y, z, w) => [x, y, false, selected_gallery_index()]",
inputs=[
res.generation_info,
@@ -244,7 +247,7 @@ def create_output_panel(tabname, outdir, toprow=None):
)
save_zip.click(
- fn=call_queue.wrap_gradio_call(save_files),
+ fn=call_queue.wrap_gradio_call_no_job(save_files),
_js="(x, y, z, w) => [x, y, true, selected_gallery_index()]",
inputs=[
res.generation_info,
diff --git a/modules/ui_extensions.py b/modules/ui_extensions.py
index d822c0b89..23aff7096 100644
--- a/modules/ui_extensions.py
+++ b/modules/ui_extensions.py
@@ -396,15 +396,15 @@ def install_extension_from_url(dirname, url, branch_name=None):
shutil.rmtree(tmpdir, True)
-def install_extension_from_index(url, hide_tags, sort_column, filter_text):
+def install_extension_from_index(url, selected_tags, showing_type, filtering_type, sort_column, filter_text):
ext_table, message = install_extension_from_url(None, url)
- code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+ code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)
return code, ext_table, message, ''
-def refresh_available_extensions(url, hide_tags, sort_column):
+def refresh_available_extensions(url, selected_tags, showing_type, filtering_type, sort_column):
global available_extensions
import urllib.request
@@ -413,19 +413,19 @@ def refresh_available_extensions(url, hide_tags, sort_column):
available_extensions = json.loads(text)
- code, tags = refresh_available_extensions_from_data(hide_tags, sort_column)
+ code, tags = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column)
return url, code, gr.CheckboxGroup.update(choices=tags), '', ''
-def refresh_available_extensions_for_tags(hide_tags, sort_column, filter_text):
- code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+def refresh_available_extensions_for_tags(selected_tags, showing_type, filtering_type, sort_column, filter_text):
+ code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)
return code, ''
-def search_extensions(filter_text, hide_tags, sort_column):
- code, _ = refresh_available_extensions_from_data(hide_tags, sort_column, filter_text)
+def search_extensions(filter_text, selected_tags, showing_type, filtering_type, sort_column):
+ code, _ = refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text)
return code, ''
@@ -450,13 +450,13 @@ def get_date(info: dict, key):
return ''
-def refresh_available_extensions_from_data(hide_tags, sort_column, filter_text=""):
+def refresh_available_extensions_from_data(selected_tags, showing_type, filtering_type, sort_column, filter_text=""):
extlist = available_extensions["extensions"]
installed_extensions = {extension.name for extension in extensions.extensions}
installed_extension_urls = {normalize_git_url(extension.remote) for extension in extensions.extensions if extension.remote is not None}
tags = available_extensions.get("tags", {})
- tags_to_hide = set(hide_tags)
+ selected_tags = set(selected_tags)
hidden = 0
code = f"""
@@ -489,9 +489,19 @@ def refresh_available_extensions_from_data(hide_tags, sort_column, filter_text="
existing = get_extension_dirname_from_url(url) in installed_extensions or normalize_git_url(url) in installed_extension_urls
extension_tags = extension_tags + ["installed"] if existing else extension_tags
- if any(x for x in extension_tags if x in tags_to_hide):
- hidden += 1
- continue
+ if len(selected_tags) > 0:
+ matched_tags = [x for x in extension_tags if x in selected_tags]
+ if filtering_type == 'or':
+ need_hide = len(matched_tags) > 0
+ else:
+ need_hide = len(matched_tags) == len(selected_tags)
+
+ if showing_type == 'show':
+ need_hide = not need_hide
+
+ if need_hide:
+ hidden += 1
+ continue
if filter_text and filter_text.strip():
if filter_text.lower() not in html.escape(name).lower() and filter_text.lower() not in html.escape(description).lower():
@@ -594,8 +604,12 @@ def create_ui():
install_extension_button = gr.Button(elem_id="install_extension_button", visible=False)
with gr.Row():
- hide_tags = gr.CheckboxGroup(value=["ads", "localization", "installed"], label="Hide extensions with tags", choices=["script", "ads", "localization", "installed"])
- sort_column = gr.Radio(value="newest first", label="Order", choices=["newest first", "oldest first", "a-z", "z-a", "internal order",'update time', 'create time', "stars"], type="index")
+ selected_tags = gr.CheckboxGroup(value=["ads", "localization", "installed"], label="Extension tags", choices=["script", "ads", "localization", "installed"], elem_classes=['compact-checkbox-group'])
+ sort_column = gr.Radio(value="newest first", label="Order", choices=["newest first", "oldest first", "a-z", "z-a", "internal order",'update time', 'create time', "stars"], type="index", elem_classes=['compact-checkbox-group'])
+
+ with gr.Row():
+ showing_type = gr.Radio(value="hide", label="Showing type", choices=["hide", "show"], elem_classes=['compact-checkbox-group'])
+ filtering_type = gr.Radio(value="or", label="Filtering type", choices=["or", "and"], elem_classes=['compact-checkbox-group'])
with gr.Row():
search_extensions_text = gr.Text(label="Search", container=False)
@@ -605,31 +619,43 @@ def create_ui():
refresh_available_extensions_button.click(
fn=modules.ui.wrap_gradio_call(refresh_available_extensions, extra_outputs=[gr.update(), gr.update(), gr.update(), gr.update()]),
- inputs=[available_extensions_index, hide_tags, sort_column],
- outputs=[available_extensions_index, available_extensions_table, hide_tags, search_extensions_text, install_result],
+ inputs=[available_extensions_index, selected_tags, showing_type, filtering_type, sort_column],
+ outputs=[available_extensions_index, available_extensions_table, selected_tags, search_extensions_text, install_result],
)
install_extension_button.click(
- fn=modules.ui.wrap_gradio_call(install_extension_from_index, extra_outputs=[gr.update(), gr.update()]),
- inputs=[extension_to_install, hide_tags, sort_column, search_extensions_text],
+ fn=modules.ui.wrap_gradio_call_no_job(install_extension_from_index, extra_outputs=[gr.update(), gr.update()]),
+ inputs=[extension_to_install, selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
outputs=[available_extensions_table, extensions_table, install_result],
)
search_extensions_text.change(
- fn=modules.ui.wrap_gradio_call(search_extensions, extra_outputs=[gr.update()]),
- inputs=[search_extensions_text, hide_tags, sort_column],
+ fn=modules.ui.wrap_gradio_call_no_job(search_extensions, extra_outputs=[gr.update()]),
+ inputs=[search_extensions_text, selected_tags, showing_type, filtering_type, sort_column],
outputs=[available_extensions_table, install_result],
)
- hide_tags.change(
- fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
- inputs=[hide_tags, sort_column, search_extensions_text],
+ selected_tags.change(
+ fn=modules.ui.wrap_gradio_call_no_job(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+ inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
+ outputs=[available_extensions_table, install_result]
+ )
+
+ showing_type.change(
+ fn=modules.ui.wrap_gradio_call_no_job(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+ inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
+ outputs=[available_extensions_table, install_result]
+ )
+
+ filtering_type.change(
+ fn=modules.ui.wrap_gradio_call_no_job(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+ inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
outputs=[available_extensions_table, install_result]
)
sort_column.change(
- fn=modules.ui.wrap_gradio_call(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
- inputs=[hide_tags, sort_column, search_extensions_text],
+ fn=modules.ui.wrap_gradio_call_no_job(refresh_available_extensions_for_tags, extra_outputs=[gr.update()]),
+ inputs=[selected_tags, showing_type, filtering_type, sort_column, search_extensions_text],
outputs=[available_extensions_table, install_result]
)
@@ -641,7 +667,7 @@ def create_ui():
install_result = gr.HTML(elem_id="extension_install_result")
install_button.click(
- fn=modules.ui.wrap_gradio_call(lambda *args: [gr.update(), *install_extension_from_url(*args)], extra_outputs=[gr.update(), gr.update()]),
+ fn=modules.ui.wrap_gradio_call_no_job(lambda *args: [gr.update(), *install_extension_from_url(*args)], extra_outputs=[gr.update(), gr.update()]),
inputs=[install_dirname, install_url, install_branch],
outputs=[install_url, extensions_table, install_result],
)
diff --git a/modules/ui_extra_networks_user_metadata.py b/modules/ui_extra_networks_user_metadata.py
index fde093700..3a07db105 100644
--- a/modules/ui_extra_networks_user_metadata.py
+++ b/modules/ui_extra_networks_user_metadata.py
@@ -194,7 +194,7 @@ class UserMetadataEditor:
def setup_ui(self, gallery):
self.button_replace_preview.click(
fn=self.save_preview,
- _js="function(x, y, z){return [selected_gallery_index(), y, z]}",
+ _js=f"function(x, y, z){{return [selected_gallery_index_id('{self.tabname + '_gallery_container'}'), y, z]}}",
inputs=[self.edit_name_input, gallery, self.edit_name_input],
outputs=[self.html_preview, self.html_status]
).then(
diff --git a/modules/ui_gradio_extensions.py b/modules/ui_gradio_extensions.py
index f5278d22f..ed57c1e98 100644
--- a/modules/ui_gradio_extensions.py
+++ b/modules/ui_gradio_extensions.py
@@ -41,6 +41,11 @@ def css_html():
if os.path.exists(user_css):
head += stylesheet(user_css)
+ from modules.shared_gradio_themes import resolve_var
+ light = resolve_var('background_fill_primary')
+ dark = resolve_var('background_fill_primary_dark')
+ head += f''
+
return head
@@ -50,7 +55,7 @@ def reload_javascript():
def template_response(*args, **kwargs):
res = shared.GradioTemplateResponseOriginal(*args, **kwargs)
- res.body = res.body.replace(b'', f'{js}'.encode("utf8"))
+ res.body = res.body.replace(b'', f'{js}'.encode("utf8"))
res.body = res.body.replace(b'