changes for inpainting for #35

support for --medvram
attempt to support share
This commit is contained in:
AUTOMATIC 2022-09-01 11:41:42 +03:00
parent 3e4103541c
commit e1648fc1d1
2 changed files with 76 additions and 53 deletions

View File

@ -71,10 +71,10 @@ Run the command to start web ui:
python stable-diffusion-webui/webui.py python stable-diffusion-webui/webui.py
``` ```
If you have a 4GB video card, run the command with `--lowvram` argument: If you have a 4GB video card, run the command with either `--lowvram` or `--medvram` argument:
``` ```
python stable-diffusion-webui/webui.py --lowvram python stable-diffusion-webui/webui.py --medvram
``` ```
After a while, you will get a message like this: After a while, you will get a message like this:
@ -280,17 +280,18 @@ print("Seed was: " + str(processed.seed))
display(processed.images, processed.seed, processed.info) display(processed.images, processed.seed, processed.info)
``` ```
### `--lowvram` ### 4GB videocard support
Optimizations for GPUs with low VRAM. This should make it possible to generate 512x512 images on videocards with 4GB memory. Optimizations for GPUs with low VRAM. This should make it possible to generate 512x512 images on videocards with 4GB memory.
The original idea of those optimizations is by basujindal: https://github.com/basujindal/stable-diffusion. Model is separated into modules, `--lowvram` is a reimplementation of optimization idea from by [basujindal](https://github.com/basujindal/stable-diffusion).
and only one module is kept in GPU memory; when another module needs to run, the previous is removed from GPU memory. Model is separated into modules, and only one module is kept in GPU memory; when another module needs to run, the previous
is removed from GPU memory. The nature of this optimization makes the processing run slower -- about 10 times slower
It should be obvious but the nature of those optimizations makes the processing run slower -- about 10 times slower
compared to normal operation on my RTX 3090. compared to normal operation on my RTX 3090.
This is an independent implementation that does not require any modification to original Stable Diffusion code, and `--medvram` is another optimization that should reduce VRAM usage significantly by not peocessing conditional and
with all code concenrated in one place rather than scattered around the program. unconditional denoising in a same batch.
This implementation of optimization does not require any modification to original Stable Diffusion code.
### Inpainting ### Inpainting
In img2img tab, draw a mask over a part of image, and that part will be in-painted. In img2img tab, draw a mask over a part of image, and that part will be in-painted.

110
webui.py
View File

@ -6,7 +6,10 @@ script_path = os.path.dirname(os.path.realpath(__file__))
sd_path = os.path.dirname(script_path) sd_path = os.path.dirname(script_path)
# add parent directory to path; this is where Stable diffusion repo should be # add parent directory to path; this is where Stable diffusion repo should be
path_dirs = [(sd_path, 'ldm', 'Stable Diffusion'), ('../../taming-transformers', 'taming', 'Taming Transformers')] path_dirs = [
(sd_path, 'ldm', 'Stable Diffusion'),
('../../taming-transformers', 'taming', 'Taming Transformers')
]
for d, must_exist, what in path_dirs: for d, must_exist, what in path_dirs:
must_exist_path = os.path.abspath(os.path.join(script_path, d, must_exist)) must_exist_path = os.path.abspath(os.path.join(script_path, d, must_exist))
if not os.path.exists(must_exist_path): if not os.path.exists(must_exist_path):
@ -38,15 +41,10 @@ from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler from ldm.models.diffusion.plms import PLMSSampler
# fix gradio phoning home
gradio.utils.version_check = lambda: None
gradio.utils.get_local_ip_address = lambda: '127.0.0.1'
# this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the bowser will not show any UI # this is a fix for Windows users. Without it, javascript files will be served with text/html content-type and the bowser will not show any UI
mimetypes.init() mimetypes.init()
mimetypes.add_type('application/javascript', '.js') mimetypes.add_type('application/javascript', '.js')
# some of those options should not be changed at all because they would break the model, so I removed them from options. # some of those options should not be changed at all because they would break the model, so I removed them from options.
opt_C = 4 opt_C = 4
opt_f = 8 opt_f = 8
@ -65,14 +63,21 @@ parser.add_argument("--no-progressbar-hiding", action='store_true', help="do not
parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI") parser.add_argument("--max-batch-count", type=int, default=16, help="maximum batch count value for the UI")
parser.add_argument("--embeddings-dir", type=str, default='embeddings', help="embeddings dirtectory for textual inversion (default: embeddings)") parser.add_argument("--embeddings-dir", type=str, default='embeddings', help="embeddings dirtectory for textual inversion (default: embeddings)")
parser.add_argument("--allow-code", action='store_true', help="allow custom script execution from webui") parser.add_argument("--allow-code", action='store_true', help="allow custom script execution from webui")
parser.add_argument("--lowvram", action='store_true', help="enamble stable diffusion model optimizations for low vram") parser.add_argument("--medvram", action='store_true', help="enable stable diffusion model optimizations for sacrficing a little speed for low VRM usage")
parser.add_argument("--lowvram", action='store_true', help="enable stable diffusion model optimizations for sacrficing a lot of speed for very low VRM usage")
parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast") parser.add_argument("--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast")
parser.add_argument("--share", action='store_true', help="use share=True for gradio and make the UI accessible through their site (doesn't work for me but you might have better luck)")
cmd_opts = parser.parse_args() cmd_opts = parser.parse_args()
cpu = torch.device("cpu") cpu = torch.device("cpu")
gpu = torch.device("cuda") gpu = torch.device("cuda")
device = gpu if torch.cuda.is_available() else cpu device = gpu if torch.cuda.is_available() else cpu
batch_cond_uncond = not (cmd_opts.lowvram or cmd_opts.medvram)
if not cmd_opts.share:
# fix gradio phoning home
gradio.utils.version_check = lambda: None
gradio.utils.get_local_ip_address = lambda: '127.0.0.1'
css_hide_progressbar = """ css_hide_progressbar = """
.wrap .m-12 svg { display:none!important; } .wrap .m-12 svg { display:none!important; }
@ -294,21 +299,25 @@ def setup_for_low_vram(sd_model):
sd_model.first_stage_model.decode = lambda z, de=sd_model.first_stage_model.decode: first_stage_model_decode_wrap(sd_model.first_stage_model, de, z) sd_model.first_stage_model.decode = lambda z, de=sd_model.first_stage_model.decode: first_stage_model_decode_wrap(sd_model.first_stage_model, de, z)
parents[sd_model.cond_stage_model.transformer] = sd_model.cond_stage_model parents[sd_model.cond_stage_model.transformer] = sd_model.cond_stage_model
# the third remaining model is still too big for 4GB, so we also do the same for its submodules if cmd_opts.medvram:
# so that only one of them is in GPU at a time sd_model.model.register_forward_pre_hook(send_me_to_gpu)
diff_model = sd_model.model.diffusion_model else:
stored = diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed diff_model = sd_model.model.diffusion_model
diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = None, None, None, None
sd_model.model.to(device)
diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = stored
# install hooks for bits of third model # the third remaining model is still too big for 4GB, so we also do the same for its submodules
diff_model.time_embed.register_forward_pre_hook(send_me_to_gpu) # so that only one of them is in GPU at a time
for block in diff_model.input_blocks: stored = diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed
block.register_forward_pre_hook(send_me_to_gpu) diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = None, None, None, None
diff_model.middle_block.register_forward_pre_hook(send_me_to_gpu) sd_model.model.to(device)
for block in diff_model.output_blocks: diff_model.input_blocks, diff_model.middle_block, diff_model.output_blocks, diff_model.time_embed = stored
block.register_forward_pre_hook(send_me_to_gpu)
# install hooks for bits of third model
diff_model.time_embed.register_forward_pre_hook(send_me_to_gpu)
for block in diff_model.input_blocks:
block.register_forward_pre_hook(send_me_to_gpu)
diff_model.middle_block.register_forward_pre_hook(send_me_to_gpu)
for block in diff_model.output_blocks:
block.register_forward_pre_hook(send_me_to_gpu)
def create_random_tensors(shape, seeds): def create_random_tensors(shape, seeds):
@ -860,7 +869,7 @@ class VanillaStableDiffusionSampler:
def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning): def sample_img2img(self, p, x, noise, conditioning, unconditional_conditioning):
t_enc = int(min(p.denoising_strength, 0.999) * p.steps) t_enc = int(min(p.denoising_strength, 0.999) * p.steps)
# existing code fail with cetin step counts, like 9 # existing code fails with cetin step counts, like 9
try: try:
self.sampler.make_schedule(ddim_num_steps=p.steps, verbose=False) self.sampler.make_schedule(ddim_num_steps=p.steps, verbose=False)
except Exception: except Exception:
@ -887,13 +896,26 @@ class CFGDenoiser(nn.Module):
def __init__(self, model): def __init__(self, model):
super().__init__() super().__init__()
self.inner_model = model self.inner_model = model
self.mask = None
self.nmask = None
self.init_latent = None
def forward(self, x, sigma, uncond, cond, cond_scale): def forward(self, x, sigma, uncond, cond, cond_scale):
x_in = torch.cat([x] * 2) if batch_cond_uncond:
sigma_in = torch.cat([sigma] * 2) x_in = torch.cat([x] * 2)
cond_in = torch.cat([uncond, cond]) sigma_in = torch.cat([sigma] * 2)
uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2) cond_in = torch.cat([uncond, cond])
return uncond + (cond - uncond) * cond_scale uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
denoised = uncond + (cond - uncond) * cond_scale
else:
uncond = self.inner_model(x, sigma, cond=uncond)
cond = self.inner_model(x, sigma, cond=cond)
denoised = uncond + (cond - uncond) * cond_scale
if self.mask is not None:
denoised = self.init_latent * self.mask + self.nmask * denoised
return denoised
class KDiffusionSampler: class KDiffusionSampler:
@ -910,19 +932,13 @@ class KDiffusionSampler:
xi = x + noise xi = x + noise
if p.mask is not None:
if p.inpainting_fill == 2:
xi = xi * p.mask + noise * p.nmask
elif p.inpainting_fill == 3:
xi = xi * p.mask
sigma_sched = sigmas[p.steps - t_enc - 1:] sigma_sched = sigmas[p.steps - t_enc - 1:]
def mask_cb(v): self.model_wrap_cfg.mask = p.mask
v["denoised"][:] = v["denoised"][:] * p.nmask + p.init_latent * p.mask self.model_wrap_cfg.nmask = p.nmask
self.model_wrap_cfg.init_latent = p.init_latent
return self.func(self.model_wrap_cfg, xi, sigma_sched, extra_args={'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': p.cfg_scale}, disable=False, callback=mask_cb if p.mask is not None else None)
return self.func(self.model_wrap_cfg, xi, sigma_sched, extra_args={'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': p.cfg_scale}, disable=False)
def sample(self, p: StableDiffusionProcessing, x, conditioning, unconditional_conditioning): def sample(self, p: StableDiffusionProcessing, x, conditioning, unconditional_conditioning):
sigmas = self.model_wrap.get_sigmas(p.steps) sigmas = self.model_wrap.get_sigmas(p.steps)
@ -932,7 +948,7 @@ class KDiffusionSampler:
return samples_ddim return samples_ddim
Processed = namedtuple('Processed', ['images','seed', 'info']) Processed = namedtuple('Processed', ['images', 'seed', 'info'])
def process_images(p: StableDiffusionProcessing) -> Processed: def process_images(p: StableDiffusionProcessing) -> Processed:
@ -1315,7 +1331,6 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
if self.mask_blur > 0: if self.mask_blur > 0:
self.image_mask = self.image_mask.filter(ImageFilter.GaussianBlur(self.mask_blur)).convert('L') self.image_mask = self.image_mask.filter(ImageFilter.GaussianBlur(self.mask_blur)).convert('L')
if self.inpaint_full_res: if self.inpaint_full_res:
self.mask_for_overlay = self.image_mask self.mask_for_overlay = self.image_mask
mask = self.image_mask.convert('L') mask = self.image_mask.convert('L')
@ -1383,6 +1398,13 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
self.nmask = torch.asarray(latmask).to(device).type(sd_model.dtype) self.nmask = torch.asarray(latmask).to(device).type(sd_model.dtype)
def sample(self, x, conditioning, unconditional_conditioning): def sample(self, x, conditioning, unconditional_conditioning):
if self.mask is not None:
if self.inpainting_fill == 2:
x = x * self.mask + create_random_tensors(x.shape[1:], [self.seed + x + 1 for x in range(x.shape[0])]) * self.nmask
elif self.inpainting_fill == 3:
x = x * self.mask
samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning) samples = self.sampler.sample_img2img(self, self.init_latent, x, conditioning, unconditional_conditioning)
if self.mask is not None: if self.mask is not None:
@ -1805,10 +1827,10 @@ sd_config = OmegaConf.load(cmd_opts.config)
sd_model = load_model_from_config(sd_config, cmd_opts.ckpt) sd_model = load_model_from_config(sd_config, cmd_opts.ckpt)
sd_model = (sd_model if cmd_opts.no_half else sd_model.half()) sd_model = (sd_model if cmd_opts.no_half else sd_model.half())
if not cmd_opts.lowvram: if cmd_opts.lowvram or cmd_opts.medvram:
sd_model = sd_model.to(device)
else:
setup_for_low_vram(sd_model) setup_for_low_vram(sd_model)
else:
sd_model = sd_model.to(device)
model_hijack = StableDiffusionModelHijack() model_hijack = StableDiffusionModelHijack()
model_hijack.hijack(sd_model) model_hijack.hijack(sd_model)
@ -1855,5 +1877,5 @@ def inject_gradio_html(javascript):
inject_gradio_html(javascript) inject_gradio_html(javascript)
demo.queue(concurrency_count=1) demo.queue(concurrency_count=1)
demo.launch() demo.launch(share=cmd_opts.share)