1. Added AdaCoor optimizer.

2. Added pyramid noise.
3. Fixed problem with log_writer missing from EveryDreamOptimizer.
This commit is contained in:
alexds9 2023-11-17 16:08:43 +02:00
parent bf3c022489
commit 5dc9f18061
5 changed files with 83 additions and 6 deletions

View File

@ -275,3 +275,13 @@ While the calculation makes sense in how it compensates for inteval and total tr
--ema_strength_target 0.10 ^
If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning.
## AdaCoor optimizer
This is an optimizer made by stripping out non functional components of CoordinateDoWG and several tweaks to high memory efficiency. It is a learning rate free adaptive optimizer where the only recommended parameter is an epsilon value of 1e-8. This optimizer does not scale well with high batch sizes, so it is recommended to use batch sizes no greater than 8 unless slow and careful training is desired.
## Pyramid_Noise_Discount parameter
This is an implementation of pyramid noise as first introduced here https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
Pyramid noise can be used to improve dynamic range in short finetunes of < 2000 steps at discounts greater than 0.40. At all discount levels pyramid noise appears to improve the amount of detail generated in images. However, it is not advised to train with pyramid noise for a full training as the noise affects the whole model rapidly and can destabilize the model if trained for too many steps. At 0, pyramid noise is disabled.

37
optimizer/adacoor.py Normal file
View File

@ -0,0 +1,37 @@
import torch
class AdaCoor(torch.optim.Optimizer):
def __init__(self, params, eps=1e-8, *args, **kwargs):
defaults = dict(epsilon=eps, lr=1)
super(AdaCoor, self).__init__(params, defaults)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
with torch.no_grad():
# Initialize epsilon as a tensor
epsilon = torch.tensor([group['epsilon']], dtype=torch.bfloat16, device=next(iter(group['params'])).device)
for p in group['params']:
if p.grad is None:
continue
state = self.state[p]
# Initialize state variable for vt
if 'vt' not in state:
state['vt'] = torch.zeros_like(p.data, device=p.device).to(dtype=torch.bfloat16, device=p.device)
vt = state['vt']
vt.add_((epsilon * p.grad.data ** 2).to(dtype=torch.bfloat16, device=p.device))
gt_hat = (epsilon * p.grad.data).to(dtype=torch.float32, device=p.device)
denom = vt.sqrt().add_(group['epsilon']).to(dtype=p.dtype, device=p.device)
p.data.addcdiv_(gt_hat, denom, value=-group['lr'])
return loss

View File

@ -355,6 +355,8 @@ class EveryDreamOptimizer():
logging.warning(f"No LR setting found, defaulting to {default_lr}")
if optimizer_name:
optimizer_name = optimizer_name.lower()
if optimizer_name == "lion":
from lion_pytorch import Lion
opt_class = Lion
@ -385,14 +387,14 @@ class EveryDreamOptimizer():
use_bias_correction=use_bias_correction,
growth_rate=growth_rate,
d0=d0,
safeguard_warmup=safeguard_warmup
safeguard_warmup=safeguard_warmup
)
elif optimizer_name == "adamw":
opt_class = torch.optim.AdamW
if "dowg" in optimizer_name:
if "dowg" in optimizer_name:
# coordinate_dowg, scalar_dowg require no additional parameters. Epsilon is overrideable but is unnecessary in all stable diffusion training situations.
import dowg
if optimizer_name == "coordinate_dowg":
if optimizer_name == "coordinate_dowg":
opt_class = dowg.CoordinateDoWG
elif optimizer_name == "scalar_dowg":
opt_class = dowg.ScalarDoWG
@ -453,6 +455,15 @@ class EveryDreamOptimizer():
log_every=args.log_step,
growth_rate=growth_rate,
)
elif optimizer_name == "adacoor":
from optimizer.adacoor import AdaCoor
opt_class = AdaCoor
optimizer = opt_class(
itertools.chain(parameters),
lr=curr_lr,
eps=epsilon
)
if not optimizer:
optimizer = opt_class(

View File

@ -41,6 +41,7 @@
"rated_dataset": false,
"rated_dataset_target_dropout_percent": 50,
"zero_frequency_noise_ratio": 0.02,
"pyramid_noise_discount": null,
"enable_zero_terminal_snr": false,
"load_settings_every_epoch": false,
"min_snr_gamma": null,

View File

@ -261,6 +261,17 @@ def setup_local_logger(args):
# """
# optimizer.load_state_dict(torch.load(path))
def pyramid_noise_like(x, discount=0.8):
b, c, w, h = x.shape # EDIT: w and h get over-written, rename for a different variant!
u = torch.nn.Upsample(size=(w, h), mode='bilinear')
noise = torch.randn_like(x)
for i in range(10):
r = random.random()*2+2 # Rather than always going 2x,
w, h = max(1, int(w/(r**i))), max(1, int(h/(r**i)))
noise += u(torch.randn(b, c, w, h).to(x)) * discount**i
if w==1 or h==1: break # Lowest resolution is 1x1
return noise/noise.std() # Scaled back to roughly unit variance
def get_gpu_memory(nvsmi):
"""
returns the gpu memory usage
@ -821,7 +832,8 @@ def main(args):
optimizer_config,
text_encoder,
unet,
epoch_len)
epoch_len,
log_writer)
log_args(log_writer, args)
@ -922,13 +934,19 @@ def main(args):
del pixel_values
latents = latents[0].sample() * 0.18215
noise = torch.randn_like(latents)
if args.pyramid_noise_discount != None:
if 0 < args.pyramid_noise_discount:
noise = pyramid_noise_like(noise, discount=args.pyramid_noise_discount)
if zero_frequency_noise_ratio != None:
if zero_frequency_noise_ratio < 0:
zero_frequency_noise_ratio = 0
# see https://www.crosslabs.org//blog/diffusion-with-offset-noise
zero_frequency_noise = zero_frequency_noise_ratio * torch.randn(latents.shape[0], latents.shape[1], 1, 1, device=latents.device)
noise = torch.randn_like(latents) + zero_frequency_noise
noise = noise + zero_frequency_noise
bsz = latents.shape[0]
@ -1376,7 +1394,7 @@ if __name__ == "__main__":
argparser.add_argument("--ema_sample_nonema_model", action="store_true", default=False, help="Will show samples from non-EMA trained model, just like regular training. Can be used with: --ema_sample_ema_model")
argparser.add_argument("--ema_sample_ema_model", action="store_true", default=False, help="Will show samples from EMA model. May be slower when using ema cpu offloading. Can be used with: --ema_sample_nonema_model")
argparser.add_argument("--ema_resume_model", type=str, default=None, help="The EMA decay checkpoint to resume from for EMA decay, either a local .ckpt file, a converted Diffusers format folder, or a Huggingface.co repo id such as stabilityai/stable-diffusion-2-1-ema-decay")
argparser.add_argument("--pyramid_noise_discount", type=float, default=None, help="Enables pyramid noise and use specified discount factor for it")
# load CLI args to overwrite existing config args
args = argparser.parse_args(args=argv, namespace=args)