dadapt stuff

This commit is contained in:
Victor Hall 2023-06-03 11:26:53 -04:00
parent 9ee2effacd
commit a96c6e2166
5 changed files with 104 additions and 3 deletions

View File

@ -1,5 +1,5 @@
"""
Copyright [2022] Victor C Hall
Copyright [2022-2023] Victor C Hall
Licensed under the GNU Affero General Public License;
You may not use this code except in compliance with the License.
@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from typing import List, Tuple
from typing import Tuple
"""
Notes:

View File

@ -45,7 +45,21 @@ Recommended settings for lion based on the paper are as follows:
"epsilon": 1e-8,
"weight_decay": 0.10
The recommendations are based on "1/10th LR" but "10x the weight decay" compared to AdamW when training diffusion models. There are no known recommendations for the CLIP text encoder. Lion converges quickly, so take care with the learning rate, and even lower learning rates may be effective.
The recommendations are based on "1/10th LR" but "10x the weight decay" compared to AdamW when training diffusion models. Lion converges quickly, so take care with the learning rate, and even lower learning rates may be effective.
There are no known recommendations for the CLIP text encoder. Using an even larger weight decay, increased epsilon, or even lower LR may be effective for the text encoder. Further investigation on betas for text encoder is needed as well.
#### D-Adaption optimizers
[Dadaptation](https://arxiv.org/abs/2301.07733) [version](https://github.com/facebookresearch/dadaptation) of various optimizers. These require drastically different hyperparameters. Early indications seem to point to LR of 0.1 to 1.0 and weight decay of 0.8 may work well for these. There is a `decouple` parameter that appears to need to be set to `true` for dadaptation to work and is defaulted. Another `d0` parameter is defaulted to 1e-6 as suggested and, according to the paper authors, does not need to be tuned, but is optional. See `optimizer_dadapt.json` for an example of a fully configured `dadapt_adam` training.
These are not memory efficient. You should use gradient checkpointing even with 24GB GPU.
Available optimizer values for Dadaptation are:
* dadapt_lion, dadapt_adam, dadapt_sgd
These are fairly experimental but tested as working. Gradient checkpointing may be required even on 24GB GPUs. Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption
## Optimizer parameters

View File

@ -297,6 +297,29 @@ class EveryDreamOptimizer():
)
elif optimizer_name == "adamw":
opt_class = torch.optim.AdamW
elif optimizer_name in ["dadapt_adam", "dadapt_lion"]:
import dadaptation
if curr_lr < 1e-4:
logging.warning(f"{Fore.YELLOW} LR, {curr_lr}, is very low for Dadaptation. Consider reviewing Dadaptation documentation, but proceeding anyway.{Style.RESET_ALL}")
if weight_decay < 1e-3:
logging.warning(f"{Fore.YELLOW} Weight decay, {weight_decay}, is very low for Dadaptation. Consider reviewing Dadaptation documentation, but proceeding anyway.{Style.RESET_ALL}")
if optimizer_name == "dadapt_adam":
opt_class = dadaptation.DAdaptAdam
elif optimizer_name == "dadapt_lion":
opt_class = dadaptation.DAdaptLion
optimizer = opt_class(
itertools.chain(parameters),
lr=curr_lr,
betas=(betas[0], betas[1]),
weight_decay=weight_decay,
eps=epsilon,
)
elif optimizer_name == "dadapt_lion":
import dadaptation
opt_class = dadaptation.DAdaptLion
else:
import bitsandbytes as bnb
opt_class = bnb.optim.AdamW8bit

53
optimizer_dadapt.json Normal file
View File

@ -0,0 +1,53 @@
{
"doc": {
"base": "base optimizer configuration for unet and text encoder",
"text_encoder_overrides": "text encoder config overrides",
"text_encoder_lr_scale": "if LR not set on text encoder, sets the Lr to a multiple of the Base LR. for example, if base `lr` is 2e-6 and `text_encoder_lr_scale` is 0.5, the text encoder's LR will be set to `1e-6`.",
"-----------------": "-----------------",
"optimizer": "adamw, adamw8bit, lion, dadapt_adam, dadapt_lion",
"optimizer_desc": "'adamw' in standard 32bit, 'adamw8bit' is bitsandbytes, 'lion' is EvoLved Sign Momentum, 'dadapt_...' are D-Adaptation methods",
"lr": "learning rate, if null will use CLI or main JSON config value",
"lr_scheduler": "'constant' or 'cosine'",
"lr_warmup_steps": "number of steps to warmup LR to target LR, if null will use CLI or default a value based on max epochs",
"lr_decay_steps": "number of steps to decay LR to zero for cosine, if null will use CLI or default a value based on max epochs",
"betas": "exponential decay rates for the moment estimates",
"epsilon": "value added to denominator for numerical stability, unused for lion, also used as d0 for dadaptation",
"weight_decay": "weight decay (L2 penalty)",
"d0": "for dadaptation only, scale of initial steps (def: 1e-6)",
"decouple": "for dadapt_adam only, whether to decouple the learning rates of the two distributions, suggested true",
"momentum": "for dadapt_sgd only, the momentum factor",
"------------------": "-----------------",
"freeze_embeddings": "whether to freeze the text embeddings",
"freeze_front_n_layers": "if not null, freeze the front N layers of the text encoder (you can pass eg -2 to leave only the last 2 layers unfrozen)",
"freeze_final_layer_norm": "whether to freeze the text encoder's final layer norm"
},
"base": {
"optimizer": "dadapt_adam",
"lr": 1e-1,
"lr_scheduler": "constant",
"lr_decay_steps": null,
"lr_warmup_steps": null,
"betas": [0.9, 0.999],
"epsilon": 1e-8,
"weight_decay": 0.80,
"d0": 1e-6,
"decouple": true
},
"text_encoder_overrides": {
"optimizer": null,
"lr": 1e-1,
"lr_scheduler": null,
"lr_decay_steps": null,
"lr_warmup_steps": null,
"betas": null,
"epsilon": 1e-8,
"weight_decay": 0.80,
"d0": 1e-6,
"decouple": true
},
"text_encoder_freezing": {
"freeze_embeddings": true,
"freeze_front_n_layers": -6,
"freeze_final_layer_norm": false
}
}

View File

@ -458,6 +458,8 @@ def main(args):
if args.zero_frequency_noise_ratio == -1.0:
from utils.unet_utils import enforce_zero_terminal_snr
noise_scheduler.betas = enforce_zero_terminal_snr(noise_scheduler.betas)
noise_scheduler.alphas = 1.0 - noise_scheduler.betas
noise_scheduler.alphas_cumprod = torch.cumprod(noise_scheduler.alphas, dim=0)
tokenizer = CLIPTokenizer.from_pretrained(model_root_folder, subfolder="tokenizer", use_fast=False)
@ -493,6 +495,15 @@ def main(args):
else:
text_encoder = text_encoder.to(device, dtype=torch.float32)
try:
torch.compile(unet)
torch.compile(text_encoder)
torch.compile(vae)
logging.info("Successfully compiled models")
except Exception as ex:
logging.warning(f"Failed to compile model, continuing anyway, ex: {ex}")
pass
optimizer_config = None
optimizer_config_path = args.optimizer_config if args.optimizer_config else "optimizer.json"
if os.path.exists(os.path.join(os.curdir, optimizer_config_path)):