dadapt stuff
This commit is contained in:
parent
9ee2effacd
commit
a96c6e2166
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
Copyright [2022] Victor C Hall
|
||||
Copyright [2022-2023] Victor C Hall
|
||||
|
||||
Licensed under the GNU Affero General Public License;
|
||||
You may not use this code except in compliance with the License.
|
||||
|
@ -13,7 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
from typing import List, Tuple
|
||||
from typing import Tuple
|
||||
|
||||
"""
|
||||
Notes:
|
||||
|
|
|
@ -45,7 +45,21 @@ Recommended settings for lion based on the paper are as follows:
|
|||
"epsilon": 1e-8,
|
||||
"weight_decay": 0.10
|
||||
|
||||
The recommendations are based on "1/10th LR" but "10x the weight decay" compared to AdamW when training diffusion models. There are no known recommendations for the CLIP text encoder. Lion converges quickly, so take care with the learning rate, and even lower learning rates may be effective.
|
||||
The recommendations are based on "1/10th LR" but "10x the weight decay" compared to AdamW when training diffusion models. Lion converges quickly, so take care with the learning rate, and even lower learning rates may be effective.
|
||||
|
||||
There are no known recommendations for the CLIP text encoder. Using an even larger weight decay, increased epsilon, or even lower LR may be effective for the text encoder. Further investigation on betas for text encoder is needed as well.
|
||||
|
||||
#### D-Adaption optimizers
|
||||
|
||||
[Dadaptation](https://arxiv.org/abs/2301.07733) [version](https://github.com/facebookresearch/dadaptation) of various optimizers. These require drastically different hyperparameters. Early indications seem to point to LR of 0.1 to 1.0 and weight decay of 0.8 may work well for these. There is a `decouple` parameter that appears to need to be set to `true` for dadaptation to work and is defaulted. Another `d0` parameter is defaulted to 1e-6 as suggested and, according to the paper authors, does not need to be tuned, but is optional. See `optimizer_dadapt.json` for an example of a fully configured `dadapt_adam` training.
|
||||
|
||||
These are not memory efficient. You should use gradient checkpointing even with 24GB GPU.
|
||||
|
||||
Available optimizer values for Dadaptation are:
|
||||
|
||||
* dadapt_lion, dadapt_adam, dadapt_sgd
|
||||
|
||||
These are fairly experimental but tested as working. Gradient checkpointing may be required even on 24GB GPUs. Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption
|
||||
|
||||
## Optimizer parameters
|
||||
|
||||
|
|
|
@ -297,6 +297,29 @@ class EveryDreamOptimizer():
|
|||
)
|
||||
elif optimizer_name == "adamw":
|
||||
opt_class = torch.optim.AdamW
|
||||
elif optimizer_name in ["dadapt_adam", "dadapt_lion"]:
|
||||
import dadaptation
|
||||
|
||||
if curr_lr < 1e-4:
|
||||
logging.warning(f"{Fore.YELLOW} LR, {curr_lr}, is very low for Dadaptation. Consider reviewing Dadaptation documentation, but proceeding anyway.{Style.RESET_ALL}")
|
||||
if weight_decay < 1e-3:
|
||||
logging.warning(f"{Fore.YELLOW} Weight decay, {weight_decay}, is very low for Dadaptation. Consider reviewing Dadaptation documentation, but proceeding anyway.{Style.RESET_ALL}")
|
||||
|
||||
if optimizer_name == "dadapt_adam":
|
||||
opt_class = dadaptation.DAdaptAdam
|
||||
elif optimizer_name == "dadapt_lion":
|
||||
opt_class = dadaptation.DAdaptLion
|
||||
|
||||
optimizer = opt_class(
|
||||
itertools.chain(parameters),
|
||||
lr=curr_lr,
|
||||
betas=(betas[0], betas[1]),
|
||||
weight_decay=weight_decay,
|
||||
eps=epsilon,
|
||||
)
|
||||
elif optimizer_name == "dadapt_lion":
|
||||
import dadaptation
|
||||
opt_class = dadaptation.DAdaptLion
|
||||
else:
|
||||
import bitsandbytes as bnb
|
||||
opt_class = bnb.optim.AdamW8bit
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"doc": {
|
||||
"base": "base optimizer configuration for unet and text encoder",
|
||||
"text_encoder_overrides": "text encoder config overrides",
|
||||
"text_encoder_lr_scale": "if LR not set on text encoder, sets the Lr to a multiple of the Base LR. for example, if base `lr` is 2e-6 and `text_encoder_lr_scale` is 0.5, the text encoder's LR will be set to `1e-6`.",
|
||||
"-----------------": "-----------------",
|
||||
"optimizer": "adamw, adamw8bit, lion, dadapt_adam, dadapt_lion",
|
||||
"optimizer_desc": "'adamw' in standard 32bit, 'adamw8bit' is bitsandbytes, 'lion' is EvoLved Sign Momentum, 'dadapt_...' are D-Adaptation methods",
|
||||
"lr": "learning rate, if null will use CLI or main JSON config value",
|
||||
"lr_scheduler": "'constant' or 'cosine'",
|
||||
"lr_warmup_steps": "number of steps to warmup LR to target LR, if null will use CLI or default a value based on max epochs",
|
||||
"lr_decay_steps": "number of steps to decay LR to zero for cosine, if null will use CLI or default a value based on max epochs",
|
||||
"betas": "exponential decay rates for the moment estimates",
|
||||
"epsilon": "value added to denominator for numerical stability, unused for lion, also used as d0 for dadaptation",
|
||||
"weight_decay": "weight decay (L2 penalty)",
|
||||
"d0": "for dadaptation only, scale of initial steps (def: 1e-6)",
|
||||
"decouple": "for dadapt_adam only, whether to decouple the learning rates of the two distributions, suggested true",
|
||||
"momentum": "for dadapt_sgd only, the momentum factor",
|
||||
"------------------": "-----------------",
|
||||
"freeze_embeddings": "whether to freeze the text embeddings",
|
||||
"freeze_front_n_layers": "if not null, freeze the front N layers of the text encoder (you can pass eg -2 to leave only the last 2 layers unfrozen)",
|
||||
"freeze_final_layer_norm": "whether to freeze the text encoder's final layer norm"
|
||||
},
|
||||
"base": {
|
||||
"optimizer": "dadapt_adam",
|
||||
"lr": 1e-1,
|
||||
"lr_scheduler": "constant",
|
||||
"lr_decay_steps": null,
|
||||
"lr_warmup_steps": null,
|
||||
"betas": [0.9, 0.999],
|
||||
"epsilon": 1e-8,
|
||||
"weight_decay": 0.80,
|
||||
"d0": 1e-6,
|
||||
"decouple": true
|
||||
},
|
||||
"text_encoder_overrides": {
|
||||
"optimizer": null,
|
||||
"lr": 1e-1,
|
||||
"lr_scheduler": null,
|
||||
"lr_decay_steps": null,
|
||||
"lr_warmup_steps": null,
|
||||
"betas": null,
|
||||
"epsilon": 1e-8,
|
||||
"weight_decay": 0.80,
|
||||
"d0": 1e-6,
|
||||
"decouple": true
|
||||
},
|
||||
"text_encoder_freezing": {
|
||||
"freeze_embeddings": true,
|
||||
"freeze_front_n_layers": -6,
|
||||
"freeze_final_layer_norm": false
|
||||
}
|
||||
}
|
11
train.py
11
train.py
|
@ -458,6 +458,8 @@ def main(args):
|
|||
if args.zero_frequency_noise_ratio == -1.0:
|
||||
from utils.unet_utils import enforce_zero_terminal_snr
|
||||
noise_scheduler.betas = enforce_zero_terminal_snr(noise_scheduler.betas)
|
||||
noise_scheduler.alphas = 1.0 - noise_scheduler.betas
|
||||
noise_scheduler.alphas_cumprod = torch.cumprod(noise_scheduler.alphas, dim=0)
|
||||
|
||||
tokenizer = CLIPTokenizer.from_pretrained(model_root_folder, subfolder="tokenizer", use_fast=False)
|
||||
|
||||
|
@ -493,6 +495,15 @@ def main(args):
|
|||
else:
|
||||
text_encoder = text_encoder.to(device, dtype=torch.float32)
|
||||
|
||||
try:
|
||||
torch.compile(unet)
|
||||
torch.compile(text_encoder)
|
||||
torch.compile(vae)
|
||||
logging.info("Successfully compiled models")
|
||||
except Exception as ex:
|
||||
logging.warning(f"Failed to compile model, continuing anyway, ex: {ex}")
|
||||
pass
|
||||
|
||||
optimizer_config = None
|
||||
optimizer_config_path = args.optimizer_config if args.optimizer_config else "optimizer.json"
|
||||
if os.path.exists(os.path.join(os.curdir, optimizer_config_path)):
|
||||
|
|
Loading…
Reference in New Issue