From eca442f7fb2049a65a13e89fbfe5077724e77b13 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Sat, 17 Jun 2023 18:54:06 +0200 Subject: [PATCH 1/2] simplify freezing text encoder layers config --- doc/ADVANCED_TWEAKING.md | 5 ++-- doc/OPTIMIZER.md | 18 +++----------- optimizer.json | 8 ++---- optimizer/optimizers.py | 54 ++++++++++++++++++++++++++++++---------- optimizerSD21.json | 6 ++--- 5 files changed, 51 insertions(+), 40 deletions(-) diff --git a/doc/ADVANCED_TWEAKING.md b/doc/ADVANCED_TWEAKING.md index b8afccc..f075eb4 100644 --- a/doc/ADVANCED_TWEAKING.md +++ b/doc/ADVANCED_TWEAKING.md @@ -197,6 +197,7 @@ The files will be in ```logs/[your project folder]/ep[N]_batch_schedule.txt``` a Clips the gradient normals to a maximum value. Default is None (no clipping). This is typically used for gradient explosion problems, which are generally not an issue with EveryDream and the grad scaler in AMP mode keeps this from being too much of an issue, but it may be worth experimenting with. - --clip_grad_norm 100000.0 ^ + --clip_grad_norm 1.0 ^ + +Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon. -Early indications seem to show high values such as 100000 may be helpful. Low values like 1.0 will drastically reduce training speed. Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon. \ No newline at end of file diff --git a/doc/OPTIMIZER.md b/doc/OPTIMIZER.md index 2efa76c..50084d3 100644 --- a/doc/OPTIMIZER.md +++ b/doc/OPTIMIZER.md @@ -75,25 +75,13 @@ If you're training SD2.1 you will likely experience great benefit from partially ``` "text_encoder_freezing": { - "freeze_embeddings": true, - "freeze_front_n_layers": -6, - "freeze_final_layer_norm": false + "unfreeze_final_n_layers": 2, } ``` -The SD2.1 text encoder is arranged as follows: +This will freeze the text encoder up to the last 2 layers, leaving the earlier layers and the embeddings intact. -``` -embeddings -> CLIP text encoder (23 layers) -> final layer norm -``` - -(The SD1.5 text encoder is similar but it has only 12 CLIP layers.) Typically you would apply freezing starting from the left and moving to the right, although it might be interesting to experiment with different freezing patterns. You can control this using the following parameters: - -* `freeze_embeddings` freezes the front 2 layers (the text embeddings - recommend). -* `freeze_front_n_layers` freezes the front N layers of the CLIP text encoder. You can also pass null to leave the CLIP layers unfrozen, or negative values to count from the back. In the example above, `-6` will freeze all but the last 6 layers. -* `freeze_final_layer_norm` freezes the parameters for the text encoder's final `LayerNorm` operation. - -Recommended settings for SD2.1 are provided in `optimizerSD21.json`: frozen embeddings, all CLIP layers frozen except for the last 6, final layer norm unfrozen. If you want to experiment, start by trying different values for `freeze_front_n_layers`: `-2` is slower but seems to produce a higher quality model, whereas `-10` is faster but can be more difficult to control. +Recommended settings for SD2.1 are provided in `optimizerSD21.json`. Unfreezing more layers will speed up training at the expense of text encoder stability. You can also try unfreezing the embeddings as well, by setting `"unfreeze_embeddings": true`. This may improve training, but it also seems to lead to quicker frying. ## General Beta, weight decay, epsilon, etc tuning diff --git a/optimizer.json b/optimizer.json index 2030c86..ecd0dea 100644 --- a/optimizer.json +++ b/optimizer.json @@ -14,9 +14,7 @@ "epsilon": "value added to denominator for numerical stability, unused for lion", "weight_decay": "weight decay (L2 penalty)", "------------------": "-----------------", - "freeze_embeddings": "whether to freeze the text embeddings", - "freeze_front_n_layers": "if not null, freeze the front N layers of the text encoder (you can pass eg -2 to leave only the last 2 layers unfrozen)", - "freeze_final_layer_norm": "whether to freeze the text encoder's final layer norm" + "unfreeze_last_n_layers": "if not null, freeze all parameters in the text encoder except for the last n layers and the final layer norm" }, "base": { "optimizer": "adamw8bit", @@ -39,8 +37,6 @@ "weight_decay": null }, "text_encoder_freezing": { - "freeze_embeddings": false, - "freeze_front_n_layers": null, - "freeze_final_layer_norm": false + "unfreeze_last_n_layers": null } } diff --git a/optimizer/optimizers.py b/optimizer/optimizers.py index 7b2f03c..6aa0a33 100644 --- a/optimizer/optimizers.py +++ b/optimizer/optimizers.py @@ -373,28 +373,56 @@ class EveryDreamOptimizer(): return optimizer def _apply_text_encoder_freeze(self, text_encoder) -> chain[Any]: - parameters = itertools.chain([]) + num_layers = len(text_encoder.text_model.encoder.layers) + unfreeze_embeddings = True + unfreeze_last_n_layers = None + unfreeze_final_layer_norm = True + if "freeze_front_n_layers" in self.te_freeze_config: + logging.warning( + ' * Found "freeze_front_n_layers" in JSON, please use "unfreeze_last_n_layers" instead') + freeze_front_n_layers = self.te_freeze_config["freeze_front_n_layers"] + if freeze_front_n_layers<0: + # eg -2 = freeze all but the last 2 + unfreeze_last_n_layers = -freeze_front_n_layers + else: + unfreeze_last_n_layers = num_layers - freeze_front_n_layers + if "unfreeze_last_n_layers" in self.te_freeze_config: + unfreeze_last_n_layers = self.te_freeze_config["unfreeze_last_n_layers"] - if self.te_freeze_config.get('freeze_embeddings', False): - # freeze embeddings - print(" ❄️ freezing embeddings") + if unfreeze_last_n_layers is None: + # nothing specified: default behaviour + unfreeze_last_n_layers = num_layers else: - parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters()) + # something specified: + assert(unfreeze_last_n_layers > 0) + if unfreeze_last_n_layers < num_layers: + # if we're unfreezing layers then by default we ought to freeze the embeddings + unfreeze_embeddings = False - freeze_front_n_layers = self.te_freeze_config.get('freeze_front_n_layers', None) - if freeze_front_n_layers is None: + if "freeze_embeddings" in self.te_freeze_config: + unfreeze_embeddings = not self.te_freeze_config["freeze_embeddings"] + if "freeze_final_layer_norm" in self.te_freeze_config: + unfreeze_final_layer_norm = not self.te_freeze_config["freeze_final_layer_norm"] + + parameters = itertools.chain([]) + if unfreeze_embeddings: + parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters()) + else: + print(" ❄️ freezing embeddings") + + if unfreeze_last_n_layers >= num_layers: parameters = itertools.chain(parameters, text_encoder.text_model.encoder.layers.parameters()) else: # freeze the specified CLIP text encoder layers layers = text_encoder.text_model.encoder.layers - print(f" ❄️ freezing text encoder layers 0-{len(layers[:freeze_front_n_layers])} of {len(layers)}") - parameters = itertools.chain(parameters, layers[freeze_front_n_layers:].parameters()) + first_layer_to_unfreeze = num_layers - unfreeze_last_n_layers + print(f" ❄️ freezing text encoder layers 1-{first_layer_to_unfreeze} out of {num_layers} layers total") + parameters = itertools.chain(parameters, layers[first_layer_to_unfreeze:].parameters()) - if self.te_freeze_config.get('freeze_final_layer_norm', False): - # instead of freezing the final layer norm parameters, we simply do not return them - print(" ❄️ freezing final layer norm") - else: + if unfreeze_final_layer_norm: parameters = itertools.chain(parameters, text_encoder.text_model.final_layer_norm.parameters()) + else: + print(" ❄️ freezing final layer norm") return parameters diff --git a/optimizerSD21.json b/optimizerSD21.json index e0a698e..5417b3a 100644 --- a/optimizerSD21.json +++ b/optimizerSD21.json @@ -20,7 +20,7 @@ }, "base": { "optimizer": "adamw8bit", - "lr": 1e-6, + "lr": 2e-6, "lr_scheduler": "constant", "lr_decay_steps": null, "lr_warmup_steps": null, @@ -39,8 +39,6 @@ "weight_decay": null }, "text_encoder_freezing": { - "freeze_embeddings": true, - "freeze_front_n_layers": -6, - "freeze_final_layer_norm": false + "unfreeze_last_n_layers": 2 } } From 888118ffff734b90a95bc1526d8947dba966f255 Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Sat, 17 Jun 2023 18:56:25 +0200 Subject: [PATCH 2/2] correct docs --- doc/OPTIMIZER.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/OPTIMIZER.md b/doc/OPTIMIZER.md index 50084d3..d391f83 100644 --- a/doc/OPTIMIZER.md +++ b/doc/OPTIMIZER.md @@ -81,7 +81,7 @@ If you're training SD2.1 you will likely experience great benefit from partially This will freeze the text encoder up to the last 2 layers, leaving the earlier layers and the embeddings intact. -Recommended settings for SD2.1 are provided in `optimizerSD21.json`. Unfreezing more layers will speed up training at the expense of text encoder stability. You can also try unfreezing the embeddings as well, by setting `"unfreeze_embeddings": true`. This may improve training, but it also seems to lead to quicker frying. +Recommended settings for SD2.1 are provided in `optimizerSD21.json`. Unfreezing more layers will speed up training at the expense of text encoder stability. You can also try unfreezing the embeddings as well, by setting `"freeze_embeddings": false`. This may improve training, but it also seems to lead to quicker frying. ## General Beta, weight decay, epsilon, etc tuning