Merge pull request #197 from damian0815/fix_simplify_freezing_text_encoder_layers

simplify freezing text encoder layers config
This commit is contained in:
Victor Hall 2023-06-18 00:54:01 -04:00 committed by GitHub
commit e097757ff6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 51 additions and 40 deletions

View File

@ -197,6 +197,7 @@ The files will be in ```logs/[your project folder]/ep[N]_batch_schedule.txt``` a
Clips the gradient normals to a maximum value. Default is None (no clipping). This is typically used for gradient explosion problems, which are generally not an issue with EveryDream and the grad scaler in AMP mode keeps this from being too much of an issue, but it may be worth experimenting with.
--clip_grad_norm 100000.0 ^
--clip_grad_norm 1.0 ^
Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon.
Early indications seem to show high values such as 100000 may be helpful. Low values like 1.0 will drastically reduce training speed. Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon.

View File

@ -75,25 +75,13 @@ If you're training SD2.1 you will likely experience great benefit from partially
```
"text_encoder_freezing": {
"freeze_embeddings": true,
"freeze_front_n_layers": -6,
"freeze_final_layer_norm": false
"unfreeze_final_n_layers": 2,
}
```
The SD2.1 text encoder is arranged as follows:
This will freeze the text encoder up to the last 2 layers, leaving the earlier layers and the embeddings intact.
```
embeddings -> CLIP text encoder (23 layers) -> final layer norm
```
(The SD1.5 text encoder is similar but it has only 12 CLIP layers.) Typically you would apply freezing starting from the left and moving to the right, although it might be interesting to experiment with different freezing patterns. You can control this using the following parameters:
* `freeze_embeddings` freezes the front 2 layers (the text embeddings - recommend).
* `freeze_front_n_layers` freezes the front N layers of the CLIP text encoder. You can also pass null to leave the CLIP layers unfrozen, or negative values to count from the back. In the example above, `-6` will freeze all but the last 6 layers.
* `freeze_final_layer_norm` freezes the parameters for the text encoder's final `LayerNorm` operation.
Recommended settings for SD2.1 are provided in `optimizerSD21.json`: frozen embeddings, all CLIP layers frozen except for the last 6, final layer norm unfrozen. If you want to experiment, start by trying different values for `freeze_front_n_layers`: `-2` is slower but seems to produce a higher quality model, whereas `-10` is faster but can be more difficult to control.
Recommended settings for SD2.1 are provided in `optimizerSD21.json`. Unfreezing more layers will speed up training at the expense of text encoder stability. You can also try unfreezing the embeddings as well, by setting `"freeze_embeddings": false`. This may improve training, but it also seems to lead to quicker frying.
## General Beta, weight decay, epsilon, etc tuning

View File

@ -14,9 +14,7 @@
"epsilon": "value added to denominator for numerical stability, unused for lion",
"weight_decay": "weight decay (L2 penalty)",
"------------------": "-----------------",
"freeze_embeddings": "whether to freeze the text embeddings",
"freeze_front_n_layers": "if not null, freeze the front N layers of the text encoder (you can pass eg -2 to leave only the last 2 layers unfrozen)",
"freeze_final_layer_norm": "whether to freeze the text encoder's final layer norm"
"unfreeze_last_n_layers": "if not null, freeze all parameters in the text encoder except for the last n layers and the final layer norm"
},
"base": {
"optimizer": "adamw8bit",
@ -39,9 +37,7 @@
"weight_decay": null
},
"text_encoder_freezing": {
"freeze_embeddings": false,
"freeze_front_n_layers": null,
"freeze_final_layer_norm": false
"unfreeze_last_n_layers": null
},
"apply_grad_scaler_step_tweaks": true
}

View File

@ -375,28 +375,56 @@ class EveryDreamOptimizer():
return optimizer
def _apply_text_encoder_freeze(self, text_encoder) -> chain[Any]:
parameters = itertools.chain([])
if self.te_freeze_config.get('freeze_embeddings', False):
# freeze embeddings
print(" ❄️ freezing embeddings")
num_layers = len(text_encoder.text_model.encoder.layers)
unfreeze_embeddings = True
unfreeze_last_n_layers = None
unfreeze_final_layer_norm = True
if "freeze_front_n_layers" in self.te_freeze_config:
logging.warning(
' * Found "freeze_front_n_layers" in JSON, please use "unfreeze_last_n_layers" instead')
freeze_front_n_layers = self.te_freeze_config["freeze_front_n_layers"]
if freeze_front_n_layers<0:
# eg -2 = freeze all but the last 2
unfreeze_last_n_layers = -freeze_front_n_layers
else:
parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters())
unfreeze_last_n_layers = num_layers - freeze_front_n_layers
if "unfreeze_last_n_layers" in self.te_freeze_config:
unfreeze_last_n_layers = self.te_freeze_config["unfreeze_last_n_layers"]
freeze_front_n_layers = self.te_freeze_config.get('freeze_front_n_layers', None)
if freeze_front_n_layers is None:
if unfreeze_last_n_layers is None:
# nothing specified: default behaviour
unfreeze_last_n_layers = num_layers
else:
# something specified:
assert(unfreeze_last_n_layers > 0)
if unfreeze_last_n_layers < num_layers:
# if we're unfreezing layers then by default we ought to freeze the embeddings
unfreeze_embeddings = False
if "freeze_embeddings" in self.te_freeze_config:
unfreeze_embeddings = not self.te_freeze_config["freeze_embeddings"]
if "freeze_final_layer_norm" in self.te_freeze_config:
unfreeze_final_layer_norm = not self.te_freeze_config["freeze_final_layer_norm"]
parameters = itertools.chain([])
if unfreeze_embeddings:
parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters())
else:
print(" ❄️ freezing embeddings")
if unfreeze_last_n_layers >= num_layers:
parameters = itertools.chain(parameters, text_encoder.text_model.encoder.layers.parameters())
else:
# freeze the specified CLIP text encoder layers
layers = text_encoder.text_model.encoder.layers
print(f" ❄️ freezing text encoder layers 0-{len(layers[:freeze_front_n_layers])} of {len(layers)}")
parameters = itertools.chain(parameters, layers[freeze_front_n_layers:].parameters())
first_layer_to_unfreeze = num_layers - unfreeze_last_n_layers
print(f" ❄️ freezing text encoder layers 1-{first_layer_to_unfreeze} out of {num_layers} layers total")
parameters = itertools.chain(parameters, layers[first_layer_to_unfreeze:].parameters())
if self.te_freeze_config.get('freeze_final_layer_norm', False):
# instead of freezing the final layer norm parameters, we simply do not return them
print(" ❄️ freezing final layer norm")
else:
if unfreeze_final_layer_norm:
parameters = itertools.chain(parameters, text_encoder.text_model.final_layer_norm.parameters())
else:
print(" ❄️ freezing final layer norm")
return parameters

View File

@ -20,7 +20,7 @@
},
"base": {
"optimizer": "adamw8bit",
"lr": 1e-6,
"lr": 2e-6,
"lr_scheduler": "constant",
"lr_decay_steps": null,
"lr_warmup_steps": null,
@ -39,8 +39,6 @@
"weight_decay": null
},
"text_encoder_freezing": {
"freeze_embeddings": true,
"freeze_front_n_layers": -6,
"freeze_final_layer_norm": false
"unfreeze_last_n_layers": 2
}
}