simplify freezing text encoder layers config
This commit is contained in:
parent
dadf881f9a
commit
eca442f7fb
|
@ -197,6 +197,7 @@ The files will be in ```logs/[your project folder]/ep[N]_batch_schedule.txt``` a
|
||||||
|
|
||||||
Clips the gradient normals to a maximum value. Default is None (no clipping). This is typically used for gradient explosion problems, which are generally not an issue with EveryDream and the grad scaler in AMP mode keeps this from being too much of an issue, but it may be worth experimenting with.
|
Clips the gradient normals to a maximum value. Default is None (no clipping). This is typically used for gradient explosion problems, which are generally not an issue with EveryDream and the grad scaler in AMP mode keeps this from being too much of an issue, but it may be worth experimenting with.
|
||||||
|
|
||||||
--clip_grad_norm 100000.0 ^
|
--clip_grad_norm 1.0 ^
|
||||||
|
|
||||||
|
Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon.
|
||||||
|
|
||||||
Early indications seem to show high values such as 100000 may be helpful. Low values like 1.0 will drastically reduce training speed. Default is no gradient normal clipping. There are also other ways to deal with gradient explosion, such as increasing optimizer epsilon.
|
|
|
@ -75,25 +75,13 @@ If you're training SD2.1 you will likely experience great benefit from partially
|
||||||
|
|
||||||
```
|
```
|
||||||
"text_encoder_freezing": {
|
"text_encoder_freezing": {
|
||||||
"freeze_embeddings": true,
|
"unfreeze_final_n_layers": 2,
|
||||||
"freeze_front_n_layers": -6,
|
|
||||||
"freeze_final_layer_norm": false
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The SD2.1 text encoder is arranged as follows:
|
This will freeze the text encoder up to the last 2 layers, leaving the earlier layers and the embeddings intact.
|
||||||
|
|
||||||
```
|
Recommended settings for SD2.1 are provided in `optimizerSD21.json`. Unfreezing more layers will speed up training at the expense of text encoder stability. You can also try unfreezing the embeddings as well, by setting `"unfreeze_embeddings": true`. This may improve training, but it also seems to lead to quicker frying.
|
||||||
embeddings -> CLIP text encoder (23 layers) -> final layer norm
|
|
||||||
```
|
|
||||||
|
|
||||||
(The SD1.5 text encoder is similar but it has only 12 CLIP layers.) Typically you would apply freezing starting from the left and moving to the right, although it might be interesting to experiment with different freezing patterns. You can control this using the following parameters:
|
|
||||||
|
|
||||||
* `freeze_embeddings` freezes the front 2 layers (the text embeddings - recommend).
|
|
||||||
* `freeze_front_n_layers` freezes the front N layers of the CLIP text encoder. You can also pass null to leave the CLIP layers unfrozen, or negative values to count from the back. In the example above, `-6` will freeze all but the last 6 layers.
|
|
||||||
* `freeze_final_layer_norm` freezes the parameters for the text encoder's final `LayerNorm` operation.
|
|
||||||
|
|
||||||
Recommended settings for SD2.1 are provided in `optimizerSD21.json`: frozen embeddings, all CLIP layers frozen except for the last 6, final layer norm unfrozen. If you want to experiment, start by trying different values for `freeze_front_n_layers`: `-2` is slower but seems to produce a higher quality model, whereas `-10` is faster but can be more difficult to control.
|
|
||||||
|
|
||||||
## General Beta, weight decay, epsilon, etc tuning
|
## General Beta, weight decay, epsilon, etc tuning
|
||||||
|
|
||||||
|
|
|
@ -14,9 +14,7 @@
|
||||||
"epsilon": "value added to denominator for numerical stability, unused for lion",
|
"epsilon": "value added to denominator for numerical stability, unused for lion",
|
||||||
"weight_decay": "weight decay (L2 penalty)",
|
"weight_decay": "weight decay (L2 penalty)",
|
||||||
"------------------": "-----------------",
|
"------------------": "-----------------",
|
||||||
"freeze_embeddings": "whether to freeze the text embeddings",
|
"unfreeze_last_n_layers": "if not null, freeze all parameters in the text encoder except for the last n layers and the final layer norm"
|
||||||
"freeze_front_n_layers": "if not null, freeze the front N layers of the text encoder (you can pass eg -2 to leave only the last 2 layers unfrozen)",
|
|
||||||
"freeze_final_layer_norm": "whether to freeze the text encoder's final layer norm"
|
|
||||||
},
|
},
|
||||||
"base": {
|
"base": {
|
||||||
"optimizer": "adamw8bit",
|
"optimizer": "adamw8bit",
|
||||||
|
@ -39,8 +37,6 @@
|
||||||
"weight_decay": null
|
"weight_decay": null
|
||||||
},
|
},
|
||||||
"text_encoder_freezing": {
|
"text_encoder_freezing": {
|
||||||
"freeze_embeddings": false,
|
"unfreeze_last_n_layers": null
|
||||||
"freeze_front_n_layers": null,
|
|
||||||
"freeze_final_layer_norm": false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -373,28 +373,56 @@ class EveryDreamOptimizer():
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
def _apply_text_encoder_freeze(self, text_encoder) -> chain[Any]:
|
def _apply_text_encoder_freeze(self, text_encoder) -> chain[Any]:
|
||||||
parameters = itertools.chain([])
|
num_layers = len(text_encoder.text_model.encoder.layers)
|
||||||
|
unfreeze_embeddings = True
|
||||||
|
unfreeze_last_n_layers = None
|
||||||
|
unfreeze_final_layer_norm = True
|
||||||
|
if "freeze_front_n_layers" in self.te_freeze_config:
|
||||||
|
logging.warning(
|
||||||
|
' * Found "freeze_front_n_layers" in JSON, please use "unfreeze_last_n_layers" instead')
|
||||||
|
freeze_front_n_layers = self.te_freeze_config["freeze_front_n_layers"]
|
||||||
|
if freeze_front_n_layers<0:
|
||||||
|
# eg -2 = freeze all but the last 2
|
||||||
|
unfreeze_last_n_layers = -freeze_front_n_layers
|
||||||
|
else:
|
||||||
|
unfreeze_last_n_layers = num_layers - freeze_front_n_layers
|
||||||
|
if "unfreeze_last_n_layers" in self.te_freeze_config:
|
||||||
|
unfreeze_last_n_layers = self.te_freeze_config["unfreeze_last_n_layers"]
|
||||||
|
|
||||||
if self.te_freeze_config.get('freeze_embeddings', False):
|
if unfreeze_last_n_layers is None:
|
||||||
# freeze embeddings
|
# nothing specified: default behaviour
|
||||||
print(" ❄️ freezing embeddings")
|
unfreeze_last_n_layers = num_layers
|
||||||
else:
|
else:
|
||||||
parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters())
|
# something specified:
|
||||||
|
assert(unfreeze_last_n_layers > 0)
|
||||||
|
if unfreeze_last_n_layers < num_layers:
|
||||||
|
# if we're unfreezing layers then by default we ought to freeze the embeddings
|
||||||
|
unfreeze_embeddings = False
|
||||||
|
|
||||||
freeze_front_n_layers = self.te_freeze_config.get('freeze_front_n_layers', None)
|
if "freeze_embeddings" in self.te_freeze_config:
|
||||||
if freeze_front_n_layers is None:
|
unfreeze_embeddings = not self.te_freeze_config["freeze_embeddings"]
|
||||||
|
if "freeze_final_layer_norm" in self.te_freeze_config:
|
||||||
|
unfreeze_final_layer_norm = not self.te_freeze_config["freeze_final_layer_norm"]
|
||||||
|
|
||||||
|
parameters = itertools.chain([])
|
||||||
|
if unfreeze_embeddings:
|
||||||
|
parameters = itertools.chain(parameters, text_encoder.text_model.embeddings.parameters())
|
||||||
|
else:
|
||||||
|
print(" ❄️ freezing embeddings")
|
||||||
|
|
||||||
|
if unfreeze_last_n_layers >= num_layers:
|
||||||
parameters = itertools.chain(parameters, text_encoder.text_model.encoder.layers.parameters())
|
parameters = itertools.chain(parameters, text_encoder.text_model.encoder.layers.parameters())
|
||||||
else:
|
else:
|
||||||
# freeze the specified CLIP text encoder layers
|
# freeze the specified CLIP text encoder layers
|
||||||
layers = text_encoder.text_model.encoder.layers
|
layers = text_encoder.text_model.encoder.layers
|
||||||
print(f" ❄️ freezing text encoder layers 0-{len(layers[:freeze_front_n_layers])} of {len(layers)}")
|
first_layer_to_unfreeze = num_layers - unfreeze_last_n_layers
|
||||||
parameters = itertools.chain(parameters, layers[freeze_front_n_layers:].parameters())
|
print(f" ❄️ freezing text encoder layers 1-{first_layer_to_unfreeze} out of {num_layers} layers total")
|
||||||
|
parameters = itertools.chain(parameters, layers[first_layer_to_unfreeze:].parameters())
|
||||||
|
|
||||||
if self.te_freeze_config.get('freeze_final_layer_norm', False):
|
if unfreeze_final_layer_norm:
|
||||||
# instead of freezing the final layer norm parameters, we simply do not return them
|
|
||||||
print(" ❄️ freezing final layer norm")
|
|
||||||
else:
|
|
||||||
parameters = itertools.chain(parameters, text_encoder.text_model.final_layer_norm.parameters())
|
parameters = itertools.chain(parameters, text_encoder.text_model.final_layer_norm.parameters())
|
||||||
|
else:
|
||||||
|
print(" ❄️ freezing final layer norm")
|
||||||
|
|
||||||
return parameters
|
return parameters
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
},
|
},
|
||||||
"base": {
|
"base": {
|
||||||
"optimizer": "adamw8bit",
|
"optimizer": "adamw8bit",
|
||||||
"lr": 1e-6,
|
"lr": 2e-6,
|
||||||
"lr_scheduler": "constant",
|
"lr_scheduler": "constant",
|
||||||
"lr_decay_steps": null,
|
"lr_decay_steps": null,
|
||||||
"lr_warmup_steps": null,
|
"lr_warmup_steps": null,
|
||||||
|
@ -39,8 +39,6 @@
|
||||||
"weight_decay": null
|
"weight_decay": null
|
||||||
},
|
},
|
||||||
"text_encoder_freezing": {
|
"text_encoder_freezing": {
|
||||||
"freeze_embeddings": true,
|
"unfreeze_last_n_layers": 2
|
||||||
"freeze_front_n_layers": -6,
|
|
||||||
"freeze_final_layer_norm": false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue