From 43984f2ad34df02b7dd122f697f8e231523eaf44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Rold=C3=A1n?= Date: Mon, 17 Jul 2023 01:33:52 -0300 Subject: [PATCH 1/3] Add --keep_tags to keep first N tags fixed on shuffle --- data/every_dream.py | 4 +++- data/image_train_item.py | 15 +++++++++++---- train.py | 5 +++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/data/every_dream.py b/data/every_dream.py index 3cd84f9..56f3a3f 100644 --- a/data/every_dream.py +++ b/data/every_dream.py @@ -41,6 +41,7 @@ class EveryDreamBatch(Dataset): seed=555, tokenizer=None, shuffle_tags=False, + keep_tags=0, rated_dataset=False, rated_dataset_dropout_target=0.5, name='train' @@ -54,6 +55,7 @@ class EveryDreamBatch(Dataset): self.tokenizer = tokenizer self.max_token_length = self.tokenizer.model_max_length self.shuffle_tags = shuffle_tags + self.keep_tags = keep_tags self.seed = seed self.rated_dataset = rated_dataset self.rated_dataset_dropout_target = rated_dataset_dropout_target @@ -94,7 +96,7 @@ class EveryDreamBatch(Dataset): ) if self.shuffle_tags or train_item["shuffle_tags"]: - example["caption"] = train_item["caption"].get_shuffled_caption(self.seed) + example["caption"] = train_item["caption"].get_shuffled_caption(self.seed, keep_tags=self.keep_tags) else: example["caption"] = train_item["caption"].get_caption() diff --git a/data/image_train_item.py b/data/image_train_item.py index 0af9b4d..27ad6ef 100644 --- a/data/image_train_item.py +++ b/data/image_train_item.py @@ -56,7 +56,7 @@ class ImageCaption: def rating(self) -> float: return self.__rating - def get_shuffled_caption(self, seed: int) -> str: + def get_shuffled_caption(self, seed: int, keep_tags: int) -> str: """ returns the caption a string with a random selection of the tags in random order :param seed used to initialize the randomizer @@ -74,7 +74,7 @@ class ImageCaption: if self.__use_weights: tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length) else: - tags_caption = self.__get_shuffled_tags(seed, self.__tags) + tags_caption = self.__get_shuffled_tags(seed, self.__tags, keep_tags) return self.__main_prompt + ", " + tags_caption return self.__main_prompt @@ -111,8 +111,15 @@ class ImageCaption: return caption @staticmethod - def __get_shuffled_tags(seed: int, tags: list[str]) -> str: - random.Random(seed).shuffle(tags) + def __get_shuffled_tags(seed: int, tags: list[str], keep_tags: int) -> str: + tags = tags.copy() + + if len(tags) > keep_tags: + fixed_tags = tags[:keep_tags] + rest = tags[keep_tags:] + random.Random(seed).shuffle(rest) + tags = fixed_tags + rest + return ", ".join(tags) class ImageTrainItem: diff --git a/train.py b/train.py index 58a2e3f..bc94961 100644 --- a/train.py +++ b/train.py @@ -350,6 +350,9 @@ def setup_args(args): if not args.shuffle_tags: args.shuffle_tags = False + if not args.keep_tags: + args.keep_tags = 0 + args.clip_skip = max(min(4, args.clip_skip), 0) if args.useadam8bit: @@ -779,6 +782,7 @@ def main(args): tokenizer=tokenizer, seed = seed, shuffle_tags=args.shuffle_tags, + keep_tags=args.keep_tags, rated_dataset=args.rated_dataset, rated_dataset_dropout_target=(1.0 - (args.rated_dataset_target_dropout_percent / 100.0)) ) @@ -1326,6 +1330,7 @@ if __name__ == "__main__": argparser.add_argument("--save_optimizer", action="store_true", default=False, help="saves optimizer state with ckpt, useful for resuming training later") argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random") argparser.add_argument("--shuffle_tags", action="store_true", default=False, help="randomly shuffles CSV tags in captions, for booru datasets") + argparser.add_argument("--keep_tags", type=int, default=0, help="Number of tags to keep when shuffle, def: 0 (shuffle all)") argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead") argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY") argparser.add_argument("--validation_config", default=None, help="Path to a JSON configuration file for the validator. Default is no validation.") From 99a0431d0fd5831e95f36762f16930d0261c76a1 Mon Sep 17 00:00:00 2001 From: Gabriel Roldan Date: Wed, 20 Sep 2023 19:50:34 -0300 Subject: [PATCH 2/3] Ignore negative keep_tags values --- data/image_train_item.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/image_train_item.py b/data/image_train_item.py index 27ad6ef..31815c3 100644 --- a/data/image_train_item.py +++ b/data/image_train_item.py @@ -113,6 +113,7 @@ class ImageCaption: @staticmethod def __get_shuffled_tags(seed: int, tags: list[str], keep_tags: int) -> str: tags = tags.copy() + keep_tags = min(keep_tags, 0) if len(tags) > keep_tags: fixed_tags = tags[:keep_tags] From f2d5a40f72d37d54ae29fe8ec04db3bf19384dd7 Mon Sep 17 00:00:00 2001 From: Gabriel Roldan Date: Wed, 20 Sep 2023 19:51:14 -0300 Subject: [PATCH 3/3] Add --keep_tags docs and fix some typos --- doc/ADVANCED_TWEAKING.md | 28 +++++++++++++++++++--------- doc/SHUFFLING_TAGS.md | 1 + 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/ADVANCED_TWEAKING.md b/doc/ADVANCED_TWEAKING.md index dab9a47..fcfb81f 100644 --- a/doc/ADVANCED_TWEAKING.md +++ b/doc/ADVANCED_TWEAKING.md @@ -6,7 +6,7 @@ Start with the [Low VRAM guide](TWEAKING.md) if you are having trouble training ## Resolution -You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount more VRAM than 512, so you will need to compensate for that by reducing ```batch_size```. +You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount of additional VRAM than 512, so you will need to compensate for that by reducing ```batch_size```. --resolution 640 ^ @@ -14,21 +14,21 @@ For instance, if training from the base 1.5 model, you can try trying at 576, 64 If you are training on a base model that is 768, such as "SD 2.1 768-v", you should also probably use 768 as a base number and adjust from there. -Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspcision is that the higher resolutions increase the gradients as more information is presented to the model per image. +Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspicion is that the higher resolutions increase the gradients as more information is presented to the model per image. - You may need to experiment with LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead. + You may need to experiment with the LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead. ## Log and ckpt save folders If you want to use a nondefault location for saving logs or ckpt files, these: -Logdir defaults to the "logs" folder in the trainer directory. If you wan to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this: +Logdir defaults to the "logs" folder in the trainer directory. If you want to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this: --logdir "/workspace/mylogs" Remember to use the same folder when you launch tensorboard (```tensorboard --logdir "/worksapce/mylogs"```) or it won't find your logs. -By default the CKPT format copies of ckpts that are peroidically saved are saved in the trainer root folder. If you want to save them elsewhere, use this: +By default the CKPT format copies of ckpts that are periodically saved are saved in the trainer root folder. If you want to save them elsewhere, use this: --save_ckpt_dir "r:\webui\models\stable-diffusion" @@ -125,11 +125,11 @@ Seed can be used to make training either more or less deterministic. The seed v To use a random seed, use -1: - -- seed -1 + --seed -1 Default behavior is to use a fixed seed of 555. The seed you set is fixed for all samples if you set a value other than -1. If you set a seed it is also incrememted for shuffling your training data every epoch (i.e. 555, 556, 557, etc). This makes training more deterministic. I suggest a fixed seed when you are trying A/B test tweaks to your general training setup, or when you want all your test samples to use the same seed. -Fixed seed should be using when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling. +Fixed seed should be used when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling. ## Shuffle tags @@ -139,6 +139,12 @@ For those training booru tagged models, you can use this arg to randomly (but de This simply chops the captions in to parts based on the commas and shuffles the order. +In case you want to keep static the first N tags, you can also add this parameter (`--shuffle_tags` must also be set): + + --keep_tags 4 ^ + +The above example will keep static the 4 first additional tags, and shuffle the rest. + ## Zero frequency noise Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffusion-with-offset-noise) zero frequency noise offsets the noise added to the image during training/denoising, which can help improve contrast and the ability to render very dark or very bright scenes more accurately, and may help slightly with color saturation. @@ -149,7 +155,11 @@ Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffu Test results: https://huggingface.co/panopstor/ff7r-stable-diffusion/blob/main/zero_freq_test_biggs.webp -Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergance over time. +Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergence over time. + +## Zero terminal SNR + +Set `zero_frequency_noise_ratio` to -1. ## Keeping images together (custom batching) @@ -264,4 +274,4 @@ While the calculation makes sense in how it compensates for inteval and total tr --ema_strength_target 0.10 ^ -If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning. \ No newline at end of file +If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning. diff --git a/doc/SHUFFLING_TAGS.md b/doc/SHUFFLING_TAGS.md index 5bf0f44..c740004 100644 --- a/doc/SHUFFLING_TAGS.md +++ b/doc/SHUFFLING_TAGS.md @@ -65,3 +65,4 @@ The effect of the limit is that the caption will always be truncated when the ma exceeded. This process does not consider if the cutoff is in the middle of a tag or even in the middle of a word if it is translated into several tokens. +To mitigate this token limitation (when not using weighted shuffling), the `--keep_tags n` parameter can be employed. This ensures that the first n tags following the initial chunk remain static, while the remaining tags are shuffled.