Merge pull request #214 from luisgabrielroldan/keep_tags

Add --keep_tags to keep first N tags fixed on shuffle
This commit is contained in:
Victor Hall 2023-09-25 13:10:21 -04:00 committed by GitHub
commit e8e4f0c2ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 14 deletions

View File

@ -41,6 +41,7 @@ class EveryDreamBatch(Dataset):
seed=555,
tokenizer=None,
shuffle_tags=False,
keep_tags=0,
rated_dataset=False,
rated_dataset_dropout_target=0.5,
name='train'
@ -54,6 +55,7 @@ class EveryDreamBatch(Dataset):
self.tokenizer = tokenizer
self.max_token_length = self.tokenizer.model_max_length
self.shuffle_tags = shuffle_tags
self.keep_tags = keep_tags
self.seed = seed
self.rated_dataset = rated_dataset
self.rated_dataset_dropout_target = rated_dataset_dropout_target
@ -94,7 +96,7 @@ class EveryDreamBatch(Dataset):
)
if self.shuffle_tags or train_item["shuffle_tags"]:
example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)
example["caption"] = train_item["caption"].get_shuffled_caption(self.seed, keep_tags=self.keep_tags)
else:
example["caption"] = train_item["caption"].get_caption()

View File

@ -56,7 +56,7 @@ class ImageCaption:
def rating(self) -> float:
return self.__rating
def get_shuffled_caption(self, seed: int) -> str:
def get_shuffled_caption(self, seed: int, keep_tags: int) -> str:
"""
returns the caption a string with a random selection of the tags in random order
:param seed used to initialize the randomizer
@ -74,7 +74,7 @@ class ImageCaption:
if self.__use_weights:
tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length)
else:
tags_caption = self.__get_shuffled_tags(seed, self.__tags)
tags_caption = self.__get_shuffled_tags(seed, self.__tags, keep_tags)
return self.__main_prompt + ", " + tags_caption
return self.__main_prompt
@ -111,8 +111,16 @@ class ImageCaption:
return caption
@staticmethod
def __get_shuffled_tags(seed: int, tags: list[str]) -> str:
random.Random(seed).shuffle(tags)
def __get_shuffled_tags(seed: int, tags: list[str], keep_tags: int) -> str:
tags = tags.copy()
keep_tags = min(keep_tags, 0)
if len(tags) > keep_tags:
fixed_tags = tags[:keep_tags]
rest = tags[keep_tags:]
random.Random(seed).shuffle(rest)
tags = fixed_tags + rest
return ", ".join(tags)
class ImageTrainItem:

View File

@ -6,7 +6,7 @@ Start with the [Low VRAM guide](TWEAKING.md) if you are having trouble training
## Resolution
You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount more VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount of additional VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
--resolution 640 ^
@ -14,21 +14,21 @@ For instance, if training from the base 1.5 model, you can try trying at 576, 64
If you are training on a base model that is 768, such as "SD 2.1 768-v", you should also probably use 768 as a base number and adjust from there.
Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspcision is that the higher resolutions increase the gradients as more information is presented to the model per image.
Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspicion is that the higher resolutions increase the gradients as more information is presented to the model per image.
You may need to experiment with LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.
You may need to experiment with the LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.
## Log and ckpt save folders
If you want to use a nondefault location for saving logs or ckpt files, these:
Logdir defaults to the "logs" folder in the trainer directory. If you wan to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:
Logdir defaults to the "logs" folder in the trainer directory. If you want to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:
--logdir "/workspace/mylogs"
Remember to use the same folder when you launch tensorboard (```tensorboard --logdir "/worksapce/mylogs"```) or it won't find your logs.
By default the CKPT format copies of ckpts that are peroidically saved are saved in the trainer root folder. If you want to save them elsewhere, use this:
By default the CKPT format copies of ckpts that are periodically saved are saved in the trainer root folder. If you want to save them elsewhere, use this:
--save_ckpt_dir "r:\webui\models\stable-diffusion"
@ -129,7 +129,7 @@ To use a random seed, use -1:
Default behavior is to use a fixed seed of 555. The seed you set is fixed for all samples if you set a value other than -1. If you set a seed it is also incrememted for shuffling your training data every epoch (i.e. 555, 556, 557, etc). This makes training more deterministic. I suggest a fixed seed when you are trying A/B test tweaks to your general training setup, or when you want all your test samples to use the same seed.
Fixed seed should be using when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.
Fixed seed should be used when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.
## Shuffle tags
@ -139,6 +139,12 @@ For those training booru tagged models, you can use this arg to randomly (but de
This simply chops the captions in to parts based on the commas and shuffles the order.
In case you want to keep static the first N tags, you can also add this parameter (`--shuffle_tags` must also be set):
--keep_tags 4 ^
The above example will keep static the 4 first additional tags, and shuffle the rest.
## Zero frequency noise
Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffusion-with-offset-noise) zero frequency noise offsets the noise added to the image during training/denoising, which can help improve contrast and the ability to render very dark or very bright scenes more accurately, and may help slightly with color saturation.
@ -149,7 +155,11 @@ Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffu
Test results: https://huggingface.co/panopstor/ff7r-stable-diffusion/blob/main/zero_freq_test_biggs.webp
Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergance over time.
Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergence over time.
## Zero terminal SNR
Set `zero_frequency_noise_ratio` to -1.
## Keeping images together (custom batching)

View File

@ -65,3 +65,4 @@ The effect of the limit is that the caption will always be truncated when the ma
exceeded. This process does not consider if the cutoff is in the middle of a tag or even in the middle of a
word if it is translated into several tokens.
To mitigate this token limitation (when not using weighted shuffling), the `--keep_tags n` parameter can be employed. This ensures that the first n tags following the initial chunk remain static, while the remaining tags are shuffled.

View File

@ -349,6 +349,9 @@ def setup_args(args):
if not args.shuffle_tags:
args.shuffle_tags = False
if not args.keep_tags:
args.keep_tags = 0
args.clip_skip = max(min(4, args.clip_skip), 0)
if args.useadam8bit:
@ -778,6 +781,7 @@ def main(args):
tokenizer=tokenizer,
seed = seed,
shuffle_tags=args.shuffle_tags,
keep_tags=args.keep_tags,
rated_dataset=args.rated_dataset,
rated_dataset_dropout_target=(1.0 - (args.rated_dataset_target_dropout_percent / 100.0))
)
@ -1325,6 +1329,7 @@ if __name__ == "__main__":
argparser.add_argument("--save_optimizer", action="store_true", default=False, help="saves optimizer state with ckpt, useful for resuming training later")
argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random")
argparser.add_argument("--shuffle_tags", action="store_true", default=False, help="randomly shuffles CSV tags in captions, for booru datasets")
argparser.add_argument("--keep_tags", type=int, default=0, help="Number of tags to keep when shuffle, def: 0 (shuffle all)")
argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead")
argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY")
argparser.add_argument("--validation_config", default=None, help="Path to a JSON configuration file for the validator. Default is no validation.")