Merge remote-tracking branch 'upstream/main' into feat_the_acculmunator

This commit is contained in:
Damian Stewart 2023-10-22 19:35:32 +02:00
commit 9396d2156e
12 changed files with 145 additions and 28 deletions

View File

@ -124,7 +124,8 @@
" 'wandb',\n",
" 'colorama',\n",
" 'keyboard',\n",
" 'lion-pytorch'\n",
" 'lion-pytorch',\n",
" 'safetensors'\n",
"]\n",
"\n",
"print(colored(0, 255, 0, 'Installing packages...'))\n",
@ -543,7 +544,7 @@
" --resolution $Resolution \\\n",
" --sample_prompts \"$Sample_File\" \\\n",
" --sample_steps $Steps_between_samples \\\n",
" --save_every_n_epoch $Save_every_N_epoch \\\n",
" --save_every_n_epochs $Save_every_N_epoch \\\n",
" --seed $Training_Seed \\\n",
" --zero_frequency_noise_ratio $zero_frequency_noise\n",
"\n",

View File

@ -41,6 +41,7 @@ class EveryDreamBatch(Dataset):
seed=555,
tokenizer=None,
shuffle_tags=False,
keep_tags=0,
rated_dataset=False,
rated_dataset_dropout_target=0.5,
name='train'
@ -54,6 +55,7 @@ class EveryDreamBatch(Dataset):
self.tokenizer = tokenizer
self.max_token_length = self.tokenizer.model_max_length
self.shuffle_tags = shuffle_tags
self.keep_tags = keep_tags
self.seed = seed
self.rated_dataset = rated_dataset
self.rated_dataset_dropout_target = rated_dataset_dropout_target
@ -94,7 +96,7 @@ class EveryDreamBatch(Dataset):
)
if self.shuffle_tags or train_item["shuffle_tags"]:
example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)
example["caption"] = train_item["caption"].get_shuffled_caption(self.seed, keep_tags=self.keep_tags)
else:
example["caption"] = train_item["caption"].get_caption()

View File

@ -56,7 +56,7 @@ class ImageCaption:
def rating(self) -> float:
return self.__rating
def get_shuffled_caption(self, seed: int) -> str:
def get_shuffled_caption(self, seed: int, keep_tags: int) -> str:
"""
returns the caption a string with a random selection of the tags in random order
:param seed used to initialize the randomizer
@ -74,7 +74,7 @@ class ImageCaption:
if self.__use_weights:
tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length)
else:
tags_caption = self.__get_shuffled_tags(seed, self.__tags)
tags_caption = self.__get_shuffled_tags(seed, self.__tags, keep_tags)
return self.__main_prompt + ", " + tags_caption
return self.__main_prompt
@ -111,8 +111,16 @@ class ImageCaption:
return caption
@staticmethod
def __get_shuffled_tags(seed: int, tags: list[str]) -> str:
random.Random(seed).shuffle(tags)
def __get_shuffled_tags(seed: int, tags: list[str], keep_tags: int) -> str:
tags = tags.copy()
keep_tags = min(keep_tags, 0)
if len(tags) > keep_tags:
fixed_tags = tags[:keep_tags]
rest = tags[keep_tags:]
random.Random(seed).shuffle(rest)
tags = fixed_tags + rest
return ", ".join(tags)
class ImageTrainItem:
@ -306,8 +314,10 @@ class ImageTrainItem:
image_aspect = width / height
target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
self.is_undersized = (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02)
self.is_undersized = (width != target_wh[0] and height != target_wh[1]) and (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02)
self.target_wh = target_wh
self.image_size = image.size
except Exception as e:
self.error = e

View File

@ -6,7 +6,7 @@ Start with the [Low VRAM guide](TWEAKING.md) if you are having trouble training
## Resolution
You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount more VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
You can train resolutions from 512 to 1024 in 64 pixel increments. General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*. This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images. This does cost speed of training and higher VRAM use! Ex. 768 takes a significant amount of additional VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
--resolution 640 ^
@ -14,21 +14,21 @@ For instance, if training from the base 1.5 model, you can try trying at 576, 64
If you are training on a base model that is 768, such as "SD 2.1 768-v", you should also probably use 768 as a base number and adjust from there.
Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspcision is that the higher resolutions increase the gradients as more information is presented to the model per image.
Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution. My suspicion is that the higher resolutions increase the gradients as more information is presented to the model per image.
You may need to experiment with LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.
You may need to experiment with the LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half. ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.
## Log and ckpt save folders
If you want to use a nondefault location for saving logs or ckpt files, these:
Logdir defaults to the "logs" folder in the trainer directory. If you wan to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:
Logdir defaults to the "logs" folder in the trainer directory. If you want to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:
--logdir "/workspace/mylogs"
Remember to use the same folder when you launch tensorboard (```tensorboard --logdir "/worksapce/mylogs"```) or it won't find your logs.
By default the CKPT format copies of ckpts that are peroidically saved are saved in the trainer root folder. If you want to save them elsewhere, use this:
By default the CKPT format copies of ckpts that are periodically saved are saved in the trainer root folder. If you want to save them elsewhere, use this:
--save_ckpt_dir "r:\webui\models\stable-diffusion"
@ -125,11 +125,11 @@ Seed can be used to make training either more or less deterministic. The seed v
To use a random seed, use -1:
-- seed -1
--seed -1
Default behavior is to use a fixed seed of 555. The seed you set is fixed for all samples if you set a value other than -1. If you set a seed it is also incrememted for shuffling your training data every epoch (i.e. 555, 556, 557, etc). This makes training more deterministic. I suggest a fixed seed when you are trying A/B test tweaks to your general training setup, or when you want all your test samples to use the same seed.
Fixed seed should be using when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.
Fixed seed should be used when performing A/B tests or hyperparameter sweeps. Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.
## Shuffle tags
@ -139,6 +139,12 @@ For those training booru tagged models, you can use this arg to randomly (but de
This simply chops the captions in to parts based on the commas and shuffles the order.
In case you want to keep static the first N tags, you can also add this parameter (`--shuffle_tags` must also be set):
--keep_tags 4 ^
The above example will keep static the 4 first additional tags, and shuffle the rest.
## Zero frequency noise
Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffusion-with-offset-noise) zero frequency noise offsets the noise added to the image during training/denoising, which can help improve contrast and the ability to render very dark or very bright scenes more accurately, and may help slightly with color saturation.
@ -149,7 +155,11 @@ Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffu
Test results: https://huggingface.co/panopstor/ff7r-stable-diffusion/blob/main/zero_freq_test_biggs.webp
Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergance over time.
Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps). Early indications seem to suggest values like 0.10 can cause divergence over time.
## Zero terminal SNR
Set `zero_frequency_noise_ratio` to -1.
## Keeping images together (custom batching)
@ -264,4 +274,4 @@ While the calculation makes sense in how it compensates for inteval and total tr
--ema_strength_target 0.10 ^
If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning.
If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning.

View File

@ -4,7 +4,7 @@
`python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`
This script uses two example image/caption pairs located in the `/example` folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder.
This script uses two example image/caption pairs located in the `/example` folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the caption in the same folder.
This script currently requires an AMPERE or newer GPU due to using bfloat16.

View File

@ -29,12 +29,13 @@ For each of the `unet` and `text_encoder` sections, you can set the following pr
Standard full precision AdamW optimizer exposed by PyTorch. Not recommended. Slower and uses more memory than adamw8bit. Widely documented on the web.
* adamw8bit
* lion8bit
Tim Dettmers / bitsandbytes AdamW 8bit optimizer. This is the default and recommended setting. Widely documented on the web.
Tim Dettmers / bitsandbytes AdamW and Lion 8bit optimizer. adamw8bit is the default and recommended setting as it is well understood, and lion8bit is very vram efficient. Widely documented on the web.
* lion
Lucidrains' [implementation](https://github.com/lucidrains/lion-pytorch) of the [lion optimizer](https://arxiv.org/abs/2302.06675). Click links to read more. `Epsilon` is not used by lion.
Lucidrains' [implementation](https://github.com/lucidrains/lion-pytorch) of the [lion optimizer](https://arxiv.org/abs/2302.06675). Click links to read more. `Epsilon` is not used by lion. You should prefer lion8bit over this optimizer as it is more memory efficient.
Recommended settings for lion based on the paper are as follows:
@ -61,7 +62,13 @@ Available optimizer values for Dadaptation are:
* dadapt_lion, dadapt_adam, dadapt_sgd
These are fairly experimental but tested as working. Gradient checkpointing may be required even on 24GB GPUs. Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption
These are fairly experimental but tested as working. Gradient checkpointing may be required even on 24GB GPUs. Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption.
#### Prodigy
Another adaptive optimizer. It is not very VRAM efficient. [Github](https://github.com/konstmish/prodigy), [Paper](https://arxiv.org/pdf/2306.06101.pdf)
* prodigy
## Optimizer parameters

View File

@ -65,3 +65,4 @@ The effect of the limit is that the caption will always be truncated when the ma
exceeded. This process does not consider if the cutoff is in the middle of a tag or even in the middle of a
word if it is translated into several tokens.
To mitigate this token limitation (when not using weighted shuffling), the `--keep_tags n` parameter can be employed. This ensures that the first n tags following the initial chunk remain static, while the remaining tags are shuffled.

View File

@ -16,3 +16,4 @@ speedtest-cli
tensorboard==2.12.0
wandb
safetensors
prodigyopt

View File

@ -276,6 +276,7 @@ class EveryDreamOptimizer():
decouple = True # seems bad to turn off, dadapt_adam only
momentum = 0.0 # dadapt_sgd
no_prox = False # ????, dadapt_adan
use_bias_correction = True # suggest by prodigy github
growth_rate=float("inf") # dadapt various, no idea what a sane default is
if local_optimizer_config is not None:
@ -309,6 +310,7 @@ class EveryDreamOptimizer():
)
elif optimizer_name == "lion8bit":
from bitsandbytes.optim import Lion8bit
opt_class = Lion8bit
optimizer = opt_class(
itertools.chain(parameters),
lr=curr_lr,
@ -316,8 +318,19 @@ class EveryDreamOptimizer():
weight_decay=weight_decay,
percentile_clipping=100,
min_8bit_size=4096,
)
elif optimizer_name == "prodigy":
from prodigyopt import Prodigy
opt_class = Prodigy
safeguard_warmup = True # per recommendation from prodigy documentation
optimizer = opt_class(
itertools.chain(parameters),
lr=curr_lr,
weight_decay=weight_decay,
use_bias_correction=use_bias_correction,
growth_rate=growth_rate,
d0=d0,
log_every=args.log_step,
safeguard_warmup=safeguard_warmup
)
elif optimizer_name == "adamw":
opt_class = torch.optim.AdamW
@ -329,7 +342,7 @@ class EveryDreamOptimizer():
elif optimizer_name == "scalar_dowg":
opt_class = dowg.ScalarDoWG
else:
raise ValueError(f"Unknown DoWG optimizer {optimizer_name}. Available options are coordinate_dowg and scalar_dowg")
raise ValueError(f"Unknown DoWG optimizer {optimizer_name}. Available options are 'coordinate_dowg' and 'scalar_dowg'")
elif optimizer_name in ["dadapt_adam", "dadapt_lion", "dadapt_sgd"]:
import dadaptation

View File

@ -4,6 +4,7 @@ import pathlib
import PIL.Image as Image
from data.image_train_item import ImageCaption, ImageTrainItem
import data.aspects as aspects
DATA_PATH = pathlib.Path('./test/data')
@ -32,4 +33,70 @@ class TestImageCaption(unittest.TestCase):
self.assertEqual(caption.get_caption(), "hello world, one, two, three")
caption = ImageCaption("hello world", 1.0, [], [], 2048, False)
self.assertEqual(caption.get_caption(), "hello world")
self.assertEqual(caption.get_caption(), "hello world")
class TestImageTrainItemConstructor(unittest.TestCase):
def tearDown(self) -> None:
for file in DATA_PATH.glob("img_*"):
file.unlink()
return super().tearDown()
@staticmethod
def image_with_size(width, height):
filename = DATA_PATH / "img_{}x{}.jpg".format(width, height)
Image.new("RGB", (width, height)).save(filename)
caption = ImageCaption("hello world", 1.0, [], [], 2048, False)
return ImageTrainItem(None, caption, aspects.ASPECTS_512, filename, 0.0, 1.0, False, False, 0)
def test_target_size_computation(self):
# Square images
image = self.image_with_size(30, 30)
self.assertEqual(image.target_wh, [512,512])
self.assertTrue(image.is_undersized)
self.assertEqual(image.image_size, (30,30))
image = self.image_with_size(512, 512)
self.assertEqual(image.target_wh, [512,512])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (512,512))
image = self.image_with_size(580, 580)
self.assertEqual(image.target_wh, [512,512])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (580,580))
# Landscape images
image = self.image_with_size(64, 38)
self.assertEqual(image.target_wh, [640,384])
self.assertTrue(image.is_undersized)
self.assertEqual(image.image_size, (64,38))
image = self.image_with_size(640, 384)
self.assertEqual(image.target_wh, [640,384])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (640,384))
image = self.image_with_size(704, 422)
self.assertEqual(image.target_wh, [640,384])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (704,422))
# Portrait images
image = self.image_with_size(38, 64)
self.assertEqual(image.target_wh, [384,640])
self.assertTrue(image.is_undersized)
self.assertEqual(image.image_size, (38,64))
image = self.image_with_size(384, 640)
self.assertEqual(image.target_wh, [384,640])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (384,640))
image = self.image_with_size(422, 704)
self.assertEqual(image.target_wh, [384,640])
self.assertFalse(image.is_undersized)
self.assertEqual(image.image_size, (422,704))

View File

@ -159,7 +159,6 @@ def save_model(save_path, ed_state: EveryDreamTrainingState, global_step: int, s
logging.warning(" No model to save, something likely blew up on startup, not saving")
return
if args.ema_decay_rate != None:
pipeline_ema = StableDiffusionPipeline(
vae=ed_state.vae,
@ -350,6 +349,9 @@ def setup_args(args):
if not args.shuffle_tags:
args.shuffle_tags = False
if not args.keep_tags:
args.keep_tags = 0
args.clip_skip = max(min(4, args.clip_skip), 0)
if args.useadam8bit:
@ -779,6 +781,7 @@ def main(args):
tokenizer=tokenizer,
seed = seed,
shuffle_tags=args.shuffle_tags,
keep_tags=args.keep_tags,
rated_dataset=args.rated_dataset,
rated_dataset_dropout_target=(1.0 - (args.rated_dataset_target_dropout_percent / 100.0))
)
@ -1208,15 +1211,15 @@ def main(args):
last_epoch_saved_time = time.time()
logging.info(f"Saving model, {args.ckpt_every_n_minutes} mins at step {global_step}")
needs_save = True
if epoch > 0 and epoch % args.save_every_n_epochs == 0 and step == 0 and epoch < args.max_epochs - 1 and epoch >= args.save_ckpts_from_n_epochs:
if epoch > 0 and epoch % args.save_every_n_epochs == 0 and step == 0 and epoch < args.max_epochs and epoch >= args.save_ckpts_from_n_epochs:
logging.info(f" Saving model, {args.save_every_n_epochs} epochs at step {global_step}")
needs_save = True
if needs_save:
save_path = make_save_path(epoch, global_step)
save_model(save_path, global_step=global_step, ed_state=make_current_ed_state(),
save_ckpt_dir=None, yaml_name=None,
save_ckpt_dir=args.save_ckpt_dir, yaml_name=None,
save_full_precision=args.save_full_precision,
save_optimizer_flag=args.save_optimizer, save_ckpt=False)
save_optimizer_flag=args.save_optimizer, save_ckpt=not args.no_save_ckpt)
plugin_runner.run_on_step_end(epoch=epoch,
global_step=global_step,
@ -1335,6 +1338,7 @@ if __name__ == "__main__":
argparser.add_argument("--save_optimizer", action="store_true", default=False, help="saves optimizer state with ckpt, useful for resuming training later")
argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random")
argparser.add_argument("--shuffle_tags", action="store_true", default=False, help="randomly shuffles CSV tags in captions, for booru datasets")
argparser.add_argument("--keep_tags", type=int, default=0, help="Number of tags to keep when shuffle, def: 0 (shuffle all)")
argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead")
argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY")
argparser.add_argument("--validation_config", default=None, help="Path to a JSON configuration file for the validator. Default is no validation.")

View File

@ -23,6 +23,7 @@ pip install compel~=1.1.3
pip install dadaptation
pip install safetensors
pip install open-flamingo==2.0.0
pip install prodigyopt
python utils/get_yamls.py
GOTO :eof