Merge remote-tracking branch 'upstream/main' into feat_the_acculmunator

2023-10-22 19:35:32 +02:00 · 2023-10-22 19:35:32 +02:00 · 9396d2156e
parent 26a1475f0c 2f2dd4c1f2
commit 9396d2156e
12 changed files with 145 additions and 28 deletions
--- a/Train_Colab.ipynb
+++ b/Train_Colab.ipynb
@ -124,7 +124,8 @@
        "    'wandb',\n",
        "    'colorama',\n",
        "    'keyboard',\n",
-        "    'lion-pytorch'\n",
+        "    'lion-pytorch',\n",
+        "    'safetensors'\n",
        "]\n",
        "\n",
        "print(colored(0, 255, 0, 'Installing packages...'))\n",
@ -543,7 +544,7 @@
        "  --resolution $Resolution \\\n",
        "  --sample_prompts \"$Sample_File\" \\\n",
        "  --sample_steps $Steps_between_samples \\\n",
-        "  --save_every_n_epoch $Save_every_N_epoch \\\n",
+        "  --save_every_n_epochs $Save_every_N_epoch \\\n",
        "  --seed $Training_Seed \\\n",
        "  --zero_frequency_noise_ratio $zero_frequency_noise\n",
        "\n",
--- a/data/every_dream.py
+++ b/data/every_dream.py
@ -41,6 +41,7 @@ class EveryDreamBatch(Dataset):
                 seed=555,
                 tokenizer=None,
                 shuffle_tags=False,
+                 keep_tags=0,
                 rated_dataset=False,
                 rated_dataset_dropout_target=0.5,
                 name='train'
@ -54,6 +55,7 @@ class EveryDreamBatch(Dataset):
        self.tokenizer = tokenizer
        self.max_token_length = self.tokenizer.model_max_length
        self.shuffle_tags = shuffle_tags
+        self.keep_tags = keep_tags
        self.seed = seed
        self.rated_dataset = rated_dataset
        self.rated_dataset_dropout_target = rated_dataset_dropout_target
@ -94,7 +96,7 @@ class EveryDreamBatch(Dataset):
        )

        if self.shuffle_tags or train_item["shuffle_tags"]:
-            example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)
+            example["caption"] = train_item["caption"].get_shuffled_caption(self.seed, keep_tags=self.keep_tags)
        else:
            example["caption"] = train_item["caption"].get_caption()

--- a/data/image_train_item.py
+++ b/data/image_train_item.py
@ -56,7 +56,7 @@ class ImageCaption:
    def rating(self) -> float:
        return self.__rating

-    def get_shuffled_caption(self, seed: int) -> str:
+    def get_shuffled_caption(self, seed: int, keep_tags: int) -> str:
        """
        returns the caption a string with a random selection of the tags in random order
        :param seed used to initialize the randomizer
@ -74,7 +74,7 @@ class ImageCaption:
            if self.__use_weights:
                tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length)
            else:
-                tags_caption = self.__get_shuffled_tags(seed, self.__tags)
+                tags_caption = self.__get_shuffled_tags(seed, self.__tags, keep_tags)

            return self.__main_prompt + ", " + tags_caption
        return self.__main_prompt
@ -111,8 +111,16 @@ class ImageCaption:
        return caption

    @staticmethod
-    def __get_shuffled_tags(seed: int, tags: list[str]) -> str:
-        random.Random(seed).shuffle(tags)
+    def __get_shuffled_tags(seed: int, tags: list[str], keep_tags: int) -> str:
+        tags = tags.copy()
+        keep_tags = min(keep_tags, 0)
+
+        if len(tags) > keep_tags:
+            fixed_tags = tags[:keep_tags]
+            rest = tags[keep_tags:]
+            random.Random(seed).shuffle(rest)
+            tags = fixed_tags + rest
+
        return ", ".join(tags)

 class ImageTrainItem:
@ -306,8 +314,10 @@ class ImageTrainItem:
                image_aspect = width / height
                target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))

-                self.is_undersized = (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02)
+                self.is_undersized = (width != target_wh[0] and height != target_wh[1]) and (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02)
+
                self.target_wh = target_wh
+                self.image_size = image.size
        except Exception as e:
            self.error = e

--- a/doc/ADVANCED_TWEAKING.md
+++ b/doc/ADVANCED_TWEAKING.md
@ -6,7 +6,7 @@ Start with the [Low VRAM guide](TWEAKING.md) if you are having trouble training

 ## Resolution

-You can train resolutions from 512 to 1024 in 64 pixel increments.  General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*.  This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images.  This does cost speed of training and higher VRAM use!  Ex. 768 takes a significant amount more VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
+You can train resolutions from 512 to 1024 in 64 pixel increments.  General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*.  This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images.  This does cost speed of training and higher VRAM use!  Ex. 768 takes a significant amount of additional VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.

    --resolution 640 ^

@ -14,21 +14,21 @@ For instance, if training from the base 1.5 model, you can try trying at 576, 64

 If you are training on a base model that is 768, such as "SD 2.1 768-v", you should also probably use 768 as a base number and adjust from there.

-Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution.  My suspcision is that the higher resolutions increase the gradients as more information is presented to the model per image.  
+Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution.  My suspicion is that the higher resolutions increase the gradients as more information is presented to the model per image.  

- You may need to experiment with LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half.  ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.  
+ You may need to experiment with the LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half.  ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.  

 ## Log and ckpt save folders

 If you want to use a nondefault location for saving logs or ckpt files, these:

-Logdir defaults to the "logs" folder in the trainer directory.  If you wan to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:
+Logdir defaults to the "logs" folder in the trainer directory.  If you want to save all logs (including diffuser copies of ckpts, sample images, and tensbooard events) use this:

    --logdir "/workspace/mylogs"

 Remember to use the same folder when you launch tensorboard (```tensorboard --logdir "/worksapce/mylogs"```) or it won't find your logs.

-By default the CKPT format copies of ckpts that are peroidically saved are saved in the trainer root folder.  If you want to save them elsewhere, use this:
+By default the CKPT format copies of ckpts that are periodically saved are saved in the trainer root folder.  If you want to save them elsewhere, use this:

    --save_ckpt_dir "r:\webui\models\stable-diffusion"

@ -125,11 +125,11 @@ Seed can be used to make training either more or less deterministic.  The seed v

 To use a random seed, use -1:

-    -- seed -1
+    --seed -1

 Default behavior is to use a fixed seed of 555. The seed you set is fixed for all samples if you set a value other than -1.  If you set a seed it is also incrememted for shuffling your training data every epoch (i.e. 555, 556, 557, etc).  This makes training more deterministic.  I suggest a fixed seed when you are trying A/B test tweaks to your general training setup, or when you want all your test samples to use the same seed. 

-Fixed seed should be using when performing A/B tests or hyperparameter sweeps.  Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.
+Fixed seed should be used when performing A/B tests or hyperparameter sweeps.  Random seed (-1) may be better if you are stopping and resuming training often so every restart is using random values for all of the various randomness sources used in training such as noising and data shuffling.

 ## Shuffle tags

@ -139,6 +139,12 @@ For those training booru tagged models, you can use this arg to randomly (but de

 This simply chops the captions in to parts based on the commas and shuffles the order. 

+In case you want to keep static the first N tags, you can also add this parameter (`--shuffle_tags` must also be set):
+
+    --keep_tags 4 ^
+
+The above example will keep static the 4 first additional tags, and shuffle the rest.
+
 ## Zero frequency noise

 Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffusion-with-offset-noise) zero frequency noise offsets the noise added to the image during training/denoising, which can help improve contrast and the ability to render very dark or very bright scenes more accurately, and may help slightly with color saturation.
@ -149,7 +155,11 @@ Based on [Nicholas Guttenberg's blog post](https://www.crosslabs.org//blog/diffu

 Test results: https://huggingface.co/panopstor/ff7r-stable-diffusion/blob/main/zero_freq_test_biggs.webp

-Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps).  Early indications seem to suggest values like 0.10 can cause divergance over time.
+Very tentatively, I suggest closer to 0.10 for short term training, and lower values of around 0.02 to 0.03 for longer runs (50k+ steps).  Early indications seem to suggest values like 0.10 can cause divergence over time. 
+
+## Zero terminal SNR
+
+Set `zero_frequency_noise_ratio` to -1.

 ## Keeping images together (custom batching)

@ -264,4 +274,4 @@ While the calculation makes sense in how it compensates for inteval and total tr

    --ema_strength_target 0.10 ^

-If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning.
+If you use `ema_strength_target` the actual calculated `ema_decay_rate` used will be printed in your logs, and you should pay attention to this value and use it to inform your future decisions on EMA tuning.
--- a/doc/CAPTION.md
+++ b/doc/CAPTION.md
@ -4,7 +4,7 @@

 `python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`

-This script uses two example image/caption pairs located in the `/example` folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder. 
+This script uses two example image/caption pairs located in the `/example` folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the caption in the same folder. 

 This script currently requires an AMPERE or newer GPU due to using bfloat16. 

--- a/doc/OPTIMIZER.md
+++ b/doc/OPTIMIZER.md
@ -29,12 +29,13 @@ For each of the `unet` and `text_encoder` sections, you can set the following pr
 Standard full precision AdamW optimizer exposed by PyTorch.  Not recommended.  Slower and uses more memory than adamw8bit.  Widely documented on the web.

 * adamw8bit
+* lion8bit

-Tim Dettmers / bitsandbytes AdamW 8bit optimizer.  This is the default and recommended setting.  Widely documented on the web.
+Tim Dettmers / bitsandbytes AdamW and Lion 8bit optimizer.  adamw8bit is the default and recommended setting as it is well understood, and lion8bit is very vram efficient.  Widely documented on the web.

 * lion

-Lucidrains' [implementation](https://github.com/lucidrains/lion-pytorch) of the [lion optimizer](https://arxiv.org/abs/2302.06675).  Click links to read more.  `Epsilon` is not used by lion.
+Lucidrains' [implementation](https://github.com/lucidrains/lion-pytorch) of the [lion optimizer](https://arxiv.org/abs/2302.06675).  Click links to read more.  `Epsilon` is not used by lion. You should prefer lion8bit over this optimizer as it is more memory efficient. 

 Recommended settings for lion based on the paper are as follows:

@ -61,7 +62,13 @@ Available optimizer values for Dadaptation are:

 * dadapt_lion, dadapt_adam, dadapt_sgd

-These are fairly experimental but tested as working.  Gradient checkpointing may be required even on 24GB GPUs.  Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption
+These are fairly experimental but tested as working.  Gradient checkpointing may be required even on 24GB GPUs.  Performance is slower than the compiled and optimized AdamW8bit optimizer unless you increae gradient accumulation as it seems the accumulation steps process slowly with the current implementation of D-Adaption. 
+
+ #### Prodigy 
+
+Another adaptive optimizer.  It is not very VRAM efficient. [Github](https://github.com/konstmish/prodigy), [Paper](https://arxiv.org/pdf/2306.06101.pdf)
+
+* prodigy

 ## Optimizer parameters

--- a/doc/SHUFFLING_TAGS.md
+++ b/doc/SHUFFLING_TAGS.md
@ -65,3 +65,4 @@ The effect of the limit is that the caption will always be truncated when the ma
 exceeded. This process does not consider if the cutoff is in the middle of a tag or even in the middle of a
 word if it is translated into several tokens.

+To mitigate this token limitation (when not using weighted shuffling), the `--keep_tags n` parameter can be employed. This ensures that the first n tags following the initial chunk remain static, while the remaining tags are shuffled. 
--- a/docker/requirements-runtime.txt
+++ b/docker/requirements-runtime.txt
@ -16,3 +16,4 @@ speedtest-cli
 tensorboard==2.12.0
 wandb
 safetensors
+prodigyopt
--- a/optimizer/optimizers.py
+++ b/optimizer/optimizers.py
@ -276,6 +276,7 @@ class EveryDreamOptimizer():
        decouple = True # seems bad to turn off, dadapt_adam only
        momentum = 0.0 # dadapt_sgd
        no_prox = False # ????, dadapt_adan
+        use_bias_correction = True # suggest by prodigy github
        growth_rate=float("inf") # dadapt various, no idea what a sane default is

        if local_optimizer_config is not None:
@ -309,6 +310,7 @@ class EveryDreamOptimizer():
                )
            elif optimizer_name == "lion8bit":
                from bitsandbytes.optim import Lion8bit
+                opt_class = Lion8bit
                optimizer = opt_class(
                    itertools.chain(parameters),
                    lr=curr_lr,
@ -316,8 +318,19 @@ class EveryDreamOptimizer():
                    weight_decay=weight_decay,
                    percentile_clipping=100,
                    min_8bit_size=4096,
+                )
+            elif optimizer_name == "prodigy":
+                from prodigyopt import Prodigy
+                opt_class = Prodigy
+                safeguard_warmup = True # per recommendation from prodigy documentation
+                optimizer = opt_class(
+                    itertools.chain(parameters),
+                    lr=curr_lr,
+                    weight_decay=weight_decay,
+                    use_bias_correction=use_bias_correction,
+                    growth_rate=growth_rate,
                    d0=d0,
-                    log_every=args.log_step,
+                    safeguard_warmup=safeguard_warmup 
                )
            elif optimizer_name == "adamw":
                opt_class = torch.optim.AdamW
@ -329,7 +342,7 @@ class EveryDreamOptimizer():
                elif optimizer_name == "scalar_dowg":
                    opt_class = dowg.ScalarDoWG
                else:
-                    raise ValueError(f"Unknown DoWG optimizer {optimizer_name}. Available options are coordinate_dowg and scalar_dowg")
+                    raise ValueError(f"Unknown DoWG optimizer {optimizer_name}. Available options are 'coordinate_dowg' and 'scalar_dowg'")
            elif optimizer_name in ["dadapt_adam", "dadapt_lion", "dadapt_sgd"]:
                import dadaptation

--- a/test/test_image_train_item.py
+++ b/test/test_image_train_item.py
@ -4,6 +4,7 @@ import pathlib
 import PIL.Image as Image

 from data.image_train_item import ImageCaption, ImageTrainItem
+import data.aspects as aspects

 DATA_PATH = pathlib.Path('./test/data')

@ -32,4 +33,70 @@ class TestImageCaption(unittest.TestCase):
        self.assertEqual(caption.get_caption(), "hello world, one, two, three")
        
        caption = ImageCaption("hello world", 1.0, [], [], 2048, False)
-        self.assertEqual(caption.get_caption(), "hello world")
+        self.assertEqual(caption.get_caption(), "hello world")
+
+class TestImageTrainItemConstructor(unittest.TestCase):
+    
+    def tearDown(self) -> None:
+        for file in DATA_PATH.glob("img_*"):
+            file.unlink()
+
+        return super().tearDown()
+
+    @staticmethod 
+    def image_with_size(width, height):
+        filename = DATA_PATH / "img_{}x{}.jpg".format(width, height)
+        Image.new("RGB", (width, height)).save(filename)
+        caption = ImageCaption("hello world", 1.0, [], [], 2048, False)
+        return ImageTrainItem(None, caption, aspects.ASPECTS_512, filename, 0.0, 1.0, False, False, 0)
+           
+    def test_target_size_computation(self):
+        # Square images
+        image = self.image_with_size(30, 30)
+        self.assertEqual(image.target_wh, [512,512])
+        self.assertTrue(image.is_undersized)
+        self.assertEqual(image.image_size, (30,30))
+
+        image = self.image_with_size(512, 512)
+        self.assertEqual(image.target_wh, [512,512])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (512,512))
+
+        image = self.image_with_size(580, 580)
+        self.assertEqual(image.target_wh, [512,512])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (580,580))
+
+        # Landscape images
+        image = self.image_with_size(64, 38)
+        self.assertEqual(image.target_wh, [640,384])
+        self.assertTrue(image.is_undersized)
+        self.assertEqual(image.image_size, (64,38))
+
+        image = self.image_with_size(640, 384)
+        self.assertEqual(image.target_wh, [640,384])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (640,384))
+
+        image = self.image_with_size(704, 422)
+        self.assertEqual(image.target_wh, [640,384])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (704,422))
+
+        # Portrait images
+        image = self.image_with_size(38, 64)
+        self.assertEqual(image.target_wh, [384,640])
+        self.assertTrue(image.is_undersized)
+        self.assertEqual(image.image_size, (38,64))
+
+        image = self.image_with_size(384, 640)
+        self.assertEqual(image.target_wh, [384,640])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (384,640))
+
+        image = self.image_with_size(422, 704)
+        self.assertEqual(image.target_wh, [384,640])
+        self.assertFalse(image.is_undersized)
+        self.assertEqual(image.image_size, (422,704))
+
+        
--- a/train.py
+++ b/train.py
@ -159,7 +159,6 @@ def save_model(save_path, ed_state: EveryDreamTrainingState, global_step: int, s
        logging.warning("  No model to save, something likely blew up on startup, not saving")
        return

-
    if args.ema_decay_rate != None:
        pipeline_ema = StableDiffusionPipeline(
            vae=ed_state.vae,
@ -350,6 +349,9 @@ def setup_args(args):
    if not args.shuffle_tags:
        args.shuffle_tags = False

+    if not args.keep_tags:
+        args.keep_tags = 0
+
    args.clip_skip = max(min(4, args.clip_skip), 0)

    if args.useadam8bit:
@ -779,6 +781,7 @@ def main(args):
        tokenizer=tokenizer,
        seed = seed,
        shuffle_tags=args.shuffle_tags,
+        keep_tags=args.keep_tags,
        rated_dataset=args.rated_dataset,
        rated_dataset_dropout_target=(1.0 - (args.rated_dataset_target_dropout_percent / 100.0))
    )
@ -1208,15 +1211,15 @@ def main(args):
                    last_epoch_saved_time = time.time()
                    logging.info(f"Saving model, {args.ckpt_every_n_minutes} mins at step {global_step}")
                    needs_save = True
-                if epoch > 0 and epoch % args.save_every_n_epochs == 0 and step == 0 and epoch < args.max_epochs - 1 and epoch >= args.save_ckpts_from_n_epochs:
+                if epoch > 0 and epoch % args.save_every_n_epochs == 0 and step == 0 and epoch < args.max_epochs and epoch >= args.save_ckpts_from_n_epochs:
                    logging.info(f" Saving model, {args.save_every_n_epochs} epochs at step {global_step}")
                    needs_save = True
                if needs_save:
                    save_path = make_save_path(epoch, global_step)
                    save_model(save_path, global_step=global_step, ed_state=make_current_ed_state(),
-                               save_ckpt_dir=None, yaml_name=None,
+                               save_ckpt_dir=args.save_ckpt_dir, yaml_name=None,
                               save_full_precision=args.save_full_precision,
-                               save_optimizer_flag=args.save_optimizer, save_ckpt=False)
+                               save_optimizer_flag=args.save_optimizer, save_ckpt=not args.no_save_ckpt)

                plugin_runner.run_on_step_end(epoch=epoch,
                                      global_step=global_step,
@ -1335,6 +1338,7 @@ if __name__ == "__main__":
    argparser.add_argument("--save_optimizer", action="store_true", default=False, help="saves optimizer state with ckpt, useful for resuming training later")
    argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random")
    argparser.add_argument("--shuffle_tags", action="store_true", default=False, help="randomly shuffles CSV tags in captions, for booru datasets")
+    argparser.add_argument("--keep_tags", type=int, default=0, help="Number of tags to keep when shuffle, def: 0 (shuffle all)")
    argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead")
    argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY")
    argparser.add_argument("--validation_config", default=None, help="Path to a JSON configuration file for the validator.  Default is no validation.")
--- a/windows_setup.cmd
+++ b/windows_setup.cmd
@ -23,6 +23,7 @@ pip install compel~=1.1.3
 pip install dadaptation
 pip install safetensors
 pip install open-flamingo==2.0.0
+pip install prodigyopt
 python utils/get_yamls.py
 GOTO :eof