bunch of updates, grad ckpting, no drop bucket, shuffle every epoch

2023-01-01 10:45:18 -05:00 · 2023-01-01 10:45:18 -05:00 · b316684bdb
parent 46f84fc791
commit b316684bdb
7 changed files with 264 additions and 117 deletions
--- a/data/data_loader.py
+++ b/data/data_loader.py
@ -22,6 +22,10 @@ from data.image_train_item import ImageTrainItem
 import data.aspects as aspects
 from colorama import Fore, Style
 import zipfile
+import tqdm
+import PIL
+
+PIL.Image.MAX_IMAGE_PIXELS = 715827880*4 # increase decompression bomb error limit to 4x default

 class DataLoaderMultiAspect():
    """
@ -36,6 +40,9 @@ class DataLoaderMultiAspect():
        self.debug_level = debug_level
        self.flip_p = flip_p
        self.log_folder = log_folder
+        self.seed = seed
+        self.batch_size = batch_size
+        self.runts = []

        self.aspects = aspects.get_aspect_buckets(resolution=resolution, square_only=False)
        logging.info(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
@ -45,11 +52,16 @@ class DataLoaderMultiAspect():

        self.__recurse_data_root(self=self, recurse_root=data_root)
        random.Random(seed).shuffle(self.image_paths)
-        prepared_train_data = self.__prescan_images(self.image_paths, flip_p) # ImageTrainItem[]
-        self.image_caption_pairs = self.__bucketize_images(prepared_train_data, batch_size=batch_size, debug_level=debug_level)
+        self.prepared_train_data = self.__prescan_images(self.image_paths, flip_p) # ImageTrainItem[]
+        self.image_caption_pairs = self.__bucketize_images(self.prepared_train_data, batch_size=batch_size, debug_level=debug_level)
+        
+    def shuffle(self):
+        self.runts = []
+        self.seed = self.seed + 1
+        random.Random(self.seed).shuffle(self.prepared_train_data)
+        self.image_caption_pairs = self.__bucketize_images(self.prepared_train_data, batch_size=self.batch_size, debug_level=0)

    def unzip_all(self, path):
-        #recursively unzip all files in path
        try:
            for root, dirs, files in os.walk(path):
                for file in files:
@ -81,7 +93,7 @@ class DataLoaderMultiAspect():
        """
        decorated_image_train_items = []

-        for pathname in image_paths:
+        for pathname in tqdm.tqdm(image_paths):
            caption_from_filename = os.path.splitext(os.path.basename(pathname))[0].split("_")[0]

            txt_file_path = os.path.splitext(pathname)[0] + ".txt"
@ -119,27 +131,27 @@ class DataLoaderMultiAspect():
        buckets = {}

        for image_caption_pair in prepared_train_data:
+            image_caption_pair.runt_size = 0
            target_wh = image_caption_pair.target_wh

            if (target_wh[0],target_wh[1]) not in buckets:
                buckets[(target_wh[0],target_wh[1])] = []
-            buckets[(target_wh[0],target_wh[1])].append(image_caption_pair) 
-
-        logging.info(f" ** Number of buckets used: {len(buckets)}")
+            buckets[(target_wh[0],target_wh[1])].append(image_caption_pair)

        if len(buckets) > 1:
            for bucket in buckets:
                truncate_count = len(buckets[bucket]) % batch_size
                if truncate_count > 0:
-                    with open(os.path.join(self.log_folder, "bucket_drops.txt"), "a") as f:                
-                        f.write(f"{bucket} {truncate_count} dropped files:\n")
-                        for item in buckets[bucket][-truncate_count:]:
-                            f.write(f"- {item.pathname}\n")
-                current_bucket_size = len(buckets[bucket])
-                buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
+                    runt_bucket = buckets[bucket][-truncate_count:]
+                    for item in runt_bucket:
+                        item.runt_size = truncate_count
+                    while len(runt_bucket) < batch_size:
+                        runt_bucket.append(random.choice(runt_bucket))

-                if debug_level > 0:
-                    logging.warning(f"  ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}")
+                    current_bucket_size = len(buckets[bucket])
+
+                    buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
+                    buckets[bucket].extend(runt_bucket)

        # flatten the buckets
        image_caption_pairs = []
--- a/data/every_dream.py
+++ b/data/every_dream.py
@ -24,6 +24,7 @@ import random
 from torchvision import transforms
 from transformers import CLIPTokenizer
 import torch.nn.functional as F
+import numpy

 class EveryDreamBatch(Dataset):
    """
@ -47,6 +48,8 @@ class EveryDreamBatch(Dataset):
                 seed=555,
                 tokenizer=None,
                 log_folder=None,
+                 retain_contrast=False,
+                 write_schedule=False,
                 ):
        self.data_root = data_root
        self.batch_size = batch_size
@ -58,6 +61,8 @@ class EveryDreamBatch(Dataset):
        self.log_folder = log_folder
        #print(f"tokenizer: {tokenizer}")
        self.max_token_length = self.tokenizer.model_max_length
+        self.retain_contrast = retain_contrast
+        self.write_schedule = write_schedule

        if seed == -1:
            seed = random.randint(0, 99999)
@ -79,14 +84,28 @@ class EveryDreamBatch(Dataset):

        self._length = self.num_images

-        self.image_transforms = transforms.Compose(
-            [
-                transforms.ToTensor(),
-                transforms.Normalize([0.5], [0.5]),
-            ]
-        )
-
        logging.info(f" ** Trainer Set: {self._length / batch_size:.0f}, num_images: {self.num_images}, batch_size: {self.batch_size}")
+        if self.write_schedule:
+            self.write_batch_schedule(0)
+
+    def write_batch_schedule(self, epoch_n):
+        with open(f"{self.log_folder}/ep{epoch_n}_batch_schedule.txt", "w") as f:
+            for i in range(len(self.image_train_items)):
+                f.write(f"step:{int(i / self.batch_size)}, wh:{self.image_train_items[i].target_wh}, r:{self.image_train_items[i].runt_size}, path:{self.image_train_items[i].pathname}\n")
+        #exit()
+
+    def get_runts():
+        return dls.shared_dataloader.runts
+
+    def shuffle(self, epoch_n):
+        if dls.shared_dataloader:
+            dls.shared_dataloader.shuffle()
+            self.image_train_items = dls.shared_dataloader.get_all_images()
+        else:
+            raise Exception("No dataloader singleton to shuffle")
+
+        if self.write_schedule:
+            self.write_batch_schedule(epoch_n)

    def __len__(self):
        return self._length
@ -95,30 +114,48 @@ class EveryDreamBatch(Dataset):
        example = {}

        train_item = self.__get_image_for_trainer(self.image_train_items[i], self.debug_level)
-        example["image"] = self.image_transforms(train_item["image"])

-        #if random.random() > 9999:
-        example["tokens"] = self.tokenizer(train_item["caption"],
-                                            truncation=True,
-                                            padding="max_length",
-                                            max_length=self.tokenizer.model_max_length,
-        ).input_ids
+        if self.retain_contrast:
+            std_dev = 1.0
+            mean = 0.0
+        else:
+            std_dev = 0.5
+            mean = 0.5
+
+        image_transforms = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([mean], [std_dev]),
+            ]
+        )
+
+        example["image"] = image_transforms(train_item["image"])
+
+        if random.random() > self.conditional_dropout:
+            example["tokens"] = self.tokenizer(train_item["caption"],
+                                                truncation=True,
+                                                padding="max_length",
+                                                max_length=self.tokenizer.model_max_length,
+                                              ).input_ids
+        else:
+            example["tokens"] = self.tokenizer(" ",
+                                                truncation=True,
+                                                padding="max_length",
+                                                max_length=self.tokenizer.model_max_length,
+                                              ).input_ids
        example["tokens"] = torch.tensor(example["tokens"])
        example["caption"] = train_item["caption"] # for sampling if needed
+        example["runt_size"] = train_item["runt_size"]

        return example

    def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):
        example = {}
-
        save = debug_level > 2

        image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)

        example["image"] = image_train_tmp.image
-        if random.random() > self.conditional_dropout:
-            example["caption"] = image_train_tmp.caption
-        else:
-            example["caption"] = " "
-        
+        example["caption"] = image_train_tmp.caption
+        example["runt_size"] = image_train_tmp.runt_size
        return example
--- a/data/image_train_item.py
+++ b/data/image_train_item.py
@ -37,6 +37,7 @@ class ImageTrainItem():
        self.pathname = pathname
        self.flip = transforms.RandomHorizontalFlip(p=flip_p)
        self.cropped_img = None
+        self.runt_size = 0

        if image is None:
            self.image = []
--- a/doc/ATWEAKING.md
+++ b/doc/ATWEAKING.md
@ -1,5 +1,9 @@
 # Advanced Tweaking

+This document is a bit more geared to experienced users who have trained several models.  It is not required reading for new users.
+
+Start with the [Low VRAM guide](TWEAKING.md) if you are having trouble training on a 12GB card.
+
 ## Resolution

 You can train resolutions from 512 to 1024 in 64 pixel increments.  General results from the community indicate you can push the base model a bit beyond what it was designed for *with enough training*.  This will work out better when you have a lot of training data (hundreds+) and enable slightly higher resolution at inference time without seeing repeats in your generated images.  This does cost speed of training and higher VRAM use!  Ex. 768 takes a significant amount more VRAM than 512, so you will need to compensate for that by reducing ```batch_size```.
@ -10,6 +14,10 @@ For instance, if training from the base 1.5 model, you can try trying at 576, 64

 If you are training on a base model that is 768, such as "SD 2.1 768-v", you should also probably use 768 as a base number and adjust from there.

+Some results from the community seem to indicate training at a higher resolution on SD1.x models may increase how fast the model learns, and it may be a good idea to slightly reduce your learning rate as you increase resolution.  My suspcision is that the higher resolutions increase the gradients as more information is presented to the model per image.  
+
+ You may need to experiment with LR as you increase resolution. I don't have a perfect rule of thumb here, but I might suggest if you train SD1.5 which is a 512 model at resolution 768 you reduce your LR by about half.  ED2 tends to prefer ~2e-6 to ~5e-6 for normal 512 training on SD1.X models around batch 6-8, so if you train SD1.X at 768 consider 1e-6 to 2.5e-6 instead.  
+
 ## Log and ckpt save folders

 If you want to use a nondefault location for saving logs or ckpt files, these:
@ -24,46 +32,46 @@ By default the CKPT format copies of ckpts that are peroidically saved are saved

    --ckpt_dir "r:\webui\models\stable-diffusion"

-This is useful if you want to dump the CKPT files directly to your webui/inference program model folder. 
-
-## Clip skip
-
-Aka "penultimate layer", this takes the output from the text encoder not from its last output layer, but layers before.  
-
-    --clip_skip 2 ^
-
-A value of "2" is the canonical form of "penultimate layer" useed by various webuis, but 1 an 3 are accepted as well if you wish to experiment.  Default is "0" which takes the "last hidden layer" or standard output of the text encoder as Stable Diffusion was originally designed.  Training with this setting may necessititate or work better when also using the same setting in your webui/inference program.  
+This is useful if you want to dump the CKPT files directly to your webui/inference program model folder so you don't have to manually cut and paste it over.

 ## Conditional dropout

 Conditional dropout means the prompt or caption on the training image is dropped, and the caption is "blank".  The theory is this can help with unconditional guidance, per the original paper and authors of Latent Diffusion and Stable Diffusion.

-The value is defaulted at 0.04, which means 4% conditional dropout.  You can set it to 0.0 to disable it, or increase it.  Many users of EveryDream 1.0 have had great success tweaking this, especially for larger models.  You may wish to try 0.10.  This may also be useful to really "force" a style.  Setting it very high may lead to bleeding or overfitting.
+The value is defaulted at 0.04, which means 4% conditional dropout.  You can set it to 0.0 to disable it, or increase it.  Many users of EveryDream 1.0 have had great success tweaking this, especially for larger models.  You may wish to try 0.10.  This may also be useful to really "force" a style into the model with a high setting such as 0.15.  However, setting it very high may lead to bleeding or overfitting to your training data, especially if your data is not very diverse, which may or may not be desirable for your project.

    --cond_dropout 0.1 ^

 ## LR tweaking

+Learning rate adjustment is a very important part of training.  You can use the default settings, or you can tweak it.  Currently the default is 3e-6, which is higher than EveryDream1 which was defaulted to 1e-6, based on the results of testing and the ability to use larger batch sizes.  You should consider increasing this further if you increase your batch size further (10+) using [gradient checkpointing](#gradient_checkpointing).
+
+    --lr 3e-6 ^
+
 By default, the learning rate is constant for the entire training session.  However, if you want it to change by itself during training, you can use cosine.

 ### Cosine LR scheduler
-Cosine LR scheduler will "taper off" your learning rate over time. It will reach a peak value of your ```--lr``` value then taper off following a cosine curve.
+Cosine LR scheduler will "taper off" your learning rate over time. It will reach a peak value of your ```--lr``` value then taper off following a cosine curve.  In other words, it allows you to set a high initial learning rate which lowers as training progresses.  This *may* help speed up training without overfitting.  If you wish to use this, I would set a slightly higher initial [learning rate](#lr-tweaking), maybe by 25-50% than you might use with a normal constant LR schedule.

 Example:

    --lr_scheduler cosine ^

-There is also warmup, which will default to 2% of the decay steps.  You can manually set warmup, but it is typically more useful from training a brand new model from scratch, not for continuation training which we're all doing.  But, if you want to tweak manually anyway, use this:
+*I don't recommend trying to set the warmup and decay steps if you are using cosine, but they're here if you want them.*
+
+There is also warmup with cosine secheuler, which will default to 2% of the decay steps.  You can manually set warmup, but it is typically more useful from training a brand new model from scratch, not for continuation training which we're all doing, thus the very short 2% default value. 

    --lr_warmup_steps 100 ^

-Cosine also has a decay period to define how long it takes to get to zero LR as it tapers.  By default, the trainer sets this to slightly longer than it will take to get to your ```--max_epochs``` number of steps so LR doesn't go all the way to zero and waste compute time.   However, if you want to tweak, you have to set the number of steps yourself and estimate what that will be.  If you set this, be sure to watch your LR log in tensorboard to make sure it does what you expect.
+Cosine scheduler also has a "decay period" to define how long it takes to get to zero LR as it tapers.  By default, the trainer sets this to slightly longer than it will take to get to your ```--max_epochs``` number of steps, so LR doesn't go all the way to zero and waste compute time near the end of training.   However, if you want to tweak, you have to set the number of steps yourself and estimate what that will be based on your max_epochs, batch_size, and number of training images.  If you set this, be sure to watch your LR log in tensorboard to make sure it does what you expect.

    --lr_decay_steps 2500 ^

+If decay steps is too low, your LR will bottom out to zero, then start rising again, following a cosine waveform, which is probably a dumb idea.  If it is way too high, it will just never taper off and you might as well use constant LR scheduler instead.
+
 ## Gradient accumulation

-Gradient accumulation is sort of like a virtual batch size increase, averaging the learning over more than one step (batch) before applying it to the model as an update to weights.
+Gradient accumulation is sort of like a virtual batch size increase, averaging the learning over more than one step (batch of images) before applying it to the model as an update to weights.

 Example:

@ -71,8 +79,21 @@ Example:

 The above example with combine the loss from 2 batches before applying updates.  This *may* be a good idea for higher resolution training that requires smaller batch size but mega batch sizes are also not the be-all-end all.

-Some experimentation shows if you already have batch size in the 6-8 range than grad accumulation of more than 2 just reduces quality, but you can experiment. 
+Some experimentation shows if you already have batch size in the 6-8 range than grad accumulation of more than 1 just reduces quality, but you can experiment. 

+*There is some VRAM overhead to set grad_accum > 1*, about equal to increasing batch size by 1, but continuing to increase grad_accum to 3+ does not continue to increase VRAM use, while increasing batch size does.  This can still be useful if you are trying to train higher resolutions with a smaller batch size and gain the benefit of larger batch sizes in terms of generalization.  You will have to decide if this is worth using.  Currently it will not work on 12GB GPUs due to VRAM limitations.
+
+## Gradient checkpointing
+
+While traditionally used to reduce VRAM for smaller GPUs, gradient checkpointing can offer a higher batch size and/or higher resolution within whatever VRAM you have, so it may be useful even on a 24GB+ GPU.
+
+    --gradient_checkpointing ^
+
+This drastically reduces VRAM (by many GB) and will allow quite a larger batch size or resolution, for example, 13-14 instead of 7-8 on a 24GB card using 512 training resolution.  
+
+While gradient checkpointing reduces performance, the ability to run a higher batch size brings performance back fairly close to without it.  My personal tests show a 25% performance hit simply turning on gradient checkpointing on a 3090 (batch 7, 512), but almost all of that is made up by the ability to use a larger batch size (up to 14).  You may NOT want to use a batch size as large as 13-14, or you may find you need to tweak learning rate all over again to find the right balance. 
+
+This probably IS a good idea for training at higher resolutions.  Balancing this toggle, resolution, batch_size, and grad_accum will take some experimentation, but you might try using this with 768+ resolutions, grad_accum 3-4, and then as high of a batch size as you can get to work without crashing, while adjusting LR with respect to your (batch_size * grad_accum) value.

 ## Flip_p

@ -80,26 +101,52 @@ If you wish for your training images to be randomly flipped horizontally, use th

    --flip_p 0.5 ^

-This is useful for styles or other training that is not symmetrical.  It is not suggested for training specific human faces as it may wash out facial features.  It is also not suggested if any of your captions included directions like "left" or "right".  Default is 0.0 (no flipping)
+This is useful for styles or other training that is not asymmetrical.  It is not suggested for training specific human faces as it may wash out facial features as real people typically have at least some asymmetric facial features.  It may also cause problems if you are training fictional characters with asymmetrical outfits, such as washing out the asymmetries in the outfit.  It is also not suggested if any of your captions included directions like "left" or "right".  Default is 0.0 (no flipping)

-# Stuff you probably don't need to mess with
+# Stuff you probably don't need to mess with, but well here it is:
+
+## Clip skip
+
+Aka "penultimate layer", this takes the output from the text encoder not from its last output layer, but layers before.  
+
+    --clip_skip 2 ^
+
+A value of "2" is the canonical form of "penultimate layer" useed by various webuis, but 2 to 4 are accepted as well if you wish to experiment.  Default is "0" which takes the "last hidden layer" or standard output of the text encoder as Stable Diffusion 1.X was originally designed.  Training with this setting may necessititate or work better when also using the same setting in your webui/inference program. 
+
+I would consider this a very "experimental" setting. 

 ## log_step

-Change how often log items are written.  Default is 25 and probably good for most situations.   This does not affect how often samples or ckpts are saved, just log scalar items. 
+Change how often log items are written.  Default is 25 and probably good for most situations.   This does not affect how often samples or ckpts are saved, just how often log scalar items are posted to Tensorboard.

    --log_step 50 ^

-## scale_lr
+Here, the log step is set to a less often "50" number.  Logging has virtually no impact on performance, and there is usually no reason to change this.

-Attempts to automatically scale your learning rate up or down base on changes to batch size and gradient accumulation.
+## Scale learning rate
+
+Attempts to automatically scale your learning rate up or down base on changes to batch size and gradient accumulation number.

    --scale_lr ^

-This multiplies your ```--lr``` setting by ```sqrt of (batch_size times grad_accum)```. This can be useful if you're tweaking batch size and grad accum and want to keep your LR to a sane value. 
+This multiplies your ```--lr``` setting by ```(batch_size times grad_accum)^0.55```. This can be useful if you're tweaking batch size and grad accum a lot and want to keep your LR to a sane value. 
+
+The value ```0.55``` was derived from the original authors of Stable Diffusion using an LR or 1e-4 for a batch size of 2048 with gradient accumulation 2 (effectively 4096) compared to original Xavier Xiao dreambooth (and forks) commonly using 1e-6 with batch size 1 or 2.  Keep in mind this always *increases* your set ```--lr``` value, so it is suggested to use a lower value for ```--lr``` and let this scale it up, such as ```--lr 2e-6```.  The actual LR used is recorded in your log file and tensorboard and you should pay attention to the logged value as you tweak your batch size and gradient accumulation numbers.  
+
+This is mostly useful for batch size and grad accum tweaking, not for LR tweaking.  Again, watch what actual LR is used to inform your future decisions on LR tweaking. 
+
+## Write batch schedule
+
+If you are interested to see exactly how the images are loaded into batches (i.e. steps), their resolution bucket, and how they are shuffled between epochs, you can use ```--write_schedule``` to output the schedule for every epoch to a file in your log folder.  Keep in mind these files can be large if you are training on a large dataset.  It's not recommended to use this regularly and more of an informational tool for those curious about inner workings of the trainer. 
+
+    --write_schedule ^
+
+The files will be in ```logs/[your project folder]/ep[N]_batch_schedule.txt``` and created every epoch. ex ```ep9_batch_schedule.txt```

 ## clip_grad_norm

-Clips the gradient normals to a maximum value.  This is an experimental feature, you can read online about gradient clipping.  Default is None (no clipping).  This is typically used for gradient explosion problems, but might be a fun thing to experiment with.
+Clips the gradient normals to a maximum value.  This is an experimental feature, you can read online about gradient clipping.  Default is None (no clipping).  This is typically used for gradient explosion problems, which are not an issue with EveryDream, but might be a fun thing to experiment with?

-    --clip_grad_norm 1.0 ^
+    --clip_grad_norm 1.0 ^
+
+This may drastically reduce training speed or have other undesirable effects.  My brief toying was mostly unsuccessful.  I would not recommend using this unless you know what you're doing or are bored, but you might discover something cool or interesting.
--- a/doc/TWEAKING.md
+++ b/doc/TWEAKING.md
@ -1,5 +1,9 @@
 # Tweaking settings

+This document should be read by all users who are trying to get the best results out of EveryDream 2.0.  These are the key settings you'll need to understand to get started.
+
+## Logging
+
 Make sure you pay attention to your logs and sample images.  Launch tensorboard in a second command line. See (logging)[doc/LOGGING.md] for more info.

    tensorboard --logdir logs
@ -14,13 +18,19 @@ You may wish to consider adding "sd1" or "sd2v" or similar to remember what the

 ## Epochs

-EveryDream 2.0 has done away with repeats and instead you should set your max_epochs.  Changing epochs has the same effect as changing repeats.  For example, if you had 50 repeats and 5 epochs, you would now set max_epochs to 250.  This is a bit more intuitive as there is no more double meaning for epochs and repeats.
+EveryDream 2.0 has done away with repeats and instead you should set your max_epochs.  Changing epochs has the same effect as changing repeats in DreamBooth or EveryDream1.  For example, if you had 50 repeats and 5 epochs, you would now set max_epochs to 250 (50x5=250).  This is a bit more intuitive as there is no more double meaning for epochs and repeats.

    --max_epochs 250 ^

-## Save interval
+This is like your "amount" of training.  

-While EveryDream 1.0 saved a checkpoint every epoch, this is no longer the case as it would produce too many files when repeats are removed.  To balance both people training large and small datasets, you can now set the interval at which checkpoints are saved.  The default is 30 minutes, but you can change it to whatever you want. 
+With more training data for your subjects and concepts, you can slowly scale this value down.  More example images mean an epoch is longer, and more training is done simply by the fact there is more training data.
+
+With less training data, this value should be higher, because more repetition on the images is needed to learn.
+
+## Save interval for checkpoints
+
+While EveryDream 1.0 saved a checkpoint every epoch, this is no longer the case as it would produce too many files as "repeats" are removed in favor of just using epochs instead.  To balance the fact EveryDream users are sometimes training small datasets and sometimes huge datasets, you can now set the interval at which checkpoints are saved.  The default is 30 minutes, but you can change it to whatever you want. 

 For isntance, if you are working on a very large dataset of thousands of images and lots of different concepts and know it will run for a few hours you may want to save every hour instead, so you would set it to 60.

@ -30,26 +40,34 @@ Every save interval, a full ckpt in Diffusers format is saved from which you can

 Additionally, these are saved at the end of training. 

-If you wish instead to save every certain number of epochs, you can set the minutes interval 0 and use save_every_n_epochs instead.  This is not recommended for large datasets as it will produce a lot of files.
+If you wish instead to save every certain number of epochs, save_every_n_epochs instead.  

-    --ckpt_every_n_minutes 0 ^
    --save_every_n_epochs 25 ^

+If you are training a huge dataset (20k+) then saving every 1 epoch may not be very often, so consider using ckpt_every_n_minutes as mentioned above instead.
+
+*A "last" checkpoint is always saved at the end of training.*
+
+Diffusers copies of checkpoints are saved in your /logs/[project_name]/ckpts folder, and can be used to continue training if you want to pick up where you left off.  CKPT files are saved in the root training folder by default.  These folders can be changed. See [Advanced Tweaking](doc/ATWEAKING.md) for more info.
+
+## Resuming training from previous runs
+
+If you want to resume training from a previous run, you can do so by pointing to the diffusers copy in the logs folder from which you want to resume.  This is the same --resume_ckpt argument you would use to start training, just pointing to a different location.
+
+    --resume_ckpt "logs\city_gradckptng2_20221231-234604\ckpts\last-city_gradckptng2-ep59-gs00600" ^
+
 ## Learning Rate

-The learning rate affects how much "training" is done on the model.  It is a very careful balance to select a value that will learn your data, but not overfit it.  If you set the LR too high, the model will "fry" or could "overtrain" and become too rigid, only learning to exactly mimick your training data images and will not be able to generalize to new data or be "stylable".  If you set the LR too low, you may take longer to train, or it may have difficulty learning the concepts at all.  Usually sane values are 1e-6 to 3e-6.
-
+The learning rate affects how much "training" is done on the model per training step.  It is a very careful balance to select a value that will learn your data.  See [Advanced Tweaking](doc/ATWEAKING.md) for more info.  Once you have started, the learning rate is a good first knob to turn as you move into more advanced tweaking.

 ## Batch Size

-Batch size is also another "hyperparamter" of itself and there are tradeoffs. It may not always be best to use the highest batch size possible.  
-
-While very small batch sizes can impact performance negative, at some point larger sizes have little impact on overall speed.
-
-Larger batch size may also impact what learning rate you use. Often a suggestion is to multiply your LR by the sqrt of batch size.  For example, if you change from batch size 2 to 6, you may consider increasing your LR by sqrt(6/2) or about 1.5x.  This is not a hard rule, but it may help you find a good LR.
+Batch size is also another "hyperparamter" of itself and there are tradeoffs. It may not always be best to use the highest batch size possible.  Once of the primary reasons to change it is if you get "CUDA out of memory" errors where lowering the value may help.

    --batch_size 4 ^

+While very small batch sizes can impact performance negatively, at some point larger sizes have little impact on overall speed as well, so shooting for the moon is not always advisable.  Changing batch size may also impact what learning rate you use, with typically larger batch_size requiring a slightly higher learning rate.  More info is provided in the [Advanced Tweaking](doc/ATWEAKING.md) document.
+
 ## LR Scheduler

 A learning rate scheduler can change your learning rate as training progresses.
@ -60,12 +78,14 @@ The constant scheduler is the default and keeps your LR set to the value you set

 ## AdamW vs AdamW 8bit

-The AdamW optimizer is the default and what was used by EveryDream 1.0.  It's a good optimizer for stable diffusion and appears to be what was used to train SD itself.
+The AdamW optimizer is the default and what was used by EveryDream 1.0.  It's a good optimizer for Stable Diffusion and appears to be what was used to train SD itself.

-AdamW 8bit is quite a bit faster and uses less VRAM.  I currently **recommend** using it for most cases as it seems worth a potential reduction in quality for a significant speed boost and lower VRAM cost.
+AdamW 8bit is quite a bit faster and uses less VRAM while still having the same basic behavior.  I currently **recommend** using it for most cases as it seems worth a potential slight reduction in quality for a *significant speed boost and lower VRAM cost*.

    --useadam8bit ^

+This may become a default in the future, and replaced with an option to use standard AdamW instead.  For now, it's an option, *but I recommend always using it.*
+
 ## Sampling

 You can set your own sample prompts by adding them, one line at a time, to sample_prompts.txt.  Or you can point to another file with --sample_prompts.
@ -74,4 +94,8 @@ You can set your own sample prompts by adding them, one line at a time, to sampl

 Keep in mind a longer list of prompts will take longer to generate.  You may also want to adjust sample_steps to a different value to get samples left often.  This is probably a good idea when training a larger dataset that you know will take longer to train, where more frequent samples will not help you.

-    --sample_steps 500 ^
+Sample steps declares how often samples are generated and put into the logs and Tensorboard.
+
+    --sample_steps 300 ^
+
+Keep in mind if you drastically change your batch_size, the frequency (in time between samples) of samples will change.  Going from batch size 2 to batch size 10 may reduce how fast steps process, so you may want to reduce sample_steps to compensate.
--- a/train.py
+++ b/train.py
@ -52,7 +52,6 @@ from utils.convert_diffusers_to_stable_diffusion import convert as converter
 from utils.gpu import GPU


-_GRAD_ACCUM_STEPS = 1 # future use...
 _SIGTERM_EXIT_CODE = 130
 _VERY_LARGE_NUMBER = 1e9

@ -171,19 +170,40 @@ def append_epoch_log(global_step: int, epoch_pbar, gpu, log_writer, **logs):
    if logs is not None:
        epoch_pbar.set_postfix(**logs, vram=f"{epoch_mem_color}{gpu_used_mem}/{gpu_total_mem} MB{Style.RESET_ALL} gs:{global_step}")

+
+def set_args_12gb(args):
+    logging.info(" Setting args to 12GB mode")
+    if not args.gradient_checkpointing:   
+        logging.info("   Overrding gradient checkpointing")
+        args.gradient_checkpointing = True
+    if args.batch_size != 1:
+        logging.info("   Overrding batch size to 1")
+        args.batch_size = 1
+    if args.grad_accum != 1:
+        logging.info("   Overrding grad accum to 1")
+        args.grad_accum = 1
+    if args.resolution != 512:
+        logging.info("   Overrding resolution to 512")
+        args.resolution = 512
+    if not args.useadam8bit:
+        logging.info("   Overrding adam8bit to True")
+        args.useadam8bit = True
+
 def main(args):
    """
    Main entry point
    """
    log_time = setup_local_logger(args)
-    #notebook = is_notebook()
+    
+    if args.lowvram:
+        set_args_12gb(args)

    seed = args.seed if args.seed != -1 else random.randint(0, 2**30)
    set_seed(seed)
    gpu = GPU()
    device = torch.device(f"cuda:{args.gpuid}")
    torch.backends.cudnn.benchmark = False
-    args.clip_skip = max(min(3, args.clip_skip), 0)
+    args.clip_skip = max(min(4, args.clip_skip), 0)
    
    if args.ckpt_every_n_minutes is None and args.save_every_n_epochs is None:
        logging.info(f"{Fore.LIGHTCYAN_EX} No checkpoint saving specified, defaulting to every 20 minutes.{Style.RESET_ALL}")
@ -209,7 +229,7 @@ def main(args):

    if args.scale_lr is not None and args.scale_lr:
        tmp_lr = args.lr
-        args.lr = args.lr * (total_batch_size**0.5)
+        args.lr = args.lr * (total_batch_size**0.55)
        logging.info(f"{Fore.CYAN} * Scaling learning rate {tmp_lr} by {total_batch_size**0.5}, new value: {args.lr}{Style.RESET_ALL}")

    log_folder = os.path.join(args.logdir, f"{args.project_name}_{log_time}")
@ -368,19 +388,21 @@ def main(args):
    except:
        logging.ERROR(" * Failed to load checkpoint *")

+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        text_encoder.gradient_checkpointing_enable()
+
    if is_xformers_available():
        try:
-            #pass
            unet.enable_xformers_memory_efficient_attention()
-            #unet.set_attention_slice(4)
-            #logging.info(" Enabled memory efficient attention")
+            print(" Enabled xformers")
        except Exception as e:
            logging.warning(
                "Could not enable memory efficient attention. Make sure xformers is installed"
                f" correctly and a GPU is available: {e}"
            )

-    default_lr = 2e-6
+    default_lr = 3e-6
    curr_lr = args.lr if args.lr is not None else default_lr

    vae = vae.to(device, dtype=torch.float32 if not args.amp else torch.float16)
@ -419,8 +441,6 @@ def main(args):
            amsgrad=False,
        )

-    #log_optimizer(optimizer, betas, epsilon)
-
    train_batch = EveryDreamBatch(
        data_root=args.data_root,
        flip_p=args.flip_p,
@ -431,6 +451,7 @@ def main(args):
        tokenizer=tokenizer,
        seed = seed,
        log_folder=log_folder,
+        write_schedule=args.write_schedule,
    )

    torch.cuda.benchmark = False
@ -445,8 +466,8 @@ def main(args):
    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
-        num_warmup_steps=lr_warmup_steps * _GRAD_ACCUM_STEPS,
-        num_training_steps=args.lr_decay_steps * _GRAD_ACCUM_STEPS,
+        num_warmup_steps=lr_warmup_steps,
+        num_training_steps=args.lr_decay_steps,
    )

    sample_prompts = []
@ -532,16 +553,19 @@ def main(args):
        images = [example["image"] for example in batch]
        captions = [example["caption"] for example in batch]
        tokens = [example["tokens"] for example in batch]
+        runt_size = batch[0]["runt_size"]

        images = torch.stack(images)
        images = images.to(memory_format=torch.contiguous_format).float()

-        batch = {
+        ret = {
            "tokens": torch.stack(tuple(tokens)),
            "image": images,
            "captions": captions,
+            "runt_size": runt_size,
        }
-        return batch
+        del batch
+        return ret

    train_dataloader = torch.utils.data.DataLoader(
        train_batch,
@ -581,16 +605,10 @@ def main(args):
    append_epoch_log(global_step=global_step, epoch_pbar=epoch_pbar, gpu=gpu, log_writer=log_writer)

    torch.cuda.empty_cache()
-    loss = torch.tensor(0.0, device=device, dtype=torch.float32)
+    #loss = torch.tensor(0.0, device=device, dtype=torch.float32)

    try:            
        for epoch in range(args.max_epochs):
-            if epoch > 0 and epoch % args.save_every_n_epochs == 0:
-                logging.info(f" Saving model")
-                save_path = os.path.join(f"{log_folder}/ckpts/{args.project_name}-ep{epoch:02}-gs{global_step:05}")
-                __save_model(save_path, unet, text_encoder, tokenizer, scheduler, vae, args.save_ckpt_dir)
-                torch.cuda.empty_cache()
-
            epoch_start_time = time.time()
            steps_pbar.reset()
            images_per_sec_epoch = []
@ -603,18 +621,16 @@ def main(args):
                    pixel_values = batch["image"].to(memory_format=torch.contiguous_format).to(unet.device)
                    with autocast(enabled=args.amp):                
                        latents = vae.encode(pixel_values, return_dict=False)
+                    del pixel_values
+                    latents = latents[0].sample() * 0.18215

-                    latent = latents[0]
-                    latents = latent.sample()
-                    latents = latents * 0.18215
+                    noise = torch.randn_like(latents)
+                    bsz = latents.shape[0]

-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
+                    timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                    timesteps = timesteps.long()

-                timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-
-                cuda_caption = batch["tokens"].to(text_encoder.device)
+                    cuda_caption = batch["tokens"].to(text_encoder.device)

                #with autocast(enabled=args.amp):
                encoder_hidden_states = text_encoder(cuda_caption, output_hidden_states=True)
@ -632,7 +648,7 @@ def main(args):
                    target = scheduler.get_velocity(latents, noise, timesteps)
                else:
                    raise ValueError(f"Unknown prediction type {scheduler.config.prediction_type}")
-                del noise, latents
+                del noise, latents, cuda_caption

                with autocast(enabled=args.amp):
                    model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
@ -640,6 +656,10 @@ def main(args):
                del timesteps, encoder_hidden_states, noisy_latents
                #with autocast(enabled=args.amp):
                loss = torch_functional.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                if batch["runt_size"]> 0:
+                    loss = loss / (batch["runt_size"] / args.batch_size)
+                del target, model_pred
+

                if args.clip_grad_norm is not None:
                    torch.nn.utils.clip_grad_norm_(parameters=unet.parameters(), max_norm=args.clip_grad_norm)
@ -661,8 +681,9 @@ def main(args):

                if (global_step + 1) % args.log_step == 0:
                    curr_lr = lr_scheduler.get_last_lr()[0]
-                    logs = {"loss/step": loss.detach().item(), "lr": curr_lr, "img/s": images_per_sec}
-                    log_writer.add_scalar(tag="loss/step", scalar_value=loss, global_step=global_step)
+                    loss_local = loss.detach().item()
+                    logs = {"loss/step": loss_local, "lr": curr_lr, "img/s": images_per_sec}
+                    log_writer.add_scalar(tag="loss/step", scalar_value=loss_local, global_step=global_step)
                    log_writer.add_scalar(tag="hyperparamater/lr", scalar_value=curr_lr, global_step=global_step)
                    sum_img = sum(images_per_sec_epoch)
                    avg = sum_img / len(images_per_sec_epoch)
@ -672,7 +693,6 @@ def main(args):
                    append_epoch_log(global_step=global_step, epoch_pbar=epoch_pbar, gpu=gpu, log_writer=log_writer, **logs)

                if (global_step + 1) % args.sample_steps == 0:
-                    #(unet, text_encoder, tokenizer, scheduler):
                    pipe = __create_inference_pipe(unet=unet, text_encoder=text_encoder, tokenizer=tokenizer, scheduler=scheduler, vae=vae)
                    pipe = pipe.to(device)

@ -692,18 +712,24 @@ def main(args):

                if args.ckpt_every_n_minutes is not None and (min_since_last_ckpt > args.ckpt_every_n_minutes):
                    last_epoch_saved_time = time.time()
-                    logging.info(f"Saving model at {args.ckpt_every_n_minutes} mins at step {global_step}")
+                    logging.info(f"Saving model, {args.ckpt_every_n_minutes} mins at step {global_step}")
                    save_path = os.path.join(f"{log_folder}/ckpts/{args.project_name}-ep{epoch:02}-gs{global_step:05}")
+                    __save_model(save_path, unet, text_encoder, tokenizer, scheduler, vae, args.save_ckpt_dir)

+                if epoch > 0 and epoch % args.save_every_n_epochs == 0 and step == 1 and epoch < args.max_epochs - 1:
+                    logging.info(f" Saving model, {args.save_every_n_epochs} epochs at step {global_step}")
+                    save_path = os.path.join(f"{log_folder}/ckpts/{args.project_name}-ep{epoch:02}-gs{global_step:05}")
                    __save_model(save_path, unet, text_encoder, tokenizer, scheduler, vae, args.save_ckpt_dir)

                # end of step

-            elapsed_epoch_time = (time.time() - epoch_start_time) / 60         
+            elapsed_epoch_time = (time.time() - epoch_start_time) / 60
            epoch_times.append(dict(epoch=epoch, time=elapsed_epoch_time))
            log_writer.add_scalar("performance/minutes per epoch", elapsed_epoch_time, global_step)

            epoch_pbar.update(1)
+            if epoch < args.max_epochs - 1:
+                train_batch.shuffle(epoch_n=epoch+1)
            # end of epoch

        # end of training
@ -729,7 +755,7 @@ def main(args):

 if __name__ == "__main__":
    supported_resolutions = [512, 576, 640, 704, 768, 832, 896, 960, 1024]
-    argparser = argparse.ArgumentParser(description="EveryDream Training options")
+    argparser = argparse.ArgumentParser(description="EveryDream2 Training options")
    argparser.add_argument("--resume_ckpt", type=str, required=True, default="sd_v1-5_vae.ckpt")
    argparser.add_argument("--lr_scheduler", type=str, default="constant", help="LR scheduler, (default: constant)", choices=["constant", "linear", "cosine", "polynomial"])
    argparser.add_argument("--lr_warmup_steps", type=int, default=None, help="Steps to reach max LR during warmup (def: 0.02 of lr_decay_steps), non-functional for constant")
@ -752,14 +778,19 @@ if __name__ == "__main__":
    argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY")
    argparser.add_argument("--save_optimizer", action="store_true", default=False, help="saves optimizer state with ckpt, useful for resuming training later")
    argparser.add_argument("--resolution", type=int, default=512, help="resolution to train", choices=supported_resolutions)
-    argparser.add_argument("--amp", action="store_true", default=False, help="use floating point 16 bit training")
+    argparser.add_argument("--amp", action="store_true", default=False, help="use floating point 16 bit training, experimental, reduces quality")
    argparser.add_argument("--cond_dropout", type=float, default=0.04, help="Conditional drop out as decimal 0.0-1.0, see docs for more info (def: 0.04)")
    argparser.add_argument("--logdir", type=str, default="logs", help="folder to save logs to (def: logs)")
    argparser.add_argument("--save_ckpt_dir", type=str, default=None, help="folder to save checkpoints to (def: root training folder)")
    argparser.add_argument("--scale_lr", action="store_true", default=False, help="automatically scale up learning rate based on batch size and grad accumulation (def: False)")
    argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random")
-    argparser.add_argument("--flip_p", type=float, default=0.0, help="probability of flipping image horizontally (def: 0.0) use 0.0 to 1.0, ex 0.5")
+    argparser.add_argument("--flip_p", type=float, default=0.0, help="probability of flipping image horizontally (def: 0.0) use 0.0 to 1.0, ex 0.5, not good for specific faces!")
    argparser.add_argument("--gpuid", type=int, default=0, help="id of gpu to use for training, (def: 0) (ex: 1 to use GPU_ID 1)")
+    argparser.add_argument("--write_schedule", action="store_true", default=False, help="write schedule of images and their batches to file (def: False)")
+    argparser.add_argument("--gradient_checkpointing", action="store_true", default=False, help="enable gradient checkpointing to reduce VRAM use, may reduce performance (def: False)")
+    
+    argparser.add_argument("--lowvram", action="store_true", default=False, help="automatically overrides various args to support 12GB gpu")
+
    args = argparser.parse_args()

    main(args)
--- a/utils/get_yamls.py
+++ b/utils/get_yamls.py
@ -5,8 +5,6 @@ _V2V_URL = ["v2-inference-v.yaml","https://raw.githubusercontent.com/Stability-A
 _V2_URL = ["v2-inference.yaml","https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference.yaml"]
 _V1_URL = ["v1-inference.yaml","https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"]

-# download https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml
-
 def download_all():
    list = [_V2V_URL,_V2_URL,_V1_URL]
    for file in list:
@ -19,9 +17,6 @@ def get_yaml(file):
        f.write(res.content)
    print(f" downloaded: {file[0]}")

-def isWindows():
-    return sys.platform.startswith('win')
-
 if __name__ == '__main__':
    download_all()
    print("SD1.x and SD2.x yamls downloaded")