unload old images for better sys ram use, fix up crop jitter

2022-11-16 13:52:06 -05:00 · 2022-11-16 13:52:06 -05:00 · b47c5c0ed3
parent d26fabac14
commit b47c5c0ed3
17 changed files with 485 additions and 109 deletions
--- a/configs/stable-diffusion/v1-finetune_everydream.yaml
+++ b/configs/stable-diffusion/v1-finetune_everydream.yaml
@ -65,7 +65,7 @@ model:
 data:
  target: main.DataModuleFromConfig
  params:
-    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error
+    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
    num_workers: 6
    wrap: falsegit
    train:
@ -73,9 +73,10 @@ data:
      params:
        repeats: 5   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
-        conditional_dropout: 0.01   # experimental, likelihood to drop the caption, may help with poorly captioned images
+        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
        crop_jitter: 5   # adds N pixels of jitter to cropping algorithm for non-square images only
-        big_mode: 0   # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
+        resolution: 512   # 512, 576, or 640, increases VRAM substantially
+        seed: 555  # seed used to shuffle the dataset ordering, keep constant for reproducibility
    validation:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
--- a/configs/stable-diffusion/v1-finetune_huge.yaml
+++ b/configs/stable-diffusion/v1-finetune_huge.yaml
@ -0,0 +1,109 @@
+model:
+  base_learning_rate: 1.2e-6
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 300
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: true
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    unfreeze_model: True
+    model_lr: 1.2e-6
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
+    num_workers: 6
+    wrap: falsegit
+    train:
+      target: ldm.data.every_dream.EveryDreamBatch
+      params:
+        repeats: 1   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
+        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
+        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
+        crop_jitter: 15   # adds N pixels of jitter to cropping algorithm for non-square images only
+        big_mode: 0   # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
+    validation:
+      target: ldm.data.ed_validate.EDValidateBatch
+      params:
+        repeats: 0.3
+    test:
+      target: ldm.data.ed_validate.EDValidateBatch
+      params:
+        repeats: 0.2
+
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # produce a ckpt every epoch, leave 1!
+      #every_n_train_steps: 1400 # can only use epoch or train step checkpoints
+      save_top_k: 6   # save the best N ckpts according to loss, can reduce to save disk space but suggest at LEAST 2, more if you have max_epochs below higher!
+      save_last: True
+      filename: "{epoch:02d}-{step:05d}"
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 500
+        max_images: 16
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
+    max_epochs: 1   # better to run several epochs and test your checkpoints!  Try 4-5, you get a checkpoint every epoch to test! 
+    max_steps: 99000   # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
+    check_val_every_n_epoch: 1
+    gpus: 0,
--- a/configs/stable-diffusion/v1-finetune_test.yaml
+++ b/configs/stable-diffusion/v1-finetune_test.yaml
@ -1,5 +1,5 @@
 model:
-  base_learning_rate: 1.0e-6
+  base_learning_rate: 1.2e-6
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
@ -17,16 +17,17 @@ model:
    scale_factor: 0.18215
    use_ema: False
    unfreeze_model: True
-    #model_lr: 1.0e-6
+    model_lr: 1.1e-6
+    #use_scheduler: True
    scheduler_config: 
-      target: ldm.lr_scheduler.LambdaLinearScheduler
+      target: ldm.lr_scheduler.EveryDreamScheduler
      params:
-        warm_up_steps: [ 5 ]
-        cycle_lengths: [ 1000 ] # incredibly large number to prevent corner cases
-        verbosity_interval: 25  # how often to print LR updates
-        f_start: [ 1.e-6 ]
-        f_max: [ 1.e-6 ] # 1.
-        f_min: [ 1.e-8 ] # 1.
+        f_start: 5.0e-1 # starting LR multiplier
+        warm_up_steps: 50 # number of steps to warm up to f_start before decaying LR
+        f_max: 1.0 # maximum LR multiplier
+        f_min: 5.0e-1 # minimum LR multiplier
+        steps_to_min: 10000 # number of steps to decay from f_max to f_min
+        verbosity_interval: 200 # how often to print LR multiplier (steps)

    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
@ -74,19 +75,22 @@ model:
 data:
  target: main.DataModuleFromConfig
  params:
-    batch_size: 6  
+    batch_size: 6 
    num_workers: 12
    wrap: falsegit
    train:
      target: ldm.data.every_dream.EveryDreamBatch
      params:
-        repeats: 5
-        flip_p: 0   
-        debug_level: 1  
+        repeats: 1   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
+        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
+        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
+        crop_jitter: 20   # adds N pixels of jitter to cropping algorithm for non-square images only
+        resolution: 512  # defines max pixels for all aspects, 512, 576, 640, 704, or 768
+        seed: 555  # seed used to shuffle the dataset, keep constant for reproducibility
    validation:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
-        repeats: 0.5
+        repeats: 0.25
    test:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
@ -96,21 +100,21 @@ lightning:
  modelcheckpoint:
    params:
      every_n_epochs: 1
-      #every_n_train_steps: 1400 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
+      #every_n_train_steps: 1500 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
      save_last: True
-      save_top_k: 5
+      save_top_k: 99
      filename: "{epoch:02d}-{step:05d}"
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
-        batch_frequency: 100
+        batch_frequency: 400
        max_images: 16
        increase_log_steps: False

  trainer:
    benchmark: True
-    max_epochs: 4
+    max_epochs: 5
    max_steps: 99000  # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
    check_val_every_n_epoch: 1
    gpus: 0,
--- a/demo/crop.gif
+++ b/demo/crop.gif
--- a/ldm/data/aspects.py
+++ b/ldm/data/aspects.py
@ -0,0 +1,76 @@
+
+GOD_ASPECTS = [[768,768],  # 589824 1:1
+    [832,704],[704,832],   # 585728 1.181:1
+    [896,640],[640,896],   # 573440 1.4:1
+    [960,576],[576,960],   # 552960 1.6:1
+    [1024,576],[576,1024], # 524288 1.778:1
+    [1088,512],[512,1088], # 497664 2.125:1
+    [1152,512],[512,1152], # 589824 2.25:1
+    [1216,448],[448,1216], # 552960 2.714:1
+    [1280,448],[448,1280], # 573440 2.857:1
+    [1344,384],[384,1344], # 518400 3.5:1
+    [1408,384],[384,1408], # 540672 3.667:1
+    [1472,320],[320,1472], # 470400 4.6:1
+    [1536,320],[320,1536], # 491520 4.8:1
+]
+
+MASSIVE_ASPECTS = [[704,704], # 501,376 1:1
+    [768,640],[640,768],   # 491,520 1.2:1
+    [832,576],[576,832],   # 458,752 1.444:1
+    [896,512],[512,896],   # 458,752 1.75:1
+    [960,512],[512,960],   # 491,520 1.875:1
+    [1024,448],[448,1024], # 458,752 2.286:1
+    [1088,448],[448,1088], # 487,424 2.429:1
+    [1152,384],[384,1152], # 442,368 3:1
+    [1216,384],[384,1216], # 466,944 3.125:1
+    [1280,384],[384,1280], # 491,520 3.333:1
+    [1280,320],[320,1280], # 409,600 4:1
+    [1408,320],[320,1408], # 450,560 4.4:1
+    [1536,320],[320,1536], # 491,520 4.8:1
+]
+
+HUGE_ASPECTS = [[640,640], # 409600 1:1 
+    [704,576],[576,704],  # 405504 1.25:1
+    [768,512],[512,768],  # 393216 1.5:1
+    [896,448],[448,896],  # 401408 2:1
+    [1024,384],[384,1024], # 393216 2.667:1
+    [1280,320],[320,1280], # 409600 4:1
+    [1408,256],[256,1408], # 360448 5.5:1
+    [1472,256],[256,1472], # 376832 5.75:1
+    [1536,256],[256,1536], # 393216 6:1
+    [1600,256],[256,1600], # 409600 6.25:1
+]
+
+BIG_ASPECTS = [[576,576], # 331776 1:1\
+    [640,512],[512,640],  # 327680 1.25:1\
+    [640,448],[448,640],  # 286720 1.4286:1\
+    [704,448],[448,704],  # 314928 1.5625:1
+    [832,384],[384,832],  # 317440 2.1667:1\
+    [1024,320],[320,1024], # 327680 3.2:1\
+    [1280,256],[256,1280], # 327680 5:1\
+]
+
+ASPECTS = [[512,512],      # 262144 1:1
+    [576,448],[448,576],   # 258048 1.29:1
+    [640,384],[384,640],   # 245760 1.667:1
+    [768,320],[320,768],   # 245760 2.4:1
+    [832,256],[256,832],   # 212992 3.25:1
+    [896,256],[256,896],   # 229376 3.5:1
+    [960,256],[256,960],   # 245760 3.75:1
+    [1024,256],[256,1024], # 245760 4:1
+    ]
+
+def get_aspect_buckets(resolution):
+    if resolution < 512:
+        raise ValueError("Resolution must be at least 512")
+    try: 
+        rounded_resolution = int(resolution / 64) * 64 # round down to nearest 64
+        all_image_sizes = __get_all_aspects()
+        aspects = next(filter(lambda sizes: sizes[0][0]==rounded_resolution, all_image_sizes), None) # find matching set of aspect ratios
+        return aspects
+    except Exception as e:
+        print(f" *** Could not find selected resolution: {rounded_resolution}, check your resolution in config YAML")
+        raise e
+
+def __get_all_aspects():
+    return [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS, MASSIVE_ASPECTS, GOD_ASPECTS]
--- a/ldm/data/data_loader.py
+++ b/ldm/data/data_loader.py
@ -2,37 +2,7 @@ import os
 from PIL import Image
 import random
 from ldm.data.image_train_item import ImageTrainItem
-
-HUGE_ASPECTS = [[640,640], # 409600 1:1 
-    [704,576],[576,704], # 405504 1:1.25
-    [768,512],[512,768], # 393216 1:1.5
-    [896,448],[448,896], # 401408 1:2
-    [1024,384],[384,1024], # 393216 1:2.667
-    [1280,320],[320,1280], # 409600 1:4
-    [1408,256],[256,1408], # 360448 1:5.5
-    [1472,256],[256,1472], # 376832 1:5.75
-    [1536,256],[256,1536], # 393216 1:6
-    [1600,256],[256,1600], # 409600 1:6.25
-]
-
-BIG_ASPECTS = [[576,576], # 331776 1:1\
-    [640,512],[512,640], # 327680 1.25:1\
-    [704,448],[448,704], # 314928 1.5625:1
-    [832,384],[384,832], # 317440 2.1667:1\
-    [1024,320],[320,1024], # 327680 3.2:1\
-    [1280,256],[256,1280], # 327680 5:1\
-]
-
-ASPECTS = [[512,512], # 1 262144\
-        [576,448],[448,576], # 1.29 258048\
-        [640,384],[384,640], # 1.67 245760\
-        [704,384],[384,704], # 1.83 245760\
-        [768,320],[320,768], # 2.4 245760\
-        [832,256],[256,832], # 3.25 212992\
-        [896,256],[256,896], # 3.5 229376\
-        [960,256],[256,960],  # 3.75 245760\
-        [1024,256],[256,1024],  # 4 245760\
-    ]
+import ldm.data.aspects as aspects
        
 class DataLoaderMultiAspect():
    """
@ -42,12 +12,13 @@ class DataLoaderMultiAspect():
    batch_size: number of images per batch
    flip_p: probability of flipping image horizontally (i.e. 0-0.5)
    """
-    def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, big_mode=0):
+    def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, resolution=512):
        self.image_paths = []
        self.debug_level = debug_level
        self.flip_p = flip_p
-        self.big_mode = big_mode

+        self.aspects = aspects.get_aspect_buckets(resolution)
+        print(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
        print(" Preloading images...")

        self.__recurse_data_root(self=self, recurse_root=data_root)
@ -56,12 +27,13 @@ class DataLoaderMultiAspect():
        self.image_caption_pairs = self.__bucketize_images(prepared_train_data, batch_size=batch_size, debug_level=debug_level)

        if debug_level > 0: print(f" * DLMA Example: {self.image_caption_pairs[0]} images")
+        

    def get_all_images(self):
        return self.image_caption_pairs

    @staticmethod
-    def __read_caption_from_file(self, file_path, fallback_caption):
+    def __read_caption_from_file(file_path, fallback_caption):
        caption = fallback_caption
        try:
            with open(file_path, 'r') as caption_file:
@ -91,15 +63,13 @@ class DataLoaderMultiAspect():
            else:
                caption = caption_from_filename

-            if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
+            #if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
            
            image = Image.open(pathname)
            width, height = image.size
            image_aspect = width / height

-            aspects = [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS][self.big_mode]
-
-            target_wh = min(aspects, key=lambda x:abs(x[0]/x[1]-image_aspect))
+            target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))

            image_train_item = ImageTrainItem(image=None, caption=caption, target_wh=target_wh, pathname=pathname, flip_p=flip_p)

@ -129,7 +99,9 @@ class DataLoaderMultiAspect():
                truncate_count = len(buckets[bucket]) % batch_size
                current_bucket_size = len(buckets[bucket])
                buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
-                print(f"  ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}") if debug_level > 0 else None
+
+                if debug_level > 0:
+                    print(f"  ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}")

        # flatten the buckets
        image_caption_pairs = []
--- a/ldm/data/ed_validate.py
+++ b/ldm/data/ed_validate.py
@ -1,6 +1,5 @@
 import numpy as np
 from torch.utils.data import Dataset
-from torchvision import transforms
 from ldm.data.data_loader import DataLoaderMultiAspect as dlma
 import math
 import ldm.data.dl_singleton as dls
--- a/ldm/data/every_dream.py
+++ b/ldm/data/every_dream.py
@ -10,10 +10,10 @@ class EveryDreamBatch(Dataset):
    data_root: root path of all your training images, will be recursively searched for images
    repeats: how many times to repeat each image in the dataset
    flip_p: probability of flipping the image horizontally
-    debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=save crops to disk for inspection
+    debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=debug info per image, 3=save crops to disk for inspection
    batch_size: how many images to return in a batch
    conditional_dropout: probability of dropping the caption for a given image
-    big_mode: 0=normal, 1=big, 2=biggest
+    resolution: max resolution (relative to square)
    jitter: number of pixels to jitter the crop by, only for non-square images
    """
    def __init__(self,
@ -24,18 +24,22 @@ class EveryDreamBatch(Dataset):
                 batch_size=1,
                 set='train',
                 conditional_dropout=0.0,
-                 big_mode=0,
+                 resolution=512,
                 crop_jitter=0,
+                 seed=555,
+                 image_cache_size=200
                 ):
        self.data_root = data_root
        self.batch_size = batch_size
        self.debug_level = debug_level
        self.conditional_dropout = conditional_dropout
        self.crop_jitter = crop_jitter
+        self.unloaded_to_idx = 0
+        self.image_cache_size = image_cache_size
        
        if not dls.shared_dataloader:
            print(" * Creating new dataloader singleton")
-            dls.shared_dataloader = dlma(data_root=data_root, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, big_mode=big_mode)
+            dls.shared_dataloader = dlma(data_root=data_root, seed=seed, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, resolution=resolution)
        
        self.image_train_items = dls.shared_dataloader.get_all_images()
        
@ -54,20 +58,35 @@ class EveryDreamBatch(Dataset):
        idx = i % self.num_images
        image_train_item = self.image_train_items[idx]
        example = self.__get_image_for_trainer(image_train_item, self.debug_level)
+
+        if self.unloaded_to_idx > idx:
+            self.unloaded_to_idx = 0
+
+        if idx % (self.batch_size*3) == 0 and idx > (self.batch_size * 5) and idx > self.image_cache_size:
+            start_del = max(self.image_cache_size, self.unloaded_to_idx)
+            self.unloaded_to_idx = int(idx / self.batch_size)*self.batch_size - self.batch_size*8
+
+            print(f"{idx}: {start_del}, {self.unloaded_to_idx}") if self.debug_level > 1 else None
+            
+            if self.unloaded_to_idx > self.image_cache_size:
+                for j in range(start_del, self.unloaded_to_idx):
+                    del self.image_train_items[j].image
+                if self.debug_level > 1: print(f" * Unloaded images from idx {start_del} to {self.unloaded_to_idx}")
+
        return example

    def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):
        example = {}

-        save = debug_level > 1
+        save = debug_level > 2

        image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)

        example["image"] = image_train_tmp.image
        
-        #if random.random() > self.conditional_dropout:
-        example["caption"] = image_train_tmp.caption
-        #else:
-        #    example["caption"] = " "
+        if random.random() > self.conditional_dropout:
+            example["caption"] = image_train_tmp.caption
+        else:
+            example["caption"] = " "

        return example
--- a/ldm/data/image_train_item.py
+++ b/ldm/data/image_train_item.py
@ -8,7 +8,11 @@ import os

 class ImageTrainItem(): 
    """
-    # [image, identifier, target_aspect, closest_aspect_wh(w,h), pathname]
+    image: PIL.Image
+    identifier: caption,
+    target_aspect: (width, height), 
+    pathname: path to image file
+    flip_p: probability of flipping image (0.0 to 1.0)
    """    
    def __init__(self, image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
        self.caption = caption
@ -18,49 +22,62 @@ class ImageTrainItem():
        self.cropped_img = None

        if image is None:
-            self.image = PIL.Image.new(mode='RGB',size=(1,1))
+            self.image = []
        else:
            self.image = image

    def hydrate(self, crop=False, save=False, crop_jitter=0):
-        self.image = PIL.Image.open(self.pathname).convert('RGB')
+        """
+        crop: hard center crop to 512x512
+        save: save the cropped image to disk, for manual inspection of resize/crop
+        crop_jitter: randomly shift cropp by N pixels when using multiple aspect ratios to improve training quality
+        """
+        if not hasattr(self, 'image') or len(self.image) == 0:
+            self.image = PIL.Image.open(self.pathname).convert('RGB')

-        width, height = self.image.size
-        if crop:
-            cropped_img = self.__autocrop(self.image)
-            self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
-        else:
-            if width == 512 and height == 512:
-                pass
-            elif self.target_wh[0] == self.target_wh[1]:
-                pass
-            else: 
-                width, height = self.image.size
-                image_aspect = width / height
-                jitter_amount = random.randint(-crop_jitter, crop_jitter)
-                jitter_amount = min(jitter_amount, int(abs(width-height)/2))
-                target_aspect = self.target_wh[0] / self.target_wh[1]
-                if image_aspect > target_aspect:
-                    new_width = int(height * target_aspect) 
-                    left = int((width - new_width) / 2) + jitter_amount
-                    right = left + new_width
-                    self.image = self.image.crop((left, 0, right, height))
-                else:
-                    new_height = int(width / target_aspect)
-                    top = int((height - new_height) / 2) + jitter_amount
-                    bottom = top + new_height
-                    self.image = self.image.crop((0, top, width, bottom))
-            self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)
+            width, height = self.image.size
+            if crop:
+                cropped_img = self.__autocrop(self.image)
+                self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
+            else:
+                if self.target_wh[0] == self.target_wh[1]:
+                    pass
+                else: 
+                    width, height = self.image.size
+                    image_aspect = width / height
+                    jitter_amount = random.randint(0, crop_jitter)
+                    target_aspect = self.target_wh[0] / self.target_wh[1]
+                    print(f"{target_aspect}, {self.target_wh}")
+                    if image_aspect > target_aspect:
+                        new_width = int(height * target_aspect)
+                        jitter_amount = max(min(jitter_amount, int(abs(width-new_width)/2)), 0)
+                        left = jitter_amount
+                        right = left + new_width
+                        print(f"crop left: {left}, right: {right}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
+                        self.image = self.image.crop((left, 0, right, height))
+                    else:
+                        new_height = int(width / target_aspect)
+                        jitter_amount = max(min(jitter_amount, int(abs(height-new_height)/2)), 0)
+                        top = jitter_amount
+                        bottom = top + new_height
+                        print(f"crop top: {top}, bottom: {bottom}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
+                        self.image = self.image.crop((0, top, width, bottom))
+                self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)

-        self.image = self.flip(self.image)
+            self.image = self.flip(self.image)

-        if save: # for manual inspection
-            base_name = os.path.basename(self.pathname)
-            self.image.save(f"test/output/{random.randint(0,4)}/{base_name}")
+        if type(self.image) is not np.ndarray:
+            if save: 
+                base_name = os.path.basename(self.pathname)
+                if not os.path.exists("test/output"):
+                    os.makedirs("test/output")
+                self.image.save(f"test/output/{base_name}")
+            
+            self.image = np.array(self.image).astype(np.uint8)
+
+            self.image = (self.image / 127.5 - 1.0).astype(np.float32)
        
-        self.image = np.array(self.image).astype(np.uint8)
-
-        self.image = (self.image / 127.5 - 1.0).astype(np.float32)
+        print(self.image.shape)

        return self

--- a/ldm/lr_scheduler.py
+++ b/ldm/lr_scheduler.py
@ -96,3 +96,35 @@ class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
            self.last_f = f
            return f

+class EveryDreamScheduler:
+    """
+    f_min: minimum lr multiplier
+    f_max: maximum lr multiplier
+    f_start: lr multiplier at the beginning of the warm-up phase
+    warm_up_steps: number of steps in the warm-up phase
+    steps_to_min: number of steps to reach the minimum lr multiplier
+    """
+    def __init__(self, f_min=0.5, f_max=1.0, f_start=1.0, warm_up_steps=1000, steps_to_min=5000, verbosity_interval=100) -> None:
+        self.f_min = f_min
+        self.f_max = f_max
+        self.f_start = f_start
+        self.warm_up_steps = warm_up_steps
+        self.steps_to_min = steps_to_min
+        self.last_f = 0.
+        self.verbosity_interval = verbosity_interval
+    
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+    
+    def schedule(self, n, **kawrgs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f:0.3f}, current cycle: {0}")
+
+        if n < self.warm_up_steps:
+            self.last_f = self.f_start
+        elif n < self.steps_to_min:
+            self.last_f = self.f_min + (self.f_max - self.f_min) * (self.steps_to_min - n) / (self.steps_to_min)
+        else:
+            self.last_f = self.f_min
+
+        return self.last_f
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@ -453,6 +453,7 @@ class LatentDiffusion(DDPM):
                 conditioning_key=None,
                 scale_factor=1.0,
                 scale_by_std=False,
+                 scheduler_config=None,
                 *args, **kwargs):
        
        self.num_timesteps_cond = default(num_timesteps_cond, 1)
@ -465,7 +466,7 @@ class LatentDiffusion(DDPM):
            conditioning_key = None
        ckpt_path = kwargs.pop("ckpt_path", None)
        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+        super().__init__(conditioning_key=conditioning_key, scheduler_config=scheduler_config, *args, **kwargs)
        self.concat_mode = concat_mode
        self.cond_stage_trainable = cond_stage_trainable
        self.cond_stage_key = cond_stage_key
@ -704,8 +705,6 @@ class LatentDiffusion(DDPM):
            if cond_key != self.first_stage_key:
                if cond_key in ['caption', 'coordinates_bbox']:
                    xc = batch[cond_key]
-                elif cond_key == 'class_label':
-                    xc = batch
                else:
                    xc = super().get_input(batch, cond_key).to(self.device)
            else:
--- a/test/test_aspects.py
+++ b/test/test_aspects.py
@ -0,0 +1,15 @@
+import ldm.data.aspects as aspects
+
+resolutions = [512, 576, 640, 704, 768]
+oops = [532, 576, 640, 704, 768]
+
+for res in resolutions:
+    example_aspects = aspects.get_aspect_buckets(res)
+    print(f" *{res} buckets: {example_aspects}")
+
+    max_pixels = example_aspects[0][0] * example_aspects[0][1]
+
+    for aspect in example_aspects:
+        pixels = aspect[0] * aspect[1]
+        print (f"max: {max_pixels}: {aspect}: {pixels}, pct {pixels/max_pixels:.2f}")
+        assert pixels <= max_pixels, f" * {aspect} is larger than {max_pixels}"
--- a/test/test_batch.py
+++ b/test/test_batch.py
@ -0,0 +1,36 @@
+# script to test data loader by itself
+# run from training root, edit the data_root manually
+
+from  ldm.data.every_dream import EveryDreamBatch
+import time
+
+s = time.perf_counter()
+
+#data_root = "r:/everydream-trainer/test/input"
+data_root = "r:/everydream-trainer/training_samples"
+
+batch_size = 6
+repeats=3
+every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=2, batch_size=batch_size, repeats=repeats, crop_jitter=25, conditional_dropout=0.3, resolution=512)
+
+print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
+print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
+i = 0
+
+while i < 99: # and i < len(every_dream_batch):
+    curr_batch = []
+    for j in range(i,i+batch_size):
+        curr_batch.append(every_dream_batch[j])
+    
+    # all in batch must have the same image size
+    assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
+    assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
+
+    #print(f"idx: {i}, batch sample: shape: {curr_batch[0]['image'].shape}: {curr_batch[0]['caption']}")
+
+    i += batch_size
+
+print(f" *TEST* test cycles: {i}")
+print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
+elapsed = time.perf_counter() - s
+print(f"{__file__} executed in {elapsed:5.2f} seconds.")
--- a/test/test_crop.py
+++ b/test/test_crop.py
@ -0,0 +1,48 @@
+# script to what cropping does to your images
+# execute from root everydream-trainer folder
+# ex.
+#      (everydream) R:\everydream-trainer>python scripts/test_crop.py
+# dumps to /test/output
+
+from  ldm.data.every_dream import EveryDreamBatch
+import time
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_root', type=str, default=None, help='root path of all your training images, will be recursively searched for images')
+parser.add_argument('--resolution', type=int, default=512, help='resolution class, 512, 576, 640, 704, or 768')
+args = parser.parse_args()
+
+s = time.perf_counter()
+
+# put in your own data_root here, WARNING don't do this on a lot of images unless you are prepared for it...
+if args.data_root is None:
+    data_root = "R:/everydream-trainer/test/input"
+else:
+    data_root = args.data_root
+
+debug_level = 3 # 3 = dump images to disk after cropping and a bunch of crap into the console be warned
+batch_size = 1
+repeats = 1
+crop_jitter = 50
+resolution = args.resolution # 512, 576, 640, 704, 768 
+every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=3, batch_size=batch_size, repeats=repeats, crop_jitter=crop_jitter, conditional_dropout=0.1, resolution=resolution)
+
+print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
+print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
+i = 0
+
+while i < len(every_dream_batch):
+    curr_batch = []
+    for j in range(i,i+batch_size):
+        curr_batch.append(every_dream_batch[j])
+    
+    assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
+    assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
+
+    i += batch_size
+
+print(f" *TEST* test cycles: {i}")
+print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
+elapsed = time.perf_counter() - s
+print(f"{__file__} executed in {elapsed:5.2f} seconds.")
--- a/test/test_dl.py
+++ b/test/test_dl.py
@ -0,0 +1,18 @@
+# script to test data loader by itself
+# run from training root, edit the data_root manually
+# python ldm/data/test_dl.py
+import ldm.data.data_loader as dl
+
+data_root = "r:/everydream-trainer/test/input"
+
+data_loader = dl.DataLoaderMultiAspect(data_root=data_root, batch_size=2, seed=555, debug_level=2)
+
+image_caption_pairs = data_loader.get_all_images()
+
+print(f"Loaded {len(image_caption_pairs)} image-caption pairs")
+
+for image_caption_pair in image_caption_pairs:
+    print(image_caption_pair)
+    
+
+print(f"**** Done loading. Loaded {len(image_caption_pairs)} images from data_root: {data_root} ****")
--- a/test/test_image_train_item.py
+++ b/test/test_image_train_item.py
@ -0,0 +1,24 @@
+# script to test data loader by itself
+# run from training root, edit the data_root manually
+# python ldm/data/test_dl.py
+from ldm.data.image_train_item import ImageTrainItem
+import glob
+import os
+
+data_root = "training_samples\multiaspect"
+
+for idx, f in enumerate(glob.iglob(f"{data_root}/*.jpg")):
+    for i in range(0, 40):
+        #print(f)
+        #image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
+        caption = os.path.basename(f)
+        caption = os.path.splitext(caption)[0]
+        my_iti = ImageTrainItem(None,caption,[512,512],f,0.0)
+
+        my_iti = my_iti.hydrate()
+
+        out_file_path = os.path.join(data_root, "output", f"{caption}_{i}.jpg")
+        #print(out_file_path)
+        my_iti.cropped_img.save(out_file_path)
+
+
--- a/test/test_linearsch.py
+++ b/test/test_linearsch.py
@ -0,0 +1,7 @@
+import ldm.lr_scheduler as lrs
+
+#def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+sch = lrs.EveryDreamScheduler(warm_up_steps=10, f_min=5.0e-1, f_max=1.0, f_start=1.0, steps_to_min=25, verbosity_interval=5)
+
+for i in range(50):
+    print(f"step {i}: {sch(i)}")