unload old images for better sys ram use, fix up crop jitter

2022-11-16 13:52:06 -05:00 · 2022-11-16 13:52:06 -05:00 · b47c5c0ed3
parent d26fabac14
commit b47c5c0ed3
17 changed files with 485 additions and 109 deletions
--- a/configs/stable-diffusion/v1-finetune_everydream.yaml
+++ b/configs/stable-diffusion/v1-finetune_everydream.yaml
@ -65,7 +65,7 @@ model:
 data:
  target: main.DataModuleFromConfig
  params:
-    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error
+    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
    num_workers: 6
    wrap: falsegit
    train:
@ -73,9 +73,10 @@ data:
      params:
        repeats: 5   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
-        conditional_dropout: 0.01   # experimental, likelihood to drop the caption, may help with poorly captioned images
+        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
        crop_jitter: 5   # adds N pixels of jitter to cropping algorithm for non-square images only
-        big_mode: 0   # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
+        resolution: 512   # 512, 576, or 640, increases VRAM substantially
        seed: 555  # seed used to shuffle the dataset ordering, keep constant for reproducibility
    validation:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
--- a/configs/stable-diffusion/v1-finetune_huge.yaml
+++ b/configs/stable-diffusion/v1-finetune_huge.yaml
@ -0,0 +1,109 @@
 model:
  base_learning_rate: 1.2e-6
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 300
    timesteps: 1000
    first_stage_key: image
    cond_stage_key: caption
    image_size: 64
    channels: 4
    cond_stage_trainable: true
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    unfreeze_model: True
    model_lr: 1.2e-6
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 512
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
 data:
  target: main.DataModuleFromConfig
  params:
    batch_size: 6  # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
    num_workers: 6
    wrap: falsegit
    train:
      target: ldm.data.every_dream.EveryDreamBatch
      params:
        repeats: 1   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
        crop_jitter: 15   # adds N pixels of jitter to cropping algorithm for non-square images only
        big_mode: 0   # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
    validation:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
        repeats: 0.3
    test:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
        repeats: 0.2
 lightning:
  modelcheckpoint:
    params:
      every_n_epochs: 1  # produce a ckpt every epoch, leave 1!
      #every_n_train_steps: 1400 # can only use epoch or train step checkpoints
      save_top_k: 6   # save the best N ckpts according to loss, can reduce to save disk space but suggest at LEAST 2, more if you have max_epochs below higher!
      save_last: True
      filename: "{epoch:02d}-{step:05d}"
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
        batch_frequency: 500
        max_images: 16
        increase_log_steps: False
  trainer:
    benchmark: True
    max_epochs: 1   # better to run several epochs and test your checkpoints!  Try 4-5, you get a checkpoint every epoch to test! 
    max_steps: 99000   # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
    check_val_every_n_epoch: 1
    gpus: 0,
--- a/configs/stable-diffusion/v1-finetune_test.yaml
+++ b/configs/stable-diffusion/v1-finetune_test.yaml
@ -1,5 +1,5 @@
 model:
-  base_learning_rate: 1.0e-6
+  base_learning_rate: 1.2e-6
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
@ -17,16 +17,17 @@ model:
    scale_factor: 0.18215
    use_ema: False
    unfreeze_model: True
-    #model_lr: 1.0e-6
+    model_lr: 1.1e-6
    #use_scheduler: True
    scheduler_config: 
-      target: ldm.lr_scheduler.LambdaLinearScheduler
+      target: ldm.lr_scheduler.EveryDreamScheduler
      params:
-        warm_up_steps: [ 5 ]
+        f_start: 5.0e-1 # starting LR multiplier
-        cycle_lengths: [ 1000 ] # incredibly large number to prevent corner cases
+        warm_up_steps: 50 # number of steps to warm up to f_start before decaying LR
-        verbosity_interval: 25  # how often to print LR updates
+        f_max: 1.0 # maximum LR multiplier
-        f_start: [ 1.e-6 ]
+        f_min: 5.0e-1 # minimum LR multiplier
-        f_max: [ 1.e-6 ] # 1.
+        steps_to_min: 10000 # number of steps to decay from f_max to f_min
-        f_min: [ 1.e-8 ] # 1.
+        verbosity_interval: 200 # how often to print LR multiplier (steps)
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
@ -80,13 +81,16 @@ data:
    train:
      target: ldm.data.every_dream.EveryDreamBatch
      params:
-        repeats: 5
+        repeats: 1   # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
-        flip_p: 0   
+        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image batching
-        debug_level: 1  
+        conditional_dropout: 0.08   # experimental, likelihood to drop the caption, may help with poorly captioned images
        crop_jitter: 20   # adds N pixels of jitter to cropping algorithm for non-square images only
        resolution: 512  # defines max pixels for all aspects, 512, 576, 640, 704, or 768
        seed: 555  # seed used to shuffle the dataset, keep constant for reproducibility
    validation:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
-        repeats: 0.5
+        repeats: 0.25
    test:
      target: ldm.data.ed_validate.EDValidateBatch
      params:
@ -96,21 +100,21 @@ lightning:
  modelcheckpoint:
    params:
      every_n_epochs: 1
-      #every_n_train_steps: 1400 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
+      #every_n_train_steps: 1500 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
      save_last: True
-      save_top_k: 5
+      save_top_k: 99
      filename: "{epoch:02d}-{step:05d}"
  callbacks:
    image_logger:
      target: main.ImageLogger
      params:
-        batch_frequency: 100
+        batch_frequency: 400
        max_images: 16
        increase_log_steps: False
  trainer:
    benchmark: True
-    max_epochs: 4
+    max_epochs: 5
    max_steps: 99000  # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
    check_val_every_n_epoch: 1
    gpus: 0,
--- a/demo/crop.gif
+++ b/demo/crop.gif
--- a/ldm/data/aspects.py
+++ b/ldm/data/aspects.py
@ -0,0 +1,76 @@
 GOD_ASPECTS = [[768,768],  # 589824 1:1
    [832,704],[704,832],   # 585728 1.181:1
    [896,640],[640,896],   # 573440 1.4:1
    [960,576],[576,960],   # 552960 1.6:1
    [1024,576],[576,1024], # 524288 1.778:1
    [1088,512],[512,1088], # 497664 2.125:1
    [1152,512],[512,1152], # 589824 2.25:1
    [1216,448],[448,1216], # 552960 2.714:1
    [1280,448],[448,1280], # 573440 2.857:1
    [1344,384],[384,1344], # 518400 3.5:1
    [1408,384],[384,1408], # 540672 3.667:1
    [1472,320],[320,1472], # 470400 4.6:1
    [1536,320],[320,1536], # 491520 4.8:1
 ]
 MASSIVE_ASPECTS = [[704,704], # 501,376 1:1
    [768,640],[640,768],   # 491,520 1.2:1
    [832,576],[576,832],   # 458,752 1.444:1
    [896,512],[512,896],   # 458,752 1.75:1
    [960,512],[512,960],   # 491,520 1.875:1
    [1024,448],[448,1024], # 458,752 2.286:1
    [1088,448],[448,1088], # 487,424 2.429:1
    [1152,384],[384,1152], # 442,368 3:1
    [1216,384],[384,1216], # 466,944 3.125:1
    [1280,384],[384,1280], # 491,520 3.333:1
    [1280,320],[320,1280], # 409,600 4:1
    [1408,320],[320,1408], # 450,560 4.4:1
    [1536,320],[320,1536], # 491,520 4.8:1
 ]
 HUGE_ASPECTS = [[640,640], # 409600 1:1 
    [704,576],[576,704],  # 405504 1.25:1
    [768,512],[512,768],  # 393216 1.5:1
    [896,448],[448,896],  # 401408 2:1
    [1024,384],[384,1024], # 393216 2.667:1
    [1280,320],[320,1280], # 409600 4:1
    [1408,256],[256,1408], # 360448 5.5:1
    [1472,256],[256,1472], # 376832 5.75:1
    [1536,256],[256,1536], # 393216 6:1
    [1600,256],[256,1600], # 409600 6.25:1
 ]
 BIG_ASPECTS = [[576,576], # 331776 1:1\
    [640,512],[512,640],  # 327680 1.25:1\
    [640,448],[448,640],  # 286720 1.4286:1\
    [704,448],[448,704],  # 314928 1.5625:1
    [832,384],[384,832],  # 317440 2.1667:1\
    [1024,320],[320,1024], # 327680 3.2:1\
    [1280,256],[256,1280], # 327680 5:1\
 ]
 ASPECTS = [[512,512],      # 262144 1:1
    [576,448],[448,576],   # 258048 1.29:1
    [640,384],[384,640],   # 245760 1.667:1
    [768,320],[320,768],   # 245760 2.4:1
    [832,256],[256,832],   # 212992 3.25:1
    [896,256],[256,896],   # 229376 3.5:1
    [960,256],[256,960],   # 245760 3.75:1
    [1024,256],[256,1024], # 245760 4:1
    ]
 def get_aspect_buckets(resolution):
    if resolution < 512:
        raise ValueError("Resolution must be at least 512")
    try: 
        rounded_resolution = int(resolution / 64) * 64 # round down to nearest 64
        all_image_sizes = __get_all_aspects()
        aspects = next(filter(lambda sizes: sizes[0][0]==rounded_resolution, all_image_sizes), None) # find matching set of aspect ratios
        return aspects
    except Exception as e:
        print(f" *** Could not find selected resolution: {rounded_resolution}, check your resolution in config YAML")
        raise e
 def __get_all_aspects():
    return [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS, MASSIVE_ASPECTS, GOD_ASPECTS]
--- a/ldm/data/data_loader.py
+++ b/ldm/data/data_loader.py
@ -2,37 +2,7 @@ import os
 from PIL import Image
 import random
 from ldm.data.image_train_item import ImageTrainItem
-
+import ldm.data.aspects as aspects
 HUGE_ASPECTS = [[640,640], # 409600 1:1 
    [704,576],[576,704], # 405504 1:1.25
    [768,512],[512,768], # 393216 1:1.5
    [896,448],[448,896], # 401408 1:2
    [1024,384],[384,1024], # 393216 1:2.667
    [1280,320],[320,1280], # 409600 1:4
    [1408,256],[256,1408], # 360448 1:5.5
    [1472,256],[256,1472], # 376832 1:5.75
    [1536,256],[256,1536], # 393216 1:6
    [1600,256],[256,1600], # 409600 1:6.25
 ]
 BIG_ASPECTS = [[576,576], # 331776 1:1\
    [640,512],[512,640], # 327680 1.25:1\
    [704,448],[448,704], # 314928 1.5625:1
    [832,384],[384,832], # 317440 2.1667:1\
    [1024,320],[320,1024], # 327680 3.2:1\
    [1280,256],[256,1280], # 327680 5:1\
 ]
 ASPECTS = [[512,512], # 1 262144\
        [576,448],[448,576], # 1.29 258048\
        [640,384],[384,640], # 1.67 245760\
        [704,384],[384,704], # 1.83 245760\
        [768,320],[320,768], # 2.4 245760\
        [832,256],[256,832], # 3.25 212992\
        [896,256],[256,896], # 3.5 229376\
        [960,256],[256,960],  # 3.75 245760\
        [1024,256],[256,1024],  # 4 245760\
    ]
 class DataLoaderMultiAspect():
    """
@ -42,12 +12,13 @@ class DataLoaderMultiAspect():
    batch_size: number of images per batch
    flip_p: probability of flipping image horizontally (i.e. 0-0.5)
    """
-    def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, big_mode=0):
+    def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, resolution=512):
        self.image_paths = []
        self.debug_level = debug_level
        self.flip_p = flip_p
        self.big_mode = big_mode
        self.aspects = aspects.get_aspect_buckets(resolution)
        print(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
        print(" Preloading images...")
        self.__recurse_data_root(self=self, recurse_root=data_root)
@ -57,11 +28,12 @@ class DataLoaderMultiAspect():
        if debug_level > 0: print(f" * DLMA Example: {self.image_caption_pairs[0]} images")
    def get_all_images(self):
        return self.image_caption_pairs
    @staticmethod
-    def __read_caption_from_file(self, file_path, fallback_caption):
+    def __read_caption_from_file(file_path, fallback_caption):
        caption = fallback_caption
        try:
            with open(file_path, 'r') as caption_file:
@ -91,15 +63,13 @@ class DataLoaderMultiAspect():
            else:
                caption = caption_from_filename
-            if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
+            #if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
            image = Image.open(pathname)
            width, height = image.size
            image_aspect = width / height
-            aspects = [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS][self.big_mode]
+            target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
            target_wh = min(aspects, key=lambda x:abs(x[0]/x[1]-image_aspect))
            image_train_item = ImageTrainItem(image=None, caption=caption, target_wh=target_wh, pathname=pathname, flip_p=flip_p)
@ -129,7 +99,9 @@ class DataLoaderMultiAspect():
                truncate_count = len(buckets[bucket]) % batch_size
                current_bucket_size = len(buckets[bucket])
                buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
-                print(f"  ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}") if debug_level > 0 else None
+
                if debug_level > 0:
                    print(f"  ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}")
        # flatten the buckets
        image_caption_pairs = []
--- a/ldm/data/ed_validate.py
+++ b/ldm/data/ed_validate.py
@ -1,6 +1,5 @@
 import numpy as np
 from torch.utils.data import Dataset
 from torchvision import transforms
 from ldm.data.data_loader import DataLoaderMultiAspect as dlma
 import math
 import ldm.data.dl_singleton as dls
--- a/ldm/data/every_dream.py
+++ b/ldm/data/every_dream.py
@ -10,10 +10,10 @@ class EveryDreamBatch(Dataset):
    data_root: root path of all your training images, will be recursively searched for images
    repeats: how many times to repeat each image in the dataset
    flip_p: probability of flipping the image horizontally
-    debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=save crops to disk for inspection
+    debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=debug info per image, 3=save crops to disk for inspection
    batch_size: how many images to return in a batch
    conditional_dropout: probability of dropping the caption for a given image
-    big_mode: 0=normal, 1=big, 2=biggest
+    resolution: max resolution (relative to square)
    jitter: number of pixels to jitter the crop by, only for non-square images
    """
    def __init__(self,
@ -24,18 +24,22 @@ class EveryDreamBatch(Dataset):
                 batch_size=1,
                 set='train',
                 conditional_dropout=0.0,
-                 big_mode=0,
+                 resolution=512,
                 crop_jitter=0,
                 seed=555,
                 image_cache_size=200
                 ):
        self.data_root = data_root
        self.batch_size = batch_size
        self.debug_level = debug_level
        self.conditional_dropout = conditional_dropout
        self.crop_jitter = crop_jitter
        self.unloaded_to_idx = 0
        self.image_cache_size = image_cache_size
        if not dls.shared_dataloader:
            print(" * Creating new dataloader singleton")
-            dls.shared_dataloader = dlma(data_root=data_root, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, big_mode=big_mode)
+            dls.shared_dataloader = dlma(data_root=data_root, seed=seed, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, resolution=resolution)
        self.image_train_items = dls.shared_dataloader.get_all_images()
@ -54,20 +58,35 @@ class EveryDreamBatch(Dataset):
        idx = i % self.num_images
        image_train_item = self.image_train_items[idx]
        example = self.__get_image_for_trainer(image_train_item, self.debug_level)
        if self.unloaded_to_idx > idx:
            self.unloaded_to_idx = 0
        if idx % (self.batch_size*3) == 0 and idx > (self.batch_size * 5) and idx > self.image_cache_size:
            start_del = max(self.image_cache_size, self.unloaded_to_idx)
            self.unloaded_to_idx = int(idx / self.batch_size)*self.batch_size - self.batch_size*8
            print(f"{idx}: {start_del}, {self.unloaded_to_idx}") if self.debug_level > 1 else None
            if self.unloaded_to_idx > self.image_cache_size:
                for j in range(start_del, self.unloaded_to_idx):
                    del self.image_train_items[j].image
                if self.debug_level > 1: print(f" * Unloaded images from idx {start_del} to {self.unloaded_to_idx}")
        return example
    def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):
        example = {}
-        save = debug_level > 1
+        save = debug_level > 2
        image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)
        example["image"] = image_train_tmp.image
-        #if random.random() > self.conditional_dropout:
+        if random.random() > self.conditional_dropout:
-        example["caption"] = image_train_tmp.caption
+            example["caption"] = image_train_tmp.caption
-        #else:
+        else:
-        #    example["caption"] = " "
+            example["caption"] = " "
        return example
--- a/ldm/data/image_train_item.py
+++ b/ldm/data/image_train_item.py
@ -8,7 +8,11 @@ import os
 class ImageTrainItem(): 
    """
-    # [image, identifier, target_aspect, closest_aspect_wh(w,h), pathname]
+    image: PIL.Image
    identifier: caption,
    target_aspect: (width, height), 
    pathname: path to image file
    flip_p: probability of flipping image (0.0 to 1.0)
    """    
    def __init__(self, image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
        self.caption = caption
@ -18,49 +22,62 @@ class ImageTrainItem():
        self.cropped_img = None
        if image is None:
-            self.image = PIL.Image.new(mode='RGB',size=(1,1))
+            self.image = []
        else:
            self.image = image
    def hydrate(self, crop=False, save=False, crop_jitter=0):
-        self.image = PIL.Image.open(self.pathname).convert('RGB')
+        """
        crop: hard center crop to 512x512
        save: save the cropped image to disk, for manual inspection of resize/crop
        crop_jitter: randomly shift cropp by N pixels when using multiple aspect ratios to improve training quality
        """
        if not hasattr(self, 'image') or len(self.image) == 0:
            self.image = PIL.Image.open(self.pathname).convert('RGB')
-        width, height = self.image.size
+            width, height = self.image.size
-        if crop:
+            if crop:
-            cropped_img = self.__autocrop(self.image)
+                cropped_img = self.__autocrop(self.image)
-            self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
+                self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
        else:
            if width == 512 and height == 512:
                pass
            elif self.target_wh[0] == self.target_wh[1]:
                pass
            else:
-                width, height = self.image.size
+                if self.target_wh[0] == self.target_wh[1]:
-                image_aspect = width / height
+                    pass
                jitter_amount = random.randint(-crop_jitter, crop_jitter)
                jitter_amount = min(jitter_amount, int(abs(width-height)/2))
                target_aspect = self.target_wh[0] / self.target_wh[1]
                if image_aspect > target_aspect:
                    new_width = int(height * target_aspect) 
                    left = int((width - new_width) / 2) + jitter_amount
                    right = left + new_width
                    self.image = self.image.crop((left, 0, right, height))
                else: 
-                    new_height = int(width / target_aspect)
+                    width, height = self.image.size
-                    top = int((height - new_height) / 2) + jitter_amount
+                    image_aspect = width / height
-                    bottom = top + new_height
+                    jitter_amount = random.randint(0, crop_jitter)
-                    self.image = self.image.crop((0, top, width, bottom))
+                    target_aspect = self.target_wh[0] / self.target_wh[1]
-            self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)
+                    print(f"{target_aspect}, {self.target_wh}")
                    if image_aspect > target_aspect:
                        new_width = int(height * target_aspect)
                        jitter_amount = max(min(jitter_amount, int(abs(width-new_width)/2)), 0)
                        left = jitter_amount
                        right = left + new_width
                        print(f"crop left: {left}, right: {right}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
                        self.image = self.image.crop((left, 0, right, height))
                    else:
                        new_height = int(width / target_aspect)
                        jitter_amount = max(min(jitter_amount, int(abs(height-new_height)/2)), 0)
                        top = jitter_amount
                        bottom = top + new_height
                        print(f"crop top: {top}, bottom: {bottom}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
                        self.image = self.image.crop((0, top, width, bottom))
                self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)
-        self.image = self.flip(self.image)
+            self.image = self.flip(self.image)
-        if save: # for manual inspection
+        if type(self.image) is not np.ndarray:
-            base_name = os.path.basename(self.pathname)
+            if save: 
-            self.image.save(f"test/output/{random.randint(0,4)}/{base_name}")
+                base_name = os.path.basename(self.pathname)
                if not os.path.exists("test/output"):
                    os.makedirs("test/output")
                self.image.save(f"test/output/{base_name}")
-        self.image = np.array(self.image).astype(np.uint8)
+            self.image = np.array(self.image).astype(np.uint8)
-        self.image = (self.image / 127.5 - 1.0).astype(np.float32)
+            self.image = (self.image / 127.5 - 1.0).astype(np.float32)
        print(self.image.shape)
        return self
--- a/ldm/lr_scheduler.py
+++ b/ldm/lr_scheduler.py
@ -96,3 +96,35 @@ class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
            self.last_f = f
            return f
 class EveryDreamScheduler:
    """
    f_min: minimum lr multiplier
    f_max: maximum lr multiplier
    f_start: lr multiplier at the beginning of the warm-up phase
    warm_up_steps: number of steps in the warm-up phase
    steps_to_min: number of steps to reach the minimum lr multiplier
    """
    def __init__(self, f_min=0.5, f_max=1.0, f_start=1.0, warm_up_steps=1000, steps_to_min=5000, verbosity_interval=100) -> None:
        self.f_min = f_min
        self.f_max = f_max
        self.f_start = f_start
        self.warm_up_steps = warm_up_steps
        self.steps_to_min = steps_to_min
        self.last_f = 0.
        self.verbosity_interval = verbosity_interval
    def __call__(self, n, **kwargs):
        return self.schedule(n, **kwargs)
    def schedule(self, n, **kawrgs):
        if self.verbosity_interval > 0:
            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f:0.3f}, current cycle: {0}")
        if n < self.warm_up_steps:
            self.last_f = self.f_start
        elif n < self.steps_to_min:
            self.last_f = self.f_min + (self.f_max - self.f_min) * (self.steps_to_min - n) / (self.steps_to_min)
        else:
            self.last_f = self.f_min
        return self.last_f
--- a/ldm/models/diffusion/ddpm.py
+++ b/ldm/models/diffusion/ddpm.py
@ -453,6 +453,7 @@ class LatentDiffusion(DDPM):
                 conditioning_key=None,
                 scale_factor=1.0,
                 scale_by_std=False,
                 scheduler_config=None,
                 *args, **kwargs):
        self.num_timesteps_cond = default(num_timesteps_cond, 1)
@ -465,7 +466,7 @@ class LatentDiffusion(DDPM):
            conditioning_key = None
        ckpt_path = kwargs.pop("ckpt_path", None)
        ignore_keys = kwargs.pop("ignore_keys", [])
-        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+        super().__init__(conditioning_key=conditioning_key, scheduler_config=scheduler_config, *args, **kwargs)
        self.concat_mode = concat_mode
        self.cond_stage_trainable = cond_stage_trainable
        self.cond_stage_key = cond_stage_key
@ -704,8 +705,6 @@ class LatentDiffusion(DDPM):
            if cond_key != self.first_stage_key:
                if cond_key in ['caption', 'coordinates_bbox']:
                    xc = batch[cond_key]
                elif cond_key == 'class_label':
                    xc = batch
                else:
                    xc = super().get_input(batch, cond_key).to(self.device)
            else:
--- a/test/test_aspects.py
+++ b/test/test_aspects.py
@ -0,0 +1,15 @@
 import ldm.data.aspects as aspects
 resolutions = [512, 576, 640, 704, 768]
 oops = [532, 576, 640, 704, 768]
 for res in resolutions:
    example_aspects = aspects.get_aspect_buckets(res)
    print(f" *{res} buckets: {example_aspects}")
    max_pixels = example_aspects[0][0] * example_aspects[0][1]
    for aspect in example_aspects:
        pixels = aspect[0] * aspect[1]
        print (f"max: {max_pixels}: {aspect}: {pixels}, pct {pixels/max_pixels:.2f}")
        assert pixels <= max_pixels, f" * {aspect} is larger than {max_pixels}"
--- a/test/test_batch.py
+++ b/test/test_batch.py
@ -0,0 +1,36 @@
 # script to test data loader by itself
 # run from training root, edit the data_root manually
 from  ldm.data.every_dream import EveryDreamBatch
 import time
 s = time.perf_counter()
 #data_root = "r:/everydream-trainer/test/input"
 data_root = "r:/everydream-trainer/training_samples"
 batch_size = 6
 repeats=3
 every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=2, batch_size=batch_size, repeats=repeats, crop_jitter=25, conditional_dropout=0.3, resolution=512)
 print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
 print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
 i = 0
 while i < 99: # and i < len(every_dream_batch):
    curr_batch = []
    for j in range(i,i+batch_size):
        curr_batch.append(every_dream_batch[j])
    # all in batch must have the same image size
    assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
    assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
    #print(f"idx: {i}, batch sample: shape: {curr_batch[0]['image'].shape}: {curr_batch[0]['caption']}")
    i += batch_size
 print(f" *TEST* test cycles: {i}")
 print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
 elapsed = time.perf_counter() - s
 print(f"{__file__} executed in {elapsed:5.2f} seconds.")
--- a/test/test_crop.py
+++ b/test/test_crop.py
@ -0,0 +1,48 @@
 # script to what cropping does to your images
 # execute from root everydream-trainer folder
 # ex.
 #      (everydream) R:\everydream-trainer>python scripts/test_crop.py
 # dumps to /test/output
 from  ldm.data.every_dream import EveryDreamBatch
 import time
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('--data_root', type=str, default=None, help='root path of all your training images, will be recursively searched for images')
 parser.add_argument('--resolution', type=int, default=512, help='resolution class, 512, 576, 640, 704, or 768')
 args = parser.parse_args()
 s = time.perf_counter()
 # put in your own data_root here, WARNING don't do this on a lot of images unless you are prepared for it...
 if args.data_root is None:
    data_root = "R:/everydream-trainer/test/input"
 else:
    data_root = args.data_root
 debug_level = 3 # 3 = dump images to disk after cropping and a bunch of crap into the console be warned
 batch_size = 1
 repeats = 1
 crop_jitter = 50
 resolution = args.resolution # 512, 576, 640, 704, 768 
 every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=3, batch_size=batch_size, repeats=repeats, crop_jitter=crop_jitter, conditional_dropout=0.1, resolution=resolution)
 print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
 print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
 i = 0
 while i < len(every_dream_batch):
    curr_batch = []
    for j in range(i,i+batch_size):
        curr_batch.append(every_dream_batch[j])
    assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
    assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
    i += batch_size
 print(f" *TEST* test cycles: {i}")
 print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
 elapsed = time.perf_counter() - s
 print(f"{__file__} executed in {elapsed:5.2f} seconds.")
--- a/test/test_dl.py
+++ b/test/test_dl.py
@ -0,0 +1,18 @@
 # script to test data loader by itself
 # run from training root, edit the data_root manually
 # python ldm/data/test_dl.py
 import ldm.data.data_loader as dl
 data_root = "r:/everydream-trainer/test/input"
 data_loader = dl.DataLoaderMultiAspect(data_root=data_root, batch_size=2, seed=555, debug_level=2)
 image_caption_pairs = data_loader.get_all_images()
 print(f"Loaded {len(image_caption_pairs)} image-caption pairs")
 for image_caption_pair in image_caption_pairs:
    print(image_caption_pair)
 print(f"**** Done loading. Loaded {len(image_caption_pairs)} images from data_root: {data_root} ****")
--- a/test/test_image_train_item.py
+++ b/test/test_image_train_item.py
@ -0,0 +1,24 @@
 # script to test data loader by itself
 # run from training root, edit the data_root manually
 # python ldm/data/test_dl.py
 from ldm.data.image_train_item import ImageTrainItem
 import glob
 import os
 data_root = "training_samples\multiaspect"
 for idx, f in enumerate(glob.iglob(f"{data_root}/*.jpg")):
    for i in range(0, 40):
        #print(f)
        #image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
        caption = os.path.basename(f)
        caption = os.path.splitext(caption)[0]
        my_iti = ImageTrainItem(None,caption,[512,512],f,0.0)
        my_iti = my_iti.hydrate()
        out_file_path = os.path.join(data_root, "output", f"{caption}_{i}.jpg")
        #print(out_file_path)
        my_iti.cropped_img.save(out_file_path)
--- a/test/test_linearsch.py
+++ b/test/test_linearsch.py
@ -0,0 +1,7 @@
 import ldm.lr_scheduler as lrs
 #def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
 sch = lrs.EveryDreamScheduler(warm_up_steps=10, f_min=5.0e-1, f_max=1.0, f_start=1.0, steps_to_min=25, verbosity_interval=5)
 for i in range(50):
    print(f"step {i}: {sch(i)}")