unload old images for better sys ram use, fix up crop jitter
This commit is contained in:
parent
d26fabac14
commit
b47c5c0ed3
|
@ -65,7 +65,7 @@ model:
|
|||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 6 # prefer highest possible without getting CUDA Out of Memory error
|
||||
batch_size: 6 # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
|
||||
num_workers: 6
|
||||
wrap: falsegit
|
||||
train:
|
||||
|
@ -73,9 +73,10 @@ data:
|
|||
params:
|
||||
repeats: 5 # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
|
||||
debug_level: 1 # 1 to print if images are dropped due to multiple-aspect ratio image batching
|
||||
conditional_dropout: 0.01 # experimental, likelihood to drop the caption, may help with poorly captioned images
|
||||
conditional_dropout: 0.08 # experimental, likelihood to drop the caption, may help with poorly captioned images
|
||||
crop_jitter: 5 # adds N pixels of jitter to cropping algorithm for non-square images only
|
||||
big_mode: 0 # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
|
||||
resolution: 512 # 512, 576, or 640, increases VRAM substantially
|
||||
seed: 555 # seed used to shuffle the dataset ordering, keep constant for reproducibility
|
||||
validation:
|
||||
target: ldm.data.ed_validate.EDValidateBatch
|
||||
params:
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
model:
|
||||
base_learning_rate: 1.2e-6
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 300
|
||||
timesteps: 1000
|
||||
first_stage_key: image
|
||||
cond_stage_key: caption
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: true
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
unfreeze_model: True
|
||||
model_lr: 1.2e-6
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: True
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 512
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
|
||||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 6 # prefer highest possible without getting CUDA Out of Memory error, A100 40GB =~20 80GB= ~48
|
||||
num_workers: 6
|
||||
wrap: falsegit
|
||||
train:
|
||||
target: ldm.data.every_dream.EveryDreamBatch
|
||||
params:
|
||||
repeats: 1 # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
|
||||
debug_level: 1 # 1 to print if images are dropped due to multiple-aspect ratio image batching
|
||||
conditional_dropout: 0.08 # experimental, likelihood to drop the caption, may help with poorly captioned images
|
||||
crop_jitter: 15 # adds N pixels of jitter to cropping algorithm for non-square images only
|
||||
big_mode: 0 # set to 1 or 2 to use larger image sizes for training, USES LOTS OF VRAM! Requires 40GB+
|
||||
validation:
|
||||
target: ldm.data.ed_validate.EDValidateBatch
|
||||
params:
|
||||
repeats: 0.3
|
||||
test:
|
||||
target: ldm.data.ed_validate.EDValidateBatch
|
||||
params:
|
||||
repeats: 0.2
|
||||
|
||||
lightning:
|
||||
modelcheckpoint:
|
||||
params:
|
||||
every_n_epochs: 1 # produce a ckpt every epoch, leave 1!
|
||||
#every_n_train_steps: 1400 # can only use epoch or train step checkpoints
|
||||
save_top_k: 6 # save the best N ckpts according to loss, can reduce to save disk space but suggest at LEAST 2, more if you have max_epochs below higher!
|
||||
save_last: True
|
||||
filename: "{epoch:02d}-{step:05d}"
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 500
|
||||
max_images: 16
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
max_epochs: 1 # better to run several epochs and test your checkpoints! Try 4-5, you get a checkpoint every epoch to test!
|
||||
max_steps: 99000 # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
|
||||
check_val_every_n_epoch: 1
|
||||
gpus: 0,
|
|
@ -1,5 +1,5 @@
|
|||
model:
|
||||
base_learning_rate: 1.0e-6
|
||||
base_learning_rate: 1.2e-6
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
|
@ -17,16 +17,17 @@ model:
|
|||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
unfreeze_model: True
|
||||
#model_lr: 1.0e-6
|
||||
model_lr: 1.1e-6
|
||||
#use_scheduler: True
|
||||
scheduler_config:
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
target: ldm.lr_scheduler.EveryDreamScheduler
|
||||
params:
|
||||
warm_up_steps: [ 5 ]
|
||||
cycle_lengths: [ 1000 ] # incredibly large number to prevent corner cases
|
||||
verbosity_interval: 25 # how often to print LR updates
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1.e-6 ] # 1.
|
||||
f_min: [ 1.e-8 ] # 1.
|
||||
f_start: 5.0e-1 # starting LR multiplier
|
||||
warm_up_steps: 50 # number of steps to warm up to f_start before decaying LR
|
||||
f_max: 1.0 # maximum LR multiplier
|
||||
f_min: 5.0e-1 # minimum LR multiplier
|
||||
steps_to_min: 10000 # number of steps to decay from f_max to f_min
|
||||
verbosity_interval: 200 # how often to print LR multiplier (steps)
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
|
@ -74,19 +75,22 @@ model:
|
|||
data:
|
||||
target: main.DataModuleFromConfig
|
||||
params:
|
||||
batch_size: 6
|
||||
batch_size: 6
|
||||
num_workers: 12
|
||||
wrap: falsegit
|
||||
train:
|
||||
target: ldm.data.every_dream.EveryDreamBatch
|
||||
params:
|
||||
repeats: 5
|
||||
flip_p: 0
|
||||
debug_level: 1
|
||||
repeats: 1 # rough suggestions: 5 with 5000+ images, 15 for 1000 images, use micro yaml for <100
|
||||
debug_level: 1 # 1 to print if images are dropped due to multiple-aspect ratio image batching
|
||||
conditional_dropout: 0.08 # experimental, likelihood to drop the caption, may help with poorly captioned images
|
||||
crop_jitter: 20 # adds N pixels of jitter to cropping algorithm for non-square images only
|
||||
resolution: 512 # defines max pixels for all aspects, 512, 576, 640, 704, or 768
|
||||
seed: 555 # seed used to shuffle the dataset, keep constant for reproducibility
|
||||
validation:
|
||||
target: ldm.data.ed_validate.EDValidateBatch
|
||||
params:
|
||||
repeats: 0.5
|
||||
repeats: 0.25
|
||||
test:
|
||||
target: ldm.data.ed_validate.EDValidateBatch
|
||||
params:
|
||||
|
@ -96,21 +100,21 @@ lightning:
|
|||
modelcheckpoint:
|
||||
params:
|
||||
every_n_epochs: 1
|
||||
#every_n_train_steps: 1400 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
|
||||
#every_n_train_steps: 1500 # can only use every_n_epochs OR every_n_train_steps, suggest you stick with epochs
|
||||
save_last: True
|
||||
save_top_k: 5
|
||||
save_top_k: 99
|
||||
filename: "{epoch:02d}-{step:05d}"
|
||||
callbacks:
|
||||
image_logger:
|
||||
target: main.ImageLogger
|
||||
params:
|
||||
batch_frequency: 100
|
||||
batch_frequency: 400
|
||||
max_images: 16
|
||||
increase_log_steps: False
|
||||
|
||||
trainer:
|
||||
benchmark: True
|
||||
max_epochs: 4
|
||||
max_epochs: 5
|
||||
max_steps: 99000 # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
|
||||
check_val_every_n_epoch: 1
|
||||
gpus: 0,
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 196 KiB |
|
@ -0,0 +1,76 @@
|
|||
|
||||
GOD_ASPECTS = [[768,768], # 589824 1:1
|
||||
[832,704],[704,832], # 585728 1.181:1
|
||||
[896,640],[640,896], # 573440 1.4:1
|
||||
[960,576],[576,960], # 552960 1.6:1
|
||||
[1024,576],[576,1024], # 524288 1.778:1
|
||||
[1088,512],[512,1088], # 497664 2.125:1
|
||||
[1152,512],[512,1152], # 589824 2.25:1
|
||||
[1216,448],[448,1216], # 552960 2.714:1
|
||||
[1280,448],[448,1280], # 573440 2.857:1
|
||||
[1344,384],[384,1344], # 518400 3.5:1
|
||||
[1408,384],[384,1408], # 540672 3.667:1
|
||||
[1472,320],[320,1472], # 470400 4.6:1
|
||||
[1536,320],[320,1536], # 491520 4.8:1
|
||||
]
|
||||
|
||||
MASSIVE_ASPECTS = [[704,704], # 501,376 1:1
|
||||
[768,640],[640,768], # 491,520 1.2:1
|
||||
[832,576],[576,832], # 458,752 1.444:1
|
||||
[896,512],[512,896], # 458,752 1.75:1
|
||||
[960,512],[512,960], # 491,520 1.875:1
|
||||
[1024,448],[448,1024], # 458,752 2.286:1
|
||||
[1088,448],[448,1088], # 487,424 2.429:1
|
||||
[1152,384],[384,1152], # 442,368 3:1
|
||||
[1216,384],[384,1216], # 466,944 3.125:1
|
||||
[1280,384],[384,1280], # 491,520 3.333:1
|
||||
[1280,320],[320,1280], # 409,600 4:1
|
||||
[1408,320],[320,1408], # 450,560 4.4:1
|
||||
[1536,320],[320,1536], # 491,520 4.8:1
|
||||
]
|
||||
|
||||
HUGE_ASPECTS = [[640,640], # 409600 1:1
|
||||
[704,576],[576,704], # 405504 1.25:1
|
||||
[768,512],[512,768], # 393216 1.5:1
|
||||
[896,448],[448,896], # 401408 2:1
|
||||
[1024,384],[384,1024], # 393216 2.667:1
|
||||
[1280,320],[320,1280], # 409600 4:1
|
||||
[1408,256],[256,1408], # 360448 5.5:1
|
||||
[1472,256],[256,1472], # 376832 5.75:1
|
||||
[1536,256],[256,1536], # 393216 6:1
|
||||
[1600,256],[256,1600], # 409600 6.25:1
|
||||
]
|
||||
|
||||
BIG_ASPECTS = [[576,576], # 331776 1:1\
|
||||
[640,512],[512,640], # 327680 1.25:1\
|
||||
[640,448],[448,640], # 286720 1.4286:1\
|
||||
[704,448],[448,704], # 314928 1.5625:1
|
||||
[832,384],[384,832], # 317440 2.1667:1\
|
||||
[1024,320],[320,1024], # 327680 3.2:1\
|
||||
[1280,256],[256,1280], # 327680 5:1\
|
||||
]
|
||||
|
||||
ASPECTS = [[512,512], # 262144 1:1
|
||||
[576,448],[448,576], # 258048 1.29:1
|
||||
[640,384],[384,640], # 245760 1.667:1
|
||||
[768,320],[320,768], # 245760 2.4:1
|
||||
[832,256],[256,832], # 212992 3.25:1
|
||||
[896,256],[256,896], # 229376 3.5:1
|
||||
[960,256],[256,960], # 245760 3.75:1
|
||||
[1024,256],[256,1024], # 245760 4:1
|
||||
]
|
||||
|
||||
def get_aspect_buckets(resolution):
|
||||
if resolution < 512:
|
||||
raise ValueError("Resolution must be at least 512")
|
||||
try:
|
||||
rounded_resolution = int(resolution / 64) * 64 # round down to nearest 64
|
||||
all_image_sizes = __get_all_aspects()
|
||||
aspects = next(filter(lambda sizes: sizes[0][0]==rounded_resolution, all_image_sizes), None) # find matching set of aspect ratios
|
||||
return aspects
|
||||
except Exception as e:
|
||||
print(f" *** Could not find selected resolution: {rounded_resolution}, check your resolution in config YAML")
|
||||
raise e
|
||||
|
||||
def __get_all_aspects():
|
||||
return [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS, MASSIVE_ASPECTS, GOD_ASPECTS]
|
|
@ -2,37 +2,7 @@ import os
|
|||
from PIL import Image
|
||||
import random
|
||||
from ldm.data.image_train_item import ImageTrainItem
|
||||
|
||||
HUGE_ASPECTS = [[640,640], # 409600 1:1
|
||||
[704,576],[576,704], # 405504 1:1.25
|
||||
[768,512],[512,768], # 393216 1:1.5
|
||||
[896,448],[448,896], # 401408 1:2
|
||||
[1024,384],[384,1024], # 393216 1:2.667
|
||||
[1280,320],[320,1280], # 409600 1:4
|
||||
[1408,256],[256,1408], # 360448 1:5.5
|
||||
[1472,256],[256,1472], # 376832 1:5.75
|
||||
[1536,256],[256,1536], # 393216 1:6
|
||||
[1600,256],[256,1600], # 409600 1:6.25
|
||||
]
|
||||
|
||||
BIG_ASPECTS = [[576,576], # 331776 1:1\
|
||||
[640,512],[512,640], # 327680 1.25:1\
|
||||
[704,448],[448,704], # 314928 1.5625:1
|
||||
[832,384],[384,832], # 317440 2.1667:1\
|
||||
[1024,320],[320,1024], # 327680 3.2:1\
|
||||
[1280,256],[256,1280], # 327680 5:1\
|
||||
]
|
||||
|
||||
ASPECTS = [[512,512], # 1 262144\
|
||||
[576,448],[448,576], # 1.29 258048\
|
||||
[640,384],[384,640], # 1.67 245760\
|
||||
[704,384],[384,704], # 1.83 245760\
|
||||
[768,320],[320,768], # 2.4 245760\
|
||||
[832,256],[256,832], # 3.25 212992\
|
||||
[896,256],[256,896], # 3.5 229376\
|
||||
[960,256],[256,960], # 3.75 245760\
|
||||
[1024,256],[256,1024], # 4 245760\
|
||||
]
|
||||
import ldm.data.aspects as aspects
|
||||
|
||||
class DataLoaderMultiAspect():
|
||||
"""
|
||||
|
@ -42,12 +12,13 @@ class DataLoaderMultiAspect():
|
|||
batch_size: number of images per batch
|
||||
flip_p: probability of flipping image horizontally (i.e. 0-0.5)
|
||||
"""
|
||||
def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, big_mode=0):
|
||||
def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, resolution=512):
|
||||
self.image_paths = []
|
||||
self.debug_level = debug_level
|
||||
self.flip_p = flip_p
|
||||
self.big_mode = big_mode
|
||||
|
||||
self.aspects = aspects.get_aspect_buckets(resolution)
|
||||
print(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
|
||||
print(" Preloading images...")
|
||||
|
||||
self.__recurse_data_root(self=self, recurse_root=data_root)
|
||||
|
@ -56,12 +27,13 @@ class DataLoaderMultiAspect():
|
|||
self.image_caption_pairs = self.__bucketize_images(prepared_train_data, batch_size=batch_size, debug_level=debug_level)
|
||||
|
||||
if debug_level > 0: print(f" * DLMA Example: {self.image_caption_pairs[0]} images")
|
||||
|
||||
|
||||
def get_all_images(self):
|
||||
return self.image_caption_pairs
|
||||
|
||||
@staticmethod
|
||||
def __read_caption_from_file(self, file_path, fallback_caption):
|
||||
def __read_caption_from_file(file_path, fallback_caption):
|
||||
caption = fallback_caption
|
||||
try:
|
||||
with open(file_path, 'r') as caption_file:
|
||||
|
@ -91,15 +63,13 @@ class DataLoaderMultiAspect():
|
|||
else:
|
||||
caption = caption_from_filename
|
||||
|
||||
if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
|
||||
#if debug_level > 1: print(f" * DLMA file: {pathname} with caption: {caption}")
|
||||
|
||||
image = Image.open(pathname)
|
||||
width, height = image.size
|
||||
image_aspect = width / height
|
||||
|
||||
aspects = [ASPECTS, BIG_ASPECTS, HUGE_ASPECTS][self.big_mode]
|
||||
|
||||
target_wh = min(aspects, key=lambda x:abs(x[0]/x[1]-image_aspect))
|
||||
target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
|
||||
|
||||
image_train_item = ImageTrainItem(image=None, caption=caption, target_wh=target_wh, pathname=pathname, flip_p=flip_p)
|
||||
|
||||
|
@ -129,7 +99,9 @@ class DataLoaderMultiAspect():
|
|||
truncate_count = len(buckets[bucket]) % batch_size
|
||||
current_bucket_size = len(buckets[bucket])
|
||||
buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
|
||||
print(f" ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}") if debug_level > 0 else None
|
||||
|
||||
if debug_level > 0:
|
||||
print(f" ** Bucket {bucket} with {current_bucket_size} will drop {truncate_count} images due to batch size {batch_size}")
|
||||
|
||||
# flatten the buckets
|
||||
image_caption_pairs = []
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import numpy as np
|
||||
from torch.utils.data import Dataset
|
||||
from torchvision import transforms
|
||||
from ldm.data.data_loader import DataLoaderMultiAspect as dlma
|
||||
import math
|
||||
import ldm.data.dl_singleton as dls
|
||||
|
|
|
@ -10,10 +10,10 @@ class EveryDreamBatch(Dataset):
|
|||
data_root: root path of all your training images, will be recursively searched for images
|
||||
repeats: how many times to repeat each image in the dataset
|
||||
flip_p: probability of flipping the image horizontally
|
||||
debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=save crops to disk for inspection
|
||||
debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=debug info per image, 3=save crops to disk for inspection
|
||||
batch_size: how many images to return in a batch
|
||||
conditional_dropout: probability of dropping the caption for a given image
|
||||
big_mode: 0=normal, 1=big, 2=biggest
|
||||
resolution: max resolution (relative to square)
|
||||
jitter: number of pixels to jitter the crop by, only for non-square images
|
||||
"""
|
||||
def __init__(self,
|
||||
|
@ -24,18 +24,22 @@ class EveryDreamBatch(Dataset):
|
|||
batch_size=1,
|
||||
set='train',
|
||||
conditional_dropout=0.0,
|
||||
big_mode=0,
|
||||
resolution=512,
|
||||
crop_jitter=0,
|
||||
seed=555,
|
||||
image_cache_size=200
|
||||
):
|
||||
self.data_root = data_root
|
||||
self.batch_size = batch_size
|
||||
self.debug_level = debug_level
|
||||
self.conditional_dropout = conditional_dropout
|
||||
self.crop_jitter = crop_jitter
|
||||
self.unloaded_to_idx = 0
|
||||
self.image_cache_size = image_cache_size
|
||||
|
||||
if not dls.shared_dataloader:
|
||||
print(" * Creating new dataloader singleton")
|
||||
dls.shared_dataloader = dlma(data_root=data_root, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, big_mode=big_mode)
|
||||
dls.shared_dataloader = dlma(data_root=data_root, seed=seed, debug_level=debug_level, batch_size=self.batch_size, flip_p=flip_p, resolution=resolution)
|
||||
|
||||
self.image_train_items = dls.shared_dataloader.get_all_images()
|
||||
|
||||
|
@ -54,20 +58,35 @@ class EveryDreamBatch(Dataset):
|
|||
idx = i % self.num_images
|
||||
image_train_item = self.image_train_items[idx]
|
||||
example = self.__get_image_for_trainer(image_train_item, self.debug_level)
|
||||
|
||||
if self.unloaded_to_idx > idx:
|
||||
self.unloaded_to_idx = 0
|
||||
|
||||
if idx % (self.batch_size*3) == 0 and idx > (self.batch_size * 5) and idx > self.image_cache_size:
|
||||
start_del = max(self.image_cache_size, self.unloaded_to_idx)
|
||||
self.unloaded_to_idx = int(idx / self.batch_size)*self.batch_size - self.batch_size*8
|
||||
|
||||
print(f"{idx}: {start_del}, {self.unloaded_to_idx}") if self.debug_level > 1 else None
|
||||
|
||||
if self.unloaded_to_idx > self.image_cache_size:
|
||||
for j in range(start_del, self.unloaded_to_idx):
|
||||
del self.image_train_items[j].image
|
||||
if self.debug_level > 1: print(f" * Unloaded images from idx {start_del} to {self.unloaded_to_idx}")
|
||||
|
||||
return example
|
||||
|
||||
def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):
|
||||
example = {}
|
||||
|
||||
save = debug_level > 1
|
||||
save = debug_level > 2
|
||||
|
||||
image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)
|
||||
|
||||
example["image"] = image_train_tmp.image
|
||||
|
||||
#if random.random() > self.conditional_dropout:
|
||||
example["caption"] = image_train_tmp.caption
|
||||
#else:
|
||||
# example["caption"] = " "
|
||||
if random.random() > self.conditional_dropout:
|
||||
example["caption"] = image_train_tmp.caption
|
||||
else:
|
||||
example["caption"] = " "
|
||||
|
||||
return example
|
||||
|
|
|
@ -8,7 +8,11 @@ import os
|
|||
|
||||
class ImageTrainItem():
|
||||
"""
|
||||
# [image, identifier, target_aspect, closest_aspect_wh(w,h), pathname]
|
||||
image: PIL.Image
|
||||
identifier: caption,
|
||||
target_aspect: (width, height),
|
||||
pathname: path to image file
|
||||
flip_p: probability of flipping image (0.0 to 1.0)
|
||||
"""
|
||||
def __init__(self, image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
|
||||
self.caption = caption
|
||||
|
@ -18,49 +22,62 @@ class ImageTrainItem():
|
|||
self.cropped_img = None
|
||||
|
||||
if image is None:
|
||||
self.image = PIL.Image.new(mode='RGB',size=(1,1))
|
||||
self.image = []
|
||||
else:
|
||||
self.image = image
|
||||
|
||||
def hydrate(self, crop=False, save=False, crop_jitter=0):
|
||||
self.image = PIL.Image.open(self.pathname).convert('RGB')
|
||||
"""
|
||||
crop: hard center crop to 512x512
|
||||
save: save the cropped image to disk, for manual inspection of resize/crop
|
||||
crop_jitter: randomly shift cropp by N pixels when using multiple aspect ratios to improve training quality
|
||||
"""
|
||||
if not hasattr(self, 'image') or len(self.image) == 0:
|
||||
self.image = PIL.Image.open(self.pathname).convert('RGB')
|
||||
|
||||
width, height = self.image.size
|
||||
if crop:
|
||||
cropped_img = self.__autocrop(self.image)
|
||||
self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
|
||||
else:
|
||||
if width == 512 and height == 512:
|
||||
pass
|
||||
elif self.target_wh[0] == self.target_wh[1]:
|
||||
pass
|
||||
else:
|
||||
width, height = self.image.size
|
||||
image_aspect = width / height
|
||||
jitter_amount = random.randint(-crop_jitter, crop_jitter)
|
||||
jitter_amount = min(jitter_amount, int(abs(width-height)/2))
|
||||
target_aspect = self.target_wh[0] / self.target_wh[1]
|
||||
if image_aspect > target_aspect:
|
||||
new_width = int(height * target_aspect)
|
||||
left = int((width - new_width) / 2) + jitter_amount
|
||||
right = left + new_width
|
||||
self.image = self.image.crop((left, 0, right, height))
|
||||
else:
|
||||
new_height = int(width / target_aspect)
|
||||
top = int((height - new_height) / 2) + jitter_amount
|
||||
bottom = top + new_height
|
||||
self.image = self.image.crop((0, top, width, bottom))
|
||||
self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)
|
||||
width, height = self.image.size
|
||||
if crop:
|
||||
cropped_img = self.__autocrop(self.image)
|
||||
self.image = cropped_img.resize((512,512), resample=PIL.Image.BICUBIC)
|
||||
else:
|
||||
if self.target_wh[0] == self.target_wh[1]:
|
||||
pass
|
||||
else:
|
||||
width, height = self.image.size
|
||||
image_aspect = width / height
|
||||
jitter_amount = random.randint(0, crop_jitter)
|
||||
target_aspect = self.target_wh[0] / self.target_wh[1]
|
||||
print(f"{target_aspect}, {self.target_wh}")
|
||||
if image_aspect > target_aspect:
|
||||
new_width = int(height * target_aspect)
|
||||
jitter_amount = max(min(jitter_amount, int(abs(width-new_width)/2)), 0)
|
||||
left = jitter_amount
|
||||
right = left + new_width
|
||||
print(f"crop left: {left}, right: {right}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
|
||||
self.image = self.image.crop((left, 0, right, height))
|
||||
else:
|
||||
new_height = int(width / target_aspect)
|
||||
jitter_amount = max(min(jitter_amount, int(abs(height-new_height)/2)), 0)
|
||||
top = jitter_amount
|
||||
bottom = top + new_height
|
||||
print(f"crop top: {top}, bottom: {bottom}, jitteramt:{jitter_amount}, [{width}, {height}] img: {self.pathname}")
|
||||
self.image = self.image.crop((0, top, width, bottom))
|
||||
self.image = self.image.resize(self.target_wh, resample=PIL.Image.BICUBIC)
|
||||
|
||||
self.image = self.flip(self.image)
|
||||
self.image = self.flip(self.image)
|
||||
|
||||
if save: # for manual inspection
|
||||
base_name = os.path.basename(self.pathname)
|
||||
self.image.save(f"test/output/{random.randint(0,4)}/{base_name}")
|
||||
if type(self.image) is not np.ndarray:
|
||||
if save:
|
||||
base_name = os.path.basename(self.pathname)
|
||||
if not os.path.exists("test/output"):
|
||||
os.makedirs("test/output")
|
||||
self.image.save(f"test/output/{base_name}")
|
||||
|
||||
self.image = np.array(self.image).astype(np.uint8)
|
||||
|
||||
self.image = (self.image / 127.5 - 1.0).astype(np.float32)
|
||||
|
||||
self.image = np.array(self.image).astype(np.uint8)
|
||||
|
||||
self.image = (self.image / 127.5 - 1.0).astype(np.float32)
|
||||
print(self.image.shape)
|
||||
|
||||
return self
|
||||
|
||||
|
|
|
@ -96,3 +96,35 @@ class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
|
|||
self.last_f = f
|
||||
return f
|
||||
|
||||
class EveryDreamScheduler:
|
||||
"""
|
||||
f_min: minimum lr multiplier
|
||||
f_max: maximum lr multiplier
|
||||
f_start: lr multiplier at the beginning of the warm-up phase
|
||||
warm_up_steps: number of steps in the warm-up phase
|
||||
steps_to_min: number of steps to reach the minimum lr multiplier
|
||||
"""
|
||||
def __init__(self, f_min=0.5, f_max=1.0, f_start=1.0, warm_up_steps=1000, steps_to_min=5000, verbosity_interval=100) -> None:
|
||||
self.f_min = f_min
|
||||
self.f_max = f_max
|
||||
self.f_start = f_start
|
||||
self.warm_up_steps = warm_up_steps
|
||||
self.steps_to_min = steps_to_min
|
||||
self.last_f = 0.
|
||||
self.verbosity_interval = verbosity_interval
|
||||
|
||||
def __call__(self, n, **kwargs):
|
||||
return self.schedule(n, **kwargs)
|
||||
|
||||
def schedule(self, n, **kawrgs):
|
||||
if self.verbosity_interval > 0:
|
||||
if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f:0.3f}, current cycle: {0}")
|
||||
|
||||
if n < self.warm_up_steps:
|
||||
self.last_f = self.f_start
|
||||
elif n < self.steps_to_min:
|
||||
self.last_f = self.f_min + (self.f_max - self.f_min) * (self.steps_to_min - n) / (self.steps_to_min)
|
||||
else:
|
||||
self.last_f = self.f_min
|
||||
|
||||
return self.last_f
|
|
@ -453,6 +453,7 @@ class LatentDiffusion(DDPM):
|
|||
conditioning_key=None,
|
||||
scale_factor=1.0,
|
||||
scale_by_std=False,
|
||||
scheduler_config=None,
|
||||
*args, **kwargs):
|
||||
|
||||
self.num_timesteps_cond = default(num_timesteps_cond, 1)
|
||||
|
@ -465,7 +466,7 @@ class LatentDiffusion(DDPM):
|
|||
conditioning_key = None
|
||||
ckpt_path = kwargs.pop("ckpt_path", None)
|
||||
ignore_keys = kwargs.pop("ignore_keys", [])
|
||||
super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
|
||||
super().__init__(conditioning_key=conditioning_key, scheduler_config=scheduler_config, *args, **kwargs)
|
||||
self.concat_mode = concat_mode
|
||||
self.cond_stage_trainable = cond_stage_trainable
|
||||
self.cond_stage_key = cond_stage_key
|
||||
|
@ -704,8 +705,6 @@ class LatentDiffusion(DDPM):
|
|||
if cond_key != self.first_stage_key:
|
||||
if cond_key in ['caption', 'coordinates_bbox']:
|
||||
xc = batch[cond_key]
|
||||
elif cond_key == 'class_label':
|
||||
xc = batch
|
||||
else:
|
||||
xc = super().get_input(batch, cond_key).to(self.device)
|
||||
else:
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
import ldm.data.aspects as aspects
|
||||
|
||||
resolutions = [512, 576, 640, 704, 768]
|
||||
oops = [532, 576, 640, 704, 768]
|
||||
|
||||
for res in resolutions:
|
||||
example_aspects = aspects.get_aspect_buckets(res)
|
||||
print(f" *{res} buckets: {example_aspects}")
|
||||
|
||||
max_pixels = example_aspects[0][0] * example_aspects[0][1]
|
||||
|
||||
for aspect in example_aspects:
|
||||
pixels = aspect[0] * aspect[1]
|
||||
print (f"max: {max_pixels}: {aspect}: {pixels}, pct {pixels/max_pixels:.2f}")
|
||||
assert pixels <= max_pixels, f" * {aspect} is larger than {max_pixels}"
|
|
@ -0,0 +1,36 @@
|
|||
# script to test data loader by itself
|
||||
# run from training root, edit the data_root manually
|
||||
|
||||
from ldm.data.every_dream import EveryDreamBatch
|
||||
import time
|
||||
|
||||
s = time.perf_counter()
|
||||
|
||||
#data_root = "r:/everydream-trainer/test/input"
|
||||
data_root = "r:/everydream-trainer/training_samples"
|
||||
|
||||
batch_size = 6
|
||||
repeats=3
|
||||
every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=2, batch_size=batch_size, repeats=repeats, crop_jitter=25, conditional_dropout=0.3, resolution=512)
|
||||
|
||||
print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
|
||||
print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
|
||||
i = 0
|
||||
|
||||
while i < 99: # and i < len(every_dream_batch):
|
||||
curr_batch = []
|
||||
for j in range(i,i+batch_size):
|
||||
curr_batch.append(every_dream_batch[j])
|
||||
|
||||
# all in batch must have the same image size
|
||||
assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
|
||||
assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
|
||||
|
||||
#print(f"idx: {i}, batch sample: shape: {curr_batch[0]['image'].shape}: {curr_batch[0]['caption']}")
|
||||
|
||||
i += batch_size
|
||||
|
||||
print(f" *TEST* test cycles: {i}")
|
||||
print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
|
||||
elapsed = time.perf_counter() - s
|
||||
print(f"{__file__} executed in {elapsed:5.2f} seconds.")
|
|
@ -0,0 +1,48 @@
|
|||
# script to what cropping does to your images
|
||||
# execute from root everydream-trainer folder
|
||||
# ex.
|
||||
# (everydream) R:\everydream-trainer>python scripts/test_crop.py
|
||||
# dumps to /test/output
|
||||
|
||||
from ldm.data.every_dream import EveryDreamBatch
|
||||
import time
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default=None, help='root path of all your training images, will be recursively searched for images')
|
||||
parser.add_argument('--resolution', type=int, default=512, help='resolution class, 512, 576, 640, 704, or 768')
|
||||
args = parser.parse_args()
|
||||
|
||||
s = time.perf_counter()
|
||||
|
||||
# put in your own data_root here, WARNING don't do this on a lot of images unless you are prepared for it...
|
||||
if args.data_root is None:
|
||||
data_root = "R:/everydream-trainer/test/input"
|
||||
else:
|
||||
data_root = args.data_root
|
||||
|
||||
debug_level = 3 # 3 = dump images to disk after cropping and a bunch of crap into the console be warned
|
||||
batch_size = 1
|
||||
repeats = 1
|
||||
crop_jitter = 50
|
||||
resolution = args.resolution # 512, 576, 640, 704, 768
|
||||
every_dream_batch = EveryDreamBatch(data_root=data_root, flip_p=0.0, debug_level=3, batch_size=batch_size, repeats=repeats, crop_jitter=crop_jitter, conditional_dropout=0.1, resolution=resolution)
|
||||
|
||||
print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
|
||||
print(f" max test cycles: {int(len(every_dream_batch) / batch_size)}, batch_size: {batch_size}, repeats: {repeats}")
|
||||
i = 0
|
||||
|
||||
while i < len(every_dream_batch):
|
||||
curr_batch = []
|
||||
for j in range(i,i+batch_size):
|
||||
curr_batch.append(every_dream_batch[j])
|
||||
|
||||
assert all(x == curr_batch[0]['image'].shape for x in [e['image'].shape for e in curr_batch])
|
||||
assert all(x[0] > 2 for x in [e['image'].shape for e in curr_batch])
|
||||
|
||||
i += batch_size
|
||||
|
||||
print(f" *TEST* test cycles: {i}")
|
||||
print(f" *TEST* EveryDreamBatch epoch image length: {len(every_dream_batch)}")
|
||||
elapsed = time.perf_counter() - s
|
||||
print(f"{__file__} executed in {elapsed:5.2f} seconds.")
|
|
@ -0,0 +1,18 @@
|
|||
# script to test data loader by itself
|
||||
# run from training root, edit the data_root manually
|
||||
# python ldm/data/test_dl.py
|
||||
import ldm.data.data_loader as dl
|
||||
|
||||
data_root = "r:/everydream-trainer/test/input"
|
||||
|
||||
data_loader = dl.DataLoaderMultiAspect(data_root=data_root, batch_size=2, seed=555, debug_level=2)
|
||||
|
||||
image_caption_pairs = data_loader.get_all_images()
|
||||
|
||||
print(f"Loaded {len(image_caption_pairs)} image-caption pairs")
|
||||
|
||||
for image_caption_pair in image_caption_pairs:
|
||||
print(image_caption_pair)
|
||||
|
||||
|
||||
print(f"**** Done loading. Loaded {len(image_caption_pairs)} images from data_root: {data_root} ****")
|
|
@ -0,0 +1,24 @@
|
|||
# script to test data loader by itself
|
||||
# run from training root, edit the data_root manually
|
||||
# python ldm/data/test_dl.py
|
||||
from ldm.data.image_train_item import ImageTrainItem
|
||||
import glob
|
||||
import os
|
||||
|
||||
data_root = "training_samples\multiaspect"
|
||||
|
||||
for idx, f in enumerate(glob.iglob(f"{data_root}/*.jpg")):
|
||||
for i in range(0, 40):
|
||||
#print(f)
|
||||
#image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
|
||||
caption = os.path.basename(f)
|
||||
caption = os.path.splitext(caption)[0]
|
||||
my_iti = ImageTrainItem(None,caption,[512,512],f,0.0)
|
||||
|
||||
my_iti = my_iti.hydrate()
|
||||
|
||||
out_file_path = os.path.join(data_root, "output", f"{caption}_{i}.jpg")
|
||||
#print(out_file_path)
|
||||
my_iti.cropped_img.save(out_file_path)
|
||||
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
import ldm.lr_scheduler as lrs
|
||||
|
||||
#def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
|
||||
sch = lrs.EveryDreamScheduler(warm_up_steps=10, f_min=5.0e-1, f_max=1.0, f_start=1.0, steps_to_min=25, verbosity_interval=5)
|
||||
|
||||
for i in range(50):
|
||||
print(f"step {i}: {sch(i)}")
|
Loading…
Reference in New Issue