From f90d8e5b53f456ca35406a5cc57fdb0bb9690c6f Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Tue, 18 Apr 2023 22:11:51 -0400 Subject: [PATCH] data loader tweaks --- data/data_loader.py | 56 +---------- data/dataset.py | 201 +++++++++++++++++++++++++++++++-------- data/every_dream.py | 20 ++-- data/image_train_item.py | 6 +- train.py | 12 +-- utils/fs_helpers.py | 4 +- 6 files changed, 181 insertions(+), 118 deletions(-) diff --git a/data/data_loader.py b/data/data_loader.py index 9d9f7c4..d77ce74 100644 --- a/data/data_loader.py +++ b/data/data_loader.py @@ -1,5 +1,5 @@ """ -Copyright [2022] Victor C Hall +Copyright [2022-2223] Victor C Hall Licensed under the GNU Affero General Public License; You may not use this code except in compliance with the License. @@ -39,7 +39,6 @@ class DataLoaderMultiAspect(): self.batch_size = batch_size self.prepared_train_data = image_train_items random.Random(self.seed).shuffle(self.prepared_train_data) - self.prepared_train_data = sorted(self.prepared_train_data, key=lambda img: img.caption.rating()) self.expected_epoch_size = math.floor(sum([i.multiplier for i in self.prepared_train_data])) if self.expected_epoch_size != len(self.prepared_train_data): logging.info(f" * DLMA initialized with {len(image_train_items)} source images. After applying multipliers, each epoch will train on at least {self.expected_epoch_size} images.") @@ -48,8 +47,6 @@ class DataLoaderMultiAspect(): self.rating_overall_sum: float = 0.0 self.ratings_summed: list[float] = [] - self.__update_rating_sums() - def __pick_multiplied_set(self, randomizer: random.Random): """ @@ -78,7 +75,7 @@ class DataLoaderMultiAspect(): return picked_images - def get_shuffled_image_buckets(self, dropout_fraction: float = 1.0) -> list[ImageTrainItem]: + def get_shuffled_image_buckets(self) -> list[ImageTrainItem]: """ Returns the current list of `ImageTrainItem` in randomized order, sorted into buckets with same sized images. @@ -94,10 +91,7 @@ class DataLoaderMultiAspect(): self.seed += 1 randomizer = random.Random(self.seed) - if dropout_fraction < 1.0: - picked_images = self.__pick_random_subset(dropout_fraction, randomizer) - else: - picked_images = self.__pick_multiplied_set(randomizer) + picked_images = self.__pick_multiplied_set(randomizer) randomizer.shuffle(picked_images) @@ -131,47 +125,3 @@ class DataLoaderMultiAspect(): items.extend(buckets[bucket]) return items - - def __pick_random_subset(self, dropout_fraction: float, picker: random.Random) -> list[ImageTrainItem]: - """ - Picks a random subset of all images - - The size of the subset is limited by dropout_faction - - The chance of an image to be picked is influenced by its rating. Double that rating -> double the chance - :param dropout_fraction: must be between 0.0 and 1.0 - :param picker: seeded random picker - :return: list of picked ImageTrainItem - """ - - prepared_train_data = self.prepared_train_data.copy() - ratings_summed = self.ratings_summed.copy() - rating_overall_sum = self.rating_overall_sum - - num_images = len(prepared_train_data) - num_images_to_pick = math.ceil(num_images * dropout_fraction) - num_images_to_pick = max(min(num_images_to_pick, num_images), 0) - - # logging.info(f"Picking {num_images_to_pick} images out of the {num_images} in the dataset for drop_fraction {dropout_fraction}") - - picked_images: list[ImageTrainItem] = [] - while num_images_to_pick > len(picked_images): - # find random sample in dataset - point = picker.uniform(0.0, rating_overall_sum) - pos = min(bisect.bisect_left(ratings_summed, point), len(prepared_train_data) -1 ) - - # pick random sample - picked_image = prepared_train_data[pos] - picked_images.append(picked_image) - - # kick picked item out of data set to not pick it again - rating_overall_sum = max(rating_overall_sum - picked_image.caption.rating(), 0.0) - ratings_summed.pop(pos) - prepared_train_data.pop(pos) - - return picked_images - - def __update_rating_sums(self): - self.rating_overall_sum: float = 0.0 - self.ratings_summed: list[float] = [] - for item in self.prepared_train_data: - self.rating_overall_sum += item.caption.rating() - self.ratings_summed.append(self.rating_overall_sum) \ No newline at end of file diff --git a/data/dataset.py b/data/dataset.py index a0c761a..4538a10 100644 --- a/data/dataset.py +++ b/data/dataset.py @@ -1,16 +1,21 @@ +import cProfile +from contextlib import nullcontext import os import logging +import time import yaml import json -from functools import total_ordering -from attrs import define, field, Factory +from functools import partial +from attrs import define, field from data.image_train_item import ImageCaption, ImageTrainItem from utils.fs_helpers import * from typing import Iterable from tqdm import tqdm +from multiprocessing import Pool, Lock + DEFAULT_MAX_CAPTION_LENGTH = 2048 def overlay(overlay, base): @@ -163,12 +168,14 @@ class Dataset: cfgs.append(ImageConfig.from_file(fileset['local.yml'])) return ImageConfig.fold(cfgs) - def __sidecar_cfg(imagepath, fileset): + def __sidecar_cfg(imagepath, fileset, lock): cfgs = [] for cfgext in ['.txt', '.caption', '.yml', '.yaml']: cfgfile = barename(imagepath) + cfgext if cfgfile in fileset: - cfgs.append(ImageConfig.from_file(fileset[cfgfile])) + cfg = ImageConfig.from_file(fileset[cfgfile]) + with lock: + cfgs.append(cfg) return ImageConfig.fold(cfgs) # Use file name for caption only as a last resort @@ -179,22 +186,52 @@ class Dataset: cap_cfg = ImageConfig.from_caption_text(barename(file).split("_")[0]) return cfg.merge(cap_cfg) + @classmethod + def scan_one(cls, img, image_configs, fileset, global_cfg, local_cfg, lock): + img_cfg = Dataset.__sidecar_cfg(img, fileset, lock) + resolved_cfg = ImageConfig.fold([global_cfg, local_cfg, img_cfg]) + with lock: + image_configs[img] = Dataset.__ensure_caption(resolved_cfg, img) + + @classmethod + def scan_one_full(cls, img, image_configs, fileset, global_cfg, local_cfg, lock): + Dataset.scan_one(img, image_configs, fileset, global_cfg, local_cfg, lock) + img_cfg = Dataset.__sidecar_cfg(img, fileset, lock) + resolved_cfg = ImageConfig.fold([global_cfg, local_cfg, img_cfg]) + image_configs[img] = Dataset.__ensure_caption(resolved_cfg, img) + #print(f"{image_configs[img].main_prompts} {image_configs[img].tags} {image_configs[img].rating}") + + @classmethod def from_path(cls, data_root): # Create a visitor that maintains global config stack # and accumulates image configs as it traverses dataset + image_configs = {} def process_dir(files, parent_globals): + #pool = Pool(int(os.cpu_count()/2)) + lock = Lock() + fileset = {os.path.basename(f): f for f in files} global_cfg = parent_globals.merge(Dataset.__global_cfg(fileset)) local_cfg = Dataset.__local_cfg(fileset) for img in filter(is_image, files): - img_cfg = Dataset.__sidecar_cfg(img, fileset) - resolved_cfg = ImageConfig.fold([global_cfg, local_cfg, img_cfg]) - image_configs[img] = Dataset.__ensure_caption(resolved_cfg, img) + #pool.apply_async(Dataset.scan_one_full, args=(img, image_configs, fileset, global_cfg, local_cfg, lock)) + Dataset.scan_one_full(img, image_configs, fileset, global_cfg, local_cfg, lock) + #Dataset.scan_one(img, image_configs, fileset, global_cfg, local_cfg, lock) + #pool.close() + #pool.join() + # img_cfg = Dataset.__sidecar_cfg(img, fileset) + # resolved_cfg = ImageConfig.fold([global_cfg, local_cfg, img_cfg]) + # image_configs[img] = Dataset.__ensure_caption(resolved_cfg, img) + return global_cfg + time_start = time.time() walk_and_visit(data_root, process_dir, ImageConfig()) + time_end = time.time() + logging.info(f" ... walk_and_visit took {(time_end - time_start)/60:.2f} minutes and found {len(image_configs)} images") + return Dataset(image_configs) @classmethod @@ -212,45 +249,125 @@ class Dataset: continue image_configs[img] = cfg return Dataset(image_configs) - + + def get_one_image_train_item(self, image, aspects, profile=False) -> ImageTrainItem: + + + config = self.image_configs[image] + + tags = [] + tag_weights = [] + for tag in sorted(config.tags, key=lambda x: x.weight or 1.0, reverse=True): + tags.append(tag.value) + tag_weights.append(tag.weight) + use_weights = len(set(tag_weights)) > 1 + + try: + if profile: + profiler = cProfile.Profile() + import random + random_n = f"{random.randint(0,999):03d}" + profiler.enable() + caption = ImageCaption( + main_prompt=next(iter(config.main_prompts)), + rating=config.rating or 1.0, + tags=tags, + tag_weights=tag_weights, + max_target_length=config.max_caption_length or DEFAULT_MAX_CAPTION_LENGTH, + use_weights=use_weights) + if profile: + profiler.disable() + profiler.dump_stats(f'profile{random_n}.prof') + #exit() + + item = ImageTrainItem( + image=None, + caption=caption, + aspects=aspects, + pathname=os.path.abspath(image), + flip_p=config.flip_p or 0.0, + multiplier=config.multiply or 1.0, + cond_dropout=config.cond_dropout + ) + except Exception as e: + logging.error(f" *** Error preloading image or caption for: {image}, error: {e}") + raise e + + + return item + def image_train_items(self, aspects): + print(f" * using async loader") + run_profiler = False items = [] - for image in tqdm(self.image_configs, desc="preloading", dynamic_ncols=True): - config = self.image_configs[image] + process_count = int(os.cpu_count()/2) + pool = Pool(process_count) + async_results = [] - if len(config.main_prompts) > 1: - logging.warning(f" *** Found multiple multiple main_prompts for image {image}, but only one will be applied: {config.main_prompts}") + time_start = time.time() + with tqdm(total=len(self.image_configs), desc=f"preloading {process_count}", dynamic_ncols=True) as pbar: + for image in self.image_configs: + async_result = pool.apply_async(self.get_one_image_train_item, args=(image,aspects, run_profiler), callback=lambda _: pbar.update()) + async_results.append(async_result) + pool.close() + pool.join() - if len(config.main_prompts) < 1: - logging.warning(f" *** No main_prompts for image {image}") + for async_result in async_results: + result = async_result.get() + if result is not None: + # ImageTrainItem + items.append(result) + else: + raise ValueError(" *** image_train_items(): Async load item missing") + + + + time_end = time.time() + logging.info(f" *** Preloading took {(time_end - time_start)/60:.2f} minutes and found {len(items)} images") + return items - tags = [] - tag_weights = [] - for tag in sorted(config.tags, key=lambda x: x.weight or 1.0, reverse=True): - tags.append(tag.value) - tag_weights.append(tag.weight) - use_weights = len(set(tag_weights)) > 1 + def image_train_items_newish(self, aspects): + print(f" * using async loader") + items = [] + process_count = int(os.cpu_count()/2) + pool = Pool(process_count) - try: - caption = ImageCaption( - main_prompt=next(iter(config.main_prompts)), - rating=config.rating or 1.0, - tags=tags, - tag_weights=tag_weights, - max_target_length=config.max_caption_length or DEFAULT_MAX_CAPTION_LENGTH, - use_weights=use_weights) + time_start = time.time() + with tqdm(total=len(self.image_configs), desc=f"preloading {process_count}", dynamic_ncols=True) as pbar: + async_results = [] + + # run 1000 async tasks + for image in self.image_configs: + # profile the task + #cProfile.runctx('self.get_one(image,aspects)', globals(), locals(), 'profile.prof') + async_result = pool.apply_async(self.get_one_image_train_item, args=(image,aspects), callback=lambda _: pbar.update()) + async_results.append(async_result) + pool.close() + #pool.join() + print(f" * async pool closed") - item = ImageTrainItem( - image=None, - caption=caption, - aspects=aspects, - pathname=os.path.abspath(image), - flip_p=config.flip_p or 0.0, - multiplier=config.multiply or 1.0, - cond_dropout=config.cond_dropout - ) - items.append(item) - except Exception as e: - logging.error(f" *** Error preloading image or caption for: {image}, error: {e}") - raise e - return items \ No newline at end of file + for async_result in async_results: + result = async_result.get() + if result is not None: + # ImageTrainItem + items.append(result) + print(f"{result.pathname} {result.caption.main_prompt}") + else: + raise ValueError(" *** image_train_items(): Async load item missing") + + time_end = time.time() + logging.info(f" *** Preloading took {(time_end - time_start)/60:.2f} minutes and found {len(items)} images") + return items + + def image_train_items_old(self, aspects): + print(f" * using single threaded loader") + items = [] + + time_start = time.time() + with tqdm(total=len(self.image_configs), desc="preloading", dynamic_ncols=True) as pbar: + for image in self.image_configs: + items.append(self.get_one_image_train_item(image, aspects)) + pbar.update() + time_end = time.time() + logging.info(f" *** Preloading took {(time_end - time_start)/60:.2f} minutes and found {len(items)} images") + return items diff --git a/data/every_dream.py b/data/every_dream.py index ba9ea60..c3b97b2 100644 --- a/data/every_dream.py +++ b/data/every_dream.py @@ -1,5 +1,5 @@ """ -Copyright [2022] Victor C Hall +Copyright [2022-2023] Victor C Hall Licensed under the GNU Affero General Public License; You may not use this code except in compliance with the License. @@ -57,11 +57,11 @@ class EveryDreamBatch(Dataset): self.retain_contrast = retain_contrast self.shuffle_tags = shuffle_tags self.seed = seed - self.rated_dataset = rated_dataset - self.rated_dataset_dropout_target = rated_dataset_dropout_target + #self.rated_dataset = rated_dataset + #self.rated_dataset_dropout_target = rated_dataset_dropout_target # First epoch always trains on all images self.image_train_items = [] - self.__update_image_train_items(1.0) + self.__update_image_train_items() self.name = name num_images = len(self.image_train_items) @@ -69,13 +69,7 @@ class EveryDreamBatch(Dataset): def shuffle(self, epoch_n: int, max_epochs: int): self.seed += 1 - - if self.rated_dataset: - dropout_fraction = (max_epochs - (epoch_n * self.rated_dataset_dropout_target)) / max_epochs - else: - dropout_fraction = 1.0 - - self.__update_image_train_items(dropout_fraction) + self.__update_image_train_items() def __len__(self): return len(self.image_train_items) @@ -140,8 +134,8 @@ class EveryDreamBatch(Dataset): return example - def __update_image_train_items(self, dropout_fraction: float): - self.image_train_items = self.data_loader.get_shuffled_image_buckets(dropout_fraction) + def __update_image_train_items(self): + self.image_train_items = self.data_loader.get_shuffled_image_buckets() def build_torch_dataloader(dataset, batch_size) -> torch.utils.data.DataLoader: dataloader = torch.utils.data.DataLoader( diff --git a/data/image_train_item.py b/data/image_train_item.py index 0346dda..741bb1f 100644 --- a/data/image_train_item.py +++ b/data/image_train_item.py @@ -56,6 +56,9 @@ class ImageCaption: if use_weights and len(tag_weights) > len(tags): self.__tag_weights = tag_weights[:len(tags)] + def __repr__(self) -> str: + return f"ImageCaption({self.__main_prompt}, {self.__rating}, {self.__tags}, {self.__tag_weights}, {self.__max_target_length}, {self.__use_weights})" + def rating(self) -> float: return self.__rating @@ -143,7 +146,6 @@ class ImageTrainItem: else: self.image = image self.image_size = image.size - self.target_size = None self.is_undersized = False self.error = None @@ -245,7 +247,7 @@ class ImageTrainItem: self.target_wh = None try: with PIL.Image.open(self.pathname) as image: - image = self._try_transpose(image, print_error=True).convert('RGB') + image = self._try_transpose(image, print_error=True) width, height = image.size image_aspect = width / height target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect)) diff --git a/train.py b/train.py index b0a6ca8..96ad494 100644 --- a/train.py +++ b/train.py @@ -241,8 +241,8 @@ def setup_args(args): args.clip_skip = max(min(4, args.clip_skip), 0) - if args.useadam8bit: - logging.warning(f"{Fore.LIGHTYELLOW_EX} Useadam8bit arg is deprecated, use optimizer.json instead, which defaults to useadam8bit anyway{Style.RESET_ALL}") + #if args.useadam8bit: + # logging.warning(f"{Fore.LIGHTYELLOW_EX} Useadam8bit arg is deprecated, use optimizer.json instead, which defaults to useadam8bit anyway{Style.RESET_ALL}") if args.ckpt_every_n_minutes is None and args.save_every_n_epochs is None: logging.info(f"{Fore.LIGHTCYAN_EX} No checkpoint saving specified, defaulting to every 20 minutes.{Style.RESET_ALL}") @@ -932,7 +932,7 @@ def main(args): if validator: validator.do_validation_if_appropriate(epoch+1, global_step, get_model_prediction_and_target) - gc.collect() + #gc.collect() # end of epoch # end of training @@ -1011,12 +1011,12 @@ if __name__ == "__main__": argparser.add_argument("--scale_lr", action="store_true", default=False, help="automatically scale up learning rate based on batch size and grad accumulation (def: False)") argparser.add_argument("--seed", type=int, default=555, help="seed used for samples and shuffling, use -1 for random") argparser.add_argument("--shuffle_tags", action="store_true", default=False, help="randomly shuffles CSV tags in captions, for booru datasets") - argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead") + #argparser.add_argument("--useadam8bit", action="store_true", default=False, help="deprecated, use --optimizer_config and optimizer.json instead") argparser.add_argument("--wandb", action="store_true", default=False, help="enable wandb logging instead of tensorboard, requires env var WANDB_API_KEY") argparser.add_argument("--validation_config", default=None, help="Path to a JSON configuration file for the validator. Default is no validation.") argparser.add_argument("--write_schedule", action="store_true", default=False, help="write schedule of images and their batches to file (def: False)") - argparser.add_argument("--rated_dataset", action="store_true", default=False, help="enable rated image set training, to less often train on lower rated images through the epochs") - argparser.add_argument("--rated_dataset_target_dropout_percent", type=int, default=50, help="how many images (in percent) should be included in the last epoch (Default 50)") + #argparser.add_argument("--rated_dataset", action="store_true", default=False, help="enable rated image set training, to less often train on lower rated images through the epochs") + #argparser.add_argument("--rated_dataset_target_dropout_percent", type=int, default=50, help="how many images (in percent) should be included in the last epoch (Default 50)") argparser.add_argument("--zero_frequency_noise_ratio", type=float, default=0.02, help="adds zero frequency noise, for improving contrast (def: 0.0) use 0.0 to 0.15") # load CLI args to overwrite existing config args diff --git a/utils/fs_helpers.py b/utils/fs_helpers.py index 775ca0f..df016ae 100644 --- a/utils/fs_helpers.py +++ b/utils/fs_helpers.py @@ -25,7 +25,7 @@ def read_float(file): try: return float(read_text(file)) except Exception as e: - logging.warning(f" *** Could not parse '{data}' to float in file {file}: {e}") + logging.warning(f" *** Could not parse number to float in file {file}: {e}") import os @@ -48,4 +48,4 @@ def walk_and_visit(path, visit_fn, context=None): subcontext = visit_fn(files, context) for subdir in dirs: - walk_and_visit(subdir, visit_fn, subcontext) \ No newline at end of file + walk_and_visit(subdir, visit_fn, subcontext)