EveryDream2trainer/data/data_loader.py

"""
Copyright [2022] Victor C Hall

Licensed under the GNU Affero General Public License;
You may not use this code except in compliance with the License.
You may obtain a copy of the License at

    https://www.gnu.org/licenses/agpl-3.0.en.html

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import bisect
import math
import os
import logging

import yaml
from PIL import Image
import random
from data.image_train_item import ImageTrainItem, ImageCaption
import data.aspects as aspects
from colorama import Fore, Style
import zipfile
import tqdm
import PIL

PIL.Image.MAX_IMAGE_PIXELS = 715827880*4 # increase decompression bomb error limit to 4x default

DEFAULT_MAX_CAPTION_LENGTH = 2048

class DataLoaderMultiAspect():
    """
    Data loader for multi-aspect-ratio training and bucketing

    data_root: root folder of training data
    batch_size: number of images per batch
    flip_p: probability of flipping image horizontally (i.e. 0-0.5)
    """
    def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, resolution=512, log_folder=None):
        self.image_paths = []
        self.debug_level = debug_level
        self.flip_p = flip_p
        self.log_folder = log_folder
        self.seed = seed
        self.batch_size = batch_size

        self.aspects = aspects.get_aspect_buckets(resolution=resolution, square_only=False)
        logging.info(f"* DLMA resolution {resolution}, buckets: {self.aspects}")
        logging.info(" Preloading images...")

        self.unzip_all(data_root)

        self.__recurse_data_root(self=self, recurse_root=data_root)
        random.Random(seed).shuffle(self.image_paths)
        self.prepared_train_data = self.__prescan_images(self.image_paths, flip_p)
        (self.rating_overall_sum, self.ratings_summed) = self.__sort_and_precalc_image_ratings()

    def get_shuffled_image_buckets(self, dropout_fraction: float = 1.0):
        """
        returns the current list of images including their captions in a randomized order,
        sorted into buckets with same sized images
        if dropout_fraction < 1.0, only a subset of the images will be returned
        :param dropout_fraction: must be between 0.0 and 1.0.
        :return: randomized list of (image, caption) pairs, sorted into same sized buckets
        """
        """
        Put images into buckets based on aspect ratio with batch_size*n images per bucket, discards remainder
        """
        # TODO: this is not terribly efficient but at least linear time

        self.seed += 1
        randomizer = random.Random(self.seed)

        if dropout_fraction < 1.0:
            picked_images = self.__pick_random_subset(dropout_fraction, randomizer)
        else:
            picked_images = self.prepared_train_data

        randomizer.shuffle(picked_images)

        buckets = {}
        batch_size = self.batch_size
        for image_caption_pair in picked_images:
            image_caption_pair.runt_size = 0
            target_wh = image_caption_pair.target_wh

            if (target_wh[0],target_wh[1]) not in buckets:
                buckets[(target_wh[0],target_wh[1])] = []
            buckets[(target_wh[0],target_wh[1])].append(image_caption_pair)

        if len(buckets) > 1:
            for bucket in buckets:
                truncate_count = len(buckets[bucket]) % batch_size
                if truncate_count > 0:
                    runt_bucket = buckets[bucket][-truncate_count:]
                    for item in runt_bucket:
                        item.runt_size = truncate_count
                    while len(runt_bucket) < batch_size:
                        runt_bucket.append(random.choice(runt_bucket))

                    current_bucket_size = len(buckets[bucket])

                    buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]
                    buckets[bucket].extend(runt_bucket)

        # flatten the buckets
        image_caption_pairs = []
        for bucket in buckets:
            image_caption_pairs.extend(buckets[bucket])

        return image_caption_pairs

    @staticmethod
    def unzip_all(path):
        try:
            for root, dirs, files in os.walk(path):
                for file in files:
                    if file.endswith('.zip'):
                        logging.info(f"Unzipping {file}")
                        with zipfile.ZipFile(path, 'r') as zip_ref:
                            zip_ref.extractall(path)
        except Exception as e:
            logging.error(f"Error unzipping files {e}")

    def __sort_and_precalc_image_ratings(self) -> tuple[float, list[float]]:
        self.prepared_train_data = sorted(self.prepared_train_data, key=lambda img: img.caption.rating())

        rating_overall_sum: float = 0.0
        ratings_summed: list[float] = []
        for image in self.prepared_train_data:
            rating_overall_sum += image.caption.rating()
            ratings_summed.append(rating_overall_sum)

        return rating_overall_sum, ratings_summed

    @staticmethod
    def __read_caption_from_file(file_path, fallback_caption: ImageCaption) -> ImageCaption:
        try:
            with open(file_path, encoding='utf-8', mode='r') as caption_file:
                caption_text = caption_file.read()
                caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_text)
        except:
            logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")
            caption = fallback_caption
            pass
        return caption

    @staticmethod
    def __read_caption_from_yaml(file_path: str, fallback_caption: ImageCaption) -> ImageCaption:
        with open(file_path, "r") as stream:
            try:
                file_content = yaml.safe_load(stream)
                main_prompt = file_content.get("main_prompt", "")
                rating = file_content.get("rating", 1.0)
                unparsed_tags = file_content.get("tags", [])

                max_caption_length = file_content.get("max_caption_length", DEFAULT_MAX_CAPTION_LENGTH)

                tags = []
                tag_weights = []
                last_weight = None
                weights_differ = False
                for unparsed_tag in unparsed_tags:
                    tag = unparsed_tag.get("tag", "").strip()
                    if len(tag) == 0:
                        continue

                    tags.append(tag)
                    tag_weight = unparsed_tag.get("weight", 1.0)
                    tag_weights.append(tag_weight)

                    if last_weight is not None and weights_differ is False:
                        weights_differ = last_weight != tag_weight

                    last_weight = tag_weight

                return ImageCaption(main_prompt, rating, tags, tag_weights, max_caption_length, weights_differ)

            except:
                logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")
                return fallback_caption

    @staticmethod
    def __split_caption_into_tags(caption_string: str) -> ImageCaption:
        """
        Splits a string by "," into the main prompt and additional tags with equal weights
        """
        split_caption = caption_string.split(",")
        main_prompt = split_caption.pop(0).strip()
        tags = []
        for tag in split_caption:
            tags.append(tag.strip())

        return ImageCaption(main_prompt, 1.0, tags, [1.0] * len(tags), DEFAULT_MAX_CAPTION_LENGTH, False)

    def __prescan_images(self, image_paths: list, flip_p=0.0) -> list[ImageTrainItem]:
        """
        Create ImageTrainItem objects with metadata for hydration later
        """
        decorated_image_train_items = []

        for pathname in tqdm.tqdm(image_paths):
            caption_from_filename = os.path.splitext(os.path.basename(pathname))[0].split("_")[0]
            caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_from_filename)

            file_path_without_ext = os.path.splitext(pathname)[0]
            yaml_file_path = file_path_without_ext + ".yaml"
            txt_file_path = file_path_without_ext + ".txt"
            caption_file_path = file_path_without_ext + ".caption"

            if os.path.exists(yaml_file_path):
                caption = self.__read_caption_from_yaml(yaml_file_path, caption)
            elif os.path.exists(txt_file_path):
                caption = self.__read_caption_from_file(txt_file_path, caption)
            elif os.path.exists(caption_file_path):
                caption = self.__read_caption_from_file(caption_file_path, caption)

            try:
                image = Image.open(pathname)
                width, height = image.size
                image_aspect = width / height

                target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))

                image_train_item = ImageTrainItem(image=None, caption=caption, target_wh=target_wh, pathname=pathname, flip_p=flip_p)

                decorated_image_train_items.append(image_train_item)
            except Exception as e:
                logging.error(f"{Fore.LIGHTRED_EX} *** Error opening {Fore.LIGHTYELLOW_EX}{pathname}{Fore.LIGHTRED_EX} to get metadata. File may be corrupt and will be skipped.{Style.RESET_ALL}")
                logging.error(f" *** exception: {e}")
                pass

        return decorated_image_train_items

    def __pick_random_subset(self, dropout_fraction: float, picker: random.Random) -> list[ImageTrainItem]:
        """
        Picks a random subset of all images
        - The size of the subset is limited by dropout_faction
        - The chance of an image to be picked is influenced by its rating. Double that rating -> double the chance
        :param dropout_fraction: must be between 0.0 and 1.0
        :param picker: seeded random picker
        :return: list of picked ImageTrainItem
        """

        prepared_train_data = self.prepared_train_data.copy()
        ratings_summed = self.ratings_summed.copy()
        rating_overall_sum = self.rating_overall_sum

        num_images = len(prepared_train_data)
        num_images_to_pick = math.ceil(num_images * dropout_fraction)
        num_images_to_pick = max(min(num_images_to_pick, num_images), 0)

        # logging.info(f"Picking {num_images_to_pick} images out of the {num_images} in the dataset for drop_fraction {dropout_fraction}")

        picked_images: list[ImageTrainItem] = []
        while num_images_to_pick > len(picked_images):
            # find random sample in dataset
            point = picker.uniform(0.0, rating_overall_sum)
            pos = min(bisect.bisect_left(ratings_summed, point), len(prepared_train_data) -1 )

            # pick random sample
            picked_image = prepared_train_data[pos]
            picked_images.append(picked_image)

            # kick picked item out of data set to not pick it again
            rating_overall_sum = max(rating_overall_sum - picked_image.caption.rating(), 0.0)
            ratings_summed.pop(pos)
            prepared_train_data.pop(pos)

        return picked_images

    @staticmethod
    def __recurse_data_root(self, recurse_root):
        multiply = 1
        multiply_path = os.path.join(recurse_root, "multiply.txt")
        if os.path.exists(multiply_path):
            try:
                with open(multiply_path, encoding='utf-8', mode='r') as f:
                    multiply = int(float(f.read().strip()))
                    logging.info(f" * DLMA multiply.txt in {recurse_root} set to {multiply}")
            except:
                logging.error(f" *** Error reading multiply.txt in {recurse_root}, defaulting to 1")
                pass

        for f in os.listdir(recurse_root):
            current = os.path.join(recurse_root, f)

            if os.path.isfile(current):
                ext = os.path.splitext(f)[1].lower()
                if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.jfif']:
                    # add image multiplyrepeats number of times
                    for _ in range(multiply):
                        self.image_paths.append(current)

        sub_dirs = []

        for d in os.listdir(recurse_root):
            current = os.path.join(recurse_root, d)
            if os.path.isdir(current):
                sub_dirs.append(current)

        for dir in sub_dirs:
            self.__recurse_data_root(self=self, recurse_root=dir)
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`
			`Copyright [2022] Victor C Hall`

			`Licensed under the GNU Affero General Public License;`
			`You may not use this code except in compliance with the License.`
			`You may obtain a copy of the License at`

			`https://www.gnu.org/licenses/agpl-3.0.en.html`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`"""`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`import bisect`
			`import math`
hey look ed2 2022-12-17 20:32:48 -07:00			`import os`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`import logging`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00
			`import yaml`
hey look ed2 2022-12-17 20:32:48 -07:00			`from PIL import Image`
			`import random`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`from data.image_train_item import ImageTrainItem, ImageCaption`
hey look ed2 2022-12-17 20:32:48 -07:00			`import data.aspects as aspects`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`from colorama import Fore, Style`
add gpu id support 2022-12-29 19:11:06 -07:00			`import zipfile`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`import tqdm`
			`import PIL`

			`PIL.Image.MAX_IMAGE_PIXELS = 715827880*4 # increase decompression bomb error limit to 4x default`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented an optimization for the shuffling if all tags have the same weight and added documentation. 2023-01-07 14:59:51 -07:00			`DEFAULT_MAX_CAPTION_LENGTH = 2048`

hey look ed2 2022-12-17 20:32:48 -07:00			`class DataLoaderMultiAspect():`
			`"""`
			`Data loader for multi-aspect-ratio training and bucketing`

			`data_root: root folder of training data`
			`batch_size: number of images per batch`
			`flip_p: probability of flipping image horizontally (i.e. 0-0.5)`
			`"""`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`def __init__(self, data_root, seed=555, debug_level=0, batch_size=1, flip_p=0.0, resolution=512, log_folder=None):`
hey look ed2 2022-12-17 20:32:48 -07:00			`self.image_paths = []`
			`self.debug_level = debug_level`
			`self.flip_p = flip_p`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`self.log_folder = log_folder`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`self.seed = seed`
			`self.batch_size = batch_size`
hey look ed2 2022-12-17 20:32:48 -07:00
			`self.aspects = aspects.get_aspect_buckets(resolution=resolution, square_only=False)`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`logging.info(f"* DLMA resolution {resolution}, buckets: {self.aspects}")`
			`logging.info(" Preloading images...")`
hey look ed2 2022-12-17 20:32:48 -07:00
add gpu id support 2022-12-29 19:11:06 -07:00			`self.unzip_all(data_root)`

hey look ed2 2022-12-17 20:32:48 -07:00			`self.__recurse_data_root(self=self, recurse_root=data_root)`
			`random.Random(seed).shuffle(self.image_paths)`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`self.prepared_train_data = self.__prescan_images(self.image_paths, flip_p)`
			`(self.rating_overall_sum, self.ratings_summed) = self.__sort_and_precalc_image_ratings()`

			`def get_shuffled_image_buckets(self, dropout_fraction: float = 1.0):`
			`"""`
			`returns the current list of images including their captions in a randomized order,`
			`sorted into buckets with same sized images`
			`if dropout_fraction < 1.0, only a subset of the images will be returned`
			`:param dropout_fraction: must be between 0.0 and 1.0.`
			`:return: randomized list of (image, caption) pairs, sorted into same sized buckets`
			`"""`
			`"""`
			`Put images into buckets based on aspect ratio with batch_size*n images per bucket, discards remainder`
			`"""`
			`# TODO: this is not terribly efficient but at least linear time`

			`self.seed += 1`
			`randomizer = random.Random(self.seed)`

			`if dropout_fraction < 1.0:`
			`picked_images = self.__pick_random_subset(dropout_fraction, randomizer)`
			`else:`
			`picked_images = self.prepared_train_data`

			`randomizer.shuffle(picked_images)`

			`buckets = {}`
			`batch_size = self.batch_size`
			`for image_caption_pair in picked_images:`
			`image_caption_pair.runt_size = 0`
			`target_wh = image_caption_pair.target_wh`

			`if (target_wh[0],target_wh[1]) not in buckets:`
			`buckets[(target_wh[0],target_wh[1])] = []`
			`buckets[(target_wh[0],target_wh[1])].append(image_caption_pair)`

			`if len(buckets) > 1:`
			`for bucket in buckets:`
			`truncate_count = len(buckets[bucket]) % batch_size`
			`if truncate_count > 0:`
			`runt_bucket = buckets[bucket][-truncate_count:]`
			`for item in runt_bucket:`
			`item.runt_size = truncate_count`
			`while len(runt_bucket) < batch_size:`
			`runt_bucket.append(random.choice(runt_bucket))`

			`current_bucket_size = len(buckets[bucket])`

			`buckets[bucket] = buckets[bucket][:current_bucket_size - truncate_count]`
			`buckets[bucket].extend(runt_bucket)`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`# flatten the buckets`
			`image_caption_pairs = []`
			`for bucket in buckets:`
			`image_caption_pairs.extend(buckets[bucket])`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`return image_caption_pairs`

			`@staticmethod`
			`def unzip_all(path):`
add gpu id support 2022-12-29 19:11:06 -07:00			`try:`
			`for root, dirs, files in os.walk(path):`
			`for file in files:`
			`if file.endswith('.zip'):`
			`logging.info(f"Unzipping {file}")`
			`with zipfile.ZipFile(path, 'r') as zip_ref:`
			`zip_ref.extractall(path)`
			`except Exception as e:`
			`logging.error(f"Error unzipping files {e}")`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`def __sort_and_precalc_image_ratings(self) -> tuple[float, list[float]]:`
			`self.prepared_train_data = sorted(self.prepared_train_data, key=lambda img: img.caption.rating())`

			`rating_overall_sum: float = 0.0`
			`ratings_summed: list[float] = []`
			`for image in self.prepared_train_data:`
			`rating_overall_sum += image.caption.rating()`
			`ratings_summed.append(rating_overall_sum)`

			`return rating_overall_sum, ratings_summed`
hey look ed2 2022-12-17 20:32:48 -07:00
			`@staticmethod`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`def __read_caption_from_file(file_path, fallback_caption: ImageCaption) -> ImageCaption:`
hey look ed2 2022-12-17 20:32:48 -07:00			`try:`
			`with open(file_path, encoding='utf-8', mode='r') as caption_file:`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`caption_text = caption_file.read()`
			`caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_text)`
hey look ed2 2022-12-17 20:32:48 -07:00			`except:`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")`
hey look ed2 2022-12-17 20:32:48 -07:00			`caption = fallback_caption`
			`pass`
			`return caption`

Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`@staticmethod`
			`def __read_caption_from_yaml(file_path: str, fallback_caption: ImageCaption) -> ImageCaption:`
			`with open(file_path, "r") as stream:`
			`try:`
			`file_content = yaml.safe_load(stream)`
			`main_prompt = file_content.get("main_prompt", "")`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`rating = file_content.get("rating", 1.0)`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`unparsed_tags = file_content.get("tags", [])`

Implemented an optimization for the shuffling if all tags have the same weight and added documentation. 2023-01-07 14:59:51 -07:00			`max_caption_length = file_content.get("max_caption_length", DEFAULT_MAX_CAPTION_LENGTH)`

Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`tags = []`
			`tag_weights = []`
Implemented an optimization for the shuffling if all tags have the same weight and added documentation. 2023-01-07 14:59:51 -07:00			`last_weight = None`
			`weights_differ = False`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`for unparsed_tag in unparsed_tags:`
			`tag = unparsed_tag.get("tag", "").strip()`
			`if len(tag) == 0:`
			`continue`

			`tags.append(tag)`
Implemented an optimization for the shuffling if all tags have the same weight and added documentation. 2023-01-07 14:59:51 -07:00			`tag_weight = unparsed_tag.get("weight", 1.0)`
			`tag_weights.append(tag_weight)`

			`if last_weight is not None and weights_differ is False:`
			`weights_differ = last_weight != tag_weight`

			`last_weight = tag_weight`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`return ImageCaption(main_prompt, rating, tags, tag_weights, max_caption_length, weights_differ)`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00
			`except:`
			`logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")`
			`return fallback_caption`

Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`@staticmethod`
			`def __split_caption_into_tags(caption_string: str) -> ImageCaption:`
			`"""`
			`Splits a string by "," into the main prompt and additional tags with equal weights`
			`"""`
			`split_caption = caption_string.split(",")`
			`main_prompt = split_caption.pop(0).strip()`
			`tags = []`
			`for tag in split_caption:`
			`tags.append(tag.strip())`

Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`return ImageCaption(main_prompt, 1.0, tags, [1.0] * len(tags), DEFAULT_MAX_CAPTION_LENGTH, False)`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`def __prescan_images(self, image_paths: list, flip_p=0.0) -> list[ImageTrainItem]:`
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`
			`Create ImageTrainItem objects with metadata for hydration later`
			`"""`
			`decorated_image_train_items = []`

bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`for pathname in tqdm.tqdm(image_paths):`
hey look ed2 2022-12-17 20:32:48 -07:00			`caption_from_filename = os.path.splitext(os.path.basename(pathname))[0].split("_")[0]`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_from_filename)`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`file_path_without_ext = os.path.splitext(pathname)[0]`
			`yaml_file_path = file_path_without_ext + ".yaml"`
			`txt_file_path = file_path_without_ext + ".txt"`
			`caption_file_path = file_path_without_ext + ".caption"`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`if os.path.exists(yaml_file_path):`
			`caption = self.__read_caption_from_yaml(yaml_file_path, caption)`
			`elif os.path.exists(txt_file_path):`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`caption = self.__read_caption_from_file(txt_file_path, caption)`
hey look ed2 2022-12-17 20:32:48 -07:00			`elif os.path.exists(caption_file_path):`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`caption = self.__read_caption_from_file(caption_file_path, caption)`
hey look ed2 2022-12-17 20:32:48 -07:00
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`try:`
			`image = Image.open(pathname)`
			`width, height = image.size`
			`image_aspect = width / height`
hey look ed2 2022-12-17 20:32:48 -07:00
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))`
hey look ed2 2022-12-17 20:32:48 -07:00
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`image_train_item = ImageTrainItem(image=None, caption=caption, target_wh=target_wh, pathname=pathname, flip_p=flip_p)`
hey look ed2 2022-12-17 20:32:48 -07:00
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`decorated_image_train_items.append(image_train_item)`
			`except Exception as e:`
			`logging.error(f"{Fore.LIGHTRED_EX} *** Error opening {Fore.LIGHTYELLOW_EX}{pathname}{Fore.LIGHTRED_EX} to get metadata. File may be corrupt and will be skipped.{Style.RESET_ALL}")`
			`logging.error(f" *** exception: {e}")`
			`pass`
hey look ed2 2022-12-17 20:32:48 -07:00
			`return decorated_image_train_items`

Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`def __pick_random_subset(self, dropout_fraction: float, picker: random.Random) -> list[ImageTrainItem]:`
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`Picks a random subset of all images`
			`- The size of the subset is limited by dropout_faction`
			`- The chance of an image to be picked is influenced by its rating. Double that rating -> double the chance`
			`:param dropout_fraction: must be between 0.0 and 1.0`
			`:param picker: seeded random picker`
			`:return: list of picked ImageTrainItem`
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`

Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`prepared_train_data = self.prepared_train_data.copy()`
			`ratings_summed = self.ratings_summed.copy()`
			`rating_overall_sum = self.rating_overall_sum`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`num_images = len(prepared_train_data)`
			`num_images_to_pick = math.ceil(num_images * dropout_fraction)`
			`num_images_to_pick = max(min(num_images_to_pick, num_images), 0)`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`# logging.info(f"Picking {num_images_to_pick} images out of the {num_images} in the dataset for drop_fraction {dropout_fraction}")`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`picked_images: list[ImageTrainItem] = []`
			`while num_images_to_pick > len(picked_images):`
			`# find random sample in dataset`
			`point = picker.uniform(0.0, rating_overall_sum)`
			`pos = min(bisect.bisect_left(ratings_summed, point), len(prepared_train_data) -1 )`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`# pick random sample`
			`picked_image = prepared_train_data[pos]`
			`picked_images.append(picked_image)`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`# kick picked item out of data set to not pick it again`
			`rating_overall_sum = max(rating_overall_sum - picked_image.caption.rating(), 0.0)`
			`ratings_summed.pop(pos)`
			`prepared_train_data.pop(pos)`
hey look ed2 2022-12-17 20:32:48 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`return picked_images`
hey look ed2 2022-12-17 20:32:48 -07:00
			`@staticmethod`
			`def __recurse_data_root(self, recurse_root):`
docs, update setup 2022-12-18 11:03:44 -07:00			`multiply = 1`
			`multiply_path = os.path.join(recurse_root, "multiply.txt")`
			`if os.path.exists(multiply_path):`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`try:`
docs, update setup 2022-12-18 11:03:44 -07:00			`with open(multiply_path, encoding='utf-8', mode='r') as f:`
			`multiply = int(float(f.read().strip()))`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`logging.info(f" * DLMA multiply.txt in {recurse_root} set to {multiply}")`
docs, update setup 2022-12-18 11:03:44 -07:00			`except:`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`logging.error(f" *** Error reading multiply.txt in {recurse_root}, defaulting to 1")`
docs, update setup 2022-12-18 11:03:44 -07:00			`pass`

hey look ed2 2022-12-17 20:32:48 -07:00			`for f in os.listdir(recurse_root):`
			`current = os.path.join(recurse_root, f)`

			`if os.path.isfile(current):`
Make file name extension check case insensitive, so .JPG or .PNG files will also be detected 2023-01-10 03:54:26 -07:00			`ext = os.path.splitext(f)[1].lower()`
docs, update setup 2022-12-18 11:03:44 -07:00			`if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.webp', '.jfif']:`
			`# add image multiplyrepeats number of times`
			`for _ in range(multiply):`
			`self.image_paths.append(current)`
hey look ed2 2022-12-17 20:32:48 -07:00
			`sub_dirs = []`

			`for d in os.listdir(recurse_root):`
			`current = os.path.join(recurse_root, d)`
			`if os.path.isdir(current):`
			`sub_dirs.append(current)`

			`for dir in sub_dirs:`
			`self.__recurse_data_root(self=self, recurse_root=dir)`