EveryDream2trainer/data/every_dream.py

"""
Copyright [2022] Victor C Hall

Licensed under the GNU Affero General Public License;
You may not use this code except in compliance with the License.
You may obtain a copy of the License at

    https://www.gnu.org/licenses/agpl-3.0.en.html

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
import os

import torch
from torch.utils.data import Dataset
from data.data_loader import DataLoaderMultiAspect
from data.image_train_item import ImageTrainItem
import random
from torchvision import transforms
from transformers import CLIPTokenizer
import torch.nn.functional as F

class EveryDreamBatch(Dataset):
    """
    data_loader: `DataLoaderMultiAspect` object
    debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=debug info per image, 3=save crops to disk for inspection
    conditional_dropout: probability of dropping the caption for a given image
    crop_jitter: number of pixels to jitter the crop by, only for non-square images
    seed: random seed
    """
    def __init__(self,
                 data_loader: DataLoaderMultiAspect,
                 debug_level=0,
                 conditional_dropout=0.02,
                 crop_jitter=20,
                 seed=555,
                 tokenizer=None,
                 retain_contrast=False,
                 shuffle_tags=False,
                 rated_dataset=False,
                 rated_dataset_dropout_target=0.5,
                 name='train'
                 ):
        self.data_loader = data_loader
        self.batch_size = data_loader.batch_size
        self.debug_level = debug_level
        self.conditional_dropout = conditional_dropout
        self.crop_jitter = crop_jitter
        self.unloaded_to_idx = 0
        self.tokenizer = tokenizer
        self.max_token_length = self.tokenizer.model_max_length
        self.retain_contrast = retain_contrast
        self.shuffle_tags = shuffle_tags
        self.seed = seed
        self.rated_dataset = rated_dataset
        self.rated_dataset_dropout_target = rated_dataset_dropout_target
        # First epoch always trains on all images
        self.image_train_items  = []
        self.__update_image_train_items(1.0)
        self.name = name

        num_images = len(self.image_train_items)
        logging.info(f" ** Dataset '{name}': {num_images / self.batch_size:.0f} batches, num_images: {num_images}, batch_size: {self.batch_size}")

    def shuffle(self, epoch_n: int, max_epochs: int):
        self.seed += 1

        if self.rated_dataset:
            dropout_fraction = (max_epochs - (epoch_n * self.rated_dataset_dropout_target)) / max_epochs
        else:
            dropout_fraction = 1.0

        self.__update_image_train_items(dropout_fraction)

    def __len__(self):
        return len(self.image_train_items)

    def __getitem__(self, i):
        example = {}

        train_item = self.__get_image_for_trainer(self.image_train_items[i], self.debug_level)

        if self.retain_contrast:
            std_dev = 1.0
            mean = 0.0
        else:
            std_dev = 0.5
            mean = 0.5

        image_transforms = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize([mean], [std_dev]),
            ]
        )

        if self.shuffle_tags:
            example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)
        else:
            example["caption"] = train_item["caption"].get_caption()

        example["image"] = image_transforms(train_item["image"])

        if random.random() > (train_item.get("cond_dropout", self.conditional_dropout)):
            example["tokens"] = self.tokenizer(example["caption"],
                                                truncation=True,
                                                padding="max_length",
                                                max_length=self.tokenizer.model_max_length,
                                              ).input_ids
        else:
            example["tokens"] = self.tokenizer(" ",
                                                truncation=True,
                                                padding="max_length",
                                                max_length=self.tokenizer.model_max_length,
                                              ).input_ids

        example["tokens"] = torch.tensor(example["tokens"])

        example["runt_size"] = train_item["runt_size"]

        return example

    def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):
        example = {}
        save = debug_level > 2

        image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)

        example["image"] = image_train_tmp.image.copy()  # hack for now to avoid memory leak
        image_train_tmp.image = None # hack for now to avoid memory leak
        example["caption"] = image_train_tmp.caption
        if image_train_tmp.cond_dropout is not None:
            example["cond_dropout"] = image_train_tmp.cond_dropout
        example["runt_size"] = image_train_tmp.runt_size

        return example

    def __update_image_train_items(self, dropout_fraction: float):
        self.image_train_items = self.data_loader.get_shuffled_image_buckets(dropout_fraction)

def build_torch_dataloader(dataset, batch_size) -> torch.utils.data.DataLoader:
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size= batch_size,
        shuffle=False,
        num_workers=min(batch_size, os.cpu_count()),
        collate_fn=collate_fn
    )
    return dataloader


def collate_fn(batch):
    """
    Collates batches
    """
    images = [example["image"] for example in batch]
    captions = [example["caption"] for example in batch]
    tokens = [example["tokens"] for example in batch]
    runt_size = batch[0]["runt_size"]

    images = torch.stack(images)
    images = images.to(memory_format=torch.contiguous_format).float()

    ret = {
        "tokens": torch.stack(tuple(tokens)),
        "image": images,
        "captions": captions,
        "runt_size": runt_size,
    }
    del batch
    return ret
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`
			`Copyright [2022] Victor C Hall`

			`Licensed under the GNU Affero General Public License;`
			`You may not use this code except in compliance with the License.`
			`You may obtain a copy of the License at`

			`https://www.gnu.org/licenses/agpl-3.0.en.html`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`"""`
various tweaks and bugfixes over holidays 2022-12-27 12:25:32 -07:00			`import logging`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00			`import os`

hey look ed2 2022-12-17 20:32:48 -07:00			`import torch`
			`from torch.utils.data import Dataset`
Don't need to set data loader singleton; formatting tweaks 2023-01-29 18:31:57 -07:00			`from data.data_loader import DataLoaderMultiAspect`
hey look ed2 2022-12-17 20:32:48 -07:00			`from data.image_train_item import ImageTrainItem`
			`import random`
			`from torchvision import transforms`
			`from transformers import CLIPTokenizer`
			`import torch.nn.functional as F`

			`class EveryDreamBatch(Dataset):`
			`"""`
Update documentation 2023-01-29 18:58:42 -07:00			data_loader: `DataLoaderMultiAspect` object
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`debug_level: 0=none, 1=print drops due to unfilled batches on aspect ratio buckets, 2=debug info per image, 3=save crops to disk for inspection`
hey look ed2 2022-12-17 20:32:48 -07:00			`conditional_dropout: probability of dropping the caption for a given image`
Update documentation 2023-01-29 18:58:42 -07:00			`crop_jitter: number of pixels to jitter the crop by, only for non-square images`
			`seed: random seed`
hey look ed2 2022-12-17 20:32:48 -07:00			`"""`
			`def __init__(self,`
Don't need to set data loader singleton; formatting tweaks 2023-01-29 18:31:57 -07:00			`data_loader: DataLoaderMultiAspect,`
hey look ed2 2022-12-17 20:32:48 -07:00			`debug_level=0,`
			`conditional_dropout=0.02,`
			`crop_jitter=20,`
			`seed=555,`
			`tokenizer=None,`
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`retain_contrast=False,`
shuffle tags arg 2023-01-06 17:12:52 -07:00			`shuffle_tags=False,`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`rated_dataset=False,`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`rated_dataset_dropout_target=0.5,`
			`name='train'`
hey look ed2 2022-12-17 20:32:48 -07:00			`):`
Push DLMA into main, pass config to resolve This patch * passes the configuration (`argparse.Namespace`) to the resolver, * pushes the DLMA code into the main function, * makes DLMA take a `list[ImageTrainItem]` instead of `data_root`, * makes `EveryDreamBatch` take `DLMA` instead of `data_root`, etc. * allows `data_root` to be a list. By doing these things, both `EveryDreamBatch` and DLMA can be free from data resolution logic. It also reduces the number of arguments which need to be passed down to EDB and DLMA. 2023-01-29 18:08:54 -07:00			`self.data_loader = data_loader`
			`self.batch_size = data_loader.batch_size`
hey look ed2 2022-12-17 20:32:48 -07:00			`self.debug_level = debug_level`
			`self.conditional_dropout = conditional_dropout`
			`self.crop_jitter = crop_jitter`
			`self.unloaded_to_idx = 0`
			`self.tokenizer = tokenizer`
			`self.max_token_length = self.tokenizer.model_max_length`
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`self.retain_contrast = retain_contrast`
shuffle tags arg 2023-01-06 17:12:52 -07:00			`self.shuffle_tags = shuffle_tags`
			`self.seed = seed`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`self.rated_dataset = rated_dataset`
			`self.rated_dataset_dropout_target = rated_dataset_dropout_target`
Don't need to set data loader singleton; formatting tweaks 2023-01-29 18:31:57 -07:00			`# First epoch always trains on all images`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`self.image_train_items = []`
			`self.__update_image_train_items(1.0)`
			`self.name = name`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`num_images = len(self.image_train_items)`
log ed batch name on creation 2023-02-07 10:08:19 -07:00			`logging.info(f" ** Dataset '{name}': {num_images / self.batch_size:.0f} batches, num_images: {num_images}, batch_size: {self.batch_size}")`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`def shuffle(self, epoch_n: int, max_epochs: int):`
Implemented loading captions from yaml file 2023-01-07 11:57:23 -07:00			`self.seed += 1`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00
Don't need to set data loader singleton; formatting tweaks 2023-01-29 18:31:57 -07:00			`if self.rated_dataset:`
			`dropout_fraction = (max_epochs - (epoch_n * self.rated_dataset_dropout_target)) / max_epochs`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`else:`
Don't need to set data loader singleton; formatting tweaks 2023-01-29 18:31:57 -07:00			`dropout_fraction = 1.0`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`self.__update_image_train_items(dropout_fraction)`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00
hey look ed2 2022-12-17 20:32:48 -07:00			`def __len__(self):`
Implemented system to train on a subset of the dataset, favouring higher rated images 2023-01-14 06:00:30 -07:00			`return len(self.image_train_items)`
hey look ed2 2022-12-17 20:32:48 -07:00
			`def __getitem__(self, i):`
			`example = {}`

revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`train_item = self.__get_image_for_trainer(self.image_train_items[i], self.debug_level)`

			`if self.retain_contrast:`
			`std_dev = 1.0`
			`mean = 0.0`
			`else:`
			`std_dev = 0.5`
			`mean = 0.5`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00
			`image_transforms = transforms.Compose(`
			`[`
			`transforms.ToTensor(),`
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`transforms.Normalize([mean], [std_dev]),`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`]`
			`)`

Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`if self.shuffle_tags:`
			`example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)`
			`else:`
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`example["caption"] = train_item["caption"].get_caption()`
shuffle tags arg 2023-01-06 17:12:52 -07:00
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`example["image"] = image_transforms(train_item["image"])`

Fix cond_dropout and rating handling 2023-03-12 17:36:59 -06:00			`if random.random() > (train_item.get("cond_dropout", self.conditional_dropout)):`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00			`example["tokens"] = self.tokenizer(example["caption"],`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`truncation=True,`
			`padding="max_length",`
			`max_length=self.tokenizer.model_max_length,`
			`).input_ids`
			`else:`
			`example["tokens"] = self.tokenizer(" ",`
			`truncation=True,`
			`padding="max_length",`
			`max_length=self.tokenizer.model_max_length,`
			`).input_ids`
shuffle tags arg 2023-01-06 17:12:52 -07:00
fix some quality issues 2022-12-20 01:30:42 -07:00			`example["tokens"] = torch.tensor(example["tokens"])`
Support more control regarding caption tag shuffeling using yaml files 2023-01-07 09:29:09 -07:00
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`example["runt_size"] = train_item["runt_size"]`
hey look ed2 2022-12-17 20:32:48 -07:00
			`return example`

revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`def __get_image_for_trainer(self, image_train_item: ImageTrainItem, debug_level=0):`
hey look ed2 2022-12-17 20:32:48 -07:00			`example = {}`
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`save = debug_level > 2`
hey look ed2 2022-12-17 20:32:48 -07:00
revert multiline txt for now due to bug 2023-02-28 19:14:19 -07:00			`image_train_tmp = image_train_item.hydrate(crop=False, save=save, crop_jitter=self.crop_jitter)`
hey look ed2 2022-12-17 20:32:48 -07:00
fix mem leak on huge data, rework optimizer to separate json, add lion optimizer 2023-02-25 13:05:22 -07:00			`example["image"] = image_train_tmp.image.copy() # hack for now to avoid memory leak`
			`image_train_tmp.image = None # hack for now to avoid memory leak`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`example["caption"] = image_train_tmp.caption`
Fix cond_dropout and rating handling 2023-03-12 17:36:59 -06:00			`if image_train_tmp.cond_dropout is not None:`
			`example["cond_dropout"] = image_train_tmp.cond_dropout`
bunch of updates, grad ckpting, no drop bucket, shuffle every epoch 2023-01-01 08:45:18 -07:00			`example["runt_size"] = image_train_tmp.runt_size`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00
hey look ed2 2022-12-17 20:32:48 -07:00			`return example`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00
			`def __update_image_train_items(self, dropout_fraction: float):`
			`self.image_train_items = self.data_loader.get_shuffled_image_buckets(dropout_fraction)`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00
update EveryDreamValidator for noprompt's changes 2023-02-07 09:32:54 -07:00			`def build_torch_dataloader(dataset, batch_size) -> torch.utils.data.DataLoader:`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`dataloader = torch.utils.data.DataLoader(`
update EveryDreamValidator for noprompt's changes 2023-02-07 09:32:54 -07:00			`dataset,`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00			`batch_size= batch_size,`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`shuffle=False,`
set # of data loaders by the min of batch size or cpu count, do not do an rg b conversion when only loading image metadata 2023-04-14 13:59:28 -06:00			`num_workers=min(batch_size, os.cpu_count()),`
GH-36: Add support for validation split (WIP) Co-authored-by: Damian Stewart <office@damianstewart.com> 2023-02-06 23:10:34 -07:00			`collate_fn=collate_fn`
			`)`
			`return dataloader`


			`def collate_fn(batch):`
			`"""`
			`Collates batches`
			`"""`
			`images = [example["image"] for example in batch]`
			`captions = [example["caption"] for example in batch]`
			`tokens = [example["tokens"] for example in batch]`
			`runt_size = batch[0]["runt_size"]`

			`images = torch.stack(images)`
			`images = images.to(memory_format=torch.contiguous_format).float()`

			`ret = {`
			`"tokens": torch.stack(tuple(tokens)),`
			`"image": images,`
			`"captions": captions,`
			`"runt_size": runt_size,`
			`}`
			`del batch`
update EveryDreamValidator for noprompt's changes 2023-02-07 09:32:54 -07:00			`return ret`