Merge pull request #7 from JanGerritsen/yaml_caption_files

Support more control regarding caption tag shuffeling using yaml files
2023-01-09 13:58:14 -08:00 · 2023-01-09 13:58:14 -08:00 · 99d8c6bc32
parent 89aa3aa53e f47ceadcc7
commit 99d8c6bc32
5 changed files with 257 additions and 59 deletions
--- a/README.md
+++ b/README.md
@ -34,3 +34,5 @@ Behind the scenes look at how the trainer handles multiaspect and crop jitter
 [Advanced Tweaking](doc/ATWEAKING.md)

 [Chaining training sessions](doc/CHAINING.md)
+
+[Shuffling Tags](doc/SHUFFLING_TAGS.md)
--- a/data/data_loader.py
+++ b/data/data_loader.py
@ -16,9 +16,11 @@ limitations under the License.

 import os
 import logging
+
+import yaml
 from PIL import Image
 import random
-from data.image_train_item import ImageTrainItem
+from data.image_train_item import ImageTrainItem, ImageCaption
 import data.aspects as aspects
 from colorama import Fore, Style
 import zipfile
@ -27,6 +29,8 @@ import PIL

 PIL.Image.MAX_IMAGE_PIXELS = 715827880*4 # increase decompression bomb error limit to 4x default

+DEFAULT_MAX_CAPTION_LENGTH = 2048
+
 class DataLoaderMultiAspect():
    """
    Data loader for multi-aspect-ratio training and bucketing
@ -76,17 +80,64 @@ class DataLoaderMultiAspect():
        return self.image_caption_pairs

    @staticmethod
-    def __read_caption_from_file(file_path, fallback_caption):
-        caption = fallback_caption
+    def __read_caption_from_file(file_path, fallback_caption: ImageCaption) -> ImageCaption:
        try:
            with open(file_path, encoding='utf-8', mode='r') as caption_file:
-                caption = caption_file.read()
+                caption_text = caption_file.read()
+                caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_text)
        except:
            logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")
            caption = fallback_caption
            pass
        return caption

+    @staticmethod
+    def __read_caption_from_yaml(file_path: str, fallback_caption: ImageCaption) -> ImageCaption:
+        with open(file_path, "r") as stream:
+            try:
+                file_content = yaml.safe_load(stream)
+                main_prompt = file_content.get("main_prompt", "")
+                unparsed_tags = file_content.get("tags", [])
+
+                max_caption_length = file_content.get("max_caption_length", DEFAULT_MAX_CAPTION_LENGTH)
+
+                tags = []
+                tag_weights = []
+                last_weight = None
+                weights_differ = False
+                for unparsed_tag in unparsed_tags:
+                    tag = unparsed_tag.get("tag", "").strip()
+                    if len(tag) == 0:
+                        continue
+
+                    tags.append(tag)
+                    tag_weight = unparsed_tag.get("weight", 1.0)
+                    tag_weights.append(tag_weight)
+
+                    if last_weight is not None and weights_differ is False:
+                        weights_differ = last_weight != tag_weight
+
+                    last_weight = tag_weight
+
+                return ImageCaption(main_prompt, tags, tag_weights, max_caption_length, weights_differ)
+
+            except:
+                logging.error(f" *** Error reading {file_path} to get caption, falling back to filename")
+                return fallback_caption
+
+    @staticmethod
+    def __split_caption_into_tags(caption_string: str) -> ImageCaption:
+        """
+        Splits a string by "," into the main prompt and additional tags with equal weights
+        """
+        split_caption = caption_string.split(",")
+        main_prompt = split_caption.pop(0).strip()
+        tags = []
+        for tag in split_caption:
+            tags.append(tag.strip())
+
+        return ImageCaption(main_prompt, tags, [1.0] * len(tags), DEFAULT_MAX_CAPTION_LENGTH, False)
+
    def __prescan_images(self, image_paths: list, flip_p=0.0):
        """
        Create ImageTrainItem objects with metadata for hydration later
@ -95,16 +146,19 @@ class DataLoaderMultiAspect():

        for pathname in tqdm.tqdm(image_paths):
            caption_from_filename = os.path.splitext(os.path.basename(pathname))[0].split("_")[0]
+            caption = DataLoaderMultiAspect.__split_caption_into_tags(caption_from_filename)

-            txt_file_path = os.path.splitext(pathname)[0] + ".txt"
-            caption_file_path = os.path.splitext(pathname)[0] + ".caption"
+            file_path_without_ext = os.path.splitext(pathname)[0]
+            yaml_file_path = file_path_without_ext + ".yaml"
+            txt_file_path = file_path_without_ext + ".txt"
+            caption_file_path = file_path_without_ext + ".caption"

-            if os.path.exists(txt_file_path):
-                caption = self.__read_caption_from_file(txt_file_path, caption_from_filename)                
+            if os.path.exists(yaml_file_path):
+                caption = self.__read_caption_from_yaml(yaml_file_path, caption)
+            elif os.path.exists(txt_file_path):
+                caption = self.__read_caption_from_file(txt_file_path, caption)
            elif os.path.exists(caption_file_path):
-                caption = self.__read_caption_from_file(caption_file_path, caption_from_filename)                
-            else:
-                caption = caption_from_filename
+                caption = self.__read_caption_from_file(caption_file_path, caption)

            try:
                image = Image.open(pathname)
--- a/data/every_dream.py
+++ b/data/every_dream.py
@ -103,6 +103,7 @@ class EveryDreamBatch(Dataset):
        return dls.shared_dataloader.runts

    def shuffle(self, epoch_n):
+        self.seed += 1
        if dls.shared_dataloader:
            dls.shared_dataloader.shuffle()
            self.image_train_items = dls.shared_dataloader.get_all_images()
@ -134,16 +135,15 @@ class EveryDreamBatch(Dataset):
            ]
        )

-        if self.shuffle_tags and "," in train_item['caption']:
-            tags = train_item["caption"].split(",")
-            random.Random(self.seed).shuffle(tags)
-            self.seed += 1
-            train_item["caption"] = ", ".join(tags)
+        if self.shuffle_tags:
+            example["caption"] = train_item["caption"].get_shuffled_caption(self.seed)
+        else:
+            example["caption"] = train_item["caption"].get_caption()

        example["image"] = image_transforms(train_item["image"])

        if random.random() > self.conditional_dropout:
-            example["tokens"] = self.tokenizer(train_item["caption"],
+            example["tokens"] = self.tokenizer(example["caption"],
                                                truncation=True,
                                                padding="max_length",
                                                max_length=self.tokenizer.model_max_length,
@ -156,7 +156,7 @@ class EveryDreamBatch(Dataset):
                                              ).input_ids

        example["tokens"] = torch.tensor(example["tokens"])
-        example["caption"] = train_item["caption"] # for sampling if needed
+
        example["runt_size"] = train_item["runt_size"]

        return example
--- a/data/image_train_item.py
+++ b/data/image_train_item.py
@ -13,16 +13,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import PIL
-import numpy as np
-from torchvision import transforms, utils
-import random
+import bisect
+import logging
 import math
 import os
-import logging
+import random
+
+import PIL
+import numpy as np
+from torchvision import transforms

 _RANDOM_TRIM = 0.04

+
+class ImageCaption:
+    """
+    Represents the various parts of an image caption
+    """
+
+    def __init__(self, main_prompt: str, tags: list[str], tag_weights: list[float], max_target_length: int, use_weights: bool):
+        """
+        :param main_prompt: The part of the caption which should always be included
+        :param tags: list of tags to pick from to fill the caption
+        :param tag_weights: weights to indicate which tags are more desired and should be picked preferably
+        :param max_target_length: The desired maximum length of a generated caption
+        :param use_weights: if ture, weights are considered when shuffling tags
+        """
+        self.__main_prompt = main_prompt
+        self.__tags = tags
+        self.__tag_weights = tag_weights
+        self.__max_target_length = max_target_length
+        self.__use_weights = use_weights
+        if use_weights and len(tags) > len(tag_weights):
+            self.__tag_weights.extend([1.0] * (len(tags) - len(tag_weights)))
+
+        if use_weights and len(tag_weights) > len(tags):
+            self.__tag_weights = tag_weights[:len(tags)]
+
+    def get_shuffled_caption(self, seed: int) -> str:
+        """
+        returns the caption a string with a random selection of the tags in random order
+        :param seed used to initialize the randomizer
+        :return: generated caption string
+        """
+        max_target_tag_length = self.__max_target_length - len(self.__main_prompt)
+
+        if self.__use_weights:
+            tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length)
+        else:
+            tags_caption = self.__get_shuffled_tags(seed, self.__tags)
+
+        return self.__main_prompt + ", " + tags_caption
+
+    def get_caption(self) -> str:
+        return self.__main_prompt + ", " + ", ".join(self.__tags)
+
+    @staticmethod
+    def __get_weighted_shuffled_tags(seed: int, tags: list[str], weights: list[float], max_target_tag_length: int) -> str:
+        picker = random.Random(seed)
+        tags_copy = tags.copy()
+        weights_copy = weights.copy()
+
+        caption = ""
+        while len(tags_copy) != 0 and len(caption) < max_target_tag_length:
+            cum_weights = []
+            weight_sum = 0.0
+            for weight in weights_copy:
+                weight_sum += weight
+                cum_weights.append(weight_sum)
+
+            point = picker.uniform(0, weight_sum)
+            pos = bisect.bisect_left(cum_weights, point)
+
+            weights_copy.pop(pos)
+            tag = tags_copy.pop(pos)
+            caption += ", " + tag
+
+        return caption
+
+    @staticmethod
+    def __get_shuffled_tags(seed: int, tags: list[str]) -> str:
+        random.Random(seed).shuffle(tags)
+        return ", ".join(tags)
+
+
 class ImageTrainItem():
    """
    image: PIL.Image
@ -31,7 +105,8 @@ class ImageTrainItem():
    pathname: path to image file
    flip_p: probability of flipping image (0.0 to 1.0)
    """
-    def __init__(self, image: PIL.Image, caption: str, target_wh: list, pathname: str, flip_p=0.0):
+
+    def __init__(self, image: PIL.Image, caption: ImageCaption, target_wh: list, pathname: str, flip_p=0.0):
        self.caption = caption
        self.target_wh = target_wh
        self.pathname = pathname
--- a/doc/SHUFFLING_TAGS.md
+++ b/doc/SHUFFLING_TAGS.md
@ -0,0 +1,67 @@
+# Shuffling tags randomly during training
+
+## General shuffling
+
+To help the model generalize better, EveryDream has an option to shuffle tags during the training.
+
+This behavior can be activated using the parameter _--shuffle_tags_. The default is off.
+
+The provided caption, extracted either from the file name or the provided caption file, 
+will be split at each "_,_" into separate chunks.     
+
+The first chunk will always be included in the caption provided during the training, 
+the additional chunks are shuffled into a random order. 
+
+Each epoch the order is reshuffled. _(Remember that each image is shown one per epoch to the model)_
+
+
+## Weighted shuffling
+
+EveryDream can read caption definitions from YAML files, for fine-tuned definitions.
+
+EveryDream will check for each image if a file with the same name and the extension _.yaml_ is provided.
+
+The expected format of the YAML file:
+````yaml
+main_prompt: A portrait of Cloud Strife
+tags:
+  - tag: low angle shot
+  - tag: looking to the side
+  - tag: holding buster sword
+    weight: 1.5
+  - tag: clouds in background
+    weight: 0.5
+  - tag: smiling
+    weight: 0.8
+max_caption_length: 1024
+````
+
+THe main prompt will always be the first part included in the caption.
+The main prompt is optional, you can provide none if you do not want a fixed part at the beginning of the caption.
+
+This is followed by a list of tags. The tags will be shuffled into a random order and added to the caption.
+The tags list is optional.   
+
+The default weight of each tag is _1.0_. A different weight can be optionally specified. 
+Tags with a higher weight have a higher chance to appear in the front of the caption tag list.
+
+The optional parameter _max_caption_length_ allows the definition of a maximum length of the assembled caption.
+Only whole tags will be processed. If the addition of the next tag exceeds the _max_caption_length_, 
+it will not be added, and the caption will be provided without the other tags for this epoch.  
+
+This can be used to train the model that an image can include a certain aspect, even if it is not 
+explicitly mentioned in the caption. 
+
+
+## General notes regarding token length
+
+For SD, the current implementation of EveryDream can only process the first 75 tokens 
+provided in the caption during training.
+
+This is a base limitation of the SD Models. Workaround exists to extend this number but are currently not
+implemented in EveryDream.
+
+The effect of the limit is that the caption will always be truncated when the maximum number of tokens is
+exceeded. This process does not consider if the cutoff is in the middle of a tag or even in the middle of a
+word if it is translated into several tokens.
+