""" Copyright [2022] Victor C Hall Licensed under the GNU Affero General Public License; You may not use this code except in compliance with the License. You may obtain a copy of the License at https://www.gnu.org/licenses/agpl-3.0.en.html Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import bisect import logging import math import os import random import typing import yaml import PIL import PIL.Image as Image import PIL.ImageOps as ImageOps import numpy as np from torchvision import transforms OptionalImageCaption = typing.Optional['ImageCaption'] class ImageCaption: """ Represents the various parts of an image caption """ def __init__(self, main_prompt: str, rating: float, tags: list[str], tag_weights: list[float], max_target_length: int, use_weights: bool): """ :param main_prompt: The part of the caption which should always be included :param tags: list of tags to pick from to fill the caption :param tag_weights: weights to indicate which tags are more desired and should be picked preferably :param max_target_length: The desired maximum length of a generated caption :param use_weights: if ture, weights are considered when shuffling tags """ self.__main_prompt = main_prompt self.__rating = rating self.__tags = tags self.__tag_weights = tag_weights self.__max_target_length = max_target_length or 2048 self.__use_weights = use_weights if use_weights and len(tags) > len(tag_weights): self.__tag_weights.extend([1.0] * (len(tags) - len(tag_weights))) if use_weights and len(tag_weights) > len(tags): self.__tag_weights = tag_weights[:len(tags)] def rating(self) -> float: return self.__rating def get_shuffled_caption(self, seed: int) -> str: """ returns the caption a string with a random selection of the tags in random order :param seed used to initialize the randomizer :return: generated caption string """ if self.__tags: try: max_target_tag_length = self.__max_target_length - len(self.__main_prompt or 0) except Exception as e: print() logging.error(f"Error determining length for: {e} on {self.__main_prompt}") print() max_target_tag_length = 2048 if self.__use_weights: tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length) else: tags_caption = self.__get_shuffled_tags(seed, self.__tags) return self.__main_prompt + ", " + tags_caption return self.__main_prompt def get_caption(self) -> str: if self.__tags: return self.__main_prompt + ", " + ", ".join(self.__tags) return self.__main_prompt @staticmethod def __get_weighted_shuffled_tags(seed: int, tags: list[str], weights: list[float], max_target_tag_length: int) -> str: picker = random.Random(seed) tags_copy = tags.copy() weights_copy = weights.copy() caption = "" while len(tags_copy) != 0 and len(caption) < max_target_tag_length: cum_weights = [] weight_sum = 0.0 for weight in weights_copy: weight_sum += weight cum_weights.append(weight_sum) point = picker.uniform(0, weight_sum) pos = bisect.bisect_left(cum_weights, point) weights_copy.pop(pos) tag = tags_copy.pop(pos) if caption: caption += ", " caption += tag return caption @staticmethod def __get_shuffled_tags(seed: int, tags: list[str]) -> str: random.Random(seed).shuffle(tags) return ", ".join(tags) class ImageTrainItem: """ image: PIL.Image identifier: caption, target_aspect: (width, height), pathname: path to image file flip_p: probability of flipping image (0.0 to 1.0) rating: the relative rating of the images. The rating is measured in comparison to the other images. """ def __init__(self, image: PIL.Image, caption: ImageCaption, aspects: list[float], pathname: str, flip_p=0.0, multiplier: float=1.0, cond_dropout=None, shuffle_tags=False, batch_id: str=None ): self.caption = caption self.aspects = aspects self.pathname = pathname self.flip = transforms.RandomHorizontalFlip(p=flip_p) self.cropped_img = None self.runt_size = 0 self.multiplier = multiplier self.cond_dropout = cond_dropout self.shuffle_tags = shuffle_tags self.batch_id = batch_id or DEFAULT_BATCH_ID self.target_wh = None self.image_size = None if image is None or len(image) == 0: self.image = [] else: self.image = image self.image_size = image.size #self.target_size = None self.is_undersized = False self.error = None self.__compute_target_width_height() def load_image(self): try: image = PIL.Image.open(self.pathname).convert('RGB') image = self._try_transpose(image, print_error=False) except SyntaxError as e: pass return image def _try_transpose(self, image, print_error=False): try: image = ImageOps.exif_transpose(image) except Exception as e: logging.warning(F"Error rotating image: {e} on {self.pathname}, image will be loaded as is, EXIF may be corrupt") if print_error else None pass return image def _needs_transpose(self, image, print_error=False): try: exif = image.getexif() orientation = exif.get(0x0112) """ https://pillow.readthedocs.io/en/stable/_modules/PIL/ImageOps.html#exif_transpose method = { 2: Image.Transpose.FLIP_LEFT_RIGHT, 3: Image.Transpose.ROTATE_180, 4: Image.Transpose.FLIP_TOP_BOTTOM, 5: Image.Transpose.TRANSPOSE, 6: Image.Transpose.ROTATE_270, 7: Image.Transpose.TRANSVERSE, 8: Image.Transpose.ROTATE_90, }.get(orientation) """ return orientation in [5, 6, 7, 8] except Exception as e: logging.warning(F"Error rotating image: {e} on {self.pathname}, image will be loaded as is, EXIF may be corrupt") if print_error else None pass return False def _percent_random_crop(self, image, crop_jitter=0.02): """ randomly crops the image by a percentage of the image size on each of the four sides """ width, height = image.size max_crop_pixels = min(width, height) * crop_jitter left_crop_pixels = random.uniform(0, max_crop_pixels) right_crop_pixels = random.uniform(0, max_crop_pixels) top_crop_pixels = random.uniform(0, max_crop_pixels) bottom_crop_pixels = random.uniform(0, max_crop_pixels) left = left_crop_pixels right = width - right_crop_pixels top = top_crop_pixels bottom = height - bottom_crop_pixels #print(f"\n *** jitter l: {left}, t: {top}, r: {right}, b: {bottom}, orig w: {width}, h: {height}, max_crop_pixels: {max_crop_pixels}") cropped_image = image.crop((left, top, right, bottom)) cropped_width = width - int(left_crop_pixels + right_crop_pixels) cropped_height = height - int(top_crop_pixels + bottom_crop_pixels) cropped_aspect_ratio = cropped_width / cropped_height if cropped_aspect_ratio > 1: new_width = cropped_width new_height = int(cropped_width / cropped_aspect_ratio) else: new_width = int(cropped_height * cropped_aspect_ratio) new_height = cropped_height cropped_image = cropped_image.resize((new_width, new_height)) return cropped_image def _debug_save_image(self, image, folder=""): base_name = os.path.basename(self.pathname) target_dir = os.path.join('test/output', folder) target_file = os.path.join(target_dir, base_name) if not os.path.exists(target_dir): os.makedirs(target_dir) try: #print(f"saving to test/output: {os.path.join('test/output', folder, base_name)}") image.save(target_file) except Exception as e: print(f"error for debug saving image of {self.pathname}: {e}") pass def _trim_to_aspect(self, image, target_wh): try: width, height = image.size target_aspect = target_wh[0] / target_wh[1] # 0.60 image_aspect = width / height # 0.5865 #self._debug_save_image(image, "precrop") if image_aspect > target_aspect: target_width = int(height * target_aspect) overwidth = width - target_width l = random.triangular(0, overwidth) #print(f"l: {l}, overwidth: {overwidth}") l = max(0, l) l = int(min(l, overwidth)) r = width - overwidth + l #print(f"\n_trim_to_aspect actual ar: {image_aspect}, target ar:{target_aspect:.2f}, {image.size}, cropping with box: {l}, 0, {r}, {height}, {self.pathname}") image = image.crop((l, 0, r, height)) elif target_aspect > image_aspect: target_height = int(width / target_aspect) overheight = height - target_height t = random.triangular(0, overheight) #print(f"t: {t}, overheight: {overheight}") t = max(0, t) t = int(min(t, overheight)) b = height - overheight + t #print(f"\n_trim_to_aspect actual ar: {image_aspect}, target ar:{target_aspect:.2f}, {image.size}, cropping with box: 0, {t}, {width}, {b}, {self.pathname}") image = image.crop((0, t, width, b)) except Exception as e: logging.error(f"fatal error trimming image {self.pathname}: {e}") raise e return image def hydrate(self, save=False, crop_jitter=0.02): """ save: save the cropped image to disk, for manual inspection of resize/crop """ image = self.load_image() width, height = image.size img_jitter = min((width-self.target_wh[0])/self.target_wh[0], (height-self.target_wh[1])/self.target_wh[1]) img_jitter = min(img_jitter, crop_jitter) img_jitter = max(img_jitter, 0.0) if img_jitter > 0.0: image = self._percent_random_crop(image, img_jitter) image = self._trim_to_aspect(image, self.target_wh) self.image = image.resize(self.target_wh) self.image = self.flip(self.image) # Remove comment here to view image cropping outputs #self._debug_save_image(self.image, "final") self.image = np.array(self.image).astype(np.uint8) return self def __compute_target_width_height(self): self.target_wh = None try: with PIL.Image.open(self.pathname) as image: if self._needs_transpose(image): height, width = image.size else: width, height = image.size image_aspect = width / height target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect)) self.is_undersized = (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02) self.target_wh = target_wh except Exception as e: self.error = e @staticmethod def __autocrop(image: PIL.Image, q=.404): """ crops image to a random square inside small axis using a truncated gaussian distribution across the long axis """ x, y = image.size if x != y: if (x > y): rand_x = x - y sigma = max(rand_x * q, 1) else: rand_y = y - x sigma = max(rand_y * q, 1) if (x > y): x_crop_gauss = abs(random.gauss(0, sigma)) x_crop = min(x_crop_gauss, (x - y) / 2) x_crop = math.trunc(x_crop) y_crop = 0 else: y_crop_gauss = abs(random.gauss(0, sigma)) x_crop = 0 y_crop = min(y_crop_gauss, (y - x) / 2) y_crop = math.trunc(y_crop) min_xy = min(x, y) image = image.crop((x_crop, y_crop, x_crop + min_xy, y_crop + min_xy)) return image DEFAULT_BATCH_ID = "default_batch"