360 lines
13 KiB
Python
360 lines
13 KiB
Python
"""
|
|
Copyright [2022] Victor C Hall
|
|
|
|
Licensed under the GNU Affero General Public License;
|
|
You may not use this code except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
https://www.gnu.org/licenses/agpl-3.0.en.html
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
"""
|
|
import bisect
|
|
import logging
|
|
import math
|
|
import os
|
|
import random
|
|
import typing
|
|
import yaml
|
|
|
|
import PIL
|
|
import PIL.Image as Image
|
|
import PIL.ImageOps as ImageOps
|
|
import numpy as np
|
|
from torchvision import transforms
|
|
|
|
OptionalImageCaption = typing.Optional['ImageCaption']
|
|
|
|
class ImageCaption:
|
|
"""
|
|
Represents the various parts of an image caption
|
|
"""
|
|
def __init__(self, main_prompt: str, rating: float, tags: list[str], tag_weights: list[float], max_target_length: int, use_weights: bool):
|
|
"""
|
|
:param main_prompt: The part of the caption which should always be included
|
|
:param tags: list of tags to pick from to fill the caption
|
|
:param tag_weights: weights to indicate which tags are more desired and should be picked preferably
|
|
:param max_target_length: The desired maximum length of a generated caption
|
|
:param use_weights: if ture, weights are considered when shuffling tags
|
|
"""
|
|
self.__main_prompt = main_prompt
|
|
self.__rating = rating
|
|
self.__tags = tags
|
|
self.__tag_weights = tag_weights
|
|
self.__max_target_length = max_target_length or 2048
|
|
self.__use_weights = use_weights
|
|
if use_weights and len(tags) > len(tag_weights):
|
|
self.__tag_weights.extend([1.0] * (len(tags) - len(tag_weights)))
|
|
|
|
if use_weights and len(tag_weights) > len(tags):
|
|
self.__tag_weights = tag_weights[:len(tags)]
|
|
|
|
def rating(self) -> float:
|
|
return self.__rating
|
|
|
|
def get_shuffled_caption(self, seed: int) -> str:
|
|
"""
|
|
returns the caption a string with a random selection of the tags in random order
|
|
:param seed used to initialize the randomizer
|
|
:return: generated caption string
|
|
"""
|
|
if self.__tags:
|
|
try:
|
|
max_target_tag_length = self.__max_target_length - len(self.__main_prompt or 0)
|
|
except Exception as e:
|
|
print()
|
|
logging.error(f"Error determining length for: {e} on {self.__main_prompt}")
|
|
print()
|
|
max_target_tag_length = 2048
|
|
|
|
if self.__use_weights:
|
|
tags_caption = self.__get_weighted_shuffled_tags(seed, self.__tags, self.__tag_weights, max_target_tag_length)
|
|
else:
|
|
tags_caption = self.__get_shuffled_tags(seed, self.__tags)
|
|
|
|
return self.__main_prompt + ", " + tags_caption
|
|
return self.__main_prompt
|
|
|
|
def get_caption(self) -> str:
|
|
if self.__tags:
|
|
return self.__main_prompt + ", " + ", ".join(self.__tags)
|
|
return self.__main_prompt
|
|
|
|
@staticmethod
|
|
def __get_weighted_shuffled_tags(seed: int, tags: list[str], weights: list[float], max_target_tag_length: int) -> str:
|
|
picker = random.Random(seed)
|
|
tags_copy = tags.copy()
|
|
weights_copy = weights.copy()
|
|
|
|
caption = ""
|
|
while len(tags_copy) != 0 and len(caption) < max_target_tag_length:
|
|
cum_weights = []
|
|
weight_sum = 0.0
|
|
for weight in weights_copy:
|
|
weight_sum += weight
|
|
cum_weights.append(weight_sum)
|
|
|
|
point = picker.uniform(0, weight_sum)
|
|
pos = bisect.bisect_left(cum_weights, point)
|
|
|
|
weights_copy.pop(pos)
|
|
tag = tags_copy.pop(pos)
|
|
|
|
if caption:
|
|
caption += ", "
|
|
caption += tag
|
|
|
|
return caption
|
|
|
|
@staticmethod
|
|
def __get_shuffled_tags(seed: int, tags: list[str]) -> str:
|
|
random.Random(seed).shuffle(tags)
|
|
return ", ".join(tags)
|
|
|
|
class ImageTrainItem:
|
|
"""
|
|
image: PIL.Image
|
|
identifier: caption,
|
|
target_aspect: (width, height),
|
|
pathname: path to image file
|
|
flip_p: probability of flipping image (0.0 to 1.0)
|
|
rating: the relative rating of the images. The rating is measured in comparison to the other images.
|
|
"""
|
|
def __init__(self,
|
|
image: PIL.Image,
|
|
caption: ImageCaption,
|
|
aspects: list[float],
|
|
pathname: str,
|
|
flip_p=0.0,
|
|
multiplier: float=1.0,
|
|
cond_dropout=None,
|
|
shuffle_tags=False,
|
|
batch_id: str=None
|
|
):
|
|
self.caption = caption
|
|
self.aspects = aspects
|
|
self.pathname = pathname
|
|
self.flip = transforms.RandomHorizontalFlip(p=flip_p)
|
|
self.cropped_img = None
|
|
self.runt_size = 0
|
|
self.multiplier = multiplier
|
|
self.cond_dropout = cond_dropout
|
|
self.shuffle_tags = shuffle_tags
|
|
self.batch_id = batch_id or DEFAULT_BATCH_ID
|
|
self.target_wh = None
|
|
|
|
self.image_size = None
|
|
if image is None or len(image) == 0:
|
|
self.image = []
|
|
else:
|
|
self.image = image
|
|
self.image_size = image.size
|
|
#self.target_size = None
|
|
|
|
self.is_undersized = False
|
|
self.error = None
|
|
self.__compute_target_width_height()
|
|
|
|
def load_image(self):
|
|
try:
|
|
image = PIL.Image.open(self.pathname).convert('RGB')
|
|
image = self._try_transpose(image, print_error=False)
|
|
except SyntaxError as e:
|
|
pass
|
|
return image
|
|
|
|
def _try_transpose(self, image, print_error=False):
|
|
try:
|
|
image = ImageOps.exif_transpose(image)
|
|
except Exception as e:
|
|
logging.warning(F"Error rotating image: {e} on {self.pathname}, image will be loaded as is, EXIF may be corrupt") if print_error else None
|
|
pass
|
|
return image
|
|
|
|
def _needs_transpose(self, image, print_error=False):
|
|
try:
|
|
exif = image.getexif()
|
|
orientation = exif.get(0x0112)
|
|
"""
|
|
https://pillow.readthedocs.io/en/stable/_modules/PIL/ImageOps.html#exif_transpose
|
|
method = {
|
|
2: Image.Transpose.FLIP_LEFT_RIGHT,
|
|
3: Image.Transpose.ROTATE_180,
|
|
4: Image.Transpose.FLIP_TOP_BOTTOM,
|
|
5: Image.Transpose.TRANSPOSE,
|
|
6: Image.Transpose.ROTATE_270,
|
|
7: Image.Transpose.TRANSVERSE,
|
|
8: Image.Transpose.ROTATE_90,
|
|
}.get(orientation)
|
|
"""
|
|
return orientation in [5, 6, 7, 8]
|
|
except Exception as e:
|
|
logging.warning(F"Error rotating image: {e} on {self.pathname}, image will be loaded as is, EXIF may be corrupt") if print_error else None
|
|
pass
|
|
return False
|
|
|
|
|
|
def _percent_random_crop(self, image, crop_jitter=0.02):
|
|
"""
|
|
randomly crops the image by a percentage of the image size on each of the four sides
|
|
"""
|
|
width, height = image.size
|
|
max_crop_pixels = min(width, height) * crop_jitter
|
|
|
|
left_crop_pixels = random.uniform(0, max_crop_pixels)
|
|
right_crop_pixels = random.uniform(0, max_crop_pixels)
|
|
top_crop_pixels = random.uniform(0, max_crop_pixels)
|
|
bottom_crop_pixels = random.uniform(0, max_crop_pixels)
|
|
|
|
left = left_crop_pixels
|
|
right = width - right_crop_pixels
|
|
top = top_crop_pixels
|
|
bottom = height - bottom_crop_pixels
|
|
#print(f"\n *** jitter l: {left}, t: {top}, r: {right}, b: {bottom}, orig w: {width}, h: {height}, max_crop_pixels: {max_crop_pixels}")
|
|
|
|
cropped_image = image.crop((left, top, right, bottom))
|
|
|
|
cropped_width = width - int(left_crop_pixels + right_crop_pixels)
|
|
cropped_height = height - int(top_crop_pixels + bottom_crop_pixels)
|
|
|
|
cropped_aspect_ratio = cropped_width / cropped_height
|
|
|
|
if cropped_aspect_ratio > 1:
|
|
new_width = cropped_width
|
|
new_height = int(cropped_width / cropped_aspect_ratio)
|
|
else:
|
|
new_width = int(cropped_height * cropped_aspect_ratio)
|
|
new_height = cropped_height
|
|
|
|
cropped_image = cropped_image.resize((new_width, new_height))
|
|
|
|
return cropped_image
|
|
|
|
def _debug_save_image(self, image, folder=""):
|
|
base_name = os.path.basename(self.pathname)
|
|
target_dir = os.path.join('test/output', folder)
|
|
target_file = os.path.join(target_dir, base_name)
|
|
|
|
if not os.path.exists(target_dir):
|
|
os.makedirs(target_dir)
|
|
|
|
try:
|
|
#print(f"saving to test/output: {os.path.join('test/output', folder, base_name)}")
|
|
image.save(target_file)
|
|
except Exception as e:
|
|
print(f"error for debug saving image of {self.pathname}: {e}")
|
|
pass
|
|
|
|
def _trim_to_aspect(self, image, target_wh):
|
|
try:
|
|
width, height = image.size
|
|
target_aspect = target_wh[0] / target_wh[1] # 0.60
|
|
image_aspect = width / height # 0.5865
|
|
#self._debug_save_image(image, "precrop")
|
|
if image_aspect > target_aspect:
|
|
target_width = int(height * target_aspect)
|
|
overwidth = width - target_width
|
|
l = random.triangular(0, overwidth)
|
|
#print(f"l: {l}, overwidth: {overwidth}")
|
|
l = max(0, l)
|
|
l = int(min(l, overwidth))
|
|
r = width - overwidth + l
|
|
#print(f"\n_trim_to_aspect actual ar: {image_aspect}, target ar:{target_aspect:.2f}, {image.size}, cropping with box: {l}, 0, {r}, {height}, {self.pathname}")
|
|
image = image.crop((l, 0, r, height))
|
|
elif target_aspect > image_aspect:
|
|
target_height = int(width / target_aspect)
|
|
overheight = height - target_height
|
|
t = random.triangular(0, overheight)
|
|
#print(f"t: {t}, overheight: {overheight}")
|
|
t = max(0, t)
|
|
t = int(min(t, overheight))
|
|
b = height - overheight + t
|
|
#print(f"\n_trim_to_aspect actual ar: {image_aspect}, target ar:{target_aspect:.2f}, {image.size}, cropping with box: 0, {t}, {width}, {b}, {self.pathname}")
|
|
image = image.crop((0, t, width, b))
|
|
|
|
except Exception as e:
|
|
logging.error(f"fatal error trimming image {self.pathname}: {e}")
|
|
raise e
|
|
return image
|
|
|
|
def hydrate(self, save=False, crop_jitter=0.02):
|
|
"""
|
|
save: save the cropped image to disk, for manual inspection of resize/crop
|
|
"""
|
|
image = self.load_image()
|
|
|
|
width, height = image.size
|
|
|
|
img_jitter = min((width-self.target_wh[0])/self.target_wh[0], (height-self.target_wh[1])/self.target_wh[1])
|
|
img_jitter = min(img_jitter, crop_jitter)
|
|
img_jitter = max(img_jitter, 0.0)
|
|
|
|
if img_jitter > 0.0:
|
|
image = self._percent_random_crop(image, img_jitter)
|
|
|
|
image = self._trim_to_aspect(image, self.target_wh)
|
|
|
|
self.image = image.resize(self.target_wh)
|
|
|
|
self.image = self.flip(self.image)
|
|
# Remove comment here to view image cropping outputs
|
|
#self._debug_save_image(self.image, "final")
|
|
|
|
self.image = np.array(self.image).astype(np.uint8)
|
|
|
|
return self
|
|
|
|
def __compute_target_width_height(self):
|
|
self.target_wh = None
|
|
try:
|
|
with PIL.Image.open(self.pathname) as image:
|
|
if self._needs_transpose(image):
|
|
height, width = image.size
|
|
else:
|
|
width, height = image.size
|
|
image_aspect = width / height
|
|
target_wh = min(self.aspects, key=lambda aspects:abs(aspects[0]/aspects[1] - image_aspect))
|
|
|
|
self.is_undersized = (width * height) < (target_wh[0]*1.02 * target_wh[1]*1.02)
|
|
self.target_wh = target_wh
|
|
except Exception as e:
|
|
self.error = e
|
|
|
|
@staticmethod
|
|
def __autocrop(image: PIL.Image, q=.404):
|
|
"""
|
|
crops image to a random square inside small axis using a truncated gaussian distribution across the long axis
|
|
"""
|
|
x, y = image.size
|
|
|
|
if x != y:
|
|
if (x > y):
|
|
rand_x = x - y
|
|
sigma = max(rand_x * q, 1)
|
|
else:
|
|
rand_y = y - x
|
|
sigma = max(rand_y * q, 1)
|
|
|
|
if (x > y):
|
|
x_crop_gauss = abs(random.gauss(0, sigma))
|
|
x_crop = min(x_crop_gauss, (x - y) / 2)
|
|
x_crop = math.trunc(x_crop)
|
|
y_crop = 0
|
|
else:
|
|
y_crop_gauss = abs(random.gauss(0, sigma))
|
|
x_crop = 0
|
|
y_crop = min(y_crop_gauss, (y - x) / 2)
|
|
y_crop = math.trunc(y_crop)
|
|
|
|
min_xy = min(x, y)
|
|
image = image.crop((x_crop, y_crop, x_crop + min_xy, y_crop + min_xy))
|
|
|
|
return image
|
|
|
|
|
|
DEFAULT_BATCH_ID = "default_batch"
|