2023-01-22 23:46:04 -07:00
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import typing
|
|
|
|
import zipfile
|
2023-01-29 18:08:54 -07:00
|
|
|
import argparse
|
2023-03-08 07:02:14 -07:00
|
|
|
from data.dataset import Dataset
|
2023-01-22 23:46:04 -07:00
|
|
|
|
|
|
|
import tqdm
|
|
|
|
from colorama import Fore, Style
|
|
|
|
|
|
|
|
from data.image_train_item import ImageCaption, ImageTrainItem
|
|
|
|
|
|
|
|
class DataResolver:
|
2023-01-29 18:08:54 -07:00
|
|
|
def __init__(self, args: argparse.Namespace):
|
|
|
|
"""
|
|
|
|
:param args: EveryDream configuration, an `argparse.Namespace` object.
|
|
|
|
"""
|
|
|
|
self.aspects = args.aspects
|
|
|
|
self.flip_p = args.flip_p
|
2023-03-02 14:52:26 -07:00
|
|
|
|
2023-01-22 23:46:04 -07:00
|
|
|
def image_train_items(self, data_root: str) -> list[ImageTrainItem]:
|
|
|
|
"""
|
|
|
|
Get the list of `ImageTrainItem` for the given data root.
|
|
|
|
|
|
|
|
:param data_root: The data root, a directory, a file, etc..
|
|
|
|
:return: The list of `ImageTrainItem`.
|
|
|
|
"""
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
class JSONResolver(DataResolver):
|
|
|
|
def image_train_items(self, json_path: str) -> list[ImageTrainItem]:
|
2023-01-23 13:11:42 -07:00
|
|
|
"""
|
|
|
|
Create `ImageTrainItem` objects with metadata for hydration later.
|
|
|
|
Extracts images and captions from a JSON file.
|
|
|
|
|
|
|
|
:param json_path: The path to the JSON file.
|
|
|
|
"""
|
2023-03-08 07:02:14 -07:00
|
|
|
return Dataset.from_json(json_path).image_train_items(self.aspects)
|
2023-01-22 23:46:04 -07:00
|
|
|
|
|
|
|
class DirectoryResolver(DataResolver):
|
|
|
|
def image_train_items(self, data_root: str) -> list[ImageTrainItem]:
|
|
|
|
"""
|
|
|
|
Create `ImageTrainItem` objects with metadata for hydration later.
|
|
|
|
Unzips all zip files in `data_root` and then recursively searches the
|
|
|
|
`data_root` for images and captions.
|
|
|
|
|
|
|
|
:param data_root: The root directory to recurse through
|
|
|
|
"""
|
|
|
|
DirectoryResolver.unzip_all(data_root)
|
2023-03-08 07:02:14 -07:00
|
|
|
return Dataset.from_path(data_root).image_train_items(self.aspects)
|
2023-01-22 23:46:04 -07:00
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def unzip_all(path):
|
|
|
|
try:
|
|
|
|
for root, dirs, files in os.walk(path):
|
|
|
|
for file in files:
|
|
|
|
if file.endswith('.zip'):
|
|
|
|
logging.info(f"Unzipping {file}")
|
|
|
|
with zipfile.ZipFile(path, 'r') as zip_ref:
|
|
|
|
zip_ref.extractall(path)
|
|
|
|
except Exception as e:
|
|
|
|
logging.error(f"Error unzipping files {e}")
|
|
|
|
|
2023-01-29 18:08:54 -07:00
|
|
|
def strategy(data_root: str) -> typing.Type[DataResolver]:
|
|
|
|
"""
|
|
|
|
Determine the strategy to use for resolving the data.
|
|
|
|
:param data_root: The root directory or JSON file to resolve.
|
|
|
|
"""
|
2023-01-22 23:46:04 -07:00
|
|
|
if os.path.isfile(data_root) and data_root.endswith('.json'):
|
|
|
|
return JSONResolver
|
|
|
|
|
|
|
|
if os.path.isdir(data_root):
|
|
|
|
return DirectoryResolver
|
|
|
|
|
|
|
|
raise ValueError(f"data_root '{data_root}' is not a valid directory or JSON file.")
|
|
|
|
|
2023-01-29 18:08:54 -07:00
|
|
|
def resolve_root(path: str, args: argparse.Namespace) -> list[ImageTrainItem]:
|
2023-01-22 23:46:04 -07:00
|
|
|
"""
|
2023-01-29 18:08:54 -07:00
|
|
|
Resolve the training data from the root path.
|
|
|
|
:param path: The root path to resolve.
|
|
|
|
:param args: EveryDream configuration, an `argparse.Namespace` object.
|
2023-01-22 23:46:04 -07:00
|
|
|
"""
|
2023-01-29 18:08:54 -07:00
|
|
|
resolver = strategy(path)
|
|
|
|
return resolver(args).image_train_items(path)
|
2023-01-22 23:46:04 -07:00
|
|
|
|
2023-01-29 18:08:54 -07:00
|
|
|
def resolve(value: typing.Union[dict, str], args: argparse.Namespace) -> list[ImageTrainItem]:
|
2023-01-22 23:46:04 -07:00
|
|
|
"""
|
|
|
|
Resolve the training data from the value.
|
2023-01-29 18:08:54 -07:00
|
|
|
:param value: The value to resolve, either a dict, an array, or a string.
|
|
|
|
:param args: EveryDream configuration, an `argparse.Namespace` object.
|
2023-01-22 23:46:04 -07:00
|
|
|
"""
|
|
|
|
if isinstance(value, str):
|
2023-01-29 18:08:54 -07:00
|
|
|
return resolve_root(value, args)
|
2023-01-22 23:46:04 -07:00
|
|
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
resolver = value.get('resolver', None)
|
|
|
|
match resolver:
|
|
|
|
case 'directory' | 'json':
|
|
|
|
path = value.get('path', None)
|
2023-01-29 18:08:54 -07:00
|
|
|
return resolve_root(path, args)
|
2023-01-22 23:46:04 -07:00
|
|
|
case 'multi':
|
2023-01-29 18:08:54 -07:00
|
|
|
return resolve(value.get('resolvers', []), args)
|
2023-01-22 23:46:04 -07:00
|
|
|
case _:
|
2023-01-29 18:08:54 -07:00
|
|
|
raise ValueError(f"Cannot resolve training data for resolver value '{resolver}'")
|
|
|
|
|
|
|
|
if isinstance(value, list):
|
|
|
|
items = []
|
|
|
|
for item in value:
|
|
|
|
items += resolve(item, args)
|
|
|
|
return items
|