EveryDream/scripts/auto_caption.py

import argparse
import glob
import os
from PIL import Image
import sys
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
import torch
import aiohttp
import asyncio
import subprocess
import numpy as np
import io
import aiofiles

SIZE = 384
BLIP_MODEL_URL = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'

def get_parser(**parser_kwargs):
    parser = argparse.ArgumentParser(**parser_kwargs)
    parser.add_argument(
        "--img_dir",
        type=str,
        nargs="?",
        const=True,
        default="input",
        help="directory with images to be captioned",
    ),
    parser.add_argument(
        "--out_dir",
        type=str,
        nargs="?",
        const=True,
        default="output",
        help="directory to put captioned images",
    ),
    parser.add_argument(
        "--format",
        type=str,
        nargs="?",
        const=True,
        default="filename",
        help="'filename', 'mrwho', 'txt', or 'caption'",
    ),
    parser.add_argument(
        "--nucleus",
        type=bool,
        nargs="?",
        const=True,
        default=False,
        help="use nucleus sampling instead of beam",
    ),
    parser.add_argument(
        "--q_factor",
        type=float,
        nargs="?",
        const=True,
        default=1.0,
        help="adjusts the likelihood of a word being repeated",
    ),
    parser.add_argument(
        "--min_length",
        type=int,
        nargs="?",
        const=True,
        default=22,
        help="adjusts the likelihood of a word being repeated",
    ),
    parser.add_argument(
        "--torch_device",
        type=str,
        nargs="?",
        const=False,
        default="cuda",
        help="specify a different torch device, e.g. 'cpu'",
    ),

    return parser

def load_image(raw_image, device):
    transform = transforms.Compose([
        #transforms.CenterCrop(SIZE),
        transforms.Resize((SIZE, SIZE), interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    image = transform(raw_image).unsqueeze(0).to(device)
    return image

def get_out_file_name(out_dir, base_name, ext):
    return os.path.join(out_dir, f"{base_name}{ext}")

async def main(opt):
    print("starting")
    import models.blip

    sample = False
    if opt.nucleus:
        sample = True

    input_dir = opt.img_dir
    print("input_dir: ", input_dir)

    config_path = "scripts/BLIP/configs/med_config.json"

    cache_folder = ".cache"
    model_cache_path = ".cache/model_base_caption_capfilt_large.pth"

    if not os.path.exists(cache_folder):
        os.makedirs(cache_folder)

    if not os.path.exists(opt.out_dir):
        os.makedirs(opt.out_dir)

    if not os.path.exists(model_cache_path):
        print(f"Downloading model to {model_cache_path}... please wait")

        async with aiohttp.ClientSession() as session:
            async with session.get(BLIP_MODEL_URL) as res:
                with open(model_cache_path, 'wb') as f:
                    async for chunk in res.content.iter_chunked(1024):
                        f.write(chunk)
        print(f"Model cached to: {model_cache_path}")
    else:
        print(f"Model already cached to: {model_cache_path}")

    blip_decoder = models.blip.blip_decoder(pretrained=model_cache_path, image_size=SIZE, vit='base', med_config=config_path)
    blip_decoder.eval()

    print(f"loading model to {opt.torch_device}")

    blip_decoder = blip_decoder.to(torch.device(opt.torch_device))

    ext = ('.jpg', '.jpeg', '.png', '.webp', '.tif', '.tga', '.tiff', '.bmp', '.gif')

    i = 0

    for idx, img_file_name in enumerate(glob.iglob(os.path.join(opt.img_dir, "*.*"))):
        if img_file_name.endswith(ext):
            caption = None
            file_ext = os.path.splitext(img_file_name)[1]
            if (file_ext in ext):
                async with aiofiles.open(img_file_name, "rb") as input_file:
                    print("working image: ", img_file_name)

                    image_bin = await input_file.read()
                    image = Image.open(io.BytesIO(image_bin))

                    if not image.mode == "RGB":
                        image = image.convert("RGB")

                    image = load_image(image, device=torch.device(opt.torch_device))

                    if opt.nucleus:
                        captions = blip_decoder.generate(image, sample=True, top_p=opt.q_factor)
                    else:
                        captions = blip_decoder.generate(image, sample=sample, num_beams=16, min_length=opt.min_length, \
                            max_length=48, repetition_penalty=opt.q_factor)

                    caption = captions[0]

                    if opt.format in ["mrwho","joepenna"]:
                        prefix = f"{i:05}@"
                        i += 1
                        caption = prefix+caption
                    elif opt.format == "filename":
                        postfix = f"_{i}"
                        i += 1
                        caption = caption+postfix

                    if opt.format in ["txt","text","caption"]:
                        out_base_name = os.path.splitext(os.path.basename(img_file_name))[0]

                    if opt.format in ["txt","text"]:
                        out_file = get_out_file_name(opt.out_dir, out_base_name, ".txt")

                    if opt.format in ["caption"]:
                        out_file = get_out_file_name(opt.out_dir, out_base_name, ".caption")

                    if opt.format in ["txt","text","caption"]:
                        print("writing caption to: ", out_file)
                        async with aiofiles.open(out_file, "w") as out_file:
                            await out_file.write(caption)

                    if opt.format in ["filename", "mrwho", "joepenna"]:
                        caption = caption.replace("/", "").replace("\\", "")  # must clean slashes using filename
                        out_file = get_out_file_name(opt.out_dir, caption, file_ext)
                        async with aiofiles.open(out_file, "wb") as out_file:
                            await out_file.write(image_bin)
                    elif opt.format == "json":
                        raise NotImplementedError
                    elif opt.format == "parquet":
                        raise NotImplementedError

def isWindows():
    return sys.platform.startswith("win")

if __name__ == "__main__":
    parser = get_parser()
    opt = parser.parse_args()

    if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]:
        raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'")

    if (isWindows()):
        print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy")
        asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    else:
        print("Unix detected, using default asyncio event loop policy")

    if not os.path.exists("scripts/BLIP"):
        print("BLIP not found, cloning BLIP repo")
        subprocess.run(["git", "clone", "https://github.com/salesforce/BLIP", "scripts/BLIP"])
    blip_path = "scripts/BLIP"
    sys.path.append(blip_path)

    asyncio.run(main(opt))
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`import argparse`
			`import glob`
			`import os`
			`from PIL import Image`
			`import sys`
			`from torchvision import transforms`
			`from torchvision.transforms.functional import InterpolationMode`
			`import torch`
			`import aiohttp`
			`import asyncio`
file renamer and some general updates 2022-11-01 18:02:54 -06:00			`import subprocess`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`import numpy as np`
			`import io`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`import aiofiles`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
			`SIZE = 384`
file renamer and some general updates 2022-11-01 18:02:54 -06:00			`BLIP_MODEL_URL = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
			`def get_parser(**parser_kwargs):`
			`parser = argparse.ArgumentParser(**parser_kwargs)`
			`parser.add_argument(`
			`"--img_dir",`
			`type=str,`
			`nargs="?",`
			`const=True,`
			`default="input",`
			`help="directory with images to be captioned",`
			`),`
			`parser.add_argument(`
			`"--out_dir",`
			`type=str,`
			`nargs="?",`
			`const=True,`
			`default="output",`
			`help="directory to put captioned images",`
			`),`
			`parser.add_argument(`
			`"--format",`
			`type=str,`
			`nargs="?",`
			`const=True,`
			`default="filename",`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`help="'filename', 'mrwho', 'txt', or 'caption'",`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`),`
			`parser.add_argument(`
			`"--nucleus",`
			`type=bool,`
			`nargs="?",`
			`const=True,`
			`default=False,`
			`help="use nucleus sampling instead of beam",`
			`),`
			`parser.add_argument(`
			`"--q_factor",`
			`type=float,`
			`nargs="?",`
			`const=True,`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`default=1.0,`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`help="adjusts the likelihood of a word being repeated",`
			`),`
			`parser.add_argument(`
			`"--min_length",`
			`type=int,`
			`nargs="?",`
			`const=True,`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`default=22,`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`help="adjusts the likelihood of a word being repeated",`
working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`),`
Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00			`parser.add_argument(`
			`"--torch_device",`
			`type=str,`
			`nargs="?",`
			`const=False,`
			`default="cuda",`
			`help="specify a different torch device, e.g. 'cpu'",`
			`),`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
			`return parser`

			`def load_image(raw_image, device):`
			`transform = transforms.Compose([`
			`#transforms.CenterCrop(SIZE),`
			`transforms.Resize((SIZE, SIZE), interpolation=InterpolationMode.BICUBIC),`
			`transforms.ToTensor(),`
			`transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))`
			`])`
			`image = transform(raw_image).unsqueeze(0).to(device)`
			`return image`

update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`def get_out_file_name(out_dir, base_name, ext):`
			`return os.path.join(out_dir, f"{base_name}{ext}")`

big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`async def main(opt):`
			`print("starting")`
			`import models.blip`

			`sample = False`
			`if opt.nucleus:`
			`sample = True`

working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`input_dir = opt.img_dir`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`print("input_dir: ", input_dir)`

working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`config_path = "scripts/BLIP/configs/med_config.json"`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
autocreate cache and output folder 2022-11-04 11:48:05 -06:00			`cache_folder = ".cache"`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`model_cache_path = ".cache/model_base_caption_capfilt_large.pth"`

autocreate cache and output folder 2022-11-04 11:48:05 -06:00			`if not os.path.exists(cache_folder):`
			`os.makedirs(cache_folder)`
Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00
autocreate cache and output folder 2022-11-04 11:48:05 -06:00			`if not os.path.exists(opt.out_dir):`
			`os.makedirs(opt.out_dir)`

working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`if not os.path.exists(model_cache_path):`
			`print(f"Downloading model to {model_cache_path}... please wait")`
Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`async with aiohttp.ClientSession() as session:`
file renamer and some general updates 2022-11-01 18:02:54 -06:00			`async with session.get(BLIP_MODEL_URL) as res:`
working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`with open(model_cache_path, 'wb') as f:`
Don't download BLIP model in to memory before saving; Download directly to file. 2022-11-25 14:24:02 -07:00			`async for chunk in res.content.iter_chunked(1024):`
			`f.write(chunk)`
working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`print(f"Model cached to: {model_cache_path}")`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`else:`
working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`print(f"Model already cached to: {model_cache_path}")`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`blip_decoder = models.blip.blip_decoder(pretrained=model_cache_path, image_size=SIZE, vit='base', med_config=config_path)`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`blip_decoder.eval()`

Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00			`print(f"loading model to {opt.torch_device}")`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00			`blip_decoder = blip_decoder.to(torch.device(opt.torch_device))`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
			`ext = ('.jpg', '.jpeg', '.png', '.webp', '.tif', '.tga', '.tiff', '.bmp', '.gif')`

			`i = 0`

			`for idx, img_file_name in enumerate(glob.iglob(os.path.join(opt.img_dir, "."))):`
			`if img_file_name.endswith(ext):`
			`caption = None`
			`file_ext = os.path.splitext(img_file_name)[1]`
			`if (file_ext in ext):`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`async with aiofiles.open(img_file_name, "rb") as input_file:`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`print("working image: ", img_file_name)`

make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`image_bin = await input_file.read()`
			`image = Image.open(io.BytesIO(image_bin))`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`if not image.mode == "RGB":`
			`image = image.convert("RGB")`

Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00			`image = load_image(image, device=torch.device(opt.torch_device))`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00
			`if opt.nucleus:`
			`captions = blip_decoder.generate(image, sample=True, top_p=opt.q_factor)`
			`else:`
			`captions = blip_decoder.generate(image, sample=sample, num_beams=16, min_length=opt.min_length, \`
			`max_length=48, repetition_penalty=opt.q_factor)`

			`caption = captions[0]`

			`if opt.format in ["mrwho","joepenna"]:`
			`prefix = f"{i:05}@"`
			`i += 1`
			`caption = prefix+caption`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`elif opt.format == "filename":`
			`postfix = f"_{i}"`
			`i += 1`
			`caption = caption+postfix`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00
			`if opt.format in ["txt","text","caption"]:`
			`out_base_name = os.path.splitext(os.path.basename(img_file_name))[0]`

			`if opt.format in ["txt","text"]:`
			`out_file = get_out_file_name(opt.out_dir, out_base_name, ".txt")`

			`if opt.format in ["caption"]:`
			`out_file = get_out_file_name(opt.out_dir, out_base_name, ".caption")`

			`if opt.format in ["txt","text","caption"]:`
			`print("writing caption to: ", out_file)`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`async with aiofiles.open(out_file, "w") as out_file:`
			`await out_file.write(caption)`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00
only strip slashes from caption if using filename output 2022-11-12 14:49:33 -07:00			`if opt.format in ["filename", "mrwho", "joepenna"]:`
			`caption = caption.replace("/", "").replace("\\", "") # must clean slashes using filename`
update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`out_file = get_out_file_name(opt.out_dir, caption, file_ext)`
make autocaption use async for all file operations, append _n to filename rename 2022-11-15 17:07:28 -07:00			`async with aiofiles.open(out_file, "wb") as out_file:`
			`await out_file.write(image_bin)`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`elif opt.format == "json":`
			`raise NotImplementedError`
			`elif opt.format == "parquet":`
			`raise NotImplementedError`

			`def isWindows():`
			`return sys.platform.startswith("win")`

			`if __name__ == "__main__":`
			`parser = get_parser()`
			`opt = parser.parse_args()`

update autocaption to be able to create .caption and .txt sidecars instead of filename 2022-11-10 09:22:20 -07:00			`if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]:`
			`raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'")`
Add torch_device option to scripts/auto_caption.py This allows using auto_caption on apple sillicon macs by specifying cpu as an argument for now, and might allow using mps eventually, once more operators are implemented. 2023-01-09 20:46:40 -07:00
			`if (isWindows()):`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy")`
			`asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())`
			`else:`
			`print("Unix detected, using default asyncio event loop policy")`

file renamer and some general updates 2022-11-01 18:02:54 -06:00			`if not os.path.exists("scripts/BLIP"):`
			`print("BLIP not found, cloning BLIP repo")`
			`subprocess.run(["git", "clone", "https://github.com/salesforce/BLIP", "scripts/BLIP"])`
working on colab for auto_caption 2022-10-30 22:02:10 -06:00			`blip_path = "scripts/BLIP"`
big update, adding auto-captioning 2022-10-30 19:59:26 -06:00			`sys.path.append(blip_path)`

			`asyncio.run(main(opt))`