update autocaption to be able to create .caption and .txt sidecars instead of filename

This commit is contained in:
Victor Hall 2022-11-10 11:22:20 -05:00
parent fad322cba3
commit b4a9d2b949
3 changed files with 43 additions and 16 deletions

View File

@ -1 +1 @@
{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Please read the documentation here before you start.\n","\n","I suggest reading this doc before you connect to your runtime to avoid using credits or being charged while you figure it out.\n","\n","[Auto Captioning Readme](doc/AUTO_CAPTION.md)\n","\n","This notebook requires an Nvidia GPU instance. Any will do, you don't need anything power. As low as 4GB should be fine.\n","\n","Only colab has automatic file transfers at this time. If you are using another platform, you will need to manually download your output files."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":929,"status":"ok","timestamp":1667184580032,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"lWGx2LuU8Q_I","outputId":"d0eb4d03-f16d-460b-981d-d5f88447e85e"},"outputs":[],"source":["#download repo\n","!git clone https://github.com/victorchall/EveryDream.git\n","# Set working directory\n","%cd EveryDream"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4944,"status":"ok","timestamp":1667184754992,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"RJxfSai-8pkD","outputId":"0ac1b805-62a0-48aa-e0da-ee19503bb3f1"},"outputs":[],"source":["# install requirements\n","!pip install torch=='1.12.1+cu113' 'torchvision==0.13.1+cu113' --extra-index-url https://download.pytorch.org/whl/cu113\n","!pip install pandas>='1.3.5'\n","!git clone https://github.com/salesforce/BLIP scripts/BLIP\n","!pip install timm\n","!pip install fairscale=='0.4.4'\n","!pip install transformers=='4.19.2'\n","!pip install timm"]},{"cell_type":"markdown","metadata":{"id":"sbeUIVXJ-EVf"},"source":["# Upload your input images into the EveryDream/input folder\n","\n","![upload to input](demo/upload_images_caption.png)"]},{"cell_type":"markdown","metadata":{},"source":["## Please read the documentation for information on the parameters\n","\n","[Auto Captioning](doc/AUTO_CAPTION.md)\n","\n","*You cannot have commented lines between uncommented lines. If you uncomment a line below, move it above any other commented lines.*\n","\n","*!python must remain the first line.*\n","\n","Default params should work fairly well."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18221,"status":"ok","timestamp":1667185808005,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"4TAICahl-RPn","outputId":"da7fa1a8-0855-403a-c295-4da31658d1f6"},"outputs":[],"source":["!python scripts/auto_caption.py \\\n","--img_dir EveryDream/input \\\n","--out_dir EveryDream/output \\\n","#--format mrwho \\\n","#--min_length 34 \\\n","#--q_factor 1.3 \\\n","#--nucleus \\"]},{"cell_type":"markdown","metadata":{"id":"HBrWnu1C_lN9"},"source":["## Download your captioned images from EveryDream/output\n","\n","If you're on a colab you can use the cell below to push your output to your Gdrive."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","\n","!mkdir /content/drive/MyDrive/AutoCaption\n","!cp output/*.* /content/drive/MyDrive/AutoCaption"]},{"cell_type":"markdown","metadata":{},"source":["## If not on colab/gdrive, the following will zip up your files for extraction\n","\n","You'll still need to use your runtime's own download feature to download the zip.\n","\n","![output zip](demo/output_zip.png)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install patool\n","\n","import patoolib\n","\n","!mkdir output/zip\n","\n","!zip -r output/zip/output.zip output"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyN9ZSr0RyOQKdfeVsl2uOiE","collapsed_sections":[],"provenance":[{"file_id":"16QrivRfoDFvE7fAa7eLeVlxj78Q573E0","timestamp":1667185879409}]},"kernelspec":{"display_name":"Python 3.10.5 ('.venv': venv)","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.5"},"vscode":{"interpreter":{"hash":"faf4a6abb601e3a9195ce3e9620411ceec233a951446de834cdf28542d2d93b4"}}},"nbformat":4,"nbformat_minor":0} {"cells":[{"cell_type":"markdown","metadata":{},"source":["# Please read the documentation here before you start.\n","\n","I suggest reading this doc before you connect to your runtime to avoid using credits or being charged while you figure it out.\n","\n","[Auto Captioning Readme](doc/AUTO_CAPTION.md)\n","\n","This notebook requires an Nvidia GPU instance. Any will do, you don't need anything power. As low as 4GB should be fine.\n","\n","Only colab has automatic file transfers at this time. If you are using another platform, you will need to manually download your output files."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":929,"status":"ok","timestamp":1667184580032,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"lWGx2LuU8Q_I","outputId":"d0eb4d03-f16d-460b-981d-d5f88447e85e"},"outputs":[],"source":["#download repo\n","!git clone https://github.com/victorchall/EveryDream.git\n","# Set working directory\n","%cd EveryDream"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4944,"status":"ok","timestamp":1667184754992,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"RJxfSai-8pkD","outputId":"0ac1b805-62a0-48aa-e0da-ee19503bb3f1"},"outputs":[],"source":["# install requirements\n","!pip install torch=='1.12.1+cu113' 'torchvision==0.13.1+cu113' --extra-index-url https://download.pytorch.org/whl/cu113\n","!pip install pandas>='1.3.5'\n","!git clone https://github.com/salesforce/BLIP scripts/BLIP\n","!pip install timm\n","!pip install fairscale=='0.4.4'\n","!pip install transformers=='4.19.2'\n","!pip install timm"]},{"cell_type":"markdown","metadata":{"id":"sbeUIVXJ-EVf"},"source":["# Upload your input images into the EveryDream/input folder\n","\n","![upload to input](demo/upload_images_caption.png)"]},{"cell_type":"markdown","metadata":{},"source":["## Please read the documentation for information on the parameters\n","\n","[Auto Captioning](doc/AUTO_CAPTION.md)\n","\n","*You cannot have commented lines between uncommented lines. If you uncomment a line below, move it above any other commented lines.*\n","\n","*!python must remain the first line.*\n","\n","Default params should work fairly well."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18221,"status":"ok","timestamp":1667185808005,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"4TAICahl-RPn","outputId":"da7fa1a8-0855-403a-c295-4da31658d1f6"},"outputs":[],"source":["!python scripts/auto_caption.py \\\n","--img_dir input \\\n","--out_dir output \\\n","#--format mrwho \\\n","#--min_length 34 \\\n","#--q_factor 1.3 \\\n","#--nucleus \\"]},{"cell_type":"markdown","metadata":{"id":"HBrWnu1C_lN9"},"source":["## Download your captioned images from EveryDream/output\n","\n","If you're on a colab you can use the cell below to push your output to your Gdrive."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","\n","!mkdir /content/drive/MyDrive/AutoCaption\n","!cp output/*.* /content/drive/MyDrive/AutoCaption"]},{"cell_type":"markdown","metadata":{},"source":["## If not on colab/gdrive, the following will zip up your files for extraction\n","\n","You'll still need to use your runtime's own download feature to download the zip.\n","\n","![output zip](demo/output_zip.png)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install patool\n","\n","import patoolib\n","\n","!mkdir output/zip\n","\n","!zip -r output/zip/output.zip output"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyN9ZSr0RyOQKdfeVsl2uOiE","collapsed_sections":[],"provenance":[{"file_id":"16QrivRfoDFvE7fAa7eLeVlxj78Q573E0","timestamp":1667185879409}]},"kernelspec":{"display_name":"Python 3.10.5 ('.venv': venv)","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.5"},"vscode":{"interpreter":{"hash":"faf4a6abb601e3a9195ce3e9620411ceec233a951446de834cdf28542d2d93b4"}}},"nbformat":4,"nbformat_minor":0}

View File

@ -46,6 +46,13 @@ ex output: *"a man in a blue suit and a woman in a black dress standing next to
python scripts/auto_caption.py --format "mrwho" python scripts/auto_caption.py --format "mrwho"
"txt" or "caption" will create a ".txt" or ".caption" file instead of renaming the image. ".txt" sidecar is another option for EveryDream trainer instead of getting the caption from the filename itself, and ".caption" is an option for other trainers.
python scripts/auto_caption.py --format "txt"
or
python scripts/auto_caption.py --format "caption"
## Tweaks ## Tweaks
You may find the following setting useful to deal with issues with bad auto-captioning. Start with defaults, and if you have issues with captions that seem inaccurate or reptitious try some of the following settings. You may find the following setting useful to deal with issues with bad auto-captioning. Start with defaults, and if you have issues with captions that seem inaccurate or reptitious try some of the following settings.
@ -67,7 +74,7 @@ nucleus q_factor 9999: *"a number of kites painted in different colors in a ceil
nucleus q_factor 200: *"a group of people waiting under art hanging from a ceiling"* nucleus q_factor 200: *"a group of people waiting under art hanging from a ceiling"*
nucleus q_factor 0.8: *"several people standing around with large colorful umbrellas"* nucleus q_factor 1: *"several people standing around with large colorful umbrellas"*
nucleus q_factor 0.01: *"people are standing in an open building with colorful paper decorations"* nucleus q_factor 0.01: *"people are standing in an open building with colorful paper decorations"*
@ -79,7 +86,7 @@ An tuning adjustment depending the algorithm used.
For the default beam 16 algorithm it limits the ability of words and phrases to be repeated. Higher value reduces repeated words and phrases. 0.6-1.4 are sensible values for beam 16. Default is 0.8 and works well with the defaulted value min_length of 24. Consider using higher values if you use a min_length higher than 24 with beam 16. For the default beam 16 algorithm it limits the ability of words and phrases to be repeated. Higher value reduces repeated words and phrases. 0.6-1.4 are sensible values for beam 16. Default is 0.8 and works well with the defaulted value min_length of 24. Consider using higher values if you use a min_length higher than 24 with beam 16.
For nucleus (--nucleus), it simply changes the opinion on the prompt and does not impact repeats. Values ranging from 0.01 to 200 seem sensible and default of 0.8 usually works well. For nucleus (--nucleus), it simply changes the opinion on the prompt and does not impact repeats. Values ranging from 0.01 to 200 seem sensible and default of 1.0 usually works well.
![Beam vs Nucleus](../demo/beam_vs_nucleus_2.webp) ![Beam vs Nucleus](../demo/beam_vs_nucleus_2.webp)
@ -87,7 +94,7 @@ For nucleus (--nucleus), it simply changes the opinion on the prompt and does no
Adjusts the minimum length of prompt, measured in tokens. **Only applies to beam 16.** Useful to adjust along with --q_factor to keep it from repeating. Adjusts the minimum length of prompt, measured in tokens. **Only applies to beam 16.** Useful to adjust along with --q_factor to keep it from repeating.
Default is 24. Sensible values are 15 to 30, max is 48. Larger values are much more prone to repeating phrases and should be accompanied by increasing --q_factor to avoid repeats. Default is 22. Sensible values are 15 to 30, max is 48. Larger values are much more prone to repeating phrases and should be accompanied by increasing --q_factor to avoid repeats.
python scripts/auto_caption.py --min_length 20 python scripts/auto_caption.py --min_length 20
@ -97,7 +104,7 @@ Default is 24. Sensible values are 15 to 30, max is 48. Larger values are much
### Note ### Note
If you continue to both increase min_length and q_factor you start to get oddly specific prompts. For example using the above image: If you continue to both increase min_length and q_factor with default beam algorithm in an attempt to get a really long caption without repeats it will generate oddly specific prompts. For example using the above image:
--q_factor 1.9 --min_length 48: --q_factor 1.9 --min_length 48:

View File

@ -9,6 +9,8 @@ import torch
import aiohttp import aiohttp
import asyncio import asyncio
import subprocess import subprocess
import numpy as np
import io
SIZE = 384 SIZE = 384
BLIP_MODEL_URL = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' BLIP_MODEL_URL = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
@ -52,7 +54,7 @@ def get_parser(**parser_kwargs):
type=float, type=float,
nargs="?", nargs="?",
const=True, const=True,
default=0.8, default=1.0,
help="adjusts the likelihood of a word being repeated", help="adjusts the likelihood of a word being repeated",
), ),
parser.add_argument( parser.add_argument(
@ -60,7 +62,7 @@ def get_parser(**parser_kwargs):
type=int, type=int,
nargs="?", nargs="?",
const=True, const=True,
default=24, default=22,
help="adjusts the likelihood of a word being repeated", help="adjusts the likelihood of a word being repeated",
), ),
@ -76,6 +78,10 @@ def load_image(raw_image, device):
image = transform(raw_image).unsqueeze(0).to(device) image = transform(raw_image).unsqueeze(0).to(device)
return image return image
@staticmethod
def get_out_file_name(out_dir, base_name, ext):
return os.path.join(out_dir, f"{base_name}{ext}")
async def main(opt): async def main(opt):
print("starting") print("starting")
import models.blip import models.blip
@ -131,6 +137,10 @@ async def main(opt):
image = Image.open(input_file) image = Image.open(input_file)
if not image.mode == "RGB":
print("converting to RGB")
image = image.convert("RGB")
image = load_image(image, device=torch.device("cuda")) image = load_image(image, device=torch.device("cuda"))
if opt.nucleus: if opt.nucleus:
@ -150,12 +160,22 @@ async def main(opt):
i += 1 i += 1
caption = prefix+caption caption = prefix+caption
out_file = os.path.join(opt.out_dir, f"{caption}{file_ext}") if opt.format in ["txt","text","caption"]:
print(" out_file:", out_file) out_base_name = os.path.splitext(os.path.basename(img_file_name))[0]
print()
if opt.format in ["filename","mrwho"]: if opt.format in ["txt","text"]:
#out_file = os.path.join(out_file) out_file = get_out_file_name(opt.out_dir, out_base_name, ".txt")
if opt.format in ["caption"]:
out_file = get_out_file_name(opt.out_dir, out_base_name, ".caption")
if opt.format in ["txt","text","caption"]:
print("writing caption to: ", out_file)
with open(out_file, "w") as out_file:
out_file.write(caption)
if opt.format in ["filename", "mrwho", "joepenna"]:
out_file = get_out_file_name(opt.out_dir, caption, file_ext)
with open(out_file, "wb") as out_file: with open(out_file, "wb") as out_file:
out_file.write(data) out_file.write(data)
elif opt.format == "json": elif opt.format == "json":
@ -170,8 +190,8 @@ if __name__ == "__main__":
parser = get_parser() parser = get_parser()
opt = parser.parse_args() opt = parser.parse_args()
if opt.format not in ["filename", "json", "mrwho", "joepenna", "parquet"]: if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]:
raise ValueError("format must be 'filename', 'json', or 'parquet'") raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'")
if (isWindows()): if (isWindows()):
print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy") print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy")