update autocaption to be able to create .caption and .txt sidecars instead of filename

2022-11-10 11:22:20 -05:00 · 2022-11-10 11:22:20 -05:00 · b4a9d2b949
parent fad322cba3
commit b4a9d2b949
3 changed files with 43 additions and 16 deletions
--- a/AutoCaption.ipynb
+++ b/AutoCaption.ipynb
@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Please read the documentation here before you start.\n","\n","I suggest reading this doc before you connect to your runtime to avoid using credits or being charged while you figure it out.\n","\n","[Auto Captioning Readme](doc/AUTO_CAPTION.md)\n","\n","This notebook requires an Nvidia GPU instance. Any will do, you don't need anything power.  As low as 4GB should be fine.\n","\n","Only colab has automatic file transfers at this time.  If you are using another platform, you will need to manually download your output files."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":929,"status":"ok","timestamp":1667184580032,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"lWGx2LuU8Q_I","outputId":"d0eb4d03-f16d-460b-981d-d5f88447e85e"},"outputs":[],"source":["#download repo\n","!git clone https://github.com/victorchall/EveryDream.git\n","# Set working directory\n","%cd EveryDream"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4944,"status":"ok","timestamp":1667184754992,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"RJxfSai-8pkD","outputId":"0ac1b805-62a0-48aa-e0da-ee19503bb3f1"},"outputs":[],"source":["# install requirements\n","!pip install torch=='1.12.1+cu113' 'torchvision==0.13.1+cu113' --extra-index-url https://download.pytorch.org/whl/cu113\n","!pip install pandas>='1.3.5'\n","!git clone https://github.com/salesforce/BLIP scripts/BLIP\n","!pip install timm\n","!pip install fairscale=='0.4.4'\n","!pip install transformers=='4.19.2'\n","!pip install timm"]},{"cell_type":"markdown","metadata":{"id":"sbeUIVXJ-EVf"},"source":["# Upload your input images into the EveryDream/input folder\n","\n","![upload to input](demo/upload_images_caption.png)"]},{"cell_type":"markdown","metadata":{},"source":["## Please read the documentation for information on the parameters\n","\n","[Auto Captioning](doc/AUTO_CAPTION.md)\n","\n","*You cannot have commented lines between uncommented lines.  If you uncomment a line below, move it above any other commented lines.*\n","\n","*!python must remain the first line.*\n","\n","Default params should work fairly well."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18221,"status":"ok","timestamp":1667185808005,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"4TAICahl-RPn","outputId":"da7fa1a8-0855-403a-c295-4da31658d1f6"},"outputs":[],"source":["!python scripts/auto_caption.py \\\n","--img_dir EveryDream/input \\\n","--out_dir EveryDream/output \\\n","#--format mrwho \\\n","#--min_length 34 \\\n","#--q_factor 1.3 \\\n","#--nucleus \\"]},{"cell_type":"markdown","metadata":{"id":"HBrWnu1C_lN9"},"source":["## Download your captioned images from EveryDream/output\n","\n","If you're on a colab you can use the cell below to push your output to your Gdrive."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","\n","!mkdir /content/drive/MyDrive/AutoCaption\n","!cp output/*.* /content/drive/MyDrive/AutoCaption"]},{"cell_type":"markdown","metadata":{},"source":["## If not on colab/gdrive, the following will zip up your files for extraction\n","\n","You'll still need to use your runtime's own download feature to download the zip.\n","\n","![output zip](demo/output_zip.png)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install patool\n","\n","import patoolib\n","\n","!mkdir output/zip\n","\n","!zip -r output/zip/output.zip output"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyN9ZSr0RyOQKdfeVsl2uOiE","collapsed_sections":[],"provenance":[{"file_id":"16QrivRfoDFvE7fAa7eLeVlxj78Q573E0","timestamp":1667185879409}]},"kernelspec":{"display_name":"Python 3.10.5 ('.venv': venv)","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.5"},"vscode":{"interpreter":{"hash":"faf4a6abb601e3a9195ce3e9620411ceec233a951446de834cdf28542d2d93b4"}}},"nbformat":4,"nbformat_minor":0}
+{"cells":[{"cell_type":"markdown","metadata":{},"source":["# Please read the documentation here before you start.\n","\n","I suggest reading this doc before you connect to your runtime to avoid using credits or being charged while you figure it out.\n","\n","[Auto Captioning Readme](doc/AUTO_CAPTION.md)\n","\n","This notebook requires an Nvidia GPU instance. Any will do, you don't need anything power.  As low as 4GB should be fine.\n","\n","Only colab has automatic file transfers at this time.  If you are using another platform, you will need to manually download your output files."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":929,"status":"ok","timestamp":1667184580032,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"lWGx2LuU8Q_I","outputId":"d0eb4d03-f16d-460b-981d-d5f88447e85e"},"outputs":[],"source":["#download repo\n","!git clone https://github.com/victorchall/EveryDream.git\n","# Set working directory\n","%cd EveryDream"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":4944,"status":"ok","timestamp":1667184754992,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"RJxfSai-8pkD","outputId":"0ac1b805-62a0-48aa-e0da-ee19503bb3f1"},"outputs":[],"source":["# install requirements\n","!pip install torch=='1.12.1+cu113' 'torchvision==0.13.1+cu113' --extra-index-url https://download.pytorch.org/whl/cu113\n","!pip install pandas>='1.3.5'\n","!git clone https://github.com/salesforce/BLIP scripts/BLIP\n","!pip install timm\n","!pip install fairscale=='0.4.4'\n","!pip install transformers=='4.19.2'\n","!pip install timm"]},{"cell_type":"markdown","metadata":{"id":"sbeUIVXJ-EVf"},"source":["# Upload your input images into the EveryDream/input folder\n","\n","![upload to input](demo/upload_images_caption.png)"]},{"cell_type":"markdown","metadata":{},"source":["## Please read the documentation for information on the parameters\n","\n","[Auto Captioning](doc/AUTO_CAPTION.md)\n","\n","*You cannot have commented lines between uncommented lines.  If you uncomment a line below, move it above any other commented lines.*\n","\n","*!python must remain the first line.*\n","\n","Default params should work fairly well."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18221,"status":"ok","timestamp":1667185808005,"user":{"displayName":"Victor Hall","userId":"00029068894644207946"},"user_tz":240},"id":"4TAICahl-RPn","outputId":"da7fa1a8-0855-403a-c295-4da31658d1f6"},"outputs":[],"source":["!python scripts/auto_caption.py \\\n","--img_dir input \\\n","--out_dir output \\\n","#--format mrwho \\\n","#--min_length 34 \\\n","#--q_factor 1.3 \\\n","#--nucleus \\"]},{"cell_type":"markdown","metadata":{"id":"HBrWnu1C_lN9"},"source":["## Download your captioned images from EveryDream/output\n","\n","If you're on a colab you can use the cell below to push your output to your Gdrive."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","\n","!mkdir /content/drive/MyDrive/AutoCaption\n","!cp output/*.* /content/drive/MyDrive/AutoCaption"]},{"cell_type":"markdown","metadata":{},"source":["## If not on colab/gdrive, the following will zip up your files for extraction\n","\n","You'll still need to use your runtime's own download feature to download the zip.\n","\n","![output zip](demo/output_zip.png)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install patool\n","\n","import patoolib\n","\n","!mkdir output/zip\n","\n","!zip -r output/zip/output.zip output"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyN9ZSr0RyOQKdfeVsl2uOiE","collapsed_sections":[],"provenance":[{"file_id":"16QrivRfoDFvE7fAa7eLeVlxj78Q573E0","timestamp":1667185879409}]},"kernelspec":{"display_name":"Python 3.10.5 ('.venv': venv)","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.10.5"},"vscode":{"interpreter":{"hash":"faf4a6abb601e3a9195ce3e9620411ceec233a951446de834cdf28542d2d93b4"}}},"nbformat":4,"nbformat_minor":0}
--- a/doc/AUTO_CAPTION.md
+++ b/doc/AUTO_CAPTION.md
@ -45,7 +45,14 @@ ex output: *"a man in a blue suit and a woman in a black dress standing next to
 "mrwho" or "joepenna" will add \[number\]@ as a prefix for use with MrWho's captioning system (on JoePenna dream both fork) which uses that naming standard to avoid file name collisions.

    python scripts/auto_caption.py --format "mrwho"
+    
+"txt" or "caption" will create a ".txt" or ".caption" file instead of renaming the image.  ".txt" sidecar is another option for EveryDream trainer instead of getting the caption from the filename itself, and ".caption" is an option for other trainers.

+    python scripts/auto_caption.py --format "txt"
+
+or
+
+    python scripts/auto_caption.py --format "caption"
 ## Tweaks

 You may find the following setting useful to deal with issues with bad auto-captioning.  Start with defaults, and if you have issues with captions that seem inaccurate or reptitious try some of the following settings. 
@ -67,7 +74,7 @@ nucleus q_factor 9999: *"a number of kites painted in different colors in a ceil

 nucleus q_factor 200: *"a group of people waiting under art hanging from a ceiling"*

-nucleus q_factor 0.8: *"several people standing around with large colorful umbrellas"*
+nucleus q_factor 1: *"several people standing around with large colorful umbrellas"*

 nucleus q_factor 0.01: *"people are standing in an open building with colorful paper decorations"*

@ -79,7 +86,7 @@ An tuning adjustment depending the algorithm used.

 For the default beam 16 algorithm it limits the ability of words and phrases to be repeated.  Higher value reduces repeated words and phrases.  0.6-1.4 are sensible values for beam 16.  Default is 0.8 and works well with the defaulted value min_length of 24.  Consider using higher values if you use a min_length higher than 24 with beam 16.

-For nucleus (--nucleus), it simply changes the opinion on the prompt and does not impact repeats.  Values ranging from 0.01 to 200 seem sensible and default of 0.8 usually works well.
+For nucleus (--nucleus), it simply changes the opinion on the prompt and does not impact repeats.  Values ranging from 0.01 to 200 seem sensible and default of 1.0 usually works well.

 ![Beam vs Nucleus](../demo/beam_vs_nucleus_2.webp)

@ -87,7 +94,7 @@ For nucleus (--nucleus), it simply changes the opinion on the prompt and does no

 Adjusts the minimum length of prompt, measured in tokens.  **Only applies to beam 16.**  Useful to adjust along with --q_factor to keep it from repeating.

-Default is 24.  Sensible values are 15 to 30, max is 48.  Larger values are much more prone to repeating phrases and should be accompanied by increasing --q_factor to avoid repeats.
+Default is 22.  Sensible values are 15 to 30, max is 48.  Larger values are much more prone to repeating phrases and should be accompanied by increasing --q_factor to avoid repeats.

    python scripts/auto_caption.py --min_length 20

@ -97,7 +104,7 @@ Default is 24.  Sensible values are 15 to 30, max is 48.  Larger values are much

 ### Note

-If you continue to both increase min_length and q_factor you start to get oddly specific prompts. For example using the above image:
+If you continue to both increase min_length and q_factor with default beam algorithm in an attempt to get a really long caption without repeats it will generate oddly specific prompts. For example using the above image:

 --q_factor 1.9  --min_length 48: 

--- a/scripts/auto_caption.py
+++ b/scripts/auto_caption.py
@ -9,6 +9,8 @@ import torch
 import aiohttp
 import asyncio
 import subprocess
+import numpy as np
+import io

 SIZE = 384
 BLIP_MODEL_URL = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
@ -52,7 +54,7 @@ def get_parser(**parser_kwargs):
        type=float,
        nargs="?",
        const=True,
-        default=0.8,
+        default=1.0,
        help="adjusts the likelihood of a word being repeated",
    ),
    parser.add_argument(
@ -60,7 +62,7 @@ def get_parser(**parser_kwargs):
        type=int,
        nargs="?",
        const=True,
-        default=24,
+        default=22,
        help="adjusts the likelihood of a word being repeated",
    ),

@ -76,6 +78,10 @@ def load_image(raw_image, device):
    image = transform(raw_image).unsqueeze(0).to(device)
    return image

+@staticmethod
+def get_out_file_name(out_dir, base_name, ext):
+    return os.path.join(out_dir, f"{base_name}{ext}")
+
 async def main(opt):
    print("starting")
    import models.blip
@ -131,6 +137,10 @@ async def main(opt):

                    image = Image.open(input_file)

+                    if not image.mode == "RGB":
+                        print("converting to RGB")
+                        image = image.convert("RGB")
+
                    image = load_image(image, device=torch.device("cuda"))

                    if opt.nucleus:
@ -149,13 +159,23 @@ async def main(opt):
                        prefix = f"{i:05}@"
                        i += 1
                        caption = prefix+caption
-                    
-                    out_file = os.path.join(opt.out_dir, f"{caption}{file_ext}")
-                    print("   out_file:", out_file)
-                    print()
-                    
-                    if opt.format in ["filename","mrwho"]:
-                        #out_file = os.path.join(out_file)                    
+
+                    if opt.format in ["txt","text","caption"]:
+                        out_base_name = os.path.splitext(os.path.basename(img_file_name))[0]
+
+                    if opt.format in ["txt","text"]:
+                        out_file = get_out_file_name(opt.out_dir, out_base_name, ".txt")
+
+                    if opt.format in ["caption"]:
+                        out_file = get_out_file_name(opt.out_dir, out_base_name, ".caption")
+
+                    if opt.format in ["txt","text","caption"]:
+                        print("writing caption to: ", out_file)
+                        with open(out_file, "w") as out_file:
+                            out_file.write(caption)
+
+                    if opt.format in ["filename", "mrwho", "joepenna"]:       
+                        out_file = get_out_file_name(opt.out_dir, caption, file_ext)
                        with open(out_file, "wb") as out_file:
                            out_file.write(data)
                    elif opt.format == "json":
@ -170,8 +190,8 @@ if __name__ == "__main__":
    parser = get_parser()
    opt = parser.parse_args()

-    if opt.format not in ["filename", "json", "mrwho", "joepenna", "parquet"]:
-        raise ValueError("format must be 'filename', 'json', or 'parquet'")
+    if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]:
+        raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'")
    
    if (isWindows()): 
        print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy")