diff --git a/doc/AUTO_CAPTION.md b/doc/AUTO_CAPTION.md index 324af8d..6510a71 100644 --- a/doc/AUTO_CAPTION.md +++ b/doc/AUTO_CAPTION.md @@ -2,11 +2,11 @@ Automatic captioning uses Salesforce's BLIP to automatically create a clean sentence structure for captioning input images before training. -This requires an Nvidia GPU, but is not terribly intensive work. It should run fine on something like a 1050 Ti 4GB. +By default this requires an Nvidia GPU, but is not terribly intensive work. It should run fine on something like a 1050 Ti 4GB. You can even run this on the CPU by specifying `--torch_device cpu` as an argument. This will be slower than running on a Nvidia GPU, but will work even on Apple Silicon Macs. [EveryDream trainer](https://github.com/victorchall/EveryDream-trainer) no longer requires cropped images. You only need to crop to exclude stuff you don't want trained, or to improve the portion of face close ups in your data. The EveryDream trainer now accepts multiple aspect ratios and can train on them natively. -But if you do wish to crop for other trainers, you can use [Birme](https://www.birme.net/?target_width=512&target_height=512&auto_focal=false&image_format=webp&quality_jpeg=95&quality_webp=99) to crop and resize first. There are various tools out there for this. +But if you do wish to crop for other trainers, you can use [Birme](https://www.birme.net/?target_width=512&target_height=512&auto_focal=false&image_format=webp&quality_jpeg=95&quality_webp=99) to crop and resize first. There are various tools out there for this. @@ -16,7 +16,7 @@ Place input files into the /input folder python scripts/auto_caption.py -Files will be **copied** and renamed to the caption as the file name and placed into /output. +Files will be **copied** and renamed to the caption as the file name and placed into /output. ## Colab notebook @@ -42,14 +42,14 @@ Changes the default output directory. Default is /output ### --format -The default behavior will simply name the file the caption .EXT and, if needed, add _n at the end to avoid collisions, for use with EveryDream trainer or Kane Wallmann's dream booth fork. +The default behavior will simply name the file the caption .EXT and, if needed, add _n at the end to avoid collisions, for use with EveryDream trainer or Kane Wallmann's dream booth fork. ex output: *"a man in a blue suit and a woman in a black dress standing next to each other in front of a table with a potted plant on it.jpg"* "mrwho" or "joepenna" will add \[number\]@ as a prefix for use with MrWho's captioning system (on JoePenna dream both fork) which uses that naming standard to avoid file name collisions. python scripts/auto_caption.py --format "mrwho" - + "txt" or "caption" will create a ".txt" or ".caption" file instead of renaming the image. ".txt" sidecar is another option for EveryDream trainer instead of getting the caption from the filename itself, and ".caption" is an option for other trainers. python scripts/auto_caption.py --format "txt" @@ -59,11 +59,11 @@ or python scripts/auto_caption.py --format "caption" ## Tweaks -You may find the following setting useful to deal with issues with bad auto-captioning. Start with defaults, and if you have issues with captions that seem inaccurate or reptitious try some of the following settings. +You may find the following setting useful to deal with issues with bad auto-captioning. Start with defaults, and if you have issues with captions that seem inaccurate or reptitious try some of the following settings. ### --nucleus -Uses an alternative "nucleus" algorithm instead of the default "beam 16" algorithm. Nucleus produces relatively short captions but reliably absent of repeated words and phrases, comparable to using beam 16 which can be adjusted further but may need more tweaking. +Uses an alternative "nucleus" algorithm instead of the default "beam 16" algorithm. Nucleus produces relatively short captions but reliably absent of repeated words and phrases, comparable to using beam 16 which can be adjusted further but may need more tweaking. python scripts/auto_caption.py --nucleus @@ -86,7 +86,7 @@ nucleus q_factor 0.00001: (same as above) ### --q_factor -An tuning adjustment depending the algorithm used. +An tuning adjustment depending the algorithm used. For the default beam 16 algorithm it limits the ability of words and phrases to be repeated. Higher value reduces repeated words and phrases. 0.6-1.4 are sensible values for beam 16. Default is 1.0 and works well with the defaulted value min_length of 24. Consider using higher values if you use a min_length higher than 24 with beam 16. @@ -110,6 +110,6 @@ Default is 22. Sensible values are 15 to 30, max is 48. Larger values are much If you continue to both increase min_length and q_factor with default beam algorithm in an attempt to get a really long caption without repeats it will generate oddly specific prompts. For example using the above image: ---q_factor 1.9 --min_length 48: +--q_factor 1.9 --min_length 48: -*"a painting of a group of people sitting at a table in a room with red drapes on the walls and gold trimmings on the ceiling, while one person is holding a wine glass in front of the other hand"* \ No newline at end of file +*"a painting of a group of people sitting at a table in a room with red drapes on the walls and gold trimmings on the ceiling, while one person is holding a wine glass in front of the other hand"* diff --git a/scripts/auto_caption.py b/scripts/auto_caption.py index 015e99e..b5c2f8e 100644 --- a/scripts/auto_caption.py +++ b/scripts/auto_caption.py @@ -66,6 +66,14 @@ def get_parser(**parser_kwargs): default=22, help="adjusts the likelihood of a word being repeated", ), + parser.add_argument( + "--torch_device", + type=str, + nargs="?", + const=False, + default="cuda", + help="specify a different torch device, e.g. 'cpu'", + ), return parser @@ -100,13 +108,13 @@ async def main(opt): if not os.path.exists(cache_folder): os.makedirs(cache_folder) - + if not os.path.exists(opt.out_dir): os.makedirs(opt.out_dir) if not os.path.exists(model_cache_path): print(f"Downloading model to {model_cache_path}... please wait") - + async with aiohttp.ClientSession() as session: async with session.get(BLIP_MODEL_URL) as res: with open(model_cache_path, 'wb') as f: @@ -119,9 +127,9 @@ async def main(opt): blip_decoder = models.blip.blip_decoder(pretrained=model_cache_path, image_size=SIZE, vit='base', med_config=config_path) blip_decoder.eval() - print("loading model to cuda") + print(f"loading model to {opt.torch_device}") - blip_decoder = blip_decoder.to(torch.device("cuda")) + blip_decoder = blip_decoder.to(torch.device(opt.torch_device)) ext = ('.jpg', '.jpeg', '.png', '.webp', '.tif', '.tga', '.tiff', '.bmp', '.gif') @@ -141,7 +149,7 @@ async def main(opt): if not image.mode == "RGB": image = image.convert("RGB") - image = load_image(image, device=torch.device("cuda")) + image = load_image(image, device=torch.device(opt.torch_device)) if opt.nucleus: captions = blip_decoder.generate(image, sample=True, top_p=opt.q_factor) @@ -193,8 +201,8 @@ if __name__ == "__main__": if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]: raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'") - - if (isWindows()): + + if (isWindows()): print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy") asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) else: @@ -207,4 +215,3 @@ if __name__ == "__main__": sys.path.append(blip_path) asyncio.run(main(opt)) - \ No newline at end of file