From da15f0a745d6861f6b5a25f9aec5467d87cd4a72 Mon Sep 17 00:00:00 2001
From: Victor Hall <victor.charles.hall@gmail.com>
Date: Mon, 9 Jan 2023 22:46:40 -0500
Subject: [PATCH] Add torch_device option to scripts/auto_caption.py

This allows using auto_caption on apple sillicon macs by specifying cpu
as an argument for now, and might allow using mps eventually, once more
operators are implemented.
---
 scripts/auto_caption.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/scripts/auto_caption.py b/scripts/auto_caption.py
index 015e99e..b5c2f8e 100644
--- a/scripts/auto_caption.py
+++ b/scripts/auto_caption.py
@@ -66,6 +66,14 @@ def get_parser(**parser_kwargs):
         default=22,
         help="adjusts the likelihood of a word being repeated",
     ),
+    parser.add_argument(
+        "--torch_device",
+        type=str,
+        nargs="?",
+        const=False,
+        default="cuda",
+        help="specify a different torch device, e.g. 'cpu'",
+    ),
 
     return parser
 
@@ -100,13 +108,13 @@ async def main(opt):
 
     if not os.path.exists(cache_folder):
         os.makedirs(cache_folder)
-    
+
     if not os.path.exists(opt.out_dir):
         os.makedirs(opt.out_dir)
 
     if not os.path.exists(model_cache_path):
         print(f"Downloading model to {model_cache_path}... please wait")
-        
+
         async with aiohttp.ClientSession() as session:
             async with session.get(BLIP_MODEL_URL) as res:
                 with open(model_cache_path, 'wb') as f:
@@ -119,9 +127,9 @@ async def main(opt):
     blip_decoder = models.blip.blip_decoder(pretrained=model_cache_path, image_size=SIZE, vit='base', med_config=config_path)
     blip_decoder.eval()
 
-    print("loading model to cuda")
+    print(f"loading model to {opt.torch_device}")
 
-    blip_decoder = blip_decoder.to(torch.device("cuda"))
+    blip_decoder = blip_decoder.to(torch.device(opt.torch_device))
 
     ext = ('.jpg', '.jpeg', '.png', '.webp', '.tif', '.tga', '.tiff', '.bmp', '.gif')
 
@@ -141,7 +149,7 @@ async def main(opt):
                     if not image.mode == "RGB":
                         image = image.convert("RGB")
 
-                    image = load_image(image, device=torch.device("cuda"))
+                    image = load_image(image, device=torch.device(opt.torch_device))
 
                     if opt.nucleus:
                         captions = blip_decoder.generate(image, sample=True, top_p=opt.q_factor)
@@ -193,8 +201,8 @@ if __name__ == "__main__":
 
     if opt.format not in ["filename", "mrwho", "joepenna", "txt", "text", "caption"]:
         raise ValueError("format must be 'filename', 'mrwho', 'txt', or 'caption'")
-    
-    if (isWindows()): 
+
+    if (isWindows()):
         print("Windows detected, using asyncio.WindowsSelectorEventLoopPolicy")
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     else:
@@ -207,4 +215,3 @@ if __name__ == "__main__":
     sys.path.append(blip_path)
 
     asyncio.run(main(opt))
-  
\ No newline at end of file