stable-diffusion-webui/modules/interrogate.py

import os
import sys
import traceback
from collections import namedtuple
import re

import torch

from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

import modules.shared as shared
from modules import devices, paths, lowvram, modelloader

blip_image_eval_size = 384
clip_model_name = 'ViT-L/14'

Category = namedtuple("Category", ["name", "topn", "items"])

re_topn = re.compile(r"\.top(\d+)\.")


class InterrogateModels:
    blip_model = None
    clip_model = None
    clip_preprocess = None
    categories = None
    dtype = None
    running_on_cpu = None

    def __init__(self, content_dir):
        self.categories = []
        self.running_on_cpu = devices.device_interrogate == torch.device("cpu")

        if os.path.exists(content_dir):
            for filename in os.listdir(content_dir):
                m = re_topn.search(filename)
                topn = 1 if m is None else int(m.group(1))

                with open(os.path.join(content_dir, filename), "r", encoding="utf8") as file:
                    lines = [x.strip() for x in file.readlines()]

                self.categories.append(Category(name=filename, topn=topn, items=lines))

    def load_blip_model(self):
        import models.blip

        files = modelloader.load_models(
            model_path=os.path.join(paths.models_path, "BLIP"),
            model_url='https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth',
            ext_filter=[".pth"],
            download_name='model_base_caption_capfilt_large.pth',
        )

        blip_model = models.blip.blip_decoder(pretrained=files[0], image_size=blip_image_eval_size, vit='base', med_config=os.path.join(paths.paths["BLIP"], "configs", "med_config.json"))
        blip_model.eval()

        return blip_model

    def load_clip_model(self):
        import clip

        if self.running_on_cpu:
            model, preprocess = clip.load(clip_model_name, device="cpu", download_root=shared.cmd_opts.clip_models_path)
        else:
            model, preprocess = clip.load(clip_model_name, download_root=shared.cmd_opts.clip_models_path)

        model.eval()
        model = model.to(devices.device_interrogate)

        return model, preprocess

    def load(self):
        if self.blip_model is None:
            self.blip_model = self.load_blip_model()
            if not shared.cmd_opts.no_half and not self.running_on_cpu:
                self.blip_model = self.blip_model.half()

        self.blip_model = self.blip_model.to(devices.device_interrogate)

        if self.clip_model is None:
            self.clip_model, self.clip_preprocess = self.load_clip_model()
            if not shared.cmd_opts.no_half and not self.running_on_cpu:
                self.clip_model = self.clip_model.half()

        self.clip_model = self.clip_model.to(devices.device_interrogate)

        self.dtype = next(self.clip_model.parameters()).dtype

    def send_clip_to_ram(self):
        if not shared.opts.interrogate_keep_models_in_memory:
            if self.clip_model is not None:
                self.clip_model = self.clip_model.to(devices.cpu)

    def send_blip_to_ram(self):
        if not shared.opts.interrogate_keep_models_in_memory:
            if self.blip_model is not None:
                self.blip_model = self.blip_model.to(devices.cpu)

    def unload(self):
        self.send_clip_to_ram()
        self.send_blip_to_ram()

        devices.torch_gc()

    def rank(self, image_features, text_array, top_count=1):
        import clip

        if shared.opts.interrogate_clip_dict_limit != 0:
            text_array = text_array[0:int(shared.opts.interrogate_clip_dict_limit)]

        top_count = min(top_count, len(text_array))
        text_tokens = clip.tokenize([text for text in text_array], truncate=True).to(devices.device_interrogate)
        text_features = self.clip_model.encode_text(text_tokens).type(self.dtype)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = torch.zeros((1, len(text_array))).to(devices.device_interrogate)
        for i in range(image_features.shape[0]):
            similarity += (100.0 * image_features[i].unsqueeze(0) @ text_features.T).softmax(dim=-1)
        similarity /= image_features.shape[0]

        top_probs, top_labels = similarity.cpu().topk(top_count, dim=-1)
        return [(text_array[top_labels[0][i].numpy()], (top_probs[0][i].numpy()*100)) for i in range(top_count)]

    def generate_caption(self, pil_image):
        gpu_image = transforms.Compose([
            transforms.Resize((blip_image_eval_size, blip_image_eval_size), interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)

        with torch.no_grad():
            caption = self.blip_model.generate(gpu_image, sample=False, num_beams=shared.opts.interrogate_clip_num_beams, min_length=shared.opts.interrogate_clip_min_length, max_length=shared.opts.interrogate_clip_max_length)

        return caption[0]

    def interrogate(self, pil_image):
        res = None

        try:

            if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:
                lowvram.send_everything_to_cpu()
                devices.torch_gc()

            self.load()

            caption = self.generate_caption(pil_image)
            self.send_blip_to_ram()
            devices.torch_gc()

            res = caption

            clip_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)

            with torch.no_grad(), devices.autocast():
                image_features = self.clip_model.encode_image(clip_image).type(self.dtype)

                image_features /= image_features.norm(dim=-1, keepdim=True)

                if shared.opts.interrogate_use_builtin_artists:
                    artist = self.rank(image_features, ["by " + artist.name for artist in shared.artist_db.artists])[0]

                    res += ", " + artist[0]

                for name, topn, items in self.categories:
                    matches = self.rank(image_features, items, top_count=topn)
                    for match, score in matches:
                        if shared.opts.interrogate_return_ranks:
                            res += f", ({match}:{score/100:.3f})"
                        else:
                            res += ", " + match

        except Exception:
            print("Error interrogating", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
            res += "<error>"

        self.unload()

        return res
CLIP interrogator 2022-09-11 09:48:36 -06:00			`import os`
			`import sys`
			`import traceback`
			`from collections import namedtuple`
			`import re`

			`import torch`

			`from torchvision import transforms`
			`from torchvision.transforms.functional import InterpolationMode`

			`import modules.shared as shared`
use modelloader for #4956 2022-12-03 08:45:51 -07:00			`from modules import devices, paths, lowvram, modelloader`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`blip_image_eval_size = 384`
			`clip_model_name = 'ViT-L/14'`

			`Category = namedtuple("Category", ["name", "topn", "items"])`

			`re_topn = re.compile(r"\.top(\d+)\.")`

preprocessing for textual inversion added 2022-10-02 13:41:21 -06:00
CLIP interrogator 2022-09-11 09:48:36 -06:00			`class InterrogateModels:`
			`blip_model = None`
			`clip_model = None`
			`clip_preprocess = None`
			`categories = None`
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`dtype = None`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`running_on_cpu = None`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`def __init__(self, content_dir):`
			`self.categories = []`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`self.running_on_cpu = devices.device_interrogate == torch.device("cpu")`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`if os.path.exists(content_dir):`
			`for filename in os.listdir(content_dir):`
			`m = re_topn.search(filename)`
			`topn = 1 if m is None else int(m.group(1))`

			`with open(os.path.join(content_dir, filename), "r", encoding="utf8") as file:`
			`lines = [x.strip() for x in file.readlines()]`

			`self.categories.append(Category(name=filename, topn=topn, items=lines))`

			`def load_blip_model(self):`
			`import models.blip`

use modelloader for #4956 2022-12-03 08:45:51 -07:00			`files = modelloader.load_models(`
			`model_path=os.path.join(paths.models_path, "BLIP"),`
			`model_url='https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth',`
			`ext_filter=[".pth"],`
			`download_name='model_base_caption_capfilt_large.pth',`
			`)`
[interrogator] mkdir check 2022-11-24 06:04:45 -07:00
use modelloader for #4956 2022-12-03 08:45:51 -07:00			`blip_model = models.blip.blip_decoder(pretrained=files[0], image_size=blip_image_eval_size, vit='base', med_config=os.path.join(paths.paths["BLIP"], "configs", "med_config.json"))`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`blip_model.eval()`

			`return blip_model`

			`def load_clip_model(self):`
			`import clip`

interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`if self.running_on_cpu:`
Added "--clip-models-path" switch to avoid using default "~/.cache/clip" and enable to run under unprivileged user without homedir 2022-10-30 15:14:07 -06:00			`model, preprocess = clip.load(clip_model_name, device="cpu", download_root=shared.cmd_opts.clip_models_path)`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`else:`
Added "--clip-models-path" switch to avoid using default "~/.cache/clip" and enable to run under unprivileged user without homedir 2022-10-30 15:14:07 -06:00			`model, preprocess = clip.load(clip_model_name, download_root=shared.cmd_opts.clip_models_path)`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00
CLIP interrogator 2022-09-11 09:48:36 -06:00			`model.eval()`
Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`model = model.to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`return model, preprocess`

			`def load(self):`
			`if self.blip_model is None:`
			`self.blip_model = self.load_blip_model()`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`if not shared.cmd_opts.no_half and not self.running_on_cpu:`
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`self.blip_model = self.blip_model.half()`
CLIP interrogator 2022-09-11 09:48:36 -06:00
Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`self.blip_model = self.blip_model.to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`if self.clip_model is None:`
			`self.clip_model, self.clip_preprocess = self.load_clip_model()`
interrogate: Fix CLIP-interrogation on CPU Currently, trying to perform CLIP interrogation on a CPU fails, saying: ``` RuntimeError: "slow_conv2d_cpu" not implemented for 'Half' ``` This merge request fixes this issue by detecting whether the target device is CPU and, if so, force-enabling `--no-half` and passing `device="cpu"` to `clip.load()` (which then does some extra tricks to ensure it works correctly on CPU). 2022-10-20 11:22:59 -06:00			`if not shared.cmd_opts.no_half and not self.running_on_cpu:`
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`self.clip_model = self.clip_model.half()`
CLIP interrogator 2022-09-11 09:48:36 -06:00
Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`self.clip_model = self.clip_model.to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`self.dtype = next(self.clip_model.parameters()).dtype`

memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00			`def send_clip_to_ram(self):`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`if not shared.opts.interrogate_keep_models_in_memory:`
			`if self.clip_model is not None:`
			`self.clip_model = self.clip_model.to(devices.cpu)`

memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00			`def send_blip_to_ram(self):`
			`if not shared.opts.interrogate_keep_models_in_memory:`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`if self.blip_model is not None:`
			`self.blip_model = self.blip_model.to(devices.cpu)`

memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00			`def unload(self):`
			`self.send_clip_to_ram()`
			`self.send_blip_to_ram()`

			`devices.torch_gc()`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`def rank(self, image_features, text_array, top_count=1):`
			`import clip`

memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00			`if shared.opts.interrogate_clip_dict_limit != 0:`
			`text_array = text_array[0:int(shared.opts.interrogate_clip_dict_limit)]`

CLIP interrogator 2022-09-11 09:48:36 -06:00			`top_count = min(top_count, len(text_array))`
Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`text_tokens = clip.tokenize([text for text in text_array], truncate=True).to(devices.device_interrogate)`
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`text_features = self.clip_model.encode_text(text_tokens).type(self.dtype)`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`text_features /= text_features.norm(dim=-1, keepdim=True)`

Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`similarity = torch.zeros((1, len(text_array))).to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`for i in range(image_features.shape[0]):`
			`similarity += (100.0 * image_features[i].unsqueeze(0) @ text_features.T).softmax(dim=-1)`
			`similarity /= image_features.shape[0]`

			`top_probs, top_labels = similarity.cpu().topk(top_count, dim=-1)`
			`return [(text_array[top_labels[0][i].numpy()], (top_probs[0][i].numpy()*100)) for i in range(top_count)]`

			`def generate_caption(self, pil_image):`
			`gpu_image = transforms.Compose([`
			`transforms.Resize((blip_image_eval_size, blip_image_eval_size), interpolation=InterpolationMode.BICUBIC),`
			`transforms.ToTensor(),`
			`transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))`
Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`])(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`with torch.no_grad():`
			`caption = self.blip_model.generate(gpu_image, sample=False, num_beams=shared.opts.interrogate_clip_num_beams, min_length=shared.opts.interrogate_clip_min_length, max_length=shared.opts.interrogate_clip_max_length)`

			`return caption[0]`

make CLIP interrogate ranks output sane values 2022-10-16 23:41:02 -06:00			`def interrogate(self, pil_image):`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`res = None`

			`try:`
memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00
			`if shared.cmd_opts.lowvram or shared.cmd_opts.medvram:`
			`lowvram.send_everything_to_cpu()`
			`devices.torch_gc()`

CLIP interrogator 2022-09-11 09:48:36 -06:00			`self.load()`

			`caption = self.generate_caption(pil_image)`
memory optimization for CLIP interrogator changed default cfg_scale to a higher value 2022-09-12 02:55:27 -06:00			`self.send_blip_to_ram()`
			`devices.torch_gc()`

CLIP interrogator 2022-09-11 09:48:36 -06:00			`res = caption`

Add 'interrogate' and 'all' choices to --use-cpu * Add 'interrogate' and 'all' choices to --use-cpu * Change type for --use-cpu argument to str.lower, so that choices are case insensitive 2022-10-14 02:42:53 -06:00			`clip_image = self.clip_preprocess(pil_image).unsqueeze(0).type(self.dtype).to(devices.device_interrogate)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
Use devices.autocast instead of torch.autocast 2022-11-28 19:36:35 -07:00			`with torch.no_grad(), devices.autocast():`
chore: Fix typos 2022-10-08 13:12:24 -06:00			`image_features = self.clip_model.encode_image(clip_image).type(self.dtype)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`image_features /= image_features.norm(dim=-1, keepdim=True)`
CLIP interrogator 2022-09-11 09:48:36 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`if shared.opts.interrogate_use_builtin_artists:`
			`artist = self.rank(image_features, ["by " + artist.name for artist in shared.artist_db.artists])[0]`
CLIP interrogator 2022-09-11 09:48:36 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`res += ", " + artist[0]`
CLIP interrogator 2022-09-11 09:48:36 -06:00
add half() supporrt for CLIP interrogation 2022-09-11 14:24:24 -06:00			`for name, topn, items in self.categories:`
			`matches = self.rank(image_features, items, top_count=topn)`
			`for match, score in matches:`
make CLIP interrogate ranks output sane values 2022-10-16 23:41:02 -06:00			`if shared.opts.interrogate_return_ranks:`
			`res += f", ({match}:{score/100:.3f})"`
Fix CLIP Interrogator and disable ranks for it 2022-10-16 16:10:59 -06:00			`else:`
			`res += ", " + match`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`except Exception:`
fix F541 f-string without any placeholders 2022-12-24 12:35:29 -07:00			`print("Error interrogating", file=sys.stderr)`
CLIP interrogator 2022-09-11 09:48:36 -06:00			`print(traceback.format_exc(), file=sys.stderr)`
remove mistaken error message 2022-09-12 03:26:37 -06:00			`res += "<error>"`
CLIP interrogator 2022-09-11 09:48:36 -06:00
			`self.unload()`

			`return res`