From 3256f9e33c4288bdc33fb107ea6583b9b0b357c2 Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Sun, 3 Mar 2024 15:18:54 -0500 Subject: [PATCH 1/3] add phrase grounding to kosmos2 caption script --- caption_kosmos2.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/caption_kosmos2.py b/caption_kosmos2.py index f5a5bab..2576004 100644 --- a/caption_kosmos2.py +++ b/caption_kosmos2.py @@ -76,7 +76,12 @@ def main(args): full_file_path = os.path.join(root, file) image = Image.open(full_file_path) - inputs = processor(text=GROUNDING+args.prompt, images=image, return_tensors="pt") + if args.phrase_mode: + text = GROUNDING + "".join(["" + x.strip() + "" for x in args.prompt.split(",")]) + else: + text = GROUNDING + args.prompt + + inputs = processor(text=text, images=image, return_tensors="pt") with torch.cuda.amp.autocast(enabled=args.dtype != "fp32", dtype=dtype): generated_ids = model.generate( @@ -98,7 +103,7 @@ def main(args): print(f"File: {full_file_path}, Generated caption: {processed_text}") name = os.path.splitext(full_file_path)[0] - if not os.path.exists(f"{name}.txt") or args.overwrite: + if not os.path.exists(f"{name}.txt") or args.overwrite and not args.save_entities_only: with open(f"{name}.txt", "w") as f: f.write(processed_text) @@ -114,15 +119,20 @@ if __name__ == "__main__": parser.description = "Kosmos-2 captioning script" parser.add_argument("--data_root", type=str, default="input", help="Path to folder of images to caption") parser.add_argument("--prompt", type=str, default="Describe this image in detail: ", help="Prompt for generating caption") + parser.add_argument("--phrase_mode", action="store_true", default=False, help="uses 'phrase mode' grounding, interprets prompt as csv list of phrases to ground.") parser.add_argument("--keep_prompt", action="store_true", default=False, help="will keep the prompt at the start of the caption when saved") parser.add_argument("--max_new_tokens", type=int, default=128, help="Maximum number of tokens to generate") parser.add_argument("--save_entities", action="store_true", default=False, help="Save coord box with entities to a separate .ent file") + parser.add_argument("--save_entities_only", action="store_true", default=False, help="Only save coord box with entities to a separate .ent file, do not write caption .txt") parser.add_argument("--overwrite", action="store_true", default=False, help="will overwrite .txt and .ent files if they exist") parser.add_argument("--cpu", action="store_true", default=False, help="use cpu instead of cuda") parser.add_argument("--dtype", type=str, default="fp16", help="force a different dtype if using GPU (fp16, bf16, fp32) (default: fp16)") args = parser.parse_args() parser.print_help() + if args.save_entities_only: + args.save_entities = True + if not args.prompt.startswith(" "): args.prompt = " " + args.prompt From aca186769706f7e5361036a2cd99d06265c56754 Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Sun, 3 Mar 2024 15:39:25 -0500 Subject: [PATCH 2/3] update caption docs and remove open flamingo script --- caption_fl.py | 209 --------------------------------------------- doc/CAPTION.md | 33 +++---- doc/CAPTION_COG.md | 4 +- 3 files changed, 15 insertions(+), 231 deletions(-) delete mode 100644 caption_fl.py diff --git a/caption_fl.py b/caption_fl.py deleted file mode 100644 index d528271..0000000 --- a/caption_fl.py +++ /dev/null @@ -1,209 +0,0 @@ -""" -Copyright [2022-2023] Victor C Hall - -Licensed under the GNU Affero General Public License; -You may not use this code except in compliance with the License. -You may obtain a copy of the License at - - https://www.gnu.org/licenses/agpl-3.0.en.html - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import os - -from PIL import Image -import argparse -import requests -from transformers import Blip2Processor, Blip2ForConditionalGeneration, GitProcessor, GitForCausalLM, AutoModel, AutoProcessor -from huggingface_hub import hf_hub_download -from open_flamingo import create_model_and_transforms - -import torch -from pynvml import * - -import time -from colorama import Fore, Style - - -SUPPORTED_EXT = [".jpg", ".png", ".jpeg", ".bmp", ".jfif", ".webp"] - -def get_gpu_memory_map(): - nvmlInit() - handle = nvmlDeviceGetHandleByIndex(0) - info = nvmlDeviceGetMemoryInfo(handle) - nvmlShutdown() - return info.used/1024/1024 - -def remove_duplicates(string): - words = string.split(', ') # Split the string into individual words - unique_words = [] - - for word in words: - if word not in unique_words: - unique_words.append(word) - else: - break # Stop appending words once a duplicate is found - - return ', '.join(unique_words) - -def get_examples(example_root, image_processor): - examples = [] - for root, dirs, files in os.walk(example_root): - for file in files: - ext = os.path.splitext(file)[-1].lower() - if ext in SUPPORTED_EXT: - #get .txt file of same base name - txt_file = os.path.splitext(file)[0] + ".txt" - with open(os.path.join(root, txt_file), 'r') as f: - caption = f.read() - image = Image.open(os.path.join(root, file)) - vision_x = [image_processor(image).unsqueeze(0)] - #vision_x = torch.cat(vision_x, dim=0) - #vision_x = vision_x.unsqueeze(1).unsqueeze(0) - examples.append((caption, vision_x)) - for x in examples: - print(f" ** Example: {x[0]}") - return examples - -def get_dtype_for_cuda_device(device): - # check compute capability - compute_capability = torch.cuda.get_device_capability() - if compute_capability[0] >= 8: - dtype = torch.bfloat16 - else: - dtype = torch.float16 - return dtype - - -def main(args): - device = "cuda" if torch.cuda.is_available() and not args.force_cpu else "cpu" - dtype = get_dtype_for_cuda_device(device) if device == "cuda" else torch.float32 - - if args.prompt: - prompt = args.prompt - else: - prompt = ": " - print(f" using prompt: {prompt}") - - if "mpt7b" in args.model: - lang_encoder_path="anas-awadalla/mpt-7b" - tokenizer_path="anas-awadalla/mpt-7b" - elif "mpt1b" in args.model: - lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b" - tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b" - - model, image_processor, tokenizer = create_model_and_transforms( - clip_vision_encoder_path="ViT-L-14", - clip_vision_encoder_pretrained="openai", - lang_encoder_path=lang_encoder_path, - tokenizer_path=tokenizer_path, - cross_attn_every_n_layers=1, - ) - - tokenizer.padding_side = "left" - - checkpoint_path = hf_hub_download(args.model, "checkpoint.pt") - model.load_state_dict(torch.load(checkpoint_path), strict=False) - print(f"GPU memory used, before loading model: {get_gpu_memory_map()} MB") - model.to(0, dtype=dtype) - print(f"GPU memory used, after loading model: {get_gpu_memory_map()} MB") - - # examples give few shot learning for captioning the novel image - examples = get_examples(args.example_root, image_processor) - - prompt = "" - output_prompt = "Output:" - per_image_prompt = " " + output_prompt - - for example in iter(examples): - prompt += f"{per_image_prompt}{example[0]}<|endofchunk|>" - prompt += per_image_prompt # prepare for novel example - prompt = prompt.replace("\n", "") # in case captions had newlines - print(f" \n** Final full prompt with example pairs: {prompt}") - - # os.walk all files in args.data_root recursively - for root, dirs, files in os.walk(args.data_root): - for file in files: - #get file extension - ext = os.path.splitext(file)[1] - if ext.lower() in SUPPORTED_EXT: - start_time = time.time() - - full_file_path = os.path.join(root, file) - image = Image.open(full_file_path) - - vision_x = [vx[1][0] for vx in examples] - vision_x.append(image_processor(image).unsqueeze(0)) - vision_x = torch.cat(vision_x, dim=0) - vision_x = vision_x.unsqueeze(1).unsqueeze(0) - vision_x = vision_x.to(device, dtype=dtype) - - lang_x = tokenizer( - [prompt], # blank for image captioning - return_tensors="pt", - ) - lang_x.to(device) - - input_ids = lang_x["input_ids"].to(device) - - with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad(): - generated_text = model.generate( - vision_x=vision_x, - lang_x=input_ids, - attention_mask=lang_x["attention_mask"], - max_new_tokens=args.max_new_tokens, - min_new_tokens=args.min_new_tokens, - num_beams=args.num_beams, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - repetition_penalty=args.repetition_penalty, - ) - del vision_x - del lang_x - - # trim and clean - generated_text = tokenizer.decode(generated_text[0][len(input_ids[0]):], skip_special_tokens=True) - generated_text = generated_text.split(output_prompt)[0] - generated_text = remove_duplicates(generated_text) - - exec_time = time.time() - start_time - print(f"* Caption: {generated_text}") - - print(f" Time for last caption: {exec_time} sec. GPU memory used: {get_gpu_memory_map()} MB") - - name = os.path.splitext(full_file_path)[0] - if not os.path.exists(name): - with open(f"{name}.txt", "w") as f: - f.write(generated_text) - print("Done!") - -if __name__ == "__main__": - print(f"Available models:") - print(f" openflamingo/OpenFlamingo-9B-vitl-mpt7b (default)") - print(f" openflamingo/OpenFlamingo-3B-vitl-mpt1b") - print(f" openflamingo/OpenFlamingo-4B-vitl-rpj3b") - parser = argparse.ArgumentParser() - parser.add_argument("--data_root", type=str, default="input", help="Path to images") - parser.add_argument("--example_root", type=str, default="examples", help="Path to 2-3 precaptioned images to guide generation") - parser.add_argument("--model", type=str, default="openflamingo/OpenFlamingo-9B-vitl-mpt7b", help="Model name or path") - parser.add_argument("--force_cpu", action="store_true", default=False, help="force using CPU even if GPU is available") - parser.add_argument("--min_new_tokens", type=int, default=20, help="minimum number of tokens to generate") - parser.add_argument("--max_new_tokens", type=int, default=50, help="maximum number of tokens to generate") - parser.add_argument("--num_beams", type=int, default=8, help="number of beams, more is more accurate but slower") - parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '") - parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default") - parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default") - parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default") - parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default") - parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default") - args = parser.parse_args() - - print(f"** OPEN-FLAMINGO ** Captioning files in: {args.data_root}") - print(f"** Using model: {args.model}") - main(args) \ No newline at end of file diff --git a/doc/CAPTION.md b/doc/CAPTION.md index 840d3f9..9b46ec9 100644 --- a/doc/CAPTION.md +++ b/doc/CAPTION.md @@ -1,34 +1,27 @@ # Captioning tools -## Open-Flamingo +## CogVLM -#### Note: Open-Flamingo currently only works on Torch 2.0.1. If you want to use it, you will have to backdate your torch installation, which will break features in the trainer. I recommend making a separate environment for Open Flamingo captioning instead. You can run through normal install, then `pip install open-flamingo` in the separate envirment to back date torch and make that install open-flamingo only. +[CogVLM](https://github.com/THUDM/CogVLM) is, so far, the best model for generating synthetic captions. The script for Cog is enhanced, so read the [CogVLM README](CAPTION_COG.md) for more information. -`python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"` +## Kosmos-2 -This script uses two example image/caption pairs located in the `/example` folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the caption in the same folder. +Microsoft's [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) is significantly lighter weight than Cog, using <5GB of VRAM and generating captions in under 1/21 second on a RTX 3090. -This script currently requires an AMPERE or newer GPU due to using bfloat16. +It has the capability to output grounding bounding boxes. -**Trying out different example image/caption pairs will influence how the system captions the input images.** Adding more examples slows processing. +Run `python caption_kosmos2.py --help` to get a list of options. -Supported models: +### _Kosmos-2 grounding_ -* `openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16 -* `openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3, or 36.7gb at num_beams 32 +Kosmos-2can generate bounding boxes for the "grounding" of the caption. This is useful for identifying specific objects in the image in 2D space, which can be useful in later piplines. -The small model with more beams (ex. 16) performs well with details and should not be immediately discounted. +It's worth reading the documentation [here](https://huggingface.co/microsoft/kosmos-2-patch14-224) to understand the grounding output. -The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary. +`--save_entities` outputs a '.ent' file with bounding box information. The entities identified will be based on what caption is produced. -Primary params: +`--phrase_mode` This modifies how the model is called, wrapping phrases in \ tags. This also interprets your prompt as a CSV, wrapping each item in a phrase tag. You might use it with `--prompt "dog,cat,tree"` for instance. *This is not a gaurantee your phrases will be found and output into the grounding output file.* -* `--num_beams 3` increasing uses more VRAM and runs slower, may improve detail, but can increase hallicunations -* `--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption +`--save_entities_only` This will not attempt to write the caption into the .txt file at all. **This is recommended with `--phrase_mode`**. Using this option forces `--save_entities`. -Other settings: - -* `--force_cpu` forces to use CPU even if a CUDA device is present -* `--temperature 1.0` relates to randomness used for next token chosen -* `--repetition_penalty 1.0` penalizes repeating tokens/words, can adjust up if you see repeated terms -* `--length_penalty 1.0` penalizes longer captions +There is a trivial/dumb UI for viewing the grounding in the scripts folder. Launch it with `python scripts/grounding_ui.py` and it will open a window allowing you to select a directory, and it will display the images and bounding boxes. diff --git a/doc/CAPTION_COG.md b/doc/CAPTION_COG.md index 9929c73..0c17cde 100644 --- a/doc/CAPTION_COG.md +++ b/doc/CAPTION_COG.md @@ -120,7 +120,7 @@ I would recommend not setting any of these and leave the default values until yo `--no_repeat_ngram_size 3` prevents the same n-gram (successive token sequence) from being repeated in the output. Can help prevent the model from repeating itself. -`--bad_words "foo,bar"` Attempts to prevent the model from using these words in the output caption. Comma-delimited. +`--bad_words "foo,bar"` Attempts to prevent the model from using these words in the output caption. Comma-delimited. Very useful, consider trying `"depicts,poses,posing,showcases,appears,suggests"` to get more concise phrasing in captions. This is not a guarantee, due to [different tokenizations](https://github.com/huggingface/transformers/issues/17504) being possible for a given bad_word. `--force_word "photograph,Spain"` Attempts to force the model to include the words in the output caption. Comma-delimited. @@ -128,7 +128,7 @@ I would recommend not setting any of these and leave the default values until yo `--max_new_tokens 120` Truncates output after n tokens. May cut off captions abruptly. -`--no_repeat_ngram_size 3` prevents the same n-gram from being repeated in the output. Default is 0, which means no n-gram is prevented from repeating. Setting this to 2 or 3 can help prevent the model from repeating itself. +`--no_repeat_ngram_size 3` prevents the same n-gram (sequence of size n-tokens) from being repeated in the output. Default is 0, which means no n-gram is prevented from repeating. Setting this to 2 or 3 can help prevent the model from repeating itself. `--min_new_tokens 5` Force the model to produce at least n tokens. From 91a9b3d898d12e842125b668df5c5f69be0d6a4e Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Sun, 3 Mar 2024 15:47:44 -0500 Subject: [PATCH 3/3] bugfix and doc fix --- caption_kosmos2.py | 2 +- doc/CAPTION.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/caption_kosmos2.py b/caption_kosmos2.py index 2576004..f55a1fc 100644 --- a/caption_kosmos2.py +++ b/caption_kosmos2.py @@ -103,7 +103,7 @@ def main(args): print(f"File: {full_file_path}, Generated caption: {processed_text}") name = os.path.splitext(full_file_path)[0] - if not os.path.exists(f"{name}.txt") or args.overwrite and not args.save_entities_only: + if (not os.path.exists(f"{name}.txt") or args.overwrite) and not args.save_entities_only: with open(f"{name}.txt", "w") as f: f.write(processed_text) diff --git a/doc/CAPTION.md b/doc/CAPTION.md index 9b46ec9..8b1896d 100644 --- a/doc/CAPTION.md +++ b/doc/CAPTION.md @@ -6,7 +6,7 @@ ## Kosmos-2 -Microsoft's [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) is significantly lighter weight than Cog, using <5GB of VRAM and generating captions in under 1/21 second on a RTX 3090. +Microsoft's [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) is significantly lighter weight than Cog, using <5GB of VRAM and generating captions in under a second on a RTX 3090. It has the capability to output grounding bounding boxes.