From acac3f154752eedd48cb0c226f8a0e034d497e2e Mon Sep 17 00:00:00 2001 From: Victor Hall Date: Fri, 30 Jun 2023 00:37:49 -0400 Subject: [PATCH] update caption doc --- caption_fl.py | 9 +++++---- doc/CAPTION.md | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/caption_fl.py b/caption_fl.py index baec83e..1376631 100644 --- a/caption_fl.py +++ b/caption_fl.py @@ -84,8 +84,8 @@ def main(args): lang_encoder_path="anas-awadalla/mpt-7b" tokenizer_path="anas-awadalla/mpt-7b" elif "mpt1b" in args.model: - lang_encoder_path="anas-awadalla/mpt-1b" - tokenizer_path="anas-awadalla/mpt-1b" + lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b" + tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b" model, image_processor, tokenizer = create_model_and_transforms( clip_vision_encoder_path="ViT-L-14", @@ -141,7 +141,7 @@ def main(args): input_ids = lang_x["input_ids"].to(device) - with torch.cuda.amp.autocast(dtype=dtype): + with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad(): generated_text = model.generate( vision_x=vision_x, lang_x=input_ids, @@ -171,6 +171,7 @@ def main(args): if not os.path.exists(name): with open(f"{name}.txt", "w") as f: f.write(generated_text) + print("Done!") if __name__ == "__main__": print(f"Available models:") @@ -188,7 +189,7 @@ if __name__ == "__main__": parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '") parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default") parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default") - parser.add_argument("--top_p", type=float, default=0.9, help="top_p sampling, 0.9 is default") + parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default") parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default") parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default") args = parser.parse_args() diff --git a/doc/CAPTION.md b/doc/CAPTION.md index d28a024..50ab754 100644 --- a/doc/CAPTION.md +++ b/doc/CAPTION.md @@ -4,6 +4,21 @@ `python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"` +This script uses two example image/caption pairs located in the example folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder. + +This script currently requires an AMPERE or newer GPU due to using bfloat16. + +**Trying out different example image/caption pairs will influence how the system captions the input images.** + +Supported models: +`openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16 +`openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3 + +The small model with more beams (ex. 16) performs well with details and should not be immediately discounted. + +The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary. + +Primary params: `--num_beams 3` increasing uses more VRAM but may improve detail, also can increase hallicunations `--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption