update caption doc

2023-06-30 00:37:49 -04:00 · 2023-06-30 00:37:49 -04:00 · acac3f1547
parent 41eaa09938
commit acac3f1547
2 changed files with 20 additions and 4 deletions
--- a/caption_fl.py
+++ b/caption_fl.py
@ -84,8 +84,8 @@ def main(args):
        lang_encoder_path="anas-awadalla/mpt-7b"
        tokenizer_path="anas-awadalla/mpt-7b"
    elif "mpt1b" in args.model:
-        lang_encoder_path="anas-awadalla/mpt-1b"
-        tokenizer_path="anas-awadalla/mpt-1b"
+        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b"
+        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b"

    model, image_processor, tokenizer = create_model_and_transforms(
        clip_vision_encoder_path="ViT-L-14",
@ -141,7 +141,7 @@ def main(args):

                input_ids = lang_x["input_ids"].to(device)

-                with torch.cuda.amp.autocast(dtype=dtype):
+                with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad():
                    generated_text = model.generate(
                        vision_x=vision_x,
                        lang_x=input_ids,
@ -171,6 +171,7 @@ def main(args):
                if not os.path.exists(name):
                    with open(f"{name}.txt", "w") as f:
                        f.write(generated_text)
+    print("Done!")

 if __name__ == "__main__":
    print(f"Available models:")
@ -188,7 +189,7 @@ if __name__ == "__main__":
    parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '")
    parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default")
    parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default")
-    parser.add_argument("--top_p", type=float, default=0.9, help="top_p sampling, 0.9 is default")
+    parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default")
    parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default")
    parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default")
    args = parser.parse_args()
--- a/doc/CAPTION.md
+++ b/doc/CAPTION.md
@ -4,6 +4,21 @@

 `python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`

+This script uses two example image/caption pairs located in the example folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder. 
+
+This script currently requires an AMPERE or newer GPU due to using bfloat16. 
+
+**Trying out different example image/caption pairs will influence how the system captions the input images.**
+
+Supported models:
+`openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16
+`openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3
+
+The small model with more beams (ex. 16) performs well with details and should not be immediately discounted. 
+
+The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary.
+
+Primary params:
 `--num_beams 3` increasing uses more VRAM but may improve detail, also can increase hallicunations
 `--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption