From 777465529730b2cd52126c5372066dffb2b5cd5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 16 Sep 2024 12:39:18 +0200 Subject: [PATCH] Add tests for Mixtral (#2520) Disable by default because CI runners do not have enough GPUs. --- .../test_flash_mixtral.json | 114 +++++ .../test_flash_mixtral_all_params.json | 99 ++++ .../test_flash_mixtral_load.json | 458 ++++++++++++++++++ .../models/test_flash_mixtral.py | 75 +++ 4 files changed, 746 insertions(+) create mode 100644 integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json create mode 100644 integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json create mode 100644 integration-tests/models/test_flash_mixtral.py diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json new file mode 100644 index 00000000..56419967 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral.json @@ -0,0 +1,114 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1824, + "logprob": -6.1445312, + "text": "What" + }, + { + "id": 349, + "logprob": -1.4648438, + "text": "is" + }, + { + "id": 21135, + "logprob": -13.6875, + "text": "gradient" + }, + { + "id": 24871, + "logprob": -1.6005859, + "text": "descent" + }, + { + "id": 28804, + "logprob": -0.39526367, + "text": "?" + }, + { + "id": 13, + "logprob": -0.640625, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.18774414, + "text": "\n" + } + ], + "seed": null, + "tokens": [ + { + "id": 20910, + "logprob": -0.96484375, + "special": false, + "text": "Grad" + }, + { + "id": 722, + "logprob": -0.003168106, + "special": false, + "text": "ient" + }, + { + "id": 24871, + "logprob": -0.16540527, + "special": false, + "text": " descent" + }, + { + "id": 349, + "logprob": -0.08886719, + "special": false, + "text": " is" + }, + { + "id": 396, + "logprob": -0.75878906, + "special": false, + "text": " an" + }, + { + "id": 18586, + "logprob": -0.5703125, + "special": false, + "text": " optimization" + }, + { + "id": 9464, + "logprob": -0.11242676, + "special": false, + "text": " algorithm" + }, + { + "id": 1307, + "logprob": -0.7939453, + "special": false, + "text": " used" + }, + { + "id": 298, + "logprob": -0.17102051, + "special": false, + "text": " to" + }, + { + "id": 26518, + "logprob": -0.34326172, + "special": false, + "text": " minimize" + } + ], + "top_tokens": null + }, + "generated_text": "Gradient descent is an optimization algorithm used to minimize" +} diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json new file mode 100644 index 00000000..00da1fed --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_all_params.json @@ -0,0 +1,99 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 24871, + "logprob": -17.234375, + "text": "descent" + }, + { + "id": 28804, + "logprob": -7.4335938, + "text": "?" + }, + { + "id": 13, + "logprob": -0.8017578, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.32958984, + "text": "\n" + } + ], + "seed": 0, + "tokens": [ + { + "id": 1313, + "logprob": -2.3613281, + "special": false, + "text": "It" + }, + { + "id": 3969, + "logprob": -0.7285156, + "special": false, + "text": " seems" + }, + { + "id": 298, + "logprob": -1.3466797, + "special": false, + "text": " to" + }, + { + "id": 528, + "logprob": 0.0, + "special": false, + "text": " me" + }, + { + "id": 28725, + "logprob": -1.6757812, + "special": false, + "text": "," + }, + { + "id": 369, + "logprob": -0.06585693, + "special": false, + "text": " that" + }, + { + "id": 513, + "logprob": -1.1269531, + "special": false, + "text": " if" + }, + { + "id": 368, + "logprob": 0.0, + "special": false, + "text": " you" + }, + { + "id": 28742, + "logprob": -2.4921875, + "special": false, + "text": "'" + }, + { + "id": 267, + "logprob": 0.0, + "special": false, + "text": "re" + } + ], + "top_tokens": null + }, + "generated_text": "What is gradient descent?\n\nIt seems to me, that if you're" +} diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json new file mode 100644 index 00000000..55056cfd --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_mixtral/test_flash_mixtral_load.json @@ -0,0 +1,458 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1824, + "logprob": -6.1445312, + "text": "What" + }, + { + "id": 349, + "logprob": -1.4648438, + "text": "is" + }, + { + "id": 21135, + "logprob": -13.6875, + "text": "gradient" + }, + { + "id": 24871, + "logprob": -1.6005859, + "text": "descent" + }, + { + "id": 28804, + "logprob": -0.39526367, + "text": "?" + }, + { + "id": 13, + "logprob": -0.640625, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.18774414, + "text": "\n" + } + ], + "seed": null, + "tokens": [ + { + "id": 20910, + "logprob": -0.96484375, + "special": false, + "text": "Grad" + }, + { + "id": 722, + "logprob": -0.003168106, + "special": false, + "text": "ient" + }, + { + "id": 24871, + "logprob": -0.16369629, + "special": false, + "text": " descent" + }, + { + "id": 349, + "logprob": -0.0881958, + "special": false, + "text": " is" + }, + { + "id": 396, + "logprob": -0.76708984, + "special": false, + "text": " an" + }, + { + "id": 18586, + "logprob": -0.57373047, + "special": false, + "text": " optimization" + }, + { + "id": 9464, + "logprob": -0.11291504, + "special": false, + "text": " algorithm" + }, + { + "id": 1307, + "logprob": -0.79589844, + "special": false, + "text": " used" + }, + { + "id": 298, + "logprob": -0.1694336, + "special": false, + "text": " to" + }, + { + "id": 26518, + "logprob": -0.34350586, + "special": false, + "text": " minimize" + } + ], + "top_tokens": null + }, + "generated_text": "Gradient descent is an optimization algorithm used to minimize" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1824, + "logprob": -6.1445312, + "text": "What" + }, + { + "id": 349, + "logprob": -1.4677734, + "text": "is" + }, + { + "id": 21135, + "logprob": -13.6875, + "text": "gradient" + }, + { + "id": 24871, + "logprob": -1.6015625, + "text": "descent" + }, + { + "id": 28804, + "logprob": -0.39453125, + "text": "?" + }, + { + "id": 13, + "logprob": -0.6435547, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.18713379, + "text": "\n" + } + ], + "seed": null, + "tokens": [ + { + "id": 20910, + "logprob": -0.9628906, + "special": false, + "text": "Grad" + }, + { + "id": 722, + "logprob": -0.0032176971, + "special": false, + "text": "ient" + }, + { + "id": 24871, + "logprob": -0.16540527, + "special": false, + "text": " descent" + }, + { + "id": 349, + "logprob": -0.08898926, + "special": false, + "text": " is" + }, + { + "id": 396, + "logprob": -0.765625, + "special": false, + "text": " an" + }, + { + "id": 18586, + "logprob": -0.5708008, + "special": false, + "text": " optimization" + }, + { + "id": 9464, + "logprob": -0.11401367, + "special": false, + "text": " algorithm" + }, + { + "id": 1307, + "logprob": -0.7963867, + "special": false, + "text": " used" + }, + { + "id": 298, + "logprob": -0.17028809, + "special": false, + "text": " to" + }, + { + "id": 26518, + "logprob": -0.34326172, + "special": false, + "text": " minimize" + } + ], + "top_tokens": null + }, + "generated_text": "Gradient descent is an optimization algorithm used to minimize" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1824, + "logprob": -6.140625, + "text": "What" + }, + { + "id": 349, + "logprob": -1.4658203, + "text": "is" + }, + { + "id": 21135, + "logprob": -13.6796875, + "text": "gradient" + }, + { + "id": 24871, + "logprob": -1.5898438, + "text": "descent" + }, + { + "id": 28804, + "logprob": -0.3955078, + "text": "?" + }, + { + "id": 13, + "logprob": -0.64501953, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.18493652, + "text": "\n" + } + ], + "seed": null, + "tokens": [ + { + "id": 20910, + "logprob": -0.9580078, + "special": false, + "text": "Grad" + }, + { + "id": 722, + "logprob": -0.0032176971, + "special": false, + "text": "ient" + }, + { + "id": 24871, + "logprob": -0.16552734, + "special": false, + "text": " descent" + }, + { + "id": 349, + "logprob": -0.08874512, + "special": false, + "text": " is" + }, + { + "id": 396, + "logprob": -0.75878906, + "special": false, + "text": " an" + }, + { + "id": 18586, + "logprob": -0.5703125, + "special": false, + "text": " optimization" + }, + { + "id": 9464, + "logprob": -0.11236572, + "special": false, + "text": " algorithm" + }, + { + "id": 1307, + "logprob": -0.79541016, + "special": false, + "text": " used" + }, + { + "id": 298, + "logprob": -0.17102051, + "special": false, + "text": " to" + }, + { + "id": 26518, + "logprob": -0.34326172, + "special": false, + "text": " minimize" + } + ], + "top_tokens": null + }, + "generated_text": "Gradient descent is an optimization algorithm used to minimize" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1824, + "logprob": -6.1328125, + "text": "What" + }, + { + "id": 349, + "logprob": -1.4658203, + "text": "is" + }, + { + "id": 21135, + "logprob": -13.6796875, + "text": "gradient" + }, + { + "id": 24871, + "logprob": -1.5947266, + "text": "descent" + }, + { + "id": 28804, + "logprob": -0.39648438, + "text": "?" + }, + { + "id": 13, + "logprob": -0.6464844, + "text": "\n" + }, + { + "id": 13, + "logprob": -0.18688965, + "text": "\n" + } + ], + "seed": null, + "tokens": [ + { + "id": 20910, + "logprob": -0.9609375, + "special": false, + "text": "Grad" + }, + { + "id": 722, + "logprob": -0.003168106, + "special": false, + "text": "ient" + }, + { + "id": 24871, + "logprob": -0.16601562, + "special": false, + "text": " descent" + }, + { + "id": 349, + "logprob": -0.088134766, + "special": false, + "text": " is" + }, + { + "id": 396, + "logprob": -0.7597656, + "special": false, + "text": " an" + }, + { + "id": 18586, + "logprob": -0.5708008, + "special": false, + "text": " optimization" + }, + { + "id": 9464, + "logprob": -0.11291504, + "special": false, + "text": " algorithm" + }, + { + "id": 1307, + "logprob": -0.7944336, + "special": false, + "text": " used" + }, + { + "id": 298, + "logprob": -0.17102051, + "special": false, + "text": " to" + }, + { + "id": 26518, + "logprob": -0.34399414, + "special": false, + "text": " minimize" + } + ], + "top_tokens": null + }, + "generated_text": "Gradient descent is an optimization algorithm used to minimize" + } +] diff --git a/integration-tests/models/test_flash_mixtral.py b/integration-tests/models/test_flash_mixtral.py new file mode 100644 index 00000000..24ae1f48 --- /dev/null +++ b/integration-tests/models/test_flash_mixtral.py @@ -0,0 +1,75 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_mixtral_handle(launcher): + with launcher("mistralai/Mixtral-8x7B-v0.1", num_shard=8) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_mixtral(flash_mixtral_handle): + await flash_mixtral_handle.health(300) + return flash_mixtral_handle.client + + +@pytest.mark.skip(reason="requires > 4 shards") +@pytest.mark.asyncio +async def test_flash_mixtral(flash_mixtral, response_snapshot): + response = await flash_mixtral.generate( + "What is gradient descent?\n\n", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert ( + response.generated_text + == "Gradient descent is an optimization algorithm used to minimize" + ) + assert response == response_snapshot + + +@pytest.mark.skip(reason="requires > 4 shards") +@pytest.mark.asyncio +async def test_flash_mixtral_all_params(flash_mixtral, response_snapshot): + response = await flash_mixtral.generate( + "What is gradient descent?\n\n", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + stop_sequences=["test"], + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert ( + response.generated_text + == "What is gradient descent?\n\nIt seems to me, that if you're" + ) + assert response == response_snapshot + + +@pytest.mark.skip(reason="requires > 4 shards") +@pytest.mark.asyncio +async def test_flash_mixtral_load(flash_mixtral, generate_load, response_snapshot): + responses = await generate_load( + flash_mixtral, "What is gradient descent?\n\n", max_new_tokens=10, n=4 + ) + + assert len(responses) == 4 + assert responses[0].details.generated_tokens == 10 + assert ( + responses[0].generated_text + == "Gradient descent is an optimization algorithm used to minimize" + ) + assert all( + [r.generated_text == responses[0].generated_text for r in responses] + ), f"{[r.generated_text for r in responses]}" + + assert responses == response_snapshot