diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index 8f59d75a..0b2fabf5 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -232,9 +232,9 @@ def launcher(event_loop): if num_shard is not None: args.extend(["--num-shard", str(num_shard)]) - if quantize: + if quantize is not None: args.append("--quantize") - args.append("bitsandbytes") + args.append(quantize) if trust_remote_code: args.append("--trust-remote-code") diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json new file mode 100644 index 00000000..fe632aaf --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json @@ -0,0 +1,103 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -9.953125, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4121094, + "text": "is" + }, + { + "id": 6483, + "logprob": -9.9765625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -1.6767578, + "text": "learning" + }, + { + "id": 1577, + "logprob": -4.5976562, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.21813965, + "special": false, + "text": "\n" + }, + { + "id": 2772, + "logprob": -1.4130859, + "special": false, + "text": "De" + }, + { + "id": 1022, + "logprob": -0.0028419495, + "special": false, + "text": "ep" + }, + { + "id": 6509, + "logprob": -0.3244629, + "special": false, + "text": " learning" + }, + { + "id": 338, + "logprob": -0.25439453, + "special": false, + "text": " is" + }, + { + "id": 263, + "logprob": -0.43774414, + "special": false, + "text": " a" + }, + { + "id": 4933, + "logprob": -1.8105469, + "special": false, + "text": " machine" + }, + { + "id": 6509, + "logprob": -0.07116699, + "special": false, + "text": " learning" + }, + { + "id": 11043, + "logprob": -0.87158203, + "special": false, + "text": " technique" + }, + { + "id": 393, + "logprob": -0.91015625, + "special": false, + "text": " that" + } + ] + }, + "generated_text": "\nDeep learning is a machine learning technique that" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json new file mode 100644 index 00000000..2b5d4257 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json @@ -0,0 +1,98 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 338, + "logprob": -10.8046875, + "text": "is" + }, + { + "id": 6483, + "logprob": -12.6640625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -3.3398438, + "text": "learning" + }, + { + "id": 1577, + "logprob": -8.3828125, + "text": "?" + } + ], + "seed": 0, + "tokens": [ + { + "id": 13, + "logprob": 0.0, + "special": false, + "text": "\n" + }, + { + "id": 4013, + "logprob": -2.6992188, + "special": false, + "text": "This" + }, + { + "id": 1139, + "logprob": -0.35668945, + "special": false, + "text": " question" + }, + { + "id": 756, + "logprob": -0.08251953, + "special": false, + "text": " has" + }, + { + "id": 1063, + "logprob": -0.39697266, + "special": false, + "text": " been" + }, + { + "id": 4433, + "logprob": 0.0, + "special": false, + "text": " asked" + }, + { + "id": 1784, + "logprob": -0.9248047, + "special": false, + "text": " many" + }, + { + "id": 3064, + "logprob": 0.0, + "special": false, + "text": " times" + }, + { + "id": 1434, + "logprob": -0.90625, + "special": false, + "text": " before" + }, + { + "id": 29892, + "logprob": -0.19580078, + "special": false, + "text": "," + } + ] + }, + "generated_text": "What is deep learning ?\nThis question has been asked many times before," +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json new file mode 100644 index 00000000..36c040fe --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json @@ -0,0 +1,414 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -9.953125, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4121094, + "text": "is" + }, + { + "id": 6483, + "logprob": -9.9765625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -1.6767578, + "text": "learning" + }, + { + "id": 1577, + "logprob": -4.5976562, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.21813965, + "special": false, + "text": "\n" + }, + { + "id": 2772, + "logprob": -1.4189453, + "special": false, + "text": "De" + }, + { + "id": 1022, + "logprob": -0.0028419495, + "special": false, + "text": "ep" + }, + { + "id": 6509, + "logprob": -0.3244629, + "special": false, + "text": " learning" + }, + { + "id": 338, + "logprob": -0.25439453, + "special": false, + "text": " is" + }, + { + "id": 263, + "logprob": -0.4375, + "special": false, + "text": " a" + }, + { + "id": 4933, + "logprob": -1.8105469, + "special": false, + "text": " machine" + }, + { + "id": 6509, + "logprob": -0.07116699, + "special": false, + "text": " learning" + }, + { + "id": 11043, + "logprob": -0.87158203, + "special": false, + "text": " technique" + }, + { + "id": 393, + "logprob": -0.91015625, + "special": false, + "text": " that" + } + ] + }, + "generated_text": "\nDeep learning is a machine learning technique that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -9.953125, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4121094, + "text": "is" + }, + { + "id": 6483, + "logprob": -9.9765625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -1.6767578, + "text": "learning" + }, + { + "id": 1577, + "logprob": -4.5976562, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.21813965, + "special": false, + "text": "\n" + }, + { + "id": 2772, + "logprob": -1.4189453, + "special": false, + "text": "De" + }, + { + "id": 1022, + "logprob": -0.0028419495, + "special": false, + "text": "ep" + }, + { + "id": 6509, + "logprob": -0.3244629, + "special": false, + "text": " learning" + }, + { + "id": 338, + "logprob": -0.25439453, + "special": false, + "text": " is" + }, + { + "id": 263, + "logprob": -0.4375, + "special": false, + "text": " a" + }, + { + "id": 4933, + "logprob": -1.8105469, + "special": false, + "text": " machine" + }, + { + "id": 6509, + "logprob": -0.07116699, + "special": false, + "text": " learning" + }, + { + "id": 11043, + "logprob": -0.87158203, + "special": false, + "text": " technique" + }, + { + "id": 393, + "logprob": -0.91015625, + "special": false, + "text": " that" + } + ] + }, + "generated_text": "\nDeep learning is a machine learning technique that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -9.953125, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4121094, + "text": "is" + }, + { + "id": 6483, + "logprob": -9.9765625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -1.6767578, + "text": "learning" + }, + { + "id": 1577, + "logprob": -4.5976562, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.21813965, + "special": false, + "text": "\n" + }, + { + "id": 2772, + "logprob": -1.4189453, + "special": false, + "text": "De" + }, + { + "id": 1022, + "logprob": -0.0028419495, + "special": false, + "text": "ep" + }, + { + "id": 6509, + "logprob": -0.3244629, + "special": false, + "text": " learning" + }, + { + "id": 338, + "logprob": -0.25439453, + "special": false, + "text": " is" + }, + { + "id": 263, + "logprob": -0.4375, + "special": false, + "text": " a" + }, + { + "id": 4933, + "logprob": -1.8105469, + "special": false, + "text": " machine" + }, + { + "id": 6509, + "logprob": -0.07116699, + "special": false, + "text": " learning" + }, + { + "id": 11043, + "logprob": -0.87158203, + "special": false, + "text": " technique" + }, + { + "id": 393, + "logprob": -0.91015625, + "special": false, + "text": " that" + } + ] + }, + "generated_text": "\nDeep learning is a machine learning technique that" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 1724, + "logprob": -9.953125, + "text": "What" + }, + { + "id": 338, + "logprob": -1.4121094, + "text": "is" + }, + { + "id": 6483, + "logprob": -9.9765625, + "text": "deep" + }, + { + "id": 6509, + "logprob": -1.6767578, + "text": "learning" + }, + { + "id": 1577, + "logprob": -4.5976562, + "text": "?" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -0.21813965, + "special": false, + "text": "\n" + }, + { + "id": 2772, + "logprob": -1.4189453, + "special": false, + "text": "De" + }, + { + "id": 1022, + "logprob": -0.0028419495, + "special": false, + "text": "ep" + }, + { + "id": 6509, + "logprob": -0.3244629, + "special": false, + "text": " learning" + }, + { + "id": 338, + "logprob": -0.25439453, + "special": false, + "text": " is" + }, + { + "id": 263, + "logprob": -0.4375, + "special": false, + "text": " a" + }, + { + "id": 4933, + "logprob": -1.8105469, + "special": false, + "text": " machine" + }, + { + "id": 6509, + "logprob": -0.07116699, + "special": false, + "text": " learning" + }, + { + "id": 11043, + "logprob": -0.87158203, + "special": false, + "text": " technique" + }, + { + "id": 393, + "logprob": -0.91015625, + "special": false, + "text": " that" + } + ] + }, + "generated_text": "\nDeep learning is a machine learning technique that" + } +] diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py new file mode 100644 index 00000000..577b94d9 --- /dev/null +++ b/integration-tests/models/test_flash_llama_gptq.py @@ -0,0 +1,58 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_llama_gptq_handle(launcher): + with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_llama_gptq(flash_llama_gptq_handle): + await flash_llama_gptq_handle.health(300) + return flash_llama_gptq_handle.client + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): + response = await flash_llama_gptq.generate( + "What is deep learning ?", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot): + response = await flash_llama_gptq.generate( + "What is deep learning ?", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + stop_sequences=["test"], + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_gptq_load(flash_llama_gptq, generate_load, response_snapshot): + responses = await generate_load(flash_llama_gptq, "What is deep learning ?", max_new_tokens=10, n=4) + + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot