fix post merge

2024-07-01 12:20:29 +00:00 · 2024-07-01 12:20:29 +00:00 · 00cc73b7b7
parent 9fd395fae4
commit 00cc73b7b7
3 changed files with 2 additions and 70 deletions
--- a/integration-tests/clean_cache_and_download.py
+++ b/integration-tests/clean_cache_and_download.py
@ -13,7 +13,7 @@ REQUIRED_MODELS = {
    "openai-community/gpt2": "main",
    "turboderp/Llama-3-8B-Instruct-exl2": "2.5bpw",
    "huggingface/llama-7b-gptq": "main",
-    "neuralmagic/llama-2-7b-chat-marlin": "main",
+    "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit": "main",
    "huggingface/llama-7b": "main",
    "FasterDecoding/medusa-vicuna-7b-v1.3": "refs/pr/1",
    "mistralai/Mistral-7B-Instruct-v0.1": "main",
--- a/integration-tests/models/test_flash_llama_gptq_marlin.py
+++ b/integration-tests/models/test_flash_llama_gptq_marlin.py
@ -1,68 +0,0 @@
 import pytest
@pytest.fixture(scope="module")
 def flash_llama_gptq_marlin_handle(launcher):
    with launcher(
        "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="marlin"
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def flash_llama_gptq_marlin(flash_llama_gptq_marlin_handle):
    await flash_llama_gptq_marlin_handle.health()
    return flash_llama_gptq_marlin_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_llama_gptq_marlin(flash_llama_gptq_marlin, response_snapshot):
    response = await flash_llama_gptq_marlin.generate(
        "Test request", max_new_tokens=10, decoder_input_details=True
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_llama_gptq_marlin_all_params(
    flash_llama_gptq_marlin, response_snapshot
 ):
    response = await flash_llama_gptq_marlin.generate(
        "Test request",
        max_new_tokens=10,
        repetition_penalty=1.2,
        return_full_text=True,
        temperature=0.5,
        top_p=0.9,
        top_k=10,
        truncate=5,
        typical_p=0.9,
        watermark=True,
        decoder_input_details=True,
        seed=0,
    )
    assert response.details.generated_tokens == 10
    assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_llama_gptq_marlin_load(
    flash_llama_gptq_marlin, generate_load, response_snapshot
 ):
    responses = await generate_load(
        flash_llama_gptq_marlin, "Test request", max_new_tokens=10, n=4
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert responses == response_snapshot
--- a/integration-tests/models/test_flash_llama_marlin.py
+++ b/integration-tests/models/test_flash_llama_marlin.py
@ -6,7 +6,7 @@ from testing_utils import SYSTEM
@pytest.fixture(scope="module")
 def flash_llama_marlin_handle(launcher):
    with launcher(
-        "neuralmagic/llama-2-7b-chat-marlin", num_shard=2, quantize="marlin"
+        "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="marlin"
    ) as handle:
        yield handle