diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index b0049701..a665d9b0 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -11,6 +11,11 @@ on: # - rocm # - intel required: true + release-tests: + description: "Run release integration tests" + required: true + default: false + type: boolean jobs: build-and-push: @@ -148,7 +153,7 @@ jobs: runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"] if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest' env: - PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main') && '--release' || '' }} + PYTEST_FLAGS: ${{ (startsWith(github.ref, 'refs/tags/') || github.ref == 'refs/heads/main' || inputs.release-tests == 'true') && '--release' || '' }} steps: - name: Checkout repository uses: actions/checkout@v4 diff --git a/.github/workflows/ci_build.yaml b/.github/workflows/ci_build.yaml index 754c4850..d62297e4 100644 --- a/.github/workflows/ci_build.yaml +++ b/.github/workflows/ci_build.yaml @@ -20,7 +20,14 @@ on: - "Dockerfile_amd" - "Dockerfile_intel" branches: - - 'main' + - "main" + workflow_dispatch: + inputs: + release-tests: + description: "Run release integration tests" + required: true + default: false + type: boolean jobs: build: @@ -33,4 +40,6 @@ jobs: uses: ./.github/workflows/build.yaml # calls the one above ^ with: hardware: ${{ matrix.hardware }} + # https://github.com/actions/runner/issues/2206 + release-tests: ${{ inputs.release-tests == true }} secrets: inherit diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json index 7797cc6c..0f99d259 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq.json @@ -5,85 +5,80 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.7890625, "text": "Test" }, { - "id": 2009, - "logprob": -9.625, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": null, "tokens": [ { - "id": 13, - "logprob": -2.3359375, + "id": 198, + "logprob": -2.5742188, "special": false, "text": "\n" }, { - "id": 3057, - "logprob": -1.8779297, + "id": 262, + "logprob": -1.6230469, "special": false, - "text": "Test" + "text": " " }, { - "id": 2009, - "logprob": -1.2744141, + "id": 3270, + "logprob": -2.046875, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1425781, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.9238281, "special": false, "text": " request" }, { - "id": 13, - "logprob": -1.6933594, + "id": 13204, + "logprob": -0.076660156, "special": false, - "text": "\n" + "text": ".method" }, { - "id": 3057, - "logprob": -1.4648438, + "id": 624, + "logprob": -0.021987915, "special": false, - "text": "Test" + "text": " ==" }, { - "id": 2009, - "logprob": -0.15600586, + "id": 364, + "logprob": -0.39208984, "special": false, - "text": " request" + "text": " '" }, { - "id": 13, - "logprob": -0.8027344, + "id": 3019, + "logprob": -0.10821533, "special": false, - "text": "\n" - }, - { - "id": 3057, - "logprob": -0.23022461, - "special": false, - "text": "Test" - }, - { - "id": 2009, - "logprob": -0.0069885254, - "special": false, - "text": " request" - }, - { - "id": 13, - "logprob": -0.02218628, - "special": false, - "text": "\n" + "text": "POST" } ], "top_tokens": null }, - "generated_text": "\nTest request\nTest request\nTest request\n" + "generated_text": "\n \"\"\"\n if request.method == 'POST" } diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json index fa2fd4a2..4152b5b3 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_all_params.json @@ -5,85 +5,80 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.84375, "text": "Test" }, { - "id": 2009, - "logprob": -9.6015625, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": 0, "tokens": [ { - "id": 29899, - "logprob": -1.5625, + "id": 13, + "logprob": -2.2539062, "special": false, - "text": "-" + "text": "." }, { - "id": 1454, - "logprob": -0.20410156, + "id": 578, + "logprob": -0.15563965, "special": false, - "text": "for" + "text": " The" }, { - "id": 29899, + "id": 3622, + "logprob": -0.8203125, + "special": false, + "text": " server" + }, + { + "id": 706, "logprob": 0.0, "special": false, - "text": "-" + "text": " has" }, { - "id": 9342, + "id": 539, "logprob": 0.0, "special": false, - "text": "comment" + "text": " not" }, { - "id": 29901, + "id": 3686, "logprob": 0.0, "special": false, - "text": ":" + "text": " yet" }, { - "id": 396, - "logprob": -0.27685547, - "special": false, - "text": " #" - }, - { - "id": 29906, - "logprob": -0.4970703, - "special": false, - "text": "2" - }, - { - "id": 29900, - "logprob": -0.80615234, - "special": false, - "text": "0" - }, - { - "id": 29896, + "id": 3288, "logprob": 0.0, "special": false, - "text": "1" + "text": " sent" }, { - "id": 29955, - "logprob": -1.0751953, + "id": 904, + "logprob": 0.0, "special": false, - "text": "7" + "text": " any" + }, + { + "id": 828, + "logprob": 0.0, + "special": false, + "text": " data" + }, + { + "id": 382, + "logprob": -1.5517578, + "special": false, + "text": ".\n\n" } ], "top_tokens": null }, - "generated_text": "Test request-for-comment: #2017" + "generated_text": "Test request. The server has not yet sent any data.\n\n" } diff --git a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json index 594b7351..75e90303 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_gptq/test_flash_llama_gptq_load.json @@ -6,87 +6,82 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.828125, "text": "Test" }, { - "id": 2009, - "logprob": -9.609375, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": null, "tokens": [ { - "id": 13, - "logprob": -2.3300781, + "id": 198, + "logprob": -2.5742188, "special": false, "text": "\n" }, { - "id": 3057, - "logprob": -1.8740234, + "id": 262, + "logprob": -1.6220703, "special": false, - "text": "Test" + "text": " " }, { - "id": 2009, - "logprob": -1.2646484, + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, "special": false, "text": " request" }, { - "id": 13, - "logprob": -1.7158203, + "id": 13204, + "logprob": -0.07672119, "special": false, - "text": "\n" + "text": ".method" }, { - "id": 3057, - "logprob": -1.4667969, + "id": 624, + "logprob": -0.021987915, "special": false, - "text": "Test" + "text": " ==" }, { - "id": 2009, - "logprob": -0.15344238, + "id": 364, + "logprob": -0.39208984, "special": false, - "text": " request" + "text": " '" }, { - "id": 13, - "logprob": -0.81591797, + "id": 3019, + "logprob": -0.10638428, "special": false, - "text": "\n" - }, - { - "id": 3057, - "logprob": -0.22973633, - "special": false, - "text": "Test" - }, - { - "id": 2009, - "logprob": -0.007045746, - "special": false, - "text": " request" - }, - { - "id": 13, - "logprob": -0.021957397, - "special": false, - "text": "\n" + "text": "POST" } ], "top_tokens": null }, - "generated_text": "\nTest request\nTest request\nTest request\n" + "generated_text": "\n \"\"\"\n if request.method == 'POST" }, { "details": { @@ -95,87 +90,82 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.84375, "text": "Test" }, { - "id": 2009, - "logprob": -9.59375, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": null, "tokens": [ { - "id": 13, - "logprob": -2.3378906, + "id": 198, + "logprob": -2.5742188, "special": false, "text": "\n" }, { - "id": 3057, - "logprob": -1.8779297, + "id": 262, + "logprob": -1.6220703, "special": false, - "text": "Test" + "text": " " }, { - "id": 2009, - "logprob": -1.2636719, + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, "special": false, "text": " request" }, { - "id": 13, - "logprob": -1.6992188, + "id": 13204, + "logprob": -0.07672119, "special": false, - "text": "\n" + "text": ".method" }, { - "id": 3057, - "logprob": -1.4589844, + "id": 624, + "logprob": -0.021987915, "special": false, - "text": "Test" + "text": " ==" }, { - "id": 2009, - "logprob": -0.15344238, + "id": 364, + "logprob": -0.39208984, "special": false, - "text": " request" + "text": " '" }, { - "id": 13, - "logprob": -0.79052734, + "id": 3019, + "logprob": -0.10638428, "special": false, - "text": "\n" - }, - { - "id": 3057, - "logprob": -0.22937012, - "special": false, - "text": "Test" - }, - { - "id": 2009, - "logprob": -0.007041931, - "special": false, - "text": " request" - }, - { - "id": 13, - "logprob": -0.022140503, - "special": false, - "text": "\n" + "text": "POST" } ], "top_tokens": null }, - "generated_text": "\nTest request\nTest request\nTest request\n" + "generated_text": "\n \"\"\"\n if request.method == 'POST" }, { "details": { @@ -184,87 +174,82 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.84375, "text": "Test" }, { - "id": 2009, - "logprob": -9.609375, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": null, "tokens": [ { - "id": 13, - "logprob": -2.3261719, + "id": 198, + "logprob": -2.5742188, "special": false, "text": "\n" }, { - "id": 3057, - "logprob": -1.8730469, + "id": 262, + "logprob": -1.6220703, "special": false, - "text": "Test" + "text": " " }, { - "id": 2009, - "logprob": -1.2587891, + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, "special": false, "text": " request" }, { - "id": 13, - "logprob": -1.6894531, + "id": 13204, + "logprob": -0.07672119, "special": false, - "text": "\n" + "text": ".method" }, { - "id": 3057, - "logprob": -1.46875, + "id": 624, + "logprob": -0.021987915, "special": false, - "text": "Test" + "text": " ==" }, { - "id": 2009, - "logprob": -0.1541748, + "id": 364, + "logprob": -0.39208984, "special": false, - "text": " request" + "text": " '" }, { - "id": 13, - "logprob": -0.80322266, + "id": 3019, + "logprob": -0.10638428, "special": false, - "text": "\n" - }, - { - "id": 3057, - "logprob": -0.22912598, - "special": false, - "text": "Test" - }, - { - "id": 2009, - "logprob": -0.0070495605, - "special": false, - "text": " request" - }, - { - "id": 13, - "logprob": -0.021606445, - "special": false, - "text": "\n" + "text": "POST" } ], "top_tokens": null }, - "generated_text": "\nTest request\nTest request\nTest request\n" + "generated_text": "\n \"\"\"\n if request.method == 'POST" }, { "details": { @@ -273,86 +258,81 @@ "generated_tokens": 10, "prefill": [ { - "id": 1, + "id": 2323, "logprob": null, - "text": "" - }, - { - "id": 4321, - "logprob": -9.84375, "text": "Test" }, { - "id": 2009, - "logprob": -9.6015625, - "text": "request" + "id": 1715, + "logprob": -11.34375, + "text": " request" } ], "seed": null, "tokens": [ { - "id": 13, - "logprob": -2.3320312, + "id": 198, + "logprob": -2.5742188, "special": false, "text": "\n" }, { - "id": 3057, - "logprob": -1.875, + "id": 262, + "logprob": -1.6220703, "special": false, - "text": "Test" + "text": " " }, { - "id": 2009, - "logprob": -1.2646484, + "id": 3270, + "logprob": -2.0410156, + "special": false, + "text": " \"\"\"\n" + }, + { + "id": 262, + "logprob": -0.015281677, + "special": false, + "text": " " + }, + { + "id": 422, + "logprob": -2.1445312, + "special": false, + "text": " if" + }, + { + "id": 1715, + "logprob": -0.92333984, "special": false, "text": " request" }, { - "id": 13, - "logprob": -1.6884766, + "id": 13204, + "logprob": -0.07672119, "special": false, - "text": "\n" + "text": ".method" }, { - "id": 3057, - "logprob": -1.4589844, + "id": 624, + "logprob": -0.021987915, "special": false, - "text": "Test" + "text": " ==" }, { - "id": 2009, - "logprob": -0.15185547, + "id": 364, + "logprob": -0.39208984, "special": false, - "text": " request" + "text": " '" }, { - "id": 13, - "logprob": -0.79833984, + "id": 3019, + "logprob": -0.10638428, "special": false, - "text": "\n" - }, - { - "id": 3057, - "logprob": -0.22827148, - "special": false, - "text": "Test" - }, - { - "id": 2009, - "logprob": -0.006996155, - "special": false, - "text": " request" - }, - { - "id": 13, - "logprob": -0.021560669, - "special": false, - "text": "\n" + "text": "POST" } ], "top_tokens": null }, - "generated_text": "\nTest request\nTest request\nTest request\n" + "generated_text": "\n \"\"\"\n if request.method == 'POST" } ] diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json new file mode 100644 index 00000000..69c1f47d --- /dev/null +++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.8359375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.6171875, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -2.3417969, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.8730469, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -1.2626953, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -1.7060547, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.4482422, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.15246582, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.796875, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -0.22766113, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.007045746, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.021759033, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": "\nTest request\nTest request\nTest request\n" +} diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json new file mode 100644 index 00000000..9b5ee9ee --- /dev/null +++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_all_params.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.7890625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.625, + "text": "request" + } + ], + "seed": 0, + "tokens": [ + { + "id": 29899, + "logprob": -1.4980469, + "special": false, + "text": "-" + }, + { + "id": 1454, + "logprob": -0.19433594, + "special": false, + "text": "for" + }, + { + "id": 29899, + "logprob": 0.0, + "special": false, + "text": "-" + }, + { + "id": 9342, + "logprob": 0.0, + "special": false, + "text": "comment" + }, + { + "id": 29901, + "logprob": 0.0, + "special": false, + "text": ":" + }, + { + "id": 396, + "logprob": -0.27392578, + "special": false, + "text": " #" + }, + { + "id": 29906, + "logprob": -0.49389648, + "special": false, + "text": "2" + }, + { + "id": 29900, + "logprob": -0.81103516, + "special": false, + "text": "0" + }, + { + "id": 29896, + "logprob": 0.0, + "special": false, + "text": "1" + }, + { + "id": 29955, + "logprob": -1.0800781, + "special": false, + "text": "7" + } + ], + "top_tokens": null + }, + "generated_text": "Test request-for-comment: #2017" +} diff --git a/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json new file mode 100644 index 00000000..df975635 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_server_gptq_quantized/test_server_gptq_quantized_load.json @@ -0,0 +1,358 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.8828125, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.5859375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -2.3359375, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.8623047, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -1.2451172, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -1.6923828, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.4492188, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.15197754, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.8022461, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -0.22583008, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.007095337, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.021652222, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": "\nTest request\nTest request\nTest request\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.796875, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -2.3476562, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.8789062, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -1.2734375, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -1.703125, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.4677734, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.15454102, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.7973633, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -0.23278809, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.006980896, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.022033691, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": "\nTest request\nTest request\nTest request\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.9296875, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.5703125, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -2.3203125, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.8486328, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -1.2480469, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -1.7060547, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.4511719, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.1529541, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.81396484, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -0.22180176, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.007133484, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.021835327, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": "\nTest request\nTest request\nTest request\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.84375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -9.6171875, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 13, + "logprob": -2.3261719, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.8691406, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -1.2597656, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -1.7070312, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -1.4550781, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.1538086, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.79345703, + "special": false, + "text": "\n" + }, + { + "id": 3057, + "logprob": -0.22924805, + "special": false, + "text": "Test" + }, + { + "id": 2009, + "logprob": -0.0070266724, + "special": false, + "text": " request" + }, + { + "id": 13, + "logprob": -0.021942139, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": "\nTest request\nTest request\nTest request\n" + } +] diff --git a/integration-tests/models/test_flash_llama_gptq.py b/integration-tests/models/test_flash_llama_gptq.py index 135f4b05..94a48e49 100644 --- a/integration-tests/models/test_flash_llama_gptq.py +++ b/integration-tests/models/test_flash_llama_gptq.py @@ -3,7 +3,9 @@ import pytest @pytest.fixture(scope="module") def flash_llama_gptq_handle(launcher): - with launcher("huggingface/llama-7b-gptq", num_shard=2, quantize="gptq") as handle: + with launcher( + "astronomer/Llama-3-8B-Instruct-GPTQ-4-Bit", num_shard=2, quantize="gptq" + ) as handle: yield handle