hf_text-generation-inference/integration-tests/models/test_completion_prompts.py

import pytest
import requests
import json
from aiohttp import ClientSession

from text_generation.types import (
    Completion,
)


@pytest.fixture(scope="module")
def flash_llama_completion_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    ) as handle:
        yield handle


@pytest.fixture(scope="module")
async def flash_llama_completion(flash_llama_completion_handle):
    await flash_llama_completion_handle.health(300)
    return flash_llama_completion_handle.client


# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
# method for it. Instead, we use the `requests` library to make the HTTP request directly.


@pytest.mark.release
def test_flash_llama_completion_single_prompt(
    flash_llama_completion, response_snapshot
):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": "Say this is a test",
            "max_tokens": 5,
            "seed": 0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 1

    assert response == response_snapshot


@pytest.mark.release
def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
    response = requests.post(
        f"{flash_llama_completion.base_url}/v1/completions",
        json={
            "model": "tgi",
            "prompt": ["Say", "this", "is", "a"],
            "max_tokens": 10,
            "seed": 0,
        },
        headers=flash_llama_completion.headers,
        stream=False,
    )
    response = response.json()
    assert len(response["choices"]) == 4

    all_indexes = [choice["index"] for choice in response["choices"]]
    all_indexes.sort()
    assert all_indexes == [0, 1, 2, 3]

    assert response == response_snapshot


@pytest.mark.release
async def test_flash_llama_completion_many_prompts_stream(
    flash_llama_completion, response_snapshot
):
    request = {
        "model": "tgi",
        "prompt": [
            "What color is the sky?",
            "Is water wet?",
            "What is the capital of France?",
            "def mai",
        ],
        "max_tokens": 10,
        "seed": 0,
        "stream": True,
    }

    url = f"{flash_llama_completion.base_url}/v1/completions"

    chunks = []
    async with ClientSession(headers=flash_llama_completion.headers) as session:
        async with session.post(url, json=request) as response:
            # iterate over the stream
            async for chunk in response.content.iter_any():
                # remove "data:"
                chunk = chunk.decode().split("\n\n")
                # remove "data:" if present
                chunk = [c.replace("data:", "") for c in chunk]
                # remove empty strings
                chunk = [c for c in chunk if c]
                # remove completion marking chunk
                chunk = [c for c in chunk if c != " [DONE]"]
                # parse json
                chunk = [json.loads(c) for c in chunk]

                for c in chunk:
                    chunks.append(Completion(**c))
                    assert "choices" in c
                    assert 0 <= c["choices"][0]["index"] <= 4

    assert response.status == 200
    assert chunks == response_snapshot
feat: accept list as prompt and use first string (#1702) This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: https://github.com/huggingface/text-generation-inference/issues/1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files 2024-04-17 02:41:12 -06:00			`import pytest`
			`import requests`
			`import json`
			`from aiohttp import ClientSession`

			`from text_generation.types import (`
			`Completion,`
			`)`


			`@pytest.fixture(scope="module")`
			`def flash_llama_completion_handle(launcher):`
			`with launcher(`
			`"TinyLlama/TinyLlama-1.1B-Chat-v1.0",`
			`) as handle:`
			`yield handle`


			`@pytest.fixture(scope="module")`
			`async def flash_llama_completion(flash_llama_completion_handle):`
			`await flash_llama_completion_handle.health(300)`
			`return flash_llama_completion_handle.client`


			# NOTE: since `v1/completions` is a deprecated inferface/endpoint we do not provide a convience
			# method for it. Instead, we use the `requests` library to make the HTTP request directly.


Add pytest release marker (#2114) * Add pytest release marker Annotate a test with `@pytest.mark.release` and it only gets run with `pytest integration-tests --release`. * Mark many models as `release` to speed up CI 2024-06-25 08:53:20 -06:00			`@pytest.mark.release`
feat: accept list as prompt and use first string (#1702) This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: https://github.com/huggingface/text-generation-inference/issues/1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files 2024-04-17 02:41:12 -06:00			`def test_flash_llama_completion_single_prompt(`
			`flash_llama_completion, response_snapshot`
			`):`
			`response = requests.post(`
			`f"{flash_llama_completion.base_url}/v1/completions",`
			`json={`
			`"model": "tgi",`
			`"prompt": "Say this is a test",`
			`"max_tokens": 5,`
			`"seed": 0,`
			`},`
			`headers=flash_llama_completion.headers,`
			`stream=False,`
			`)`
			`response = response.json()`
			`assert len(response["choices"]) == 1`

			`assert response == response_snapshot`


Add pytest release marker (#2114) * Add pytest release marker Annotate a test with `@pytest.mark.release` and it only gets run with `pytest integration-tests --release`. * Mark many models as `release` to speed up CI 2024-06-25 08:53:20 -06:00			`@pytest.mark.release`
feat: accept list as prompt and use first string (#1702) This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: https://github.com/huggingface/text-generation-inference/issues/1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files 2024-04-17 02:41:12 -06:00			`def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):`
			`response = requests.post(`
			`f"{flash_llama_completion.base_url}/v1/completions",`
			`json={`
			`"model": "tgi",`
			`"prompt": ["Say", "this", "is", "a"],`
			`"max_tokens": 10,`
			`"seed": 0,`
			`},`
			`headers=flash_llama_completion.headers,`
			`stream=False,`
			`)`
			`response = response.json()`
			`assert len(response["choices"]) == 4`

			`all_indexes = [choice["index"] for choice in response["choices"]]`
			`all_indexes.sort()`
			`assert all_indexes == [0, 1, 2, 3]`

			`assert response == response_snapshot`


Add pytest release marker (#2114) * Add pytest release marker Annotate a test with `@pytest.mark.release` and it only gets run with `pytest integration-tests --release`. * Mark many models as `release` to speed up CI 2024-06-25 08:53:20 -06:00			`@pytest.mark.release`
feat: accept list as prompt and use first string (#1702) This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: https://github.com/huggingface/text-generation-inference/issues/1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files 2024-04-17 02:41:12 -06:00			`async def test_flash_llama_completion_many_prompts_stream(`
			`flash_llama_completion, response_snapshot`
			`):`
			`request = {`
			`"model": "tgi",`
			`"prompt": [`
			`"What color is the sky?",`
			`"Is water wet?",`
			`"What is the capital of France?",`
			`"def mai",`
			`],`
			`"max_tokens": 10,`
			`"seed": 0,`
			`"stream": True,`
			`}`

			`url = f"{flash_llama_completion.base_url}/v1/completions"`

			`chunks = []`
			`async with ClientSession(headers=flash_llama_completion.headers) as session:`
			`async with session.post(url, json=request) as response:`
			`# iterate over the stream`
			`async for chunk in response.content.iter_any():`
			`# remove "data:"`
			`chunk = chunk.decode().split("\n\n")`
			`# remove "data:" if present`
			`chunk = [c.replace("data:", "") for c in chunk]`
			`# remove empty strings`
			`chunk = [c for c in chunk if c]`
Improve the handling of quantized weights (#2250) * Improve the handling of quantized weights Handling of quantized weights was split between two mechanisms: - For quantized checkpoints, we used the new weight loader infrastructure. - For quantization while loading (EETQ, FP8, bitsandbytes) we instead relied on conditional in `get_linear`. Weight loaders support context managers to selectively load particular layers with different weight loaders, which is useful for models like Idefics2 AWQ, which uses a quantized text model, but unquantized vision and connector models. However, the context manager would be overrided by `get_linear`, which string-checks `quantizer`. Also, the context manager would not work with EETQ, FP8, and bitsandbytes. This change migrates all quantizers to the weight loader infrastructure. This has several benefits: - We can use context managers with all quantizers. - All the implementation details move down to the quantizer layers, `get_linear` does not need to know how to handle quantizer linear layers. - All quantizer weights are strongly typed, we don't pass around raw tensors. - We don't have to pass around the `quantizer` string everywhere. * Exclude non-MLP layers when using FP8 quantization with Llama 2024-07-19 01:37:39 -06:00			`# remove completion marking chunk`
			`chunk = [c for c in chunk if c != " [DONE]"]`
feat: accept list as prompt and use first string (#1702) This PR allows the `CompletionRequest.prompt` to be sent as a string or array of strings. When an array is sent the first value will be used if it's a string; otherwise the according error will be thrown Fixes: https://github.com/huggingface/text-generation-inference/issues/1690 Similar to: https://github.com/vllm-project/vllm/pull/323/files 2024-04-17 02:41:12 -06:00			`# parse json`
			`chunk = [json.loads(c) for c in chunk]`

			`for c in chunk:`
			`chunks.append(Completion(**c))`
			`assert "choices" in c`
			`assert 0 <= c["choices"][0]["index"] <= 4`

			`assert response.status == 200`
			`assert chunks == response_snapshot`