From d8dc8f1b0c97da009dd975c3622e49d01e2081b3 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Thu, 9 Mar 2023 16:05:33 +0100
Subject: [PATCH] feat(python-client): add new parameters (#118)

---
 clients/python/README.md                   |  18 +++
 clients/python/pyproject.toml              |   2 +-
 clients/python/tests/conftest.py           |   5 -
 clients/python/tests/test_client.py        |  46 ++++---
 clients/python/tests/test_inference_api.py |   8 +-
 clients/python/tests/test_types.py         |  47 ++++++-
 clients/python/text_generation/__init__.py |   2 +-
 clients/python/text_generation/client.py   |  52 ++++++++
 clients/python/text_generation/types.py    | 138 +++++++++++++++++++--
 9 files changed, 278 insertions(+), 40 deletions(-)
diff --git a/clients/python/README.md b/clients/python/README.md
index 79b88377..f509e65c 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -133,6 +133,22 @@ class FinishReason(Enum):
     StopSequence = "stop_sequence"
 
 
+# Additional sequences when using the `best_of` parameter
+class BestOfSequence:
+    # Generated text
+    generated_text: str
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+
+
 # `generate` details
 class Details:
     # Generation finish reason
@@ -145,6 +161,8 @@ class Details:
     prefill: List[PrefillToken]
     # Generated tokens
     tokens: List[Token]
+    # Additional sequences when using the `best_of` parameter
+    best_of_sequences: Optional[List[BestOfSequence]]
 
 
 # `generate` return value
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 0b8fa8c7..51ecce82 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.2.1"
+version = "0.3.0"
 description = "Hugging Face Text Generation Python Client"
 license = "Apache-2.0"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
diff --git a/clients/python/tests/conftest.py b/clients/python/tests/conftest.py
index 4298623e..48734f0d 100644
--- a/clients/python/tests/conftest.py
+++ b/clients/python/tests/conftest.py
@@ -4,11 +4,6 @@ from text_generation import __version__
 from huggingface_hub.utils import build_hf_headers
 
 
-@pytest.fixture
-def bloom_model():
-    return "bigscience/bloom"
-
-
 @pytest.fixture
 def flan_t5_xxl():
     return "google/flan-t5-xxl"
diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py
index 2f96aa87..c998de41 100644
--- a/clients/python/tests/test_client.py
+++ b/clients/python/tests/test_client.py
@@ -5,24 +5,32 @@ from text_generation.errors import NotFoundError, ValidationError
 from text_generation.types import FinishReason, PrefillToken, Token
 
 
-def test_generate(bloom_url, hf_headers):
-    client = Client(bloom_url, hf_headers)
+def test_generate(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
     response = client.generate("test", max_new_tokens=1)
 
-    assert response.generated_text == "."
+    assert response.generated_text == ""
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
     assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == PrefillToken(
-        id=9234, text="test", logprob=None
-    )
+    assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
     assert len(response.details.tokens) == 1
     assert response.details.tokens[0] == Token(
-        id=17, text=".", logprob=-1.75, special=False
+        id=3, text=" ", logprob=-1.984375, special=False
     )
 
 
+def test_generate_best_of(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
+    response = client.generate("test", max_new_tokens=1, best_of=2, do_sample=True)
+
+    assert response.details.seed is not None
+    assert response.details.best_of_sequences is not None
+    assert len(response.details.best_of_sequences) == 1
+    assert response.details.best_of_sequences[0].seed is not None
+
+
 def test_generate_not_found(fake_url, hf_headers):
     client = Client(fake_url, hf_headers)
     with pytest.raises(NotFoundError):
@@ -35,8 +43,8 @@ def test_generate_validation_error(flan_t5_xxl_url, hf_headers):
         client.generate("test", max_new_tokens=10_000)
 
 
-def test_generate_stream(bloom_url, hf_headers):
-    client = Client(bloom_url, hf_headers)
+def test_generate_stream(flan_t5_xxl_url, hf_headers):
+    client = Client(flan_t5_xxl_url, hf_headers)
     responses = [
         response for response in client.generate_stream("test", max_new_tokens=1)
     ]
@@ -44,7 +52,7 @@ def test_generate_stream(bloom_url, hf_headers):
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == "."
+    assert response.generated_text == ""
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
@@ -63,21 +71,19 @@ def test_generate_stream_validation_error(flan_t5_xxl_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_async(bloom_url, hf_headers):
-    client = AsyncClient(bloom_url, hf_headers)
+async def test_generate_async(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
     response = await client.generate("test", max_new_tokens=1)
 
-    assert response.generated_text == "."
+    assert response.generated_text == ""
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
     assert len(response.details.prefill) == 1
-    assert response.details.prefill[0] == PrefillToken(
-        id=9234, text="test", logprob=None
-    )
+    assert response.details.prefill[0] == PrefillToken(id=0, text="<pad>", logprob=None)
     assert len(response.details.tokens) == 1
     assert response.details.tokens[0] == Token(
-        id=17, text=".", logprob=-1.75, special=False
+        id=3, text=" ", logprob=-1.984375, special=False
     )
 
 
@@ -96,8 +102,8 @@ async def test_generate_async_validation_error(flan_t5_xxl_url, hf_headers):
 
 
 @pytest.mark.asyncio
-async def test_generate_stream_async(bloom_url, hf_headers):
-    client = AsyncClient(bloom_url, hf_headers)
+async def test_generate_stream_async(flan_t5_xxl_url, hf_headers):
+    client = AsyncClient(flan_t5_xxl_url, hf_headers)
     responses = [
         response async for response in client.generate_stream("test", max_new_tokens=1)
     ]
@@ -105,7 +111,7 @@ async def test_generate_stream_async(bloom_url, hf_headers):
     assert len(responses) == 1
     response = responses[0]
 
-    assert response.generated_text == "."
+    assert response.generated_text == ""
     assert response.details.finish_reason == FinishReason.Length
     assert response.details.generated_tokens == 1
     assert response.details.seed is None
diff --git a/clients/python/tests/test_inference_api.py b/clients/python/tests/test_inference_api.py
index dc744940..79e503a3 100644
--- a/clients/python/tests/test_inference_api.py
+++ b/clients/python/tests/test_inference_api.py
@@ -14,8 +14,8 @@ def test_get_supported_models():
     assert isinstance(get_supported_models(), list)
 
 
-def test_client(bloom_model):
-    client = InferenceAPIClient(bloom_model)
+def test_client(flan_t5_xxl):
+    client = InferenceAPIClient(flan_t5_xxl)
     assert isinstance(client, Client)
 
 
@@ -24,8 +24,8 @@ def test_client_unsupported_model(unsupported_model):
         InferenceAPIClient(unsupported_model)
 
 
-def test_async_client(bloom_model):
-    client = InferenceAPIAsyncClient(bloom_model)
+def test_async_client(flan_t5_xxl):
+    client = InferenceAPIAsyncClient(flan_t5_xxl)
     assert isinstance(client, AsyncClient)
 
 
diff --git a/clients/python/tests/test_types.py b/clients/python/tests/test_types.py
index d319b570..4c9d4c89 100644
--- a/clients/python/tests/test_types.py
+++ b/clients/python/tests/test_types.py
@@ -1,10 +1,20 @@
 import pytest
 
-from text_generation.types import Parameters
+from text_generation.types import Parameters, Request
 from text_generation.errors import ValidationError
 
 
 def test_parameters_validation():
+    # Test best_of
+    Parameters(best_of=1)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=0)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=-1)
+    Parameters(best_of=2, do_sample=True)
+    with pytest.raises(ValidationError):
+        Parameters(best_of=2)
+
     # Test repetition_penalty
     Parameters(repetition_penalty=1)
     with pytest.raises(ValidationError):
@@ -32,8 +42,41 @@ def test_parameters_validation():
         Parameters(top_k=-1)
 
     # Test top_p
-    Parameters(top_p=1)
+    Parameters(top_p=0.5)
     with pytest.raises(ValidationError):
         Parameters(top_p=0)
     with pytest.raises(ValidationError):
         Parameters(top_p=-1)
+    with pytest.raises(ValidationError):
+        Parameters(top_p=1)
+
+    # Test truncate
+    Parameters(truncate=1)
+    with pytest.raises(ValidationError):
+        Parameters(truncate=0)
+    with pytest.raises(ValidationError):
+        Parameters(truncate=-1)
+
+    # Test typical_p
+    Parameters(typical_p=0.5)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=0)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=-1)
+    with pytest.raises(ValidationError):
+        Parameters(typical_p=1)
+
+
+def test_request_validation():
+    Request(inputs="test")
+
+    with pytest.raises(ValidationError):
+        Request(inputs="")
+
+    Request(inputs="test", stream=True)
+    Request(inputs="test", parameters=Parameters(best_of=2, do_sample=True))
+
+    with pytest.raises(ValidationError):
+        Request(
+            inputs="test", parameters=Parameters(best_of=2, do_sample=True), stream=True
+        )
diff --git a/clients/python/text_generation/__init__.py b/clients/python/text_generation/__init__.py
index db09cdfa..46109833 100644
--- a/clients/python/text_generation/__init__.py
+++ b/clients/python/text_generation/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.2.1"
+__version__ = "0.3.0"
 
 from text_generation.client import Client, AsyncClient
 from text_generation.inference_api import InferenceAPIClient, InferenceAPIAsyncClient
diff --git a/clients/python/text_generation/client.py b/clients/python/text_generation/client.py
index 23655240..e05a002e 100644
--- a/clients/python/text_generation/client.py
+++ b/clients/python/text_generation/client.py
@@ -56,6 +56,7 @@ class Client:
         prompt: str,
         do_sample: bool = False,
         max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
@@ -63,6 +64,8 @@ class Client:
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
         watermark: bool = False,
     ) -> Response:
         """
@@ -75,6 +78,8 @@ class Client:
                 Activate logits sampling
             max_new_tokens (`int`):
                 Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
@@ -91,6 +96,11 @@ class Client:
             top_p (`float`):
                 If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 
@@ -99,6 +109,7 @@ class Client:
         """
         # Validate parameters
         parameters = Parameters(
+            best_of=best_of,
             details=True,
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
@@ -109,6 +120,8 @@ class Client:
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
             watermark=watermark,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
@@ -129,6 +142,7 @@ class Client:
         prompt: str,
         do_sample: bool = False,
         max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
@@ -136,6 +150,8 @@ class Client:
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
         watermark: bool = False,
     ) -> Iterator[StreamResponse]:
         """
@@ -148,6 +164,8 @@ class Client:
                 Activate logits sampling
             max_new_tokens (`int`):
                 Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
@@ -164,6 +182,11 @@ class Client:
             top_p (`float`):
                 If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 
@@ -172,6 +195,7 @@ class Client:
         """
         # Validate parameters
         parameters = Parameters(
+            best_of=best_of,
             details=True,
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
@@ -182,6 +206,8 @@ class Client:
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
             watermark=watermark,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
@@ -261,6 +287,7 @@ class AsyncClient:
         prompt: str,
         do_sample: bool = False,
         max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
@@ -268,6 +295,8 @@ class AsyncClient:
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
         watermark: bool = False,
     ) -> Response:
         """
@@ -280,6 +309,8 @@ class AsyncClient:
                 Activate logits sampling
             max_new_tokens (`int`):
                 Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
@@ -296,6 +327,11 @@ class AsyncClient:
             top_p (`float`):
                 If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 
@@ -304,6 +340,7 @@ class AsyncClient:
         """
         # Validate parameters
         parameters = Parameters(
+            best_of=best_of,
             details=True,
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
@@ -314,6 +351,8 @@ class AsyncClient:
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
             watermark=watermark,
         )
         request = Request(inputs=prompt, stream=False, parameters=parameters)
@@ -331,6 +370,7 @@ class AsyncClient:
         prompt: str,
         do_sample: bool = False,
         max_new_tokens: int = 20,
+        best_of: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         return_full_text: bool = False,
         seed: Optional[int] = None,
@@ -338,6 +378,8 @@ class AsyncClient:
         temperature: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
+        truncate: Optional[int] = None,
+        typical_p: Optional[float] = None,
         watermark: bool = False,
     ) -> AsyncIterator[StreamResponse]:
         """
@@ -350,6 +392,8 @@ class AsyncClient:
                 Activate logits sampling
             max_new_tokens (`int`):
                 Maximum number of generated tokens
+            best_of (`int`):
+                Generate best_of sequences and return the one if the highest token logprobs
             repetition_penalty (`float`):
                 The parameter for repetition penalty. 1.0 means no penalty. See [this
                 paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
@@ -366,6 +410,11 @@ class AsyncClient:
             top_p (`float`):
                 If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
+            truncate (`int`):
+                Truncate inputs tokens to the given size
+            typical_p (`float`):
+                Typical Decoding mass
+                See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
             watermark (`bool`):
                 Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
 
@@ -374,6 +423,7 @@ class AsyncClient:
         """
         # Validate parameters
         parameters = Parameters(
+            best_of=best_of,
             details=True,
             do_sample=do_sample,
             max_new_tokens=max_new_tokens,
@@ -384,6 +434,8 @@ class AsyncClient:
             temperature=temperature,
             top_k=top_k,
             top_p=top_p,
+            truncate=truncate,
+            typical_p=typical_p,
             watermark=watermark,
         )
         request = Request(inputs=prompt, stream=True, parameters=parameters)
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index d276b60e..ea2070b8 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -6,27 +6,64 @@ from text_generation.errors import ValidationError
 
 
 class Parameters(BaseModel):
+    # Activate logits sampling
     do_sample: bool = False
+    # Maximum number of generated tokens
     max_new_tokens: int = 20
+    # The parameter for repetition penalty. 1.0 means no penalty.
+    # See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     repetition_penalty: Optional[float] = None
+    # Whether to prepend the prompt to the generated text
     return_full_text: bool = False
+    # Stop generating tokens if a member of `stop_sequences` is generated
     stop: List[str] = []
+    # Random sampling seed
     seed: Optional[int]
+    # The value used to module the logits distribution.
     temperature: Optional[float]
+    # The number of highest probability vocabulary tokens to keep for top-k-filtering.
     top_k: Optional[int]
+    # If set to < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or
+    # higher are kept for generation.
     top_p: Optional[float]
+    # truncate inputs tokens to the given size
+    truncate: Optional[int]
+    # Typical Decoding mass
+    # See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information
+    typical_p: Optional[float]
+    # Generate best_of sequences and return the one if the highest token logprobs
+    best_of: Optional[int]
+    # Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
     watermark: bool = False
+    # Get generation details
     details: bool = False
 
+    @validator("best_of")
+    def valid_best_of(cls, field_value, values):
+        if field_value is not None:
+            if field_value <= 0:
+                raise ValidationError("`best_of` must be strictly positive")
+            sampling = (
+                values["do_sample"]
+                | (values["temperature"] is not None)
+                | (values["top_k"] is not None)
+                | (values["top_p"] is not None)
+                | (values["typical_p"] is not None)
+            )
+            if field_value > 1 and not sampling:
+                raise ValidationError("you must use sampling when `best_of` is > 1")
+
+        return field_value
+
     @validator("repetition_penalty")
     def valid_repetition_penalty(cls, v):
-        if v is not None and v is v <= 0:
+        if v is not None and v <= 0:
             raise ValidationError("`repetition_penalty` must be strictly positive")
         return v
 
     @validator("seed")
     def valid_seed(cls, v):
-        if v is not None and v is v < 0:
+        if v is not None and v < 0:
             raise ValidationError("`seed` must be positive")
         return v
 
@@ -44,56 +81,143 @@ class Parameters(BaseModel):
 
     @validator("top_p")
     def valid_top_p(cls, v):
-        if v is not None and (v <= 0 or v > 1.0):
-            raise ValidationError("`top_p` must be > 0.0 and <= 1.0")
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`top_p` must be > 0.0 and < 1.0")
+        return v
+
+    @validator("truncate")
+    def valid_truncate(cls, v):
+        if v is not None and v <= 0:
+            raise ValidationError("`truncate` must be strictly positive")
+        return v
+
+    @validator("typical_p")
+    def valid_typical_p(cls, v):
+        if v is not None and (v <= 0 or v >= 1.0):
+            raise ValidationError("`typical_p` must be > 0.0 and < 1.0")
         return v
 
 
 class Request(BaseModel):
+    # Prompt
     inputs: str
-    parameters: Parameters
+    # Generation parameters
+    parameters: Optional[Parameters]
+    # Whether to stream output tokens
     stream: bool = False
 
+    @validator("inputs")
+    def valid_input(cls, v):
+        if not v:
+            raise ValidationError("`inputs` cannot be empty")
+        return v
 
+    @validator("stream")
+    def valid_best_of_stream(cls, field_value, values):
+        parameters = values["parameters"]
+        if (
+            parameters is not None
+            and parameters.best_of is not None
+            and parameters.best_of > 1
+            and field_value
+        ):
+            raise ValidationError(
+                "`best_of` != 1 is not supported when `stream` == True"
+            )
+        return field_value
+
+
+# Prompt tokens
 class PrefillToken(BaseModel):
+    # Token ID from the model tokenizer
     id: int
+    # Token text
     text: str
+    # Logprob
+    # Optional since the logprob of the first token cannot be computed
     logprob: Optional[float]
 
 
+# Generated tokens
 class Token(BaseModel):
+    # Token ID from the model tokenizer
     id: int
+    # Token text
     text: str
+    # Logprob
     logprob: float
+    # Is the token a special token
+    # Can be used to ignore tokens when concatenating
     special: bool
 
 
+# Generation finish reason
 class FinishReason(Enum):
+    # number of generated tokens == `max_new_tokens`
     Length = "length"
+    # the model generated its end of sequence token
     EndOfSequenceToken = "eos_token"
+    # the model generated a text included in `stop_sequences`
     StopSequence = "stop_sequence"
 
 
-class Details(BaseModel):
+# Additional sequences when using the `best_of` parameter
+class BestOfSequence(BaseModel):
+    # Generated text
+    generated_text: str
+    # Generation finish reason
     finish_reason: FinishReason
+    # Number of generated tokens
     generated_tokens: int
+    # Sampling seed if sampling was activated
     seed: Optional[int]
+    # Prompt tokens
     prefill: List[PrefillToken]
+    # Generated tokens
     tokens: List[Token]
 
 
-class StreamDetails(BaseModel):
+# `generate` details
+class Details(BaseModel):
+    # Generation finish reason
     finish_reason: FinishReason
+    # Number of generated tokens
     generated_tokens: int
+    # Sampling seed if sampling was activated
     seed: Optional[int]
+    # Prompt tokens
+    prefill: List[PrefillToken]
+    # Generated tokens
+    tokens: List[Token]
+    # Additional sequences when using the `best_of` parameter
+    best_of_sequences: Optional[List[BestOfSequence]]
 
 
+# `generate` return value
 class Response(BaseModel):
+    # Generated text
     generated_text: str
+    # Generation details
     details: Details
 
 
+# `generate_stream` details
+class StreamDetails(BaseModel):
+    # Generation finish reason
+    finish_reason: FinishReason
+    # Number of generated tokens
+    generated_tokens: int
+    # Sampling seed if sampling was activated
+    seed: Optional[int]
+
+
+# `generate_stream` return value
 class StreamResponse(BaseModel):
+    # Generated token
     token: Token
+    # Complete generated text
+    # Only available when the generation is finished
     generated_text: Optional[str]
+    # Generation details
+    # Only available when the generation is finished
     details: Optional[StreamDetails]