chore: add pre-commit (#1569)

This commit is contained in:
OlivierDehaene 2024-02-16 11:58:58 +01:00 committed by GitHub
parent 142cdabed3
commit 9946165ee0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
78 changed files with 346 additions and 234 deletions

View File

@ -5,14 +5,14 @@ body:
id: system-info id: system-info
attributes: attributes:
label: System Info label: System Info
description: | description: |
Please share your system info with us (`text-generation-launcher --env` if installed locally). Please share your system info with us (`text-generation-launcher --env` if installed locally).
The full command line used that causes issues: The full command line used that causes issues:
OS version: OS version:
Rust version (if self-compiling, `cargo version`): Rust version (if self-compiling, `cargo version`):
Model being used (`curl 127.0.0.1:8080/info | jq`): Model being used (`curl 127.0.0.1:8080/info | jq`):
If local model please explicit the kind of model and/or equivalents. If local model please explicit the kind of model and/or equivalents.
Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
Deployment specificities (Kubernetes, EKS, AKS, any particular deployments): Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
The current version being used: The current version being used:
@ -52,11 +52,11 @@ body:
placeholder: | placeholder: |
Steps to reproduce the behavior: Steps to reproduce the behavior:
1. 1.
2. 2.
3. 3.
- type: textarea - type: textarea
id: expected-behavior id: expected-behavior

View File

@ -19,7 +19,7 @@ body:
label: Motivation label: Motivation
description: | description: |
Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
- type: textarea - type: textarea
id: contribution id: contribution

View File

@ -6,15 +6,15 @@ on:
jobs: jobs:
update_docs: update_docs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v2 uses: actions/checkout@v2
- name: Install Launcher - name: Install Launcher
id: install-launcher id: install-launcher
run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher
- name: Check launcher Docs are up-to-date - name: Check launcher Docs are up-to-date
run: | run: |
echo text-generation-launcher --help echo text-generation-launcher --help

View File

@ -16,4 +16,4 @@ jobs:
commit_sha: ${{ github.event.pull_request.head.sha }} commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }} pr_number: ${{ github.event.number }}
package: text-generation-inference package: text-generation-inference
additional_args: --not_python_module additional_args: --not_python_module

View File

@ -71,12 +71,11 @@ jobs:
pip install pytest pip install pytest
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
pytest -s -vv server/tests pytest -s -vv server/tests
- name: Run Rust fmt - name: Pre-commit checks
run: | run: |
cargo fmt --check pip install pre-commit
- name: Run Rust clippy pre-commit install
run: | pre-commit run --all-files
cargo clippy
- name: Run Rust tests - name: Run Rust tests
run: | run: |
cargo test cargo test

View File

@ -13,4 +13,4 @@ jobs:
package_name: text-generation-inference package_name: text-generation-inference
secrets: secrets:
hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}

1
.gitignore vendored
View File

@ -11,4 +11,3 @@ server/exllama_kernels/exllama_kernels/hip_func/
*_hip.cuh *_hip.cuh
server/exllama_kernels/exllama_kernels/hip_buffers.cuh server/exllama_kernels/exllama_kernels/hip_buffers.cuh
server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp

18
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,18 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
exclude: docs/source/basic_tutorials/launcher.md
- repo: https://github.com/psf/black
rev: 24.2.0
hooks:
- id: black
- repo: https://github.com/doublify/pre-commit-rust
rev: v1.0
hooks:
- id: fmt
- id: cargo-check
- id: clippy

View File

@ -1,5 +1,5 @@
<div align="center"> <div align="center">
<a href="https://www.youtube.com/watch?v=jlMAX2Oaht0"> <a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
<img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png"> <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
</a> </a>
@ -228,7 +228,7 @@ text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
You can also quantize the weights with bitsandbytes to reduce the VRAM requirement: You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
```shell ```shell
text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
``` ```
4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`. 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.

View File

@ -29,4 +29,3 @@ tui = {package = "ratatui", version = "0.23", default-features = false, features
tracing = "0.1.37" tracing = "0.1.37"
tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
hf-hub = "0.3.1" hf-hub = "0.3.1"

View File

@ -6,12 +6,12 @@
</div> </div>
A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
and powered by [tui](https://github.com/tui-rs-revival/ratatui). and powered by [tui](https://github.com/tui-rs-revival/ratatui).
## Install ## Install
```shell ```shell
make install-benchmark make install-benchmark
``` ```
@ -27,4 +27,4 @@ Then run the benchmarking tool:
```shell ```shell
text-generation-benchmark --tokenizer-name bigscience/bloom-560m text-generation-benchmark --tokenizer-name bigscience/bloom-560m
``` ```

View File

@ -155,4 +155,4 @@ dmypy.json
cython_debug/ cython_debug/
transformers transformers
safetensors safetensors

View File

@ -3,4 +3,4 @@ unit-tests:
install: install:
pip install pip --upgrade pip install pip --upgrade
pip install -e . pip install -e .

View File

@ -141,7 +141,7 @@ class Parameters:
# Get decoder input token logprobs and ids # Get decoder input token logprobs and ids
decoder_input_details: bool decoder_input_details: bool
# Return the N most likely tokens at each step # Return the N most likely tokens at each step
top_n_tokens: Optional[int] top_n_tokens: Optional[int]
# Decoder input tokens # Decoder input tokens
class InputToken: class InputToken:
@ -192,7 +192,7 @@ class BestOfSequence:
# Generated tokens # Generated tokens
tokens: List[Token] tokens: List[Token]
# Most likely tokens # Most likely tokens
top_tokens: Optional[List[List[Token]]] top_tokens: Optional[List[List[Token]]]
# `generate` details # `generate` details
@ -236,7 +236,7 @@ class StreamResponse:
# Generated token # Generated token
token: Token token: Token
# Most likely tokens # Most likely tokens
top_tokens: Optional[List[Token]] top_tokens: Optional[List[Token]]
# Complete generated text # Complete generated text
# Only available when the generation is finished # Only available when the generation is finished
generated_text: Optional[str] generated_text: Optional[str]
@ -248,4 +248,4 @@ class StreamResponse:
class DeployedModel: class DeployedModel:
model_id: str model_id: str
sha: str sha: str
``` ```

View File

@ -134,6 +134,7 @@ class Parameters(BaseModel):
raise ValidationError("`value` cannot be empty for `json` grammar") raise ValidationError("`value` cannot be empty for `json` grammar")
return v return v
class Request(BaseModel): class Request(BaseModel):
# Prompt # Prompt
inputs: str inputs: str

View File

@ -27,4 +27,4 @@
} }
</script> </script>
</body> </body>
</html> </html>

View File

@ -1290,4 +1290,4 @@
"description": "Hugging Face Text Generation Inference API" "description": "Hugging Face Text Generation Inference API"
} }
] ]
} }

View File

@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip.
pip install huggingface-hub pip install huggingface-hub
``` ```
Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
```python ```python
from huggingface_hub import InferenceClient from huggingface_hub import InferenceClient
@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin
pip install huggingface-hub gradio pip install huggingface-hub gradio
``` ```
Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
```python ```python
import gradio as gr import gradio as gr
from huggingface_hub import InferenceClient from huggingface_hub import InferenceClient
@ -110,30 +110,30 @@ gr.ChatInterface(
).queue().launch() ).queue().launch()
``` ```
The UI looks like this 👇 The UI looks like this 👇
<div class="flex justify-center"> <div class="flex justify-center">
<img <img
class="block dark:hidden" class="block dark:hidden"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
/> />
<img <img
class="hidden dark:block" class="hidden dark:block"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
/> />
</div> </div>
You can try the demo directly here 👇 You can try the demo directly here 👇
<div class="block dark:hidden"> <div class="block dark:hidden">
<iframe <iframe
src="https://merve-gradio-tgi-2.hf.space?__theme=light" src="https://merve-gradio-tgi-2.hf.space?__theme=light"
width="850" width="850"
height="750" height="750"
></iframe> ></iframe>
</div> </div>
<div class="hidden dark:block"> <div class="hidden dark:block">
<iframe <iframe
src="https://merve-gradio-tgi-2.hf.space?__theme=dark" src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
width="850" width="850"
height="750" height="750"
@ -152,4 +152,4 @@ You can read more about how to customize a `ChatInterface` [here](https://www.gr
## API documentation ## API documentation
You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference). You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).

View File

@ -2,19 +2,19 @@
TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs. TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
You can serve these models using the same Docker command-line invocation as with fully supported models 👇 You can serve these models using the same Docker command-line invocation as with fully supported models 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
``` ```
If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇 If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
``` ```
Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇 Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
```bash ```bash
# Make sure your model is in the $volume directory # Make sure your model is in the $volume directory

View File

@ -1,6 +1,6 @@
# Preparing the Model # Preparing the Model
Text Generation Inference improves the model in several aspects. Text Generation Inference improves the model in several aspects.
## Quantization ## Quantization
@ -9,7 +9,7 @@ TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsan
## RoPE Scaling ## RoPE Scaling
RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension. RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.
<Tip> <Tip>
@ -19,4 +19,4 @@ We recommend using `dynamic` RoPE scaling.
## Safetensors ## Safetensors
[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format. [Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.

View File

@ -2,29 +2,29 @@
You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli). You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).
`text-generation-server` lets you download the model with `download-weights` command like below 👇 `text-generation-server` lets you download the model with `download-weights` command like below 👇
```bash ```bash
text-generation-server download-weights MODEL_HUB_ID text-generation-server download-weights MODEL_HUB_ID
``` ```
You can also use it to quantize models like below 👇 You can also use it to quantize models like below 👇
```bash ```bash
text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
``` ```
You can use `text-generation-launcher` to serve models. You can use `text-generation-launcher` to serve models.
```bash ```bash
text-generation-launcher --model-id MODEL_HUB_ID --port 8080 text-generation-launcher --model-id MODEL_HUB_ID --port 8080
``` ```
There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running
```bash ```bash
text-generation-launcher --help text-generation-launcher --help
``` ```
You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/). You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).

View File

@ -1,12 +1,11 @@
# Flash Attention # Flash Attention
Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference. Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.
Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back. Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.
![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png) ![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)
It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix. It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.
You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135). You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).

View File

@ -4,20 +4,20 @@ TGI offers GPTQ and bits-and-bytes quantization to quantize large language model
## Quantization with GPTQ ## Quantization with GPTQ
GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇 GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇
Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\): Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):
$$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$ $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
``` ```
Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
To quantize a given model using GPTQ with a calibration dataset, simply run To quantize a given model using GPTQ with a calibration dataset, simply run
@ -41,7 +41,7 @@ You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.
bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision. bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much. 8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
```bash ```bash
@ -50,7 +50,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load. 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
```bash ```bash
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4

View File

@ -1,7 +1,7 @@
# Safetensors # Safetensors
Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries). Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).
TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format. TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.
You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index). You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).

View File

@ -5,12 +5,12 @@
Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience. Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
<div class="flex justify-center"> <div class="flex justify-center">
<img <img
class="block dark:hidden" class="block dark:hidden"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
/> />
<img <img
class="hidden dark:block" class="hidden dark:block"
src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
/> />
</div> </div>
@ -25,14 +25,14 @@ With token streaming, the server can start returning the tokens one by one befor
For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below. For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.
<div class="block dark:hidden"> <div class="block dark:hidden">
<iframe <iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light" src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
width="850" width="850"
height="350" height="350"
></iframe> ></iframe>
</div> </div>
<div class="hidden dark:block"> <div class="hidden dark:block">
<iframe <iframe
src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark" src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
width="850" width="850"
height="350" height="350"
@ -43,7 +43,7 @@ For example, a system can generate 100 tokens per second. If the system generate
### Streaming with Python ### Streaming with Python
To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response. To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.
```python ```python
from huggingface_hub import InferenceClient from huggingface_hub import InferenceClient
@ -116,7 +116,7 @@ curl -N 127.0.0.1:8080/generate_stream \
First, we need to install the `@huggingface/inference` library. First, we need to install the `@huggingface/inference` library.
`npm install @huggingface/inference` `npm install @huggingface/inference`
If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's
We can create a `HfInferenceEndpoint` providing our endpoint URL and credential. We can create a `HfInferenceEndpoint` providing our endpoint URL and credential.
@ -129,7 +129,7 @@ const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.
const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips' const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips'
const stream = hf.textGenerationStream({ inputs: prompt }) const stream = hf.textGenerationStream({ inputs: prompt })
for await (const r of stream) { for await (const r of stream) {
// yield the generated token // yield the generated token
process.stdout.write(r.token.text) process.stdout.write(r.token.text)
} }

View File

@ -1,6 +1,6 @@
# Tensor Parallelism # Tensor Parallelism
Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇 Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇
![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png) ![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)

View File

@ -4,7 +4,7 @@ This section explains how to install the CLI tool as well as installing TGI from
## Install CLI ## Install CLI
You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.
To install the CLI, you need to first clone the TGI repository and then run `make`. To install the CLI, you need to first clone the TGI repository and then run `make`.
@ -23,7 +23,7 @@ BUILD_EXTENSIONS=True make install
Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**. Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.
Text Generation Inference is available on pypi, conda and GitHub. Text Generation Inference is available on pypi, conda and GitHub.
To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
Python 3.9, e.g. using conda: Python 3.9, e.g. using conda:

View File

@ -92,7 +92,7 @@ print(chat_completion)
## Hugging Face Inference Endpoints ## Hugging Face Inference Endpoints
The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated). The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library: Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:
> **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key. > **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.

View File

@ -53,7 +53,7 @@ print(response.json())
```js ```js
async function query() { async function query() {
const response = await fetch( const response = await fetch(
'http://127.0.0.1:8080/generate', 'http://127.0.0.1:8080/generate',
{ {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json'}, headers: { 'Content-Type': 'application/json'},

View File

@ -54,7 +54,9 @@ async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.private @pytest.mark.private
async def test_mamba_load(fused_kernel_mamba, generate_load, generous_response_snapshot): async def test_mamba_load(
fused_kernel_mamba, generate_load, generous_response_snapshot
):
responses = await generate_load( responses = await generate_load(
fused_kernel_mamba, "What is Deep Learning?", max_new_tokens=10, n=4 fused_kernel_mamba, "What is Deep Learning?", max_new_tokens=10, n=4
) )

View File

@ -2,4 +2,4 @@
addopts = --snapshot-warn-unused addopts = --snapshot-warn-unused
asyncio_mode = auto asyncio_mode = auto
markers = markers =
private: marks tests as requiring an admin hf token (deselect with '-m "not private"') private: marks tests as requiring an admin hf token (deselect with '-m "not private"')

View File

@ -57,7 +57,7 @@ export function run(host, generate_payload, max_new_tokens) {
const duration = res.timings.duration; const duration = res.timings.duration;
if (res.status === 200) { if (res.status === 200) {
const body = res.json(); const body = res.json();
const n_tokens = body.details.tokens.length; const n_tokens = body.details.tokens.length;
const latency_ms_per_token = duration / n_tokens; const latency_ms_per_token = duration / n_tokens;
timePerToken.add(latency_ms_per_token); timePerToken.add(latency_ms_per_token);

View File

@ -60,4 +60,4 @@ export default function () {
inferenceTime.add(res.headers["X-Inference-Time"]); inferenceTime.add(res.headers["X-Inference-Time"]);
timePerToken.add(res.headers["X-Time-Per-Token"]); timePerToken.add(res.headers["X-Time-Per-Token"]);
} }
} }

View File

@ -1,5 +1,5 @@
import { get_options, run } from "./common.js"; import { get_options, run } from "./common.js";
const reference_latency_ms = 70; const reference_latency_ms = 70;
const host = __ENV.HOST || '127.0.0.1:8000'; const host = __ENV.HOST || '127.0.0.1:8000';
const max_new_tokens = 50; const max_new_tokens = 50;

View File

@ -1,5 +1,5 @@
import { get_options, run } from "./common.js"; import { get_options, run } from "./common.js";
const reference_latency_ms = 22; const reference_latency_ms = 22;
const host = __ENV.HOST || '127.0.0.1:8000'; const host = __ENV.HOST || '127.0.0.1:8000';
const max_new_tokens = 50; const max_new_tokens = 50;

View File

@ -28,7 +28,7 @@ this is controlled by the client, and therefore the amount of batching is decide
beforehand. beforehand.
For text-generation, and LLMs which are memory bound we can try to be much more For text-generation, and LLMs which are memory bound we can try to be much more
efficient with the available compute, by having client sending us single queries, efficient with the available compute, by having client sending us single queries,
and let the router mix&match queries into or out of batches to make the use the and let the router mix&match queries into or out of batches to make the use the
compute the most efficiently. This is possible because for LLMs the total compute compute the most efficiently. This is possible because for LLMs the total compute
for running the model is much bigger than doing mix&match of the batches themselves. for running the model is much bigger than doing mix&match of the batches themselves.
@ -89,5 +89,5 @@ most critical perceived quality of an LLM API.
With token streaming, the server can start answering after the first `prefill` pass With token streaming, the server can start answering after the first `prefill` pass
directly, without waiting for all the generation to be done. For extremely long queries directly, without waiting for all the generation to be done. For extremely long queries
this means clients can start to see something happening orders of magnitude before this means clients can start to see something happening orders of magnitude before
the work is done. Seeing something in progress allows them to cut short if it's not the work is done. Seeing something in progress allows them to cut short if it's not
what's wanted but also it "feels" better. what's wanted but also it "feels" better.

View File

@ -1 +1 @@
*.rs *.rs

View File

@ -27,6 +27,7 @@ pub struct Validation {
} }
impl Validation { impl Validation {
#[allow(clippy::too_many_arguments)]
pub(crate) fn new( pub(crate) fn new(
workers: usize, workers: usize,
tokenizer: Option<Tokenizer>, tokenizer: Option<Tokenizer>,

View File

@ -3,4 +3,4 @@
# Branched from master on: 10 November, 2023 # Branched from master on: 10 November, 2023
# https://releases.rs/docs/1.75.0/ # https://releases.rs/docs/1.75.0/
channel = "1.75.0" channel = "1.75.0"
components = ["rustfmt", "clippy"] components = ["rustfmt", "clippy"]

View File

@ -2,7 +2,7 @@
# to make cuda graphs work. # to make cuda graphs work.
awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4 awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
awq: awq:
rm -rf llm-awq rm -rf llm-awq
git clone https://github.com/huggingface/llm-awq git clone https://github.com/huggingface/llm-awq

View File

@ -13,4 +13,4 @@ build-flash-attention: flash-attention
install-flash-attention: build-flash-attention install-flash-attention: build-flash-attention
pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install

View File

@ -13,7 +13,7 @@ install-causal-conv1d: build-causal-conv1d
cd causal-conv1d/ && pip install . cd causal-conv1d/ && pip install .
# selective-scan dependends on causal-conv1d # selective-scan dependends on causal-conv1d
selective-scan: selective-scan:
rm -rf mamba rm -rf mamba
git clone https://github.com/state-spaces/mamba.git mamba git clone https://github.com/state-spaces/mamba.git mamba
@ -21,8 +21,8 @@ build-selective-scan: selective-scan
cd mamba/ && git fetch && git checkout $(selective_scan_commit) cd mamba/ && git fetch && git checkout $(selective_scan_commit)
cd mamba && python setup.py build cd mamba && python setup.py build
install-selective-scan: install-causal-conv1d build-selective-scan install-selective-scan: install-causal-conv1d build-selective-scan
pip uninstall selective-scan-cuda -y || true pip uninstall selective-scan-cuda -y || true
cd mamba && pip install . cd mamba && pip install .
build-all: build-causal-conv1d build-selective-scan build-all: build-causal-conv1d build-selective-scan

View File

@ -12,4 +12,4 @@ make install
```shell ```shell
make run-dev make run-dev
``` ```

View File

@ -247,4 +247,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
&forward, &forward,
"Bloom attention mechanism forward (CUDA)" "Bloom attention mechanism forward (CUDA)"
); );
} }

View File

@ -16,4 +16,4 @@ void column_remap_cuda
const uint32_t* x_map const uint32_t* x_map
); );
#endif #endif

View File

@ -50,4 +50,4 @@ private:
void g_q4_keep_matrix(Q4Matrix* m); void g_q4_keep_matrix(Q4Matrix* m);
void g_q4_free_matrices(); void g_q4_free_matrices();
#endif #endif

View File

@ -48,4 +48,4 @@ __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t
#define rocblas_set_stream hipblasSetStream #define rocblas_set_stream hipblasSetStream
#define rocblas_hgemm __compat_hipblasHgemm #define rocblas_hgemm __compat_hipblasHgemm
#endif #endif

View File

@ -118,4 +118,4 @@ public:
} }
}; };
#endif #endif

View File

@ -33,4 +33,4 @@ void clear_tensor_cuda
int size_n int size_n
); );
#endif #endif

View File

@ -100,4 +100,4 @@ __forceinline__ __device__ void dequant_2bit_16
#endif #endif
#endif #endif

View File

@ -224,4 +224,4 @@ __forceinline__ __device__ void dequant_4bit_8_gptq
#endif #endif
#endif #endif

View File

@ -204,4 +204,4 @@ __forceinline__ __device__ void dequant_5bit_32
#endif #endif
#endif #endif

View File

@ -40,5 +40,3 @@ __forceinline__ __device__ void dequant_6bit_16
#endif #endif
#endif #endif

View File

@ -35,4 +35,4 @@ __forceinline__ __device__ void dequant_8bit_8
#endif #endif
#endif #endif

View File

@ -51,4 +51,4 @@ inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=
void print_global_mem(const half* ptr, int rows, int columns, int stride); void print_global_mem(const half* ptr, int rows, int columns, int stride);
#endif #endif

View File

@ -251,9 +251,9 @@ class LlamaMLP(nn.Module):
if "gelu" not in act if "gelu" not in act
else lambda x: torch.nn.functional.gelu( else lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
) )
# Fuse gate and up proj # Fuse gate and up proj

View File

@ -255,9 +255,9 @@ class MistralMLP(nn.Module):
if "gelu" not in act if "gelu" not in act
else lambda x: torch.nn.functional.gelu( else lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
) )
# Fuse gate and up proj # Fuse gate and up proj

View File

@ -344,9 +344,9 @@ class BlockSparseMoE(nn.Module):
if "gelu" in act: if "gelu" in act:
self.act = lambda x: torch.nn.functional.gelu( self.act = lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
elif "silu" in act: elif "silu" in act:
self.act = torch.nn.functional.silu self.act = torch.nn.functional.silu
@ -600,9 +600,9 @@ class DenseMoE(nn.Module):
if "gelu" in act: if "gelu" in act:
self.act = lambda x: torch.nn.functional.gelu( self.act = lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
elif "silu" in act: elif "silu" in act:
self.act = torch.nn.functional.silu self.act = torch.nn.functional.silu

View File

@ -187,9 +187,9 @@ class FlashMLP(nn.Module):
if "gelu" not in act if "gelu" not in act
else lambda x: torch.nn.functional.gelu( else lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
) )

View File

@ -225,9 +225,9 @@ class PhiMLP(nn.Module):
if "gelu" not in act if "gelu" not in act
else lambda x: torch.nn.functional.gelu( else lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
) )

View File

@ -69,7 +69,12 @@ def _load_multi_mqa_gptq(
qzeros = torch.cat([q_tensor, kv_tensor], dim=1) qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
qzeros = qzeros.to(device=weights.device) qzeros = qzeros.to(device=weights.device)
bits, groupsize, _, quant_method, = weights._get_gptq_params() (
bits,
groupsize,
_,
quant_method,
) = weights._get_gptq_params()
if quant_method == "gptq": if quant_method == "gptq":
g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx") g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
g_idx = g_idx.to(device=weights.device) g_idx = g_idx.to(device=weights.device)
@ -306,9 +311,9 @@ class MLP(nn.Module):
if "gelu" not in act if "gelu" not in act
else lambda x: torch.nn.functional.gelu( else lambda x: torch.nn.functional.gelu(
x, x,
approximate="tanh" approximate=(
if act in ["gelu_fast", "gelu_pytorch_tanh"] "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
else "none", ),
) )
) )

View File

@ -66,6 +66,7 @@ class IdeficsVisionConfig(PretrainedConfig):
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
""" """
model_type = "idefics" model_type = "idefics"
attribute_map = { attribute_map = {
"hidden_size": "embed_dim", "hidden_size": "embed_dim",
@ -125,6 +126,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`): qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
Whether or not to use qk layer norms in perceiver Whether or not to use qk layer norms in perceiver
""" """
model_type = "idefics" model_type = "idefics"
def __init__( def __init__(
@ -219,6 +221,7 @@ class IdeficsConfig(PretrainedConfig):
>>> # Accessing the model configuration >>> # Accessing the model configuration
>>> configuration = model.config >>> configuration = model.config
```""" ```"""
model_type = "idefics" model_type = "idefics"
is_composition = True is_composition = True

View File

@ -123,10 +123,10 @@ def expand_inputs_for_generation(
raise ValueError( raise ValueError(
"If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined." "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
) )
encoder_outputs[ encoder_outputs["last_hidden_state"] = (
"last_hidden_state" encoder_outputs.last_hidden_state.index_select(
] = encoder_outputs.last_hidden_state.index_select( 0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device) )
) )
model_kwargs["encoder_outputs"] = encoder_outputs model_kwargs["encoder_outputs"] = encoder_outputs
return input_ids, model_kwargs return input_ids, model_kwargs

View File

@ -133,6 +133,7 @@ class IdeficsProcessor(ProcessorMixin):
An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input. An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image) image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
""" """
attributes = ["image_processor", "tokenizer"] attributes = ["image_processor", "tokenizer"]
image_processor_class = "IdeficsImageProcessor" image_processor_class = "IdeficsImageProcessor"
tokenizer_class = "LlamaTokenizerFast" tokenizer_class = "LlamaTokenizerFast"

View File

@ -19,10 +19,12 @@ from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
import math import math
from dataclasses import dataclass from dataclasses import dataclass
@dataclass @dataclass
class InferenceParams: class InferenceParams:
"""Inference parameters that are passed to the main model in order """Inference parameters that are passed to the main model in order
to efficienly calculate and store the context during inference.""" to efficienly calculate and store the context during inference."""
max_seqlen: int max_seqlen: int
max_batch_size: int max_batch_size: int
conv_states: torch.Tensor conv_states: torch.Tensor
@ -137,13 +139,28 @@ class MambaBlock(nn.Module):
def step(self, hidden_states, conv_state, ssm_state): def step(self, hidden_states, conv_state, ssm_state):
xz = self.in_proj(hidden_states.squeeze(1)) xz = self.in_proj(hidden_states.squeeze(1))
x, z = xz.chunk(2, dim=-1) # (B D) x, z = xz.chunk(2, dim=-1) # (B D)
x = causal_conv1d_update(x, conv_state, self.conv1d.weight.squeeze(1), self.conv1d.bias, self.activation) x = causal_conv1d_update(
x,
conv_state,
self.conv1d.weight.squeeze(1),
self.conv1d.bias,
self.activation,
)
x_db = self.x_proj(x) # (B dt_rank+2*d_state) x_db = self.x_proj(x) # (B dt_rank+2*d_state)
dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1) dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
dt = F.linear(dt, self.dt_proj.weight) dt = F.linear(dt, self.dt_proj.weight)
A = self.negA A = self.negA
y = selective_state_update( y = selective_state_update(
ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True ssm_state,
x,
dt,
A,
B,
C,
self.D,
z=z,
dt_bias=self.dt_proj.bias,
dt_softplus=True,
) )
out = self.out_proj(y) out = self.out_proj(y)
return out.unsqueeze(1), conv_state.clone(), ssm_state.clone() return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()

View File

@ -2,6 +2,7 @@
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
""" """
import math import math
import os import os
import warnings import warnings

View File

@ -35,7 +35,6 @@ from text_generation_server.utils.dist import MEMORY_FRACTION
tracer = trace.get_tracer(__name__) tracer = trace.get_tracer(__name__)
@dataclass @dataclass
class FlashCausalLMBatch(Batch): class FlashCausalLMBatch(Batch):
batch_id: int batch_id: int
@ -1213,8 +1212,9 @@ class FlashCausalLM(Model):
# accept each new token for this specific request since we may # accept each new token for this specific request since we may
# have more than one new token per request with speculative decoding # have more than one new token per request with speculative decoding
for next_token_id in _next_token_ids: for next_token_id in _next_token_ids:
batch.next_token_chooser = batch.next_token_chooser.advance_grammar_single(i, next_token_id) batch.next_token_chooser = (
batch.next_token_chooser.advance_grammar_single(i, next_token_id)
)
# Update values # Update values
batch.input_lengths[i] = input_length + n_accepted_ids batch.input_lengths[i] = input_length + n_accepted_ids

View File

@ -92,7 +92,9 @@ class GalacticaCausalLMBatch(CausalLMBatch):
requests_idx_mapping[r.id] = i requests_idx_mapping[r.id] = i
# Add escape_custom_split_sequence to the CausalLMBatch logic # Add escape_custom_split_sequence to the CausalLMBatch logic
inputs.append(escape_custom_split_sequence(r.inputs)) inputs.append(escape_custom_split_sequence(r.inputs))
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer)) next_token_choosers.append(
NextTokenChooser.from_pb(r.parameters, device, tokenizer)
)
stopping_criteria = StoppingCriteria.from_pb( stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer r.stopping_parameters, tokenizer
) )

View File

@ -114,7 +114,9 @@ class IdeficsCausalLMBatch(Batch):
for i, r in enumerate(pb.requests): for i, r in enumerate(pb.requests):
requests_idx_mapping[r.id] = i requests_idx_mapping[r.id] = i
inputs.append(r.inputs) inputs.append(r.inputs)
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer)) next_token_choosers.append(
NextTokenChooser.from_pb(r.parameters, device, tokenizer)
)
stopping_criteria = StoppingCriteria.from_pb( stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer r.stopping_parameters, tokenizer
) )
@ -401,9 +403,9 @@ class IdeficsCausalLMBatch(Batch):
pixel_values = batch.pixel_values.new_zeros( pixel_values = batch.pixel_values.new_zeros(
(total_batch_size, max_num_images, 3, 224, 224) (total_batch_size, max_num_images, 3, 224, 224)
) )
pixel_values[ pixel_values[start_index:end_index, :curr_batch_max_num_images] = (
start_index:end_index, :curr_batch_max_num_images batch.pixel_values
] = batch.pixel_values )
if image_attention_mask is None: if image_attention_mask is None:
image_attention_mask = batch.image_attention_mask.new_zeros( image_attention_mask = batch.image_attention_mask.new_zeros(
@ -500,14 +502,14 @@ class IdeficsCausalLMBatch(Batch):
# We slice the keys to remove the padding from previous batches # We slice the keys to remove the padding from previous batches
past_seq_len = batch.max_input_length - 1 past_seq_len = batch.max_input_length - 1
if batch.keys_head_dim_last: if batch.keys_head_dim_last:
padded_past_keys[ padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
start_index:end_index, :, -past_seq_len:, : past_keys[:, :, -past_seq_len:, :]
] = past_keys[:, :, -past_seq_len:, :] )
else: else:
# BLOOM case # BLOOM case
padded_past_keys[ padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
start_index:end_index, :, :, -past_seq_len: past_keys[:, :, :, -past_seq_len:]
] = past_keys[:, :, :, -past_seq_len:] )
del past_keys del past_keys
start_index = end_index start_index = end_index
@ -525,9 +527,9 @@ class IdeficsCausalLMBatch(Batch):
end_index = start_index + len(batch) end_index = start_index + len(batch)
# We slice the past values to remove the padding from previous batches # We slice the past values to remove the padding from previous batches
past_seq_len = batch.max_input_length - 1 past_seq_len = batch.max_input_length - 1
padded_past_values[ padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
start_index:end_index, :, -past_seq_len:, : past_values[:, :, -past_seq_len:, :]
] = past_values[:, :, -past_seq_len:, :] )
del past_values del past_values
# Update values # Update values
@ -603,9 +605,11 @@ class IdeficsCausalLM(Model):
model_id, model_id,
revision=revision, revision=revision,
torch_dtype=dtype, torch_dtype=dtype,
device_map="auto" device_map=(
if torch.cuda.is_available() and torch.cuda.device_count() > 1 "auto"
else None, if torch.cuda.is_available() and torch.cuda.device_count() > 1
else None
),
load_in_8bit=quantize == "bitsandbytes", load_in_8bit=quantize == "bitsandbytes",
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
@ -836,9 +840,9 @@ class IdeficsCausalLM(Model):
# Update attention_mask as we added a new token to input_ids # Update attention_mask as we added a new token to input_ids
batch.attention_mask[:, -batch.padding_right_offset] = 1 batch.attention_mask[:, -batch.padding_right_offset] = 1
batch.image_attention_mask[ batch.image_attention_mask[:, -batch.padding_right_offset, :] = (
:, -batch.padding_right_offset, : batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
] = batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :] )
# Decrease right offset # Decrease right offset
batch.padding_right_offset -= 1 batch.padding_right_offset -= 1

View File

@ -15,7 +15,10 @@ from text_generation_server.utils import (
) )
from text_generation_server.models.globals import ENABLE_CUDA_GRAPHS, MEM_POOL from text_generation_server.models.globals import ENABLE_CUDA_GRAPHS, MEM_POOL
import time import time
from text_generation_server.models.custom_modeling.mamba_modeling import MambaModel, InferenceParams from text_generation_server.models.custom_modeling.mamba_modeling import (
MambaModel,
InferenceParams,
)
from text_generation_server.models import Model from text_generation_server.models import Model
from typing import Any, List, Optional, Tuple, Type, Dict from typing import Any, List, Optional, Tuple, Type, Dict
from text_generation_server.models.types import ( from text_generation_server.models.types import (
@ -28,21 +31,35 @@ from text_generation_server.utils.tokens import batch_top_tokens, Sampling
from dataclasses import dataclass from dataclasses import dataclass
from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
def new_inference_params(n_blocks: int, batch_size: int, d_inner: int, d_conv: int, d_state: int, seqlen_offset: int, dtype: torch.dtype, device: torch.device):
def new_inference_params(
n_blocks: int,
batch_size: int,
d_inner: int,
d_conv: int,
d_state: int,
seqlen_offset: int,
dtype: torch.dtype,
device: torch.device,
):
max_seqlen = 0 max_seqlen = 0
conv_states = torch.zeros( conv_states = torch.zeros(
(n_blocks, (
batch_size, n_blocks,
d_inner, batch_size,
d_conv,), d_inner,
d_conv,
),
device=device, device=device,
dtype=dtype, dtype=dtype,
) )
ssm_states = torch.zeros( ssm_states = torch.zeros(
(n_blocks, (
batch_size, n_blocks,
d_inner, batch_size,
d_state,), d_inner,
d_state,
),
device=device, device=device,
dtype=dtype, dtype=dtype,
) )
@ -52,7 +69,6 @@ def new_inference_params(n_blocks: int, batch_size: int, d_inner: int, d_conv: i
seqlen_offset=seqlen_offset, seqlen_offset=seqlen_offset,
conv_states=conv_states, conv_states=conv_states,
ssm_states=ssm_states, ssm_states=ssm_states,
) )
return inference_params return inference_params
@ -124,7 +140,9 @@ class MambaBatch(Batch):
for i, r in enumerate(pb.requests): for i, r in enumerate(pb.requests):
requests_idx_mapping[r.id] = i requests_idx_mapping[r.id] = i
inputs.append(r.inputs) inputs.append(r.inputs)
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer)) next_token_choosers.append(
NextTokenChooser.from_pb(r.parameters, device, tokenizer)
)
stopping_criteria = StoppingCriteria.from_pb( stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer r.stopping_parameters, tokenizer
) )
@ -251,7 +269,9 @@ class MambaBatch(Batch):
# TODO # TODO
# Kept it simple by just updating the state, maybe updating the other CPU values is necessary. # Kept it simple by just updating the state, maybe updating the other CPU values is necessary.
self.inference_params.conv_states = self.inference_params.conv_states[:, indices] self.inference_params.conv_states = self.inference_params.conv_states[
:, indices
]
self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices] self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices]
return self return self
@ -280,13 +300,20 @@ class MambaBatch(Batch):
max_seqlen = 0 max_seqlen = 0
seqlen_offset = 0 seqlen_offset = 0
(n_blocks, _, d_inner, d_conv) = ( (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
batches[0].inference_params.conv_states.shape
)
(_, _, _, d_state) = batches[0].inference_params.ssm_states.shape (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
dtype = batches[0].inference_params.conv_states.dtype dtype = batches[0].inference_params.conv_states.dtype
device = batches[0].inference_params.conv_states.device device = batches[0].inference_params.conv_states.device
inference_params = new_inference_params(n_blocks=n_blocks, batch_size=total_batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=device, dtype=dtype) inference_params = new_inference_params(
n_blocks=n_blocks,
batch_size=total_batch_size,
d_state=d_state,
d_conv=d_conv,
d_inner=d_inner,
seqlen_offset=seqlen_offset,
device=device,
dtype=dtype,
)
# Batch tensors # Batch tensors
input_ids = None input_ids = None
@ -334,13 +361,20 @@ class MambaBatch(Batch):
max_input_length - batch.max_input_length max_input_length - batch.max_input_length
) * len(batch) ) * len(batch)
inference_params.max_seqlen = max(inference_params.max_seqlen, batch.inference_params.max_seqlen) inference_params.max_seqlen = max(
inference_params.max_seqlen, batch.inference_params.max_seqlen
)
assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset" assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
inference_params.seqlen_offset = max(inference_params.seqlen_offset, batch.inference_params.seqlen_offset) inference_params.seqlen_offset = max(
inference_params.seqlen_offset, batch.inference_params.seqlen_offset
)
inference_params.conv_states[:, start_index:end_index] = (
inference_params.conv_states[:, start_index:end_index] = batch.inference_params.conv_states batch.inference_params.conv_states
inference_params.ssm_states[:, start_index:end_index] = batch.inference_params.ssm_states )
inference_params.ssm_states[:, start_index:end_index] = (
batch.inference_params.ssm_states
)
start_index = end_index start_index = end_index
@ -452,36 +486,39 @@ class Mamba(Model):
# Important seqlen_offset to go through the update mecanism with the state # Important seqlen_offset to go through the update mecanism with the state
seqlen_offset = 1 seqlen_offset = 1
inference_params = new_inference_params(n_blocks=n_blocks, batch_size=batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=self.device, dtype=self.dtype) inference_params = new_inference_params(
n_blocks=n_blocks,
batch_size=batch_size,
d_state=d_state,
d_conv=d_conv,
d_inner=d_inner,
seqlen_offset=seqlen_offset,
device=self.device,
dtype=self.dtype,
)
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
torch.cuda.synchronize() torch.cuda.synchronize()
# Run once outside to warmup # Run once outside to warmup
self.model.forward( self.model.forward(input_ids=input_ids, inference_params=inference_params)
input_ids=input_ids,
inference_params=inference_params
)
torch.cuda.synchronize() torch.cuda.synchronize()
with torch.cuda.graph(graph, pool=MEM_POOL): with torch.cuda.graph(graph, pool=MEM_POOL):
logits = self.model.forward( logits = self.model.forward(
input_ids=input_ids, input_ids=input_ids, inference_params=inference_params
inference_params=inference_params
) )
torch.cuda.synchronize() torch.cuda.synchronize()
graph_dict = { graph_dict = {
"input_ids": input_ids, "input_ids": input_ids,
"inference_params": inference_params, "inference_params": inference_params,
"graph": graph, "graph": graph,
"logits": logits "logits": logits,
} }
self.cuda_graphs[batch_size] = graph_dict self.cuda_graphs[batch_size] = graph_dict
def forward( def forward(
self, self, input_ids: torch.Tensor, inference_params: Any
input_ids: torch.Tensor,
inference_params: Any
) -> Tuple[torch.Tensor, torch.Tensor]: ) -> Tuple[torch.Tensor, torch.Tensor]:
bs = input_ids.shape[0] bs = input_ids.shape[0]
padded_bs = bs padded_bs = bs
@ -504,15 +541,21 @@ class Mamba(Model):
# Copy inputs to the static inputs of the cuda graph # Copy inputs to the static inputs of the cuda graph
# Static inputs are potentially padded # Static inputs are potentially padded
cuda_graph["input_ids"][: bs] = input_ids cuda_graph["input_ids"][:bs] = input_ids
cuda_graph["inference_params"].conv_states[:, : bs] = inference_params.conv_states cuda_graph["inference_params"].conv_states[
cuda_graph["inference_params"].ssm_states[:, : bs] = inference_params.ssm_states :, :bs
] = inference_params.conv_states
cuda_graph["inference_params"].ssm_states[:, :bs] = inference_params.ssm_states
# Replay the graph # Replay the graph
cuda_graph["graph"].replay() cuda_graph["graph"].replay()
inference_params.conv_states.copy_(cuda_graph["inference_params"].conv_states[:, :bs]) inference_params.conv_states.copy_(
inference_params.ssm_states.copy_(cuda_graph["inference_params"].ssm_states[:, :bs]) cuda_graph["inference_params"].conv_states[:, :bs]
)
inference_params.ssm_states.copy_(
cuda_graph["inference_params"].ssm_states[:, :bs]
)
# Slice output to the correct shape # Slice output to the correct shape
return cuda_graph["logits"][:bs] return cuda_graph["logits"][:bs]
@ -528,19 +571,25 @@ class Mamba(Model):
if batch.inference_params is None: if batch.inference_params is None:
# 0 is important here # 0 is important here
seqlen_offset = 0 seqlen_offset = 0
n_blocks = len(self.model.blocks) n_blocks = len(self.model.blocks)
d_state = self.model.config.d_state d_state = self.model.config.d_state
d_conv = self.model.config.d_conv d_conv = self.model.config.d_conv
d_inner = self.model.config.d_inner d_inner = self.model.config.d_inner
inference_params = new_inference_params(n_blocks=n_blocks, batch_size=batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=self.device, dtype=self.dtype) inference_params = new_inference_params(
n_blocks=n_blocks,
batch_size=batch_size,
d_state=d_state,
d_conv=d_conv,
d_inner=d_inner,
seqlen_offset=seqlen_offset,
device=self.device,
dtype=self.dtype,
)
batch.inference_params = inference_params batch.inference_params = inference_params
# Forward pass # Forward pass
logits = self.forward( logits = self.forward(input_ids, inference_params=batch.inference_params)
input_ids, inference_params=batch.inference_params
)
# batch.inference_params = new_inference_params # batch.inference_params = new_inference_params
# Results # Results
@ -694,9 +743,9 @@ class Mamba(Model):
generations.append(generation) generations.append(generation)
# Update values # Update values
batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar( batch.next_token_choosers[i] = batch.next_token_choosers[
next_token_id_squeezed.item() i
) ].advance_grammar(next_token_id_squeezed.item())
batch.input_ids[i, 0] = next_token_id batch.input_ids[i, 0] = next_token_id
batch.all_input_ids[i] = all_input_ids batch.all_input_ids[i] = all_input_ids
batch.input_lengths[i] = new_input_length batch.input_lengths[i] = new_input_length

View File

@ -36,9 +36,11 @@ class RW(CausalLM):
model_id, model_id,
revision=revision, revision=revision,
torch_dtype=dtype, torch_dtype=dtype,
device_map="auto" device_map=(
if torch.cuda.is_available() and torch.cuda.device_count() > 1 "auto"
else None, if torch.cuda.is_available() and torch.cuda.device_count() > 1
else None
),
load_in_8bit=quantize == "bitsandbytes", load_in_8bit=quantize == "bitsandbytes",
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )

View File

@ -96,7 +96,9 @@ class Seq2SeqLMBatch(Batch):
inputs.append(r.inputs) inputs.append(r.inputs)
requests_idx_mapping[r.id] = i requests_idx_mapping[r.id] = i
decoder_input_lengths.append(1) decoder_input_lengths.append(1)
next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer)) next_token_choosers.append(
NextTokenChooser.from_pb(r.parameters, device, tokenizer)
)
stopping_criteria = StoppingCriteria.from_pb( stopping_criteria = StoppingCriteria.from_pb(
r.stopping_parameters, tokenizer r.stopping_parameters, tokenizer
) )
@ -351,9 +353,9 @@ class Seq2SeqLMBatch(Batch):
(total_batch_size, max_input_length), (total_batch_size, max_input_length),
) )
# Copy to correct indices # Copy to correct indices
attention_mask[ attention_mask[start_index:end_index, -batch.max_input_length :] = (
start_index:end_index, -batch.max_input_length : batch.attention_mask[:, -batch.max_input_length :]
] = batch.attention_mask[:, -batch.max_input_length :] )
# Create padded tensor # Create padded tensor
if decoder_input_ids is None: if decoder_input_ids is None:
@ -547,9 +549,11 @@ class Seq2SeqLM(Model):
model_id, model_id,
revision=revision, revision=revision,
torch_dtype=dtype, torch_dtype=dtype,
device_map="auto" device_map=(
if torch.cuda.is_available() and torch.cuda.device_count() > 1 "auto"
else None, if torch.cuda.is_available() and torch.cuda.device_count() > 1
else None
),
load_in_8bit=quantize == "bitsandbytes", load_in_8bit=quantize == "bitsandbytes",
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
@ -750,7 +754,7 @@ class Seq2SeqLM(Model):
if top_n_tokens > 0: if top_n_tokens > 0:
all_top_tokens = [] all_top_tokens = []
for (top_token_ids, top_token_logprobs) in zip( for top_token_ids, top_token_logprobs in zip(
top_token_ids, top_token_logprobs top_token_ids, top_token_logprobs
): ):
toptoken_texts = self.tokenizer.batch_decode( toptoken_texts = self.tokenizer.batch_decode(

View File

@ -88,14 +88,16 @@ class Generation:
def to_pb(self) -> generate_pb2.Generation: def to_pb(self) -> generate_pb2.Generation:
return generate_pb2.Generation( return generate_pb2.Generation(
request_id=self.request_id, request_id=self.request_id,
prefill_tokens=self.prefill_tokens.to_pb() prefill_tokens=(
if self.prefill_tokens is not None self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
else None, ),
tokens=self.tokens.to_pb(), tokens=self.tokens.to_pb(),
generated_text=self.generated_text.to_pb() generated_text=(
if self.generated_text is not None self.generated_text.to_pb() if self.generated_text is not None else None
else None, ),
top_tokens=[top_tokens.to_pb() for top_tokens in self.top_tokens] top_tokens=(
if self.top_tokens is not None [top_tokens.to_pb() for top_tokens in self.top_tokens]
else None, if self.top_tokens is not None
else None
),
) )

View File

@ -1,3 +1,3 @@
*.py *.py
*.pyi *.pyi
*.py-e *.py-e

View File

@ -182,7 +182,7 @@ try:
) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) ) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
zeros = (zeros >> zeros_shifter[None, :]) & maxq zeros = (zeros >> zeros_shifter[None, :]) & maxq
zeros = (zeros + 1) & maxq # eventually avoid overflow zeros = (zeros + 1) & maxq # eventually avoid overflow
a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K) a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)
b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated

View File

@ -355,7 +355,9 @@ def get_linear(weight, bias, quantize):
"to use Exllama/GPTQ kernels for AWQ inference." "to use Exllama/GPTQ kernels for AWQ inference."
) )
if not HAS_AWQ: if not HAS_AWQ:
raise NotImplementedError("You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly") raise NotImplementedError(
"You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
)
linear = WQLinear( linear = WQLinear(
w_bit=bits, w_bit=bits,
group_size=groupsize, group_size=groupsize,

View File

@ -516,7 +516,7 @@ class GrammarLogitProcessor(LogitsProcessor):
if grammar_type == GrammarType.GRAMMAR_TYPE_JSON: if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
schema = build_regex_from_object(schema) schema = build_regex_from_object(schema)
elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX: elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
pass # schema is already a regex just here for clarity pass # schema is already a regex just here for clarity
fsm = RegexFSM(schema, tokenizer) fsm = RegexFSM(schema, tokenizer)
logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s") logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
return fsm return fsm

View File

@ -409,8 +409,12 @@ class HeterogeneousNextTokenChooser:
def advance_grammar_single(self, grammar_state_index: int, next_id: int): def advance_grammar_single(self, grammar_state_index: int, next_id: int):
if self.grammar_processor is not None: if self.grammar_processor is not None:
self.fsm_grammar_states[grammar_state_index] = self.grammar_processor.advance_at_index( self.fsm_grammar_states[grammar_state_index] = (
next_id, self.fsm_grammar_states[grammar_state_index], grammar_state_index self.grammar_processor.advance_at_index(
next_id,
self.fsm_grammar_states[grammar_state_index],
grammar_state_index,
)
) )
return self return self