From 9946165ee022bf31578e1dde3124cc61ded2b664 Mon Sep 17 00:00:00 2001
From: OlivierDehaene <olivier@huggingface.co>
Date: Fri, 16 Feb 2024 11:58:58 +0100
Subject: [PATCH] chore: add pre-commit (#1569)

---
 .github/ISSUE_TEMPLATE/bug-report.yml         |  10 +-
 .github/ISSUE_TEMPLATE/feature-request.yml    |   2 +-
 .github/workflows/autodocs.yml                |   6 +-
 .github/workflows/build_pr_documentation.yml  |   2 +-
 .github/workflows/tests.yaml                  |   9 +-
 .github/workflows/upload_pr_documentation.yml |   2 +-
 .gitignore                                    |   1 -
 .pre-commit-config.yaml                       |  18 +++
 README.md                                     |   4 +-
 benchmark/Cargo.toml                          |   1 -
 benchmark/README.md                           |   8 +-
 clients/python/.gitignore                     |   2 +-
 clients/python/Makefile                       |   2 +-
 clients/python/README.md                      |   8 +-
 clients/python/text_generation/types.py       |   1 +
 docs/index.html                               |   2 +-
 docs/openapi.json                             |   2 +-
 docs/source/basic_tutorials/consuming_tgi.md  |  24 +--
 .../source/basic_tutorials/non_core_models.md |   6 +-
 .../source/basic_tutorials/preparing_model.md |   6 +-
 docs/source/basic_tutorials/using_cli.md      |  12 +-
 docs/source/conceptual/flash_attention.md     |   5 +-
 docs/source/conceptual/quantization.md        |  10 +-
 docs/source/conceptual/safetensors.md         |   6 +-
 docs/source/conceptual/streaming.md           |  18 +--
 docs/source/conceptual/tensor_parallelism.md  |   2 +-
 docs/source/installation.md                   |   4 +-
 docs/source/messages_api.md                   |   2 +-
 docs/source/quicktour.md                      |   2 +-
 integration-tests/models/test_mamba.py        |   4 +-
 integration-tests/pytest.ini                  |   2 +-
 load_tests/common.js                          |   2 +-
 load_tests/starcoder_load.js                  |   2 +-
 load_tests/tgi.js                             |   2 +-
 load_tests/vllm.js                            |   2 +-
 router/README.md                              |   4 +-
 router/client/src/pb/.gitignore               |   2 +-
 router/src/validation.rs                      |   1 +
 rust-toolchain.toml                           |   2 +-
 server/Makefile-awq                           |   2 +-
 server/Makefile-flash-att                     |   2 +-
 server/Makefile-selective-scan                |   6 +-
 server/README.md                              |   2 +-
 .../fused_bloom_attention_cuda.cu             |   2 +-
 .../cuda_func/column_remap.cuh                |   2 +-
 .../exllama_kernels/cuda_func/q4_matrix.cuh   |   2 +-
 .../exllama_kernels/hip_compat.cuh            |   2 +-
 .../exllamav2_kernels/cuda/matrix_view.cuh    |   2 +-
 .../exllamav2_kernels/cuda/q_gemm.cuh         |   2 +-
 .../exllamav2_kernels/cuda/quant/qdq_2.cuh    |   2 +-
 .../exllamav2_kernels/cuda/quant/qdq_4.cuh    |   2 +-
 .../exllamav2_kernels/cuda/quant/qdq_5.cuh    |   2 +-
 .../exllamav2_kernels/cuda/quant/qdq_6.cuh    |   2 -
 .../exllamav2_kernels/cuda/quant/qdq_8.cuh    |   2 +-
 .../exllamav2_kernels/cuda/util.cuh           |   2 +-
 .../custom_modeling/flash_llama_modeling.py   |   6 +-
 .../custom_modeling/flash_mistral_modeling.py |   6 +-
 .../custom_modeling/flash_mixtral_modeling.py |  12 +-
 .../custom_modeling/flash_neox_modeling.py    |   6 +-
 .../custom_modeling/flash_phi_modeling.py     |   6 +-
 .../flash_santacoder_modeling.py              |  13 +-
 .../models/custom_modeling/idefics_config.py  |   3 +
 .../custom_modeling/idefics_modeling.py       |   8 +-
 .../custom_modeling/idefics_processing.py     |   1 +
 .../models/custom_modeling/mamba_modeling.py  |  21 ++-
 .../models/custom_modeling/mpt_modeling.py    |   1 +
 .../models/flash_causal_lm.py                 |   6 +-
 .../models/galactica.py                       |   4 +-
 .../models/idefics_causal_lm.py               |  42 ++---
 server/text_generation_server/models/mamba.py | 143 ++++++++++++------
 server/text_generation_server/models/rw.py    |   8 +-
 .../models/seq2seq_lm.py                      |  20 ++-
 server/text_generation_server/models/types.py |  20 +--
 server/text_generation_server/pb/.gitignore   |   2 +-
 .../utils/gptq/quant_linear.py                |   2 +-
 server/text_generation_server/utils/layers.py |   4 +-
 .../utils/logits_process.py                   |   2 +-
 server/text_generation_server/utils/tokens.py |   8 +-
 78 files changed, 346 insertions(+), 234 deletions(-)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 12c93b9e..24ac3cbe 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -5,14 +5,14 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: | 
+      description: |
         Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
         OS version:
         Rust version (if self-compiling, `cargo version`):
         Model being used (`curl 127.0.0.1:8080/info | jq`):
           If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
         Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
         The current version being used:
 
@@ -52,11 +52,11 @@ body:
 
       placeholder: |
         Steps to reproduce the behavior:
-          
+
           1.
           2.
           3.
-          
+
 
   - type: textarea
     id: expected-behavior
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 5abc1565..f1a9135c 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -19,7 +19,7 @@ body:
       label: Motivation
       description: |
         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+
 
   - type: textarea
     id: contribution
diff --git a/.github/workflows/autodocs.yml b/.github/workflows/autodocs.yml
index a981c09c..7c5c6eca 100644
--- a/.github/workflows/autodocs.yml
+++ b/.github/workflows/autodocs.yml
@@ -6,15 +6,15 @@ on:
 jobs:
   update_docs:
     runs-on: ubuntu-latest
-    
+
     steps:
     - name: Checkout code
       uses: actions/checkout@v2
-    
+
     - name: Install Launcher
       id: install-launcher
       run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher
-    
+
     - name: Check launcher Docs are up-to-date
       run: |
         echo text-generation-launcher --help
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index b46216ec..a5ce39a5 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -16,4 +16,4 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: text-generation-inference
-      additional_args: --not_python_module 
+      additional_args: --not_python_module
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index ecc8eb4d..5b19eb8c 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -71,12 +71,11 @@ jobs:
           pip install pytest
           export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
           pytest -s -vv server/tests
-      - name: Run Rust fmt
+      - name: Pre-commit checks
         run: |
-          cargo fmt --check
-      - name: Run Rust clippy
-        run: |
-          cargo clippy
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
       - name: Run Rust tests
         run: |
           cargo test
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
index b984ead2..ae00bb51 100644
--- a/.github/workflows/upload_pr_documentation.yml
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -13,4 +13,4 @@ jobs:
       package_name: text-generation-inference
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 1f9ba162..b3ca772b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,3 @@ server/exllama_kernels/exllama_kernels/hip_func/
 *_hip.cuh
 server/exllama_kernels/exllama_kernels/hip_buffers.cuh
 server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..45bc07a5
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        exclude: docs/source/basic_tutorials/launcher.md
+-   repo: https://github.com/psf/black
+    rev: 24.2.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: fmt
+    -   id: cargo-check
+    -   id: clippy
diff --git a/README.md b/README.md
index c4d84efa..7589a3a6 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-  
+
 <a href="https://www.youtube.com/watch?v=jlMAX2Oaht0">
   <img width=560 width=315 alt="Making TGI deployment optimal" src="https://huggingface.co/datasets/Narsil/tgi_assets/resolve/main/thumbnail.png">
 </a>
@@ -228,7 +228,7 @@ text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2
 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
 
 ```shell
-text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize 
+text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize
 ```
 
 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`.
diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml
index 2dd2e64d..40738c4d 100644
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@@ -29,4 +29,3 @@ tui = {package = "ratatui", version = "0.23", default-features = false, features
 tracing = "0.1.37"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 hf-hub = "0.3.1"
-
diff --git a/benchmark/README.md b/benchmark/README.md
index 7f51a731..17a02a30 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -6,12 +6,12 @@
 
 </div>
 
-A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) 
+A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha)
 and powered by [tui](https://github.com/tui-rs-revival/ratatui).
 
-## Install 
+## Install
 
-```shell 
+```shell
 make install-benchmark
 ```
 
@@ -27,4 +27,4 @@ Then run the benchmarking tool:
 
 ```shell
 text-generation-benchmark --tokenizer-name bigscience/bloom-560m
-```
\ No newline at end of file
+```
diff --git a/clients/python/.gitignore b/clients/python/.gitignore
index 5758ba92..5a8ecaa7 100644
--- a/clients/python/.gitignore
+++ b/clients/python/.gitignore
@@ -155,4 +155,4 @@ dmypy.json
 cython_debug/
 
 transformers
-safetensors
\ No newline at end of file
+safetensors
diff --git a/clients/python/Makefile b/clients/python/Makefile
index 8b4334bd..42720875 100644
--- a/clients/python/Makefile
+++ b/clients/python/Makefile
@@ -3,4 +3,4 @@ unit-tests:
 
 install:
 	pip install pip --upgrade
-	pip install -e .
\ No newline at end of file
+	pip install -e .
diff --git a/clients/python/README.md b/clients/python/README.md
index 82f3ee0c..20243f4a 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -141,7 +141,7 @@ class Parameters:
     # Get decoder input token logprobs and ids
     decoder_input_details: bool
     # Return the N most likely tokens at each step
-    top_n_tokens: Optional[int] 
+    top_n_tokens: Optional[int]
 
 # Decoder input tokens
 class InputToken:
@@ -192,7 +192,7 @@ class BestOfSequence:
     # Generated tokens
     tokens: List[Token]
     # Most likely tokens
-    top_tokens: Optional[List[List[Token]]] 
+    top_tokens: Optional[List[List[Token]]]
 
 
 # `generate` details
@@ -236,7 +236,7 @@ class StreamResponse:
     # Generated token
     token: Token
     # Most likely tokens
-    top_tokens: Optional[List[Token]] 
+    top_tokens: Optional[List[Token]]
     # Complete generated text
     # Only available when the generation is finished
     generated_text: Optional[str]
@@ -248,4 +248,4 @@ class StreamResponse:
 class DeployedModel:
     model_id: str
     sha: str
-```
\ No newline at end of file
+```
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index 3426411b..911114ee 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -134,6 +134,7 @@ class Parameters(BaseModel):
                 raise ValidationError("`value` cannot be empty for `json` grammar")
         return v
 
+
 class Request(BaseModel):
     # Prompt
     inputs: str
diff --git a/docs/index.html b/docs/index.html
index 16d143d8..f582d3ce 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -27,4 +27,4 @@
             }
         </script>
     </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/docs/openapi.json b/docs/openapi.json
index da3969df..fd7cca7b 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -1290,4 +1290,4 @@
       "description": "Hugging Face Text Generation Inference API"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md
index 540f4b13..4829ec7c 100644
--- a/docs/source/basic_tutorials/consuming_tgi.md
+++ b/docs/source/basic_tutorials/consuming_tgi.md
@@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip.
 pip install huggingface-hub
 ```
 
-Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. 
+Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python.
 
 ```python
 from huggingface_hub import InferenceClient
@@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin
 pip install huggingface-hub gradio
 ```
 
-Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). 
- 
+Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client).
+
 ```python
 import gradio as gr
 from huggingface_hub import InferenceClient
@@ -110,30 +110,30 @@ gr.ChatInterface(
 ).queue().launch()
 ```
 
-The UI looks like this 👇 
+The UI looks like this 👇
 
 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi.png"
     />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/gradio-tgi-dark.png"
     />
 </div>
 
-You can try the demo directly here 👇 
+You can try the demo directly here 👇
 
 <div class="block dark:hidden">
-	<iframe 
+	<iframe
         src="https://merve-gradio-tgi-2.hf.space?__theme=light"
         width="850"
         height="750"
     ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
         src="https://merve-gradio-tgi-2.hf.space?__theme=dark"
         width="850"
         height="750"
@@ -152,4 +152,4 @@ You can read more about how to customize a `ChatInterface` [here](https://www.gr
 
 ## API documentation
 
-You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference). 
+You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route. The Swagger UI is also available [here](https://huggingface.github.io/text-generation-inference).
diff --git a/docs/source/basic_tutorials/non_core_models.md b/docs/source/basic_tutorials/non_core_models.md
index 6f2e6cfa..2badaff0 100644
--- a/docs/source/basic_tutorials/non_core_models.md
+++ b/docs/source/basic_tutorials/non_core_models.md
@@ -2,19 +2,19 @@
 
 TGI supports various LLM architectures (see full list [here](../supported_models)). If you wish to serve a model that is not one of the supported models, TGI will fallback to the `transformers` implementation of that model. This means you will be unable to use some of the features introduced by TGI, such as tensor-parallel sharding or flash attention. However, you can still get many benefits of TGI, such as continuous batching or streaming outputs.
 
-You can serve these models using the same Docker command-line invocation as with fully supported models 👇 
+You can serve these models using the same Docker command-line invocation as with fully supported models 👇
 
 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
 ```
 
-If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇 
+If the model you wish to serve is a custom transformers model, and its weights and implementation are available in the Hub, you can still serve the model by passing the `--trust-remote-code` flag to the `docker run` command like below 👇
 
 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
 ```
 
-Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇 
+Finally, if the model is not on Hugging Face Hub but on your local, you can pass the path to the folder that contains your model like below 👇
 
 ```bash
 # Make sure your model is in the $volume directory
diff --git a/docs/source/basic_tutorials/preparing_model.md b/docs/source/basic_tutorials/preparing_model.md
index ea74d18c..71ca5598 100644
--- a/docs/source/basic_tutorials/preparing_model.md
+++ b/docs/source/basic_tutorials/preparing_model.md
@@ -1,6 +1,6 @@
 # Preparing the Model
 
-Text Generation Inference improves the model in several aspects. 
+Text Generation Inference improves the model in several aspects.
 
 ## Quantization
 
@@ -9,7 +9,7 @@ TGI supports [bits-and-bytes](https://github.com/TimDettmers/bitsandbytes#bitsan
 
 ## RoPE Scaling
 
-RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension. 
+RoPE scaling can be used to increase the sequence length of the model during the inference time without necessarily fine-tuning it. To enable RoPE scaling, simply pass `--rope-scaling`, `--max-input-length` and `--rope-factors` flags when running through CLI. `--rope-scaling` can take the values `linear` or `dynamic`. If your model is not fine-tuned to a longer sequence length, use `dynamic`. `--rope-factor` is the ratio between the intended max sequence length and the model's original max sequence length. Make sure to pass `--max-input-length` to provide maximum input length for extension.
 
 <Tip>
 
@@ -19,4 +19,4 @@ We recommend using `dynamic` RoPE scaling.
 
 ## Safetensors
 
-[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format. 
+[Safetensors](https://github.com/huggingface/safetensors) is a fast and safe persistence format for deep learning models, and is required for tensor parallelism. TGI supports `safetensors` model loading under the hood. By default, given a repository with `safetensors` and `pytorch` weights, TGI will always load `safetensors`. If there's no `pytorch` weights, TGI will convert the weights to `safetensors` format.
diff --git a/docs/source/basic_tutorials/using_cli.md b/docs/source/basic_tutorials/using_cli.md
index a3a65f60..64554069 100644
--- a/docs/source/basic_tutorials/using_cli.md
+++ b/docs/source/basic_tutorials/using_cli.md
@@ -2,29 +2,29 @@
 
 You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. To install the CLI, please refer to [the installation section](../installation#install-cli).
 
-`text-generation-server` lets you download the model with `download-weights` command like below 👇 
+`text-generation-server` lets you download the model with `download-weights` command like below 👇
 
 ```bash
 text-generation-server download-weights MODEL_HUB_ID
 ```
 
-You can also use it to quantize models like below 👇 
+You can also use it to quantize models like below 👇
 
 ```bash
-text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR 
+text-generation-server quantize MODEL_HUB_ID OUTPUT_DIR
 ```
 
-You can use `text-generation-launcher` to serve models. 
+You can use `text-generation-launcher` to serve models.
 
 ```bash
 text-generation-launcher --model-id MODEL_HUB_ID --port 8080
 ```
 
-There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running 
+There are many options and parameters you can pass to `text-generation-launcher`. The documentation for CLI is kept minimal and intended to rely on self-generating documentation, which can be found by running
 
 ```bash
 text-generation-launcher --help
-``` 
+```
 
 You can also find it hosted in this [Swagger UI](https://huggingface.github.io/text-generation-inference/).
 
diff --git a/docs/source/conceptual/flash_attention.md b/docs/source/conceptual/flash_attention.md
index 1f3a6293..6b13cd13 100644
--- a/docs/source/conceptual/flash_attention.md
+++ b/docs/source/conceptual/flash_attention.md
@@ -1,12 +1,11 @@
 # Flash Attention
 
-Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference. 
+Scaling the transformer architecture is heavily bottlenecked by the self-attention mechanism, which has quadratic time and memory complexity. Recent developments in accelerator hardware mainly focus on enhancing compute capacities and not memory and transferring data between hardware. This results in attention operation having a memory bottleneck. **Flash Attention** is an attention algorithm used to reduce this problem and scale transformer-based models more efficiently, enabling faster training and inference.
 
-Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back. 
+Standard attention mechanism uses High Bandwidth Memory (HBM) to store, read and write keys, queries and values. HBM is large in memory, but slow in processing, meanwhile SRAM is smaller in memory, but faster in operations. In the standard attention implementation, the cost of loading and writing keys, queries, and values from HBM is high. It loads keys, queries, and values from HBM to GPU on-chip SRAM, performs a single step of the attention mechanism, writes it back to HBM, and repeats this for every single attention step. Instead, Flash Attention loads keys, queries, and values once, fuses the operations of the attention mechanism, and writes them back.
 
 ![Flash Attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png)
 
 It is implemented for supported models. You can check out the complete list of models that support Flash Attention [here](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models), for models with flash prefix.
 
 You can learn more about Flash Attention by reading the paper in this [link](https://arxiv.org/abs/2205.14135).
-
diff --git a/docs/source/conceptual/quantization.md b/docs/source/conceptual/quantization.md
index 9bd77b93..8f26fdba 100644
--- a/docs/source/conceptual/quantization.md
+++ b/docs/source/conceptual/quantization.md
@@ -4,20 +4,20 @@ TGI offers GPTQ and bits-and-bytes quantization to quantize large language model
 
 ## Quantization with GPTQ
 
-GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇 
+GPTQ is a post-training quantization method to make the model smaller. It quantizes the layers by finding a compressed version of that weight, that will yield a minimum mean squared error like below 👇
 
 Given a layer \\(l\\) with weight matrix \\(W_{l}\\) and layer input \\(X_{l}\\), find quantized weight \\(\\hat{W}_{l}\\):
 
 $$({\hat{W}_{l}}^{*} = argmin_{\hat{W_{l}}} ||W_{l}X-\hat{W}_{l}X||^{2}_{2})$$
 
 
-TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇 
+TGI allows you to both run an already GPTQ quantized model (see available models [here](https://huggingface.co/models?search=gptq)) or quantize a model of your choice using quantization script. You can run a quantized model by simply passing --quantize like below 👇
 
 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize gptq
 ```
 
-Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI. 
+Note that TGI's GPTQ implementation doesn't use [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) under the hood. However, models quantized using AutoGPTQ or Optimum can still be served by TGI.
 
 To quantize a given model using GPTQ with a calibration dataset, simply run
 
@@ -41,7 +41,7 @@ You can learn more about GPTQ from the [paper](https://arxiv.org/pdf/2210.17323.
 
 bitsandbytes is a library used to apply 8-bit and 4-bit quantization to models. Unlike GPTQ quantization, bitsandbytes doesn't require a calibration dataset or any post-processing – weights are automatically quantized on load. However, inference with bitsandbytes is slower than GPTQ or FP16 precision.
 
-8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much. 
+8-bit quantization enables multi-billion parameter scale models to fit in smaller hardware without degrading performance too much.
 In TGI, you can use 8-bit quantization by adding `--quantize bitsandbytes` like below 👇
 
 ```bash
@@ -50,7 +50,7 @@ docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingf
 
 4-bit quantization is also possible with bitsandbytes. You can choose one of the following 4-bit data types: 4-bit float (`fp4`), or 4-bit `NormalFloat` (`nf4`). These data types were introduced in the context of parameter-efficient fine-tuning, but you can apply them for inference by automatically converting the model weights on load.
 
-In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇 
+In TGI, you can use 4-bit quantization by adding `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` like below 👇
 
 ```bash
 docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --quantize bitsandbytes-nf4
diff --git a/docs/source/conceptual/safetensors.md b/docs/source/conceptual/safetensors.md
index fcc31bac..8ede20fe 100644
--- a/docs/source/conceptual/safetensors.md
+++ b/docs/source/conceptual/safetensors.md
@@ -1,7 +1,7 @@
 # Safetensors
 
-Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries). 
+Safetensors is a model serialization format for deep learning models. It is [faster](https://huggingface.co/docs/safetensors/speed) and safer compared to other serialization formats like pickle (which is used under the hood in many deep learning libraries).
 
-TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format. 
+TGI depends on safetensors format mainly to enable [tensor parallelism sharding](./tensor_parallelism). For a given model repository during serving, TGI looks for safetensors weights. If there are no safetensors weights, TGI converts the PyTorch weights to safetensors format.
 
-You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
\ No newline at end of file
+You can learn more about safetensors by reading the [safetensors documentation](https://huggingface.co/docs/safetensors/index).
diff --git a/docs/source/conceptual/streaming.md b/docs/source/conceptual/streaming.md
index b7e75c5f..505a0d9e 100644
--- a/docs/source/conceptual/streaming.md
+++ b/docs/source/conceptual/streaming.md
@@ -5,12 +5,12 @@
 Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.
 
 <div class="flex justify-center">
-    <img 
-        class="block dark:hidden" 
+    <img
+        class="block dark:hidden"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"
     />
-    <img 
-        class="hidden dark:block" 
+    <img
+        class="hidden dark:block"
         src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"
     />
 </div>
@@ -25,14 +25,14 @@ With token streaming, the server can start returning the tokens one by one befor
 For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click **generate** below.
 
 <div class="block dark:hidden">
-	<iframe 
+	<iframe
         src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light"
         width="850"
         height="350"
     ></iframe>
 </div>
 <div class="hidden dark:block">
-    <iframe 
+    <iframe
         src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark"
         width="850"
         height="350"
@@ -43,7 +43,7 @@ For example, a system can generate 100 tokens per second. If the system generate
 
 ### Streaming with Python
 
-To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response. 
+To stream tokens with `InferenceClient`, simply pass `stream=True` and iterate over the response.
 
 ```python
 from huggingface_hub import InferenceClient
@@ -116,7 +116,7 @@ curl -N 127.0.0.1:8080/generate_stream \
 First, we need to install the `@huggingface/inference` library.
 `npm install @huggingface/inference`
 
-If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's 
+If you're using the free Inference API, you can use `HfInference`. If you're using inference endpoints, you can use `HfInferenceEndpoint`. Let's
 
 We can create a `HfInferenceEndpoint` providing our endpoint URL and credential.
 
@@ -129,7 +129,7 @@ const hf = new HfInferenceEndpoint('https://YOUR_ENDPOINT.endpoints.huggingface.
 const prompt = 'What can you do in Nuremberg, Germany? Give me 3 Tips'
 
 const stream = hf.textGenerationStream({ inputs: prompt })
-for await (const r of stream) { 
+for await (const r of stream) {
   // yield the generated token
   process.stdout.write(r.token.text)
 }
diff --git a/docs/source/conceptual/tensor_parallelism.md b/docs/source/conceptual/tensor_parallelism.md
index 886a349a..2c241c41 100644
--- a/docs/source/conceptual/tensor_parallelism.md
+++ b/docs/source/conceptual/tensor_parallelism.md
@@ -1,6 +1,6 @@
 # Tensor Parallelism
 
-Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇 
+Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇
 
 ![Image courtesy of Anton Lozkhov](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png)
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 1301b930..3e62102d 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -4,7 +4,7 @@ This section explains how to install the CLI tool as well as installing TGI from
 
 ## Install CLI
 
-You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters. 
+You can use TGI command-line interface (CLI) to download weights, serve and quantize models, or get information on serving parameters.
 
 To install the CLI, you need to first clone the TGI repository and then run `make`.
 
@@ -23,7 +23,7 @@ BUILD_EXTENSIONS=True make install
 
 Before you start, you will need to setup your environment, and install Text Generation Inference. Text Generation Inference is tested on **Python 3.9+**.
 
-Text Generation Inference is available on pypi, conda and GitHub. 
+Text Generation Inference is available on pypi, conda and GitHub.
 
 To install and launch locally, first [install Rust](https://rustup.rs/) and create a Python virtual environment with at least
 Python 3.9, e.g. using conda:
diff --git a/docs/source/messages_api.md b/docs/source/messages_api.md
index 939850aa..250aaae2 100644
--- a/docs/source/messages_api.md
+++ b/docs/source/messages_api.md
@@ -92,7 +92,7 @@ print(chat_completion)
 
 ## Hugging Face Inference Endpoints
 
-The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).  
+The Messages API is integrated with [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
 Every endpoint that uses "Text Generation Inference" with an LLM, which has a chat template can now be used. Below is an example of how to use IE with TGI using OpenAI's Python client library:
 
 > **Note:** Make sure to replace `base_url` with your endpoint URL and to include `v1/` at the end of the URL. The `api_key` should be replaced with your Hugging Face API key.
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
index 78ebb8e2..07dddfa8 100644
--- a/docs/source/quicktour.md
+++ b/docs/source/quicktour.md
@@ -53,7 +53,7 @@ print(response.json())
 ```js
 async function query() {
     const response = await fetch(
-        'http://127.0.0.1:8080/generate', 
+        'http://127.0.0.1:8080/generate',
         {
             method: 'POST',
             headers: { 'Content-Type': 'application/json'},
diff --git a/integration-tests/models/test_mamba.py b/integration-tests/models/test_mamba.py
index 4cc863f0..5ec2ec31 100644
--- a/integration-tests/models/test_mamba.py
+++ b/integration-tests/models/test_mamba.py
@@ -54,7 +54,9 @@ async def test_mamba_all_params(fused_kernel_mamba, response_snapshot):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_mamba_load(fused_kernel_mamba, generate_load, generous_response_snapshot):
+async def test_mamba_load(
+    fused_kernel_mamba, generate_load, generous_response_snapshot
+):
     responses = await generate_load(
         fused_kernel_mamba, "What is Deep Learning?", max_new_tokens=10, n=4
     )
diff --git a/integration-tests/pytest.ini b/integration-tests/pytest.ini
index 7dcae663..bab689d7 100644
--- a/integration-tests/pytest.ini
+++ b/integration-tests/pytest.ini
@@ -2,4 +2,4 @@
 addopts = --snapshot-warn-unused
 asyncio_mode = auto
 markers =
-    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')
\ No newline at end of file
+    private: marks tests as requiring an admin hf token (deselect with '-m "not private"')
diff --git a/load_tests/common.js b/load_tests/common.js
index 5d71abea..06d2506f 100644
--- a/load_tests/common.js
+++ b/load_tests/common.js
@@ -57,7 +57,7 @@ export function run(host, generate_payload, max_new_tokens) {
     const duration = res.timings.duration;
 
     if (res.status === 200) {
-        const body = res.json(); 
+        const body = res.json();
         const n_tokens = body.details.tokens.length;
         const latency_ms_per_token = duration / n_tokens;
         timePerToken.add(latency_ms_per_token);
diff --git a/load_tests/starcoder_load.js b/load_tests/starcoder_load.js
index 76316b65..2f6cb3d6 100644
--- a/load_tests/starcoder_load.js
+++ b/load_tests/starcoder_load.js
@@ -60,4 +60,4 @@ export default function () {
         inferenceTime.add(res.headers["X-Inference-Time"]);
         timePerToken.add(res.headers["X-Time-Per-Token"]);
     }
-}
\ No newline at end of file
+}
diff --git a/load_tests/tgi.js b/load_tests/tgi.js
index 1db4ab6f..6c559a9f 100644
--- a/load_tests/tgi.js
+++ b/load_tests/tgi.js
@@ -1,5 +1,5 @@
 import { get_options, run } from "./common.js";
- 
+
 const reference_latency_ms = 70;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const max_new_tokens = 50;
diff --git a/load_tests/vllm.js b/load_tests/vllm.js
index fcb38262..1edc039a 100644
--- a/load_tests/vllm.js
+++ b/load_tests/vllm.js
@@ -1,5 +1,5 @@
 import { get_options, run } from "./common.js";
- 
+
 const reference_latency_ms = 22;
 const host = __ENV.HOST || '127.0.0.1:8000';
 const max_new_tokens = 50;
diff --git a/router/README.md b/router/README.md
index c18d4f9e..5b1f9e36 100644
--- a/router/README.md
+++ b/router/README.md
@@ -28,7 +28,7 @@ this is controlled by the client, and therefore the amount of batching is decide
 beforehand.
 
 For text-generation, and LLMs which are memory bound we can try to be much more
-efficient with the available compute, by having client sending us single queries, 
+efficient with the available compute, by having client sending us single queries,
 and let the router mix&match queries into or out of batches to make the use the
 compute the most efficiently. This is possible because for LLMs the total compute
 for running the model is much bigger than doing mix&match of the batches themselves.
@@ -89,5 +89,5 @@ most critical perceived quality of an LLM API.
 With token streaming, the server can start answering after the first `prefill` pass
 directly, without waiting for all the generation to be done. For extremely long queries
 this means clients can start to see something happening orders of magnitude before
-the work is done. Seeing something in progress allows them to cut short if it's not 
+the work is done. Seeing something in progress allows them to cut short if it's not
 what's wanted but also it "feels" better.
diff --git a/router/client/src/pb/.gitignore b/router/client/src/pb/.gitignore
index b46a4c42..6f5f3d11 100644
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
@@ -1 +1 @@
-*.rs
\ No newline at end of file
+*.rs
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 7801f4e3..bf85b12f 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -27,6 +27,7 @@ pub struct Validation {
 }
 
 impl Validation {
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new(
         workers: usize,
         tokenizer: Option<Tokenizer>,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 313c018c..67982433 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -3,4 +3,4 @@
 # Branched from master on: 10 November, 2023
 # https://releases.rs/docs/1.75.0/
 channel = "1.75.0"
-components = ["rustfmt", "clippy"]
\ No newline at end of file
+components = ["rustfmt", "clippy"]
diff --git a/server/Makefile-awq b/server/Makefile-awq
index 5dd9dbaa..4e074a13 100644
--- a/server/Makefile-awq
+++ b/server/Makefile-awq
@@ -2,7 +2,7 @@
 # to make cuda graphs work.
 awq_commit := bd1dc2d5254345cc76ab71894651fb821275bdd4
 
-awq: 
+awq:
 	rm -rf llm-awq
 	git clone https://github.com/huggingface/llm-awq
 
diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att
index b4b2e40c..ffa304aa 100644
--- a/server/Makefile-flash-att
+++ b/server/Makefile-flash-att
@@ -13,4 +13,4 @@ build-flash-attention: flash-attention
 
 install-flash-attention: build-flash-attention
 	pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true
-	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
\ No newline at end of file
+	cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install
diff --git a/server/Makefile-selective-scan b/server/Makefile-selective-scan
index f4dec868..b93b517d 100644
--- a/server/Makefile-selective-scan
+++ b/server/Makefile-selective-scan
@@ -13,7 +13,7 @@ install-causal-conv1d: build-causal-conv1d
 	cd causal-conv1d/ && pip install .
 
 # selective-scan dependends on causal-conv1d
-selective-scan: 
+selective-scan:
 	rm -rf mamba
 	git clone https://github.com/state-spaces/mamba.git mamba
 
@@ -21,8 +21,8 @@ build-selective-scan: selective-scan
 	cd mamba/ && git fetch && git checkout $(selective_scan_commit)
 	cd mamba && python setup.py build
 
-install-selective-scan: install-causal-conv1d build-selective-scan 
+install-selective-scan: install-causal-conv1d build-selective-scan
 	pip uninstall selective-scan-cuda -y || true
 	cd mamba && pip install .
 
-build-all: build-causal-conv1d build-selective-scan
\ No newline at end of file
+build-all: build-causal-conv1d build-selective-scan
diff --git a/server/README.md b/server/README.md
index 8efd80ac..b8208f9e 100644
--- a/server/README.md
+++ b/server/README.md
@@ -12,4 +12,4 @@ make install
 
 ```shell
 make run-dev
-```
\ No newline at end of file
+```
diff --git a/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu b/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
index 4be547b1..8206c3e0 100644
--- a/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
+++ b/server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu
@@ -247,4 +247,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         &forward,
         "Bloom attention mechanism forward (CUDA)"
     );
-}
\ No newline at end of file
+}
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
index 6571c17d..0364e38c 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/column_remap.cuh
@@ -16,4 +16,4 @@ void column_remap_cuda
     const uint32_t* x_map
 );
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
index 50cb72a4..49431dc9 100644
--- a/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
+++ b/server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cuh
@@ -50,4 +50,4 @@ private:
 void g_q4_keep_matrix(Q4Matrix* m);
 void g_q4_free_matrices();
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllama_kernels/exllama_kernels/hip_compat.cuh b/server/exllama_kernels/exllama_kernels/hip_compat.cuh
index 4f2a7ae7..5e698b1a 100644
--- a/server/exllama_kernels/exllama_kernels/hip_compat.cuh
+++ b/server/exllama_kernels/exllama_kernels/hip_compat.cuh
@@ -48,4 +48,4 @@ __host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t
 #define rocblas_set_stream hipblasSetStream
 #define rocblas_hgemm __compat_hipblasHgemm
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
index 55af84f2..a72bc7bc 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/matrix_view.cuh
@@ -118,4 +118,4 @@ public:
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
index b643f915..e49457f3 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/q_gemm.cuh
@@ -33,4 +33,4 @@ void clear_tensor_cuda
     int size_n
 );
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
index 3beaeefa..90c18a0c 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_2.cuh
@@ -100,4 +100,4 @@ __forceinline__ __device__ void dequant_2bit_16
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
index 5fb070d0..ad95edb4 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_4.cuh
@@ -224,4 +224,4 @@ __forceinline__ __device__ void dequant_4bit_8_gptq
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
index 454e4b93..78d81f92 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_5.cuh
@@ -204,4 +204,4 @@ __forceinline__ __device__ void dequant_5bit_32
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
index c2eb8cfb..562fe695 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_6.cuh
@@ -40,5 +40,3 @@ __forceinline__ __device__ void dequant_6bit_16
 #endif
 
 #endif
-
-
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
index e2409efa..6e6bedbd 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/quant/qdq_8.cuh
@@ -35,4 +35,4 @@ __forceinline__ __device__ void dequant_8bit_8
 
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
index f56eda79..e167bc23 100644
--- a/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
+++ b/server/exllamav2_kernels/exllamav2_kernels/cuda/util.cuh
@@ -51,4 +51,4 @@ inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=
 
 void print_global_mem(const half* ptr, int rows, int columns, int stride);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 3b424f80..1626eb4d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -251,9 +251,9 @@ class LlamaMLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
         # Fuse gate and up proj
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index 7b45be57..fda34e5a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -255,9 +255,9 @@ class MistralMLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
         # Fuse gate and up proj
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index c91b2224..3d3caba3 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -344,9 +344,9 @@ class BlockSparseMoE(nn.Module):
         if "gelu" in act:
             self.act = lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         elif "silu" in act:
             self.act = torch.nn.functional.silu
@@ -600,9 +600,9 @@ class DenseMoE(nn.Module):
         if "gelu" in act:
             self.act = lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         elif "silu" in act:
             self.act = torch.nn.functional.silu
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index 3ee344e4..780861c2 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -187,9 +187,9 @@ class FlashMLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index 96701794..a9a929e9 100644
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -225,9 +225,9 @@ class PhiMLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 81041046..d3fe95d0 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -69,7 +69,12 @@ def _load_multi_mqa_gptq(
         qzeros = torch.cat([q_tensor, kv_tensor], dim=1)
         qzeros = qzeros.to(device=weights.device)
 
-        bits, groupsize, _, quant_method, = weights._get_gptq_params()
+        (
+            bits,
+            groupsize,
+            _,
+            quant_method,
+        ) = weights._get_gptq_params()
         if quant_method == "gptq":
             g_idx = weights.get_tensor(f"{prefix}.c_attn.g_idx")
             g_idx = g_idx.to(device=weights.device)
@@ -306,9 +311,9 @@ class MLP(nn.Module):
             if "gelu" not in act
             else lambda x: torch.nn.functional.gelu(
                 x,
-                approximate="tanh"
-                if act in ["gelu_fast", "gelu_pytorch_tanh"]
-                else "none",
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
             )
         )
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics_config.py b/server/text_generation_server/models/custom_modeling/idefics_config.py
index 0bdb2e3d..a5565819 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_config.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_config.py
@@ -66,6 +66,7 @@ class IdeficsVisionConfig(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
+
     model_type = "idefics"
     attribute_map = {
         "hidden_size": "embed_dim",
@@ -125,6 +126,7 @@ class IdeficsPerceiverConfig(PretrainedConfig):
         qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
             Whether or not to use qk layer norms in perceiver
     """
+
     model_type = "idefics"
 
     def __init__(
@@ -219,6 +221,7 @@ class IdeficsConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
+
     model_type = "idefics"
     is_composition = True
 
diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
index 555bf5af..4f7dfb95 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -123,10 +123,10 @@ def expand_inputs_for_generation(
             raise ValueError(
                 "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
             )
-        encoder_outputs[
-            "last_hidden_state"
-        ] = encoder_outputs.last_hidden_state.index_select(
-            0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+        encoder_outputs["last_hidden_state"] = (
+            encoder_outputs.last_hidden_state.index_select(
+                0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+            )
         )
         model_kwargs["encoder_outputs"] = encoder_outputs
     return input_ids, model_kwargs
diff --git a/server/text_generation_server/models/custom_modeling/idefics_processing.py b/server/text_generation_server/models/custom_modeling/idefics_processing.py
index beca864b..7bba6977 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_processing.py
@@ -133,6 +133,7 @@ class IdeficsProcessor(ProcessorMixin):
             An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
         image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
     """
+
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "IdeficsImageProcessor"
     tokenizer_class = "LlamaTokenizerFast"
diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
index 53e939bb..baf1fb85 100644
--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -19,10 +19,12 @@ from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 import math
 from dataclasses import dataclass
 
+
 @dataclass
 class InferenceParams:
     """Inference parameters that are passed to the main model in order
     to efficienly calculate and store the context during inference."""
+
     max_seqlen: int
     max_batch_size: int
     conv_states: torch.Tensor
@@ -137,13 +139,28 @@ class MambaBlock(nn.Module):
     def step(self, hidden_states, conv_state, ssm_state):
         xz = self.in_proj(hidden_states.squeeze(1))
         x, z = xz.chunk(2, dim=-1)  # (B D)
-        x = causal_conv1d_update(x, conv_state, self.conv1d.weight.squeeze(1), self.conv1d.bias, self.activation)
+        x = causal_conv1d_update(
+            x,
+            conv_state,
+            self.conv1d.weight.squeeze(1),
+            self.conv1d.bias,
+            self.activation,
+        )
         x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
         dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
         dt = F.linear(dt, self.dt_proj.weight)
         A = self.negA
         y = selective_state_update(
-          ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            ssm_state,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            self.D,
+            z=z,
+            dt_bias=self.dt_proj.bias,
+            dt_softplus=True,
         )
         out = self.out_proj(y)
         return out.unsqueeze(1), conv_state.clone(), ssm_state.clone()
diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
index 2c2fec48..2e2e423e 100644
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -2,6 +2,7 @@
 
 Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """
+
 import math
 import os
 import warnings
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 7ec8c2fc..b8d0be22 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -35,7 +35,6 @@ from text_generation_server.utils.dist import MEMORY_FRACTION
 tracer = trace.get_tracer(__name__)
 
 
-
 @dataclass
 class FlashCausalLMBatch(Batch):
     batch_id: int
@@ -1213,8 +1212,9 @@ class FlashCausalLM(Model):
             # accept each new token for this specific request since we may
             # have more than one new token per request with speculative decoding
             for next_token_id in _next_token_ids:
-                batch.next_token_chooser = batch.next_token_chooser.advance_grammar_single(i, next_token_id)
-
+                batch.next_token_chooser = (
+                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
+                )
 
             # Update values
             batch.input_lengths[i] = input_length + n_accepted_ids
diff --git a/server/text_generation_server/models/galactica.py b/server/text_generation_server/models/galactica.py
index a2c30255..3607c285 100644
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@@ -92,7 +92,9 @@ class GalacticaCausalLMBatch(CausalLMBatch):
             requests_idx_mapping[r.id] = i
             # Add escape_custom_split_sequence to the CausalLMBatch logic
             inputs.append(escape_custom_split_sequence(r.inputs))
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index 5ea2db87..a6df2ebe 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -114,7 +114,9 @@ class IdeficsCausalLMBatch(Batch):
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
             inputs.append(r.inputs)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -401,9 +403,9 @@ class IdeficsCausalLMBatch(Batch):
                 pixel_values = batch.pixel_values.new_zeros(
                     (total_batch_size, max_num_images, 3, 224, 224)
                 )
-            pixel_values[
-                start_index:end_index, :curr_batch_max_num_images
-            ] = batch.pixel_values
+            pixel_values[start_index:end_index, :curr_batch_max_num_images] = (
+                batch.pixel_values
+            )
 
             if image_attention_mask is None:
                 image_attention_mask = batch.image_attention_mask.new_zeros(
@@ -500,14 +502,14 @@ class IdeficsCausalLMBatch(Batch):
                 # We slice the keys to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
                 if batch.keys_head_dim_last:
-                    padded_past_keys[
-                        start_index:end_index, :, -past_seq_len:, :
-                    ] = past_keys[:, :, -past_seq_len:, :]
+                    padded_past_keys[start_index:end_index, :, -past_seq_len:, :] = (
+                        past_keys[:, :, -past_seq_len:, :]
+                    )
                 else:
                     # BLOOM case
-                    padded_past_keys[
-                        start_index:end_index, :, :, -past_seq_len:
-                    ] = past_keys[:, :, :, -past_seq_len:]
+                    padded_past_keys[start_index:end_index, :, :, -past_seq_len:] = (
+                        past_keys[:, :, :, -past_seq_len:]
+                    )
                 del past_keys
 
                 start_index = end_index
@@ -525,9 +527,9 @@ class IdeficsCausalLMBatch(Batch):
                 end_index = start_index + len(batch)
                 # We slice the past values to remove the padding from previous batches
                 past_seq_len = batch.max_input_length - 1
-                padded_past_values[
-                    start_index:end_index, :, -past_seq_len:, :
-                ] = past_values[:, :, -past_seq_len:, :]
+                padded_past_values[start_index:end_index, :, -past_seq_len:, :] = (
+                    past_values[:, :, -past_seq_len:, :]
+                )
                 del past_values
 
                 # Update values
@@ -603,9 +605,11 @@ class IdeficsCausalLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
@@ -836,9 +840,9 @@ class IdeficsCausalLM(Model):
 
         # Update attention_mask as we added a new token to input_ids
         batch.attention_mask[:, -batch.padding_right_offset] = 1
-        batch.image_attention_mask[
-            :, -batch.padding_right_offset, :
-        ] = batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
+        batch.image_attention_mask[:, -batch.padding_right_offset, :] = (
+            batch.image_attention_mask[:, -(batch.padding_right_offset + 1), :]
+        )
         # Decrease right offset
         batch.padding_right_offset -= 1
 
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
index 4585f4b9..9d59f424 100644
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -15,7 +15,10 @@ from text_generation_server.utils import (
 )
 from text_generation_server.models.globals import ENABLE_CUDA_GRAPHS, MEM_POOL
 import time
-from text_generation_server.models.custom_modeling.mamba_modeling import MambaModel, InferenceParams
+from text_generation_server.models.custom_modeling.mamba_modeling import (
+    MambaModel,
+    InferenceParams,
+)
 from text_generation_server.models import Model
 from typing import Any, List, Optional, Tuple, Type, Dict
 from text_generation_server.models.types import (
@@ -28,21 +31,35 @@ from text_generation_server.utils.tokens import batch_top_tokens, Sampling
 from dataclasses import dataclass
 from text_generation_server.utils import NextTokenChooser, StoppingCriteria, Sampling
 
-def new_inference_params(n_blocks: int, batch_size: int, d_inner: int, d_conv: int, d_state: int, seqlen_offset: int, dtype: torch.dtype, device: torch.device):
+
+def new_inference_params(
+    n_blocks: int,
+    batch_size: int,
+    d_inner: int,
+    d_conv: int,
+    d_state: int,
+    seqlen_offset: int,
+    dtype: torch.dtype,
+    device: torch.device,
+):
     max_seqlen = 0
     conv_states = torch.zeros(
-        (n_blocks,
-        batch_size,
-        d_inner,
-        d_conv,),
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_conv,
+        ),
         device=device,
         dtype=dtype,
     )
     ssm_states = torch.zeros(
-        (n_blocks,
-        batch_size,
-        d_inner,
-        d_state,),
+        (
+            n_blocks,
+            batch_size,
+            d_inner,
+            d_state,
+        ),
         device=device,
         dtype=dtype,
     )
@@ -52,7 +69,6 @@ def new_inference_params(n_blocks: int, batch_size: int, d_inner: int, d_conv: i
         seqlen_offset=seqlen_offset,
         conv_states=conv_states,
         ssm_states=ssm_states,
-
     )
     return inference_params
 
@@ -124,7 +140,9 @@ class MambaBatch(Batch):
         for i, r in enumerate(pb.requests):
             requests_idx_mapping[r.id] = i
             inputs.append(r.inputs)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -251,7 +269,9 @@ class MambaBatch(Batch):
 
         # TODO
         # Kept it simple by just updating the state, maybe updating the other CPU values is necessary.
-        self.inference_params.conv_states = self.inference_params.conv_states[:, indices]
+        self.inference_params.conv_states = self.inference_params.conv_states[
+            :, indices
+        ]
         self.inference_params.ssm_states = self.inference_params.ssm_states[:, indices]
         return self
 
@@ -280,13 +300,20 @@ class MambaBatch(Batch):
         max_seqlen = 0
         seqlen_offset = 0
 
-        (n_blocks, _, d_inner, d_conv) = (
-            batches[0].inference_params.conv_states.shape
-        )
+        (n_blocks, _, d_inner, d_conv) = batches[0].inference_params.conv_states.shape
         (_, _, _, d_state) = batches[0].inference_params.ssm_states.shape
         dtype = batches[0].inference_params.conv_states.dtype
         device = batches[0].inference_params.conv_states.device
-        inference_params = new_inference_params(n_blocks=n_blocks, batch_size=total_batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=device, dtype=dtype)
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=total_batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=device,
+            dtype=dtype,
+        )
 
         # Batch tensors
         input_ids = None
@@ -334,13 +361,20 @@ class MambaBatch(Batch):
                 max_input_length - batch.max_input_length
             ) * len(batch)
 
-            inference_params.max_seqlen = max(inference_params.max_seqlen, batch.inference_params.max_seqlen)
+            inference_params.max_seqlen = max(
+                inference_params.max_seqlen, batch.inference_params.max_seqlen
+            )
             assert batch.inference_params.seqlen_offset != 0, "Invalid seqlen offset"
-            inference_params.seqlen_offset = max(inference_params.seqlen_offset, batch.inference_params.seqlen_offset)
+            inference_params.seqlen_offset = max(
+                inference_params.seqlen_offset, batch.inference_params.seqlen_offset
+            )
 
-
-            inference_params.conv_states[:, start_index:end_index] = batch.inference_params.conv_states
-            inference_params.ssm_states[:, start_index:end_index] = batch.inference_params.ssm_states
+            inference_params.conv_states[:, start_index:end_index] = (
+                batch.inference_params.conv_states
+            )
+            inference_params.ssm_states[:, start_index:end_index] = (
+                batch.inference_params.ssm_states
+            )
 
             start_index = end_index
 
@@ -452,36 +486,39 @@ class Mamba(Model):
 
         # Important seqlen_offset to go through the update mecanism with the state
         seqlen_offset = 1
-        inference_params = new_inference_params(n_blocks=n_blocks, batch_size=batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=self.device, dtype=self.dtype)
+        inference_params = new_inference_params(
+            n_blocks=n_blocks,
+            batch_size=batch_size,
+            d_state=d_state,
+            d_conv=d_conv,
+            d_inner=d_inner,
+            seqlen_offset=seqlen_offset,
+            device=self.device,
+            dtype=self.dtype,
+        )
 
         graph = torch.cuda.CUDAGraph()
 
         torch.cuda.synchronize()
         # Run once outside to warmup
-        self.model.forward(
-            input_ids=input_ids,
-            inference_params=inference_params
-        )
+        self.model.forward(input_ids=input_ids, inference_params=inference_params)
         torch.cuda.synchronize()
 
         with torch.cuda.graph(graph, pool=MEM_POOL):
             logits = self.model.forward(
-                input_ids=input_ids,
-                inference_params=inference_params
+                input_ids=input_ids, inference_params=inference_params
             )
         torch.cuda.synchronize()
         graph_dict = {
             "input_ids": input_ids,
             "inference_params": inference_params,
             "graph": graph,
-            "logits": logits
+            "logits": logits,
         }
         self.cuda_graphs[batch_size] = graph_dict
 
     def forward(
-        self,
-        input_ids: torch.Tensor,
-        inference_params: Any
+        self, input_ids: torch.Tensor, inference_params: Any
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         bs = input_ids.shape[0]
         padded_bs = bs
@@ -504,15 +541,21 @@ class Mamba(Model):
 
         # Copy inputs to the static inputs of the cuda graph
         # Static inputs are potentially padded
-        cuda_graph["input_ids"][: bs] = input_ids
-        cuda_graph["inference_params"].conv_states[:, : bs] = inference_params.conv_states
-        cuda_graph["inference_params"].ssm_states[:, : bs] = inference_params.ssm_states
+        cuda_graph["input_ids"][:bs] = input_ids
+        cuda_graph["inference_params"].conv_states[
+            :, :bs
+        ] = inference_params.conv_states
+        cuda_graph["inference_params"].ssm_states[:, :bs] = inference_params.ssm_states
 
         # Replay the graph
         cuda_graph["graph"].replay()
 
-        inference_params.conv_states.copy_(cuda_graph["inference_params"].conv_states[:, :bs])
-        inference_params.ssm_states.copy_(cuda_graph["inference_params"].ssm_states[:, :bs])
+        inference_params.conv_states.copy_(
+            cuda_graph["inference_params"].conv_states[:, :bs]
+        )
+        inference_params.ssm_states.copy_(
+            cuda_graph["inference_params"].ssm_states[:, :bs]
+        )
 
         # Slice output to the correct shape
         return cuda_graph["logits"][:bs]
@@ -528,19 +571,25 @@ class Mamba(Model):
 
         if batch.inference_params is None:
             # 0 is important here
-            seqlen_offset = 0 
+            seqlen_offset = 0
             n_blocks = len(self.model.blocks)
             d_state = self.model.config.d_state
             d_conv = self.model.config.d_conv
             d_inner = self.model.config.d_inner
-            inference_params = new_inference_params(n_blocks=n_blocks, batch_size=batch_size, d_state=d_state, d_conv=d_conv, d_inner=d_inner, seqlen_offset=seqlen_offset, device=self.device, dtype=self.dtype)
+            inference_params = new_inference_params(
+                n_blocks=n_blocks,
+                batch_size=batch_size,
+                d_state=d_state,
+                d_conv=d_conv,
+                d_inner=d_inner,
+                seqlen_offset=seqlen_offset,
+                device=self.device,
+                dtype=self.dtype,
+            )
             batch.inference_params = inference_params
 
         # Forward pass
-        logits = self.forward(
-            input_ids, inference_params=batch.inference_params
-        )
-
+        logits = self.forward(input_ids, inference_params=batch.inference_params)
 
         # batch.inference_params = new_inference_params
         # Results
@@ -694,9 +743,9 @@ class Mamba(Model):
                 generations.append(generation)
 
                 # Update values
-                batch.next_token_choosers[i] = batch.next_token_choosers[i].advance_grammar(
-                    next_token_id_squeezed.item()
-                )
+                batch.next_token_choosers[i] = batch.next_token_choosers[
+                    i
+                ].advance_grammar(next_token_id_squeezed.item())
                 batch.input_ids[i, 0] = next_token_id
                 batch.all_input_ids[i] = all_input_ids
                 batch.input_lengths[i] = new_input_length
diff --git a/server/text_generation_server/models/rw.py b/server/text_generation_server/models/rw.py
index 802a4aa6..22ab093e 100644
--- a/server/text_generation_server/models/rw.py
+++ b/server/text_generation_server/models/rw.py
@@ -36,9 +36,11 @@ class RW(CausalLM):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 459f4256..777a55ba 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -96,7 +96,9 @@ class Seq2SeqLMBatch(Batch):
             inputs.append(r.inputs)
             requests_idx_mapping[r.id] = i
             decoder_input_lengths.append(1)
-            next_token_choosers.append(NextTokenChooser.from_pb(r.parameters, device, tokenizer))
+            next_token_choosers.append(
+                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
+            )
             stopping_criteria = StoppingCriteria.from_pb(
                 r.stopping_parameters, tokenizer
             )
@@ -351,9 +353,9 @@ class Seq2SeqLMBatch(Batch):
                     (total_batch_size, max_input_length),
                 )
             # Copy to correct indices
-            attention_mask[
-                start_index:end_index, -batch.max_input_length :
-            ] = batch.attention_mask[:, -batch.max_input_length :]
+            attention_mask[start_index:end_index, -batch.max_input_length :] = (
+                batch.attention_mask[:, -batch.max_input_length :]
+            )
 
             # Create padded tensor
             if decoder_input_ids is None:
@@ -547,9 +549,11 @@ class Seq2SeqLM(Model):
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map="auto"
-            if torch.cuda.is_available() and torch.cuda.device_count() > 1
-            else None,
+            device_map=(
+                "auto"
+                if torch.cuda.is_available() and torch.cuda.device_count() > 1
+                else None
+            ),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
@@ -750,7 +754,7 @@ class Seq2SeqLM(Model):
 
                 if top_n_tokens > 0:
                     all_top_tokens = []
-                    for (top_token_ids, top_token_logprobs) in zip(
+                    for top_token_ids, top_token_logprobs in zip(
                         top_token_ids, top_token_logprobs
                     ):
                         toptoken_texts = self.tokenizer.batch_decode(
diff --git a/server/text_generation_server/models/types.py b/server/text_generation_server/models/types.py
index da71b0ec..339b733b 100644
--- a/server/text_generation_server/models/types.py
+++ b/server/text_generation_server/models/types.py
@@ -88,14 +88,16 @@ class Generation:
     def to_pb(self) -> generate_pb2.Generation:
         return generate_pb2.Generation(
             request_id=self.request_id,
-            prefill_tokens=self.prefill_tokens.to_pb()
-            if self.prefill_tokens is not None
-            else None,
+            prefill_tokens=(
+                self.prefill_tokens.to_pb() if self.prefill_tokens is not None else None
+            ),
             tokens=self.tokens.to_pb(),
-            generated_text=self.generated_text.to_pb()
-            if self.generated_text is not None
-            else None,
-            top_tokens=[top_tokens.to_pb() for top_tokens in self.top_tokens]
-            if self.top_tokens is not None
-            else None,
+            generated_text=(
+                self.generated_text.to_pb() if self.generated_text is not None else None
+            ),
+            top_tokens=(
+                [top_tokens.to_pb() for top_tokens in self.top_tokens]
+                if self.top_tokens is not None
+                else None
+            ),
         )
diff --git a/server/text_generation_server/pb/.gitignore b/server/text_generation_server/pb/.gitignore
index 2621a190..5a68d631 100644
--- a/server/text_generation_server/pb/.gitignore
+++ b/server/text_generation_server/pb/.gitignore
@@ -1,3 +1,3 @@
 *.py
 *.pyi
-*.py-e
\ No newline at end of file
+*.py-e
diff --git a/server/text_generation_server/utils/gptq/quant_linear.py b/server/text_generation_server/utils/gptq/quant_linear.py
index 8ad0dd80..a832f755 100644
--- a/server/text_generation_server/utils/gptq/quant_linear.py
+++ b/server/text_generation_server/utils/gptq/quant_linear.py
@@ -182,7 +182,7 @@ try:
             )  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
 
             zeros = (zeros >> zeros_shifter[None, :]) & maxq
-            zeros = (zeros + 1) & maxq # eventually avoid overflow
+            zeros = (zeros + 1) & maxq  # eventually avoid overflow
 
             a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
             b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index 01e32588..bef2a146 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -355,7 +355,9 @@ def get_linear(weight, bias, quantize):
                 "to use Exllama/GPTQ kernels for AWQ inference."
             )
         if not HAS_AWQ:
-            raise NotImplementedError("You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly")
+            raise NotImplementedError(
+                "You do not seem to have awq installed, either install it (cd server &&  make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly"
+            )
         linear = WQLinear(
             w_bit=bits,
             group_size=groupsize,
diff --git a/server/text_generation_server/utils/logits_process.py b/server/text_generation_server/utils/logits_process.py
index 73fcf53f..950c074d 100644
--- a/server/text_generation_server/utils/logits_process.py
+++ b/server/text_generation_server/utils/logits_process.py
@@ -516,7 +516,7 @@ class GrammarLogitProcessor(LogitsProcessor):
         if grammar_type == GrammarType.GRAMMAR_TYPE_JSON:
             schema = build_regex_from_object(schema)
         elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
-            pass # schema is already a regex just here for clarity
+            pass  # schema is already a regex just here for clarity
         fsm = RegexFSM(schema, tokenizer)
         logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
         return fsm
diff --git a/server/text_generation_server/utils/tokens.py b/server/text_generation_server/utils/tokens.py
index 2784585e..93cd7ba0 100644
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@@ -409,8 +409,12 @@ class HeterogeneousNextTokenChooser:
 
     def advance_grammar_single(self, grammar_state_index: int, next_id: int):
         if self.grammar_processor is not None:
-            self.fsm_grammar_states[grammar_state_index] = self.grammar_processor.advance_at_index(
-                next_id, self.fsm_grammar_states[grammar_state_index], grammar_state_index
+            self.fsm_grammar_states[grammar_state_index] = (
+                self.grammar_processor.advance_at_index(
+                    next_id,
+                    self.fsm_grammar_states[grammar_state_index],
+                    grammar_state_index,
+                )
             )
         return self