From 9946165ee022bf31578e1dde3124cc61ded2b664 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Fri, 16 Feb 2024 11:58:58 +0100 Subject: [PATCH] chore: add pre-commit (#1569) --- .github/ISSUE_TEMPLATE/bug-report.yml | 10 +- .github/ISSUE_TEMPLATE/feature-request.yml | 2 +- .github/workflows/autodocs.yml | 6 +- .github/workflows/build_pr_documentation.yml | 2 +- .github/workflows/tests.yaml | 9 +- .github/workflows/upload_pr_documentation.yml | 2 +- .gitignore | 1 - .pre-commit-config.yaml | 18 +++ README.md | 4 +- benchmark/Cargo.toml | 1 - benchmark/README.md | 8 +- clients/python/.gitignore | 2 +- clients/python/Makefile | 2 +- clients/python/README.md | 8 +- clients/python/text_generation/types.py | 1 + docs/index.html | 2 +- docs/openapi.json | 2 +- docs/source/basic_tutorials/consuming_tgi.md | 24 +-- .../source/basic_tutorials/non_core_models.md | 6 +- .../source/basic_tutorials/preparing_model.md | 6 +- docs/source/basic_tutorials/using_cli.md | 12 +- docs/source/conceptual/flash_attention.md | 5 +- docs/source/conceptual/quantization.md | 10 +- docs/source/conceptual/safetensors.md | 6 +- docs/source/conceptual/streaming.md | 18 +-- docs/source/conceptual/tensor_parallelism.md | 2 +- docs/source/installation.md | 4 +- docs/source/messages_api.md | 2 +- docs/source/quicktour.md | 2 +- integration-tests/models/test_mamba.py | 4 +- integration-tests/pytest.ini | 2 +- load_tests/common.js | 2 +- load_tests/starcoder_load.js | 2 +- load_tests/tgi.js | 2 +- load_tests/vllm.js | 2 +- router/README.md | 4 +- router/client/src/pb/.gitignore | 2 +- router/src/validation.rs | 1 + rust-toolchain.toml | 2 +- server/Makefile-awq | 2 +- server/Makefile-flash-att | 2 +- server/Makefile-selective-scan | 6 +- server/README.md | 2 +- .../fused_bloom_attention_cuda.cu | 2 +- .../cuda_func/column_remap.cuh | 2 +- .../exllama_kernels/cuda_func/q4_matrix.cuh | 2 +- .../exllama_kernels/hip_compat.cuh | 2 +- .../exllamav2_kernels/cuda/matrix_view.cuh | 2 +- .../exllamav2_kernels/cuda/q_gemm.cuh | 2 +- .../exllamav2_kernels/cuda/quant/qdq_2.cuh | 2 +- .../exllamav2_kernels/cuda/quant/qdq_4.cuh | 2 +- .../exllamav2_kernels/cuda/quant/qdq_5.cuh | 2 +- .../exllamav2_kernels/cuda/quant/qdq_6.cuh | 2 - .../exllamav2_kernels/cuda/quant/qdq_8.cuh | 2 +- .../exllamav2_kernels/cuda/util.cuh | 2 +- .../custom_modeling/flash_llama_modeling.py | 6 +- .../custom_modeling/flash_mistral_modeling.py | 6 +- .../custom_modeling/flash_mixtral_modeling.py | 12 +- .../custom_modeling/flash_neox_modeling.py | 6 +- .../custom_modeling/flash_phi_modeling.py | 6 +- .../flash_santacoder_modeling.py | 13 +- .../models/custom_modeling/idefics_config.py | 3 + .../custom_modeling/idefics_modeling.py | 8 +- .../custom_modeling/idefics_processing.py | 1 + .../models/custom_modeling/mamba_modeling.py | 21 ++- .../models/custom_modeling/mpt_modeling.py | 1 + .../models/flash_causal_lm.py | 6 +- .../models/galactica.py | 4 +- .../models/idefics_causal_lm.py | 42 ++--- server/text_generation_server/models/mamba.py | 143 ++++++++++++------ server/text_generation_server/models/rw.py | 8 +- .../models/seq2seq_lm.py | 20 ++- server/text_generation_server/models/types.py | 20 +-- server/text_generation_server/pb/.gitignore | 2 +- .../utils/gptq/quant_linear.py | 2 +- server/text_generation_server/utils/layers.py | 4 +- .../utils/logits_process.py | 2 +- server/text_generation_server/utils/tokens.py | 8 +- 78 files changed, 346 insertions(+), 234 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 12c93b9e..24ac3cbe 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -5,14 +5,14 @@ body: id: system-info attributes: label: System Info - description: | + description: | Please share your system info with us (`text-generation-launcher --env` if installed locally). - The full command line used that causes issues: + The full command line used that causes issues: OS version: Rust version (if self-compiling, `cargo version`): Model being used (`curl 127.0.0.1:8080/info | jq`): If local model please explicit the kind of model and/or equivalents. - Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): + Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): Deployment specificities (Kubernetes, EKS, AKS, any particular deployments): The current version being used: @@ -52,11 +52,11 @@ body: placeholder: | Steps to reproduce the behavior: - + 1. 2. 3. - + - type: textarea id: expected-behavior diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 5abc1565..f1a9135c 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -19,7 +19,7 @@ body: label: Motivation description: | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. - + - type: textarea id: contribution diff --git a/.github/workflows/autodocs.yml b/.github/workflows/autodocs.yml index a981c09c..7c5c6eca 100644 --- a/.github/workflows/autodocs.yml +++ b/.github/workflows/autodocs.yml @@ -6,15 +6,15 @@ on: jobs: update_docs: runs-on: ubuntu-latest - + steps: - name: Checkout code uses: actions/checkout@v2 - + - name: Install Launcher id: install-launcher run: cargo install --git https://github.com/${{ github.repository }} --branch ${{ github.head_ref }} text-generation-launcher - + - name: Check launcher Docs are up-to-date run: | echo text-generation-launcher --help diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index b46216ec..a5ce39a5 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -16,4 +16,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: text-generation-inference - additional_args: --not_python_module + additional_args: --not_python_module diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ecc8eb4d..5b19eb8c 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -71,12 +71,11 @@ jobs: pip install pytest export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }} pytest -s -vv server/tests - - name: Run Rust fmt + - name: Pre-commit checks run: | - cargo fmt --check - - name: Run Rust clippy - run: | - cargo clippy + pip install pre-commit + pre-commit install + pre-commit run --all-files - name: Run Rust tests run: | cargo test diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml index b984ead2..ae00bb51 100644 --- a/.github/workflows/upload_pr_documentation.yml +++ b/.github/workflows/upload_pr_documentation.yml @@ -13,4 +13,4 @@ jobs: package_name: text-generation-inference secrets: hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} - comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} diff --git a/.gitignore b/.gitignore index 1f9ba162..b3ca772b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,3 @@ server/exllama_kernels/exllama_kernels/hip_func/ *_hip.cuh server/exllama_kernels/exllama_kernels/hip_buffers.cuh server/exllama_kernels/exllama_kernels/exllama_ext_hip.cpp - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..45bc07a5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + exclude: docs/source/basic_tutorials/launcher.md +- repo: https://github.com/psf/black + rev: 24.2.0 + hooks: + - id: black +- repo: https://github.com/doublify/pre-commit-rust + rev: v1.0 + hooks: + - id: fmt + - id: cargo-check + - id: clippy diff --git a/README.md b/README.md index c4d84efa..7589a3a6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- + Making TGI deployment optimal @@ -228,7 +228,7 @@ text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement: ```shell -text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize +text-generation-launcher --model-id mistralai/Mistral-7B-Instruct-v0.2 --quantize ``` 4bit quantization is available using the [NF4 and FP4 data types from bitsandbytes](https://arxiv.org/pdf/2305.14314.pdf). It can be enabled by providing `--quantize bitsandbytes-nf4` or `--quantize bitsandbytes-fp4` as a command line argument to `text-generation-launcher`. diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml index 2dd2e64d..40738c4d 100644 --- a/benchmark/Cargo.toml +++ b/benchmark/Cargo.toml @@ -29,4 +29,3 @@ tui = {package = "ratatui", version = "0.23", default-features = false, features tracing = "0.1.37" tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] } hf-hub = "0.3.1" - diff --git a/benchmark/README.md b/benchmark/README.md index 7f51a731..17a02a30 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -6,12 +6,12 @@
-A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) +A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) and powered by [tui](https://github.com/tui-rs-revival/ratatui). -## Install +## Install -```shell +```shell make install-benchmark ``` @@ -27,4 +27,4 @@ Then run the benchmarking tool: ```shell text-generation-benchmark --tokenizer-name bigscience/bloom-560m -``` \ No newline at end of file +``` diff --git a/clients/python/.gitignore b/clients/python/.gitignore index 5758ba92..5a8ecaa7 100644 --- a/clients/python/.gitignore +++ b/clients/python/.gitignore @@ -155,4 +155,4 @@ dmypy.json cython_debug/ transformers -safetensors \ No newline at end of file +safetensors diff --git a/clients/python/Makefile b/clients/python/Makefile index 8b4334bd..42720875 100644 --- a/clients/python/Makefile +++ b/clients/python/Makefile @@ -3,4 +3,4 @@ unit-tests: install: pip install pip --upgrade - pip install -e . \ No newline at end of file + pip install -e . diff --git a/clients/python/README.md b/clients/python/README.md index 82f3ee0c..20243f4a 100644 --- a/clients/python/README.md +++ b/clients/python/README.md @@ -141,7 +141,7 @@ class Parameters: # Get decoder input token logprobs and ids decoder_input_details: bool # Return the N most likely tokens at each step - top_n_tokens: Optional[int] + top_n_tokens: Optional[int] # Decoder input tokens class InputToken: @@ -192,7 +192,7 @@ class BestOfSequence: # Generated tokens tokens: List[Token] # Most likely tokens - top_tokens: Optional[List[List[Token]]] + top_tokens: Optional[List[List[Token]]] # `generate` details @@ -236,7 +236,7 @@ class StreamResponse: # Generated token token: Token # Most likely tokens - top_tokens: Optional[List[Token]] + top_tokens: Optional[List[Token]] # Complete generated text # Only available when the generation is finished generated_text: Optional[str] @@ -248,4 +248,4 @@ class StreamResponse: class DeployedModel: model_id: str sha: str -``` \ No newline at end of file +``` diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py index 3426411b..911114ee 100644 --- a/clients/python/text_generation/types.py +++ b/clients/python/text_generation/types.py @@ -134,6 +134,7 @@ class Parameters(BaseModel): raise ValidationError("`value` cannot be empty for `json` grammar") return v + class Request(BaseModel): # Prompt inputs: str diff --git a/docs/index.html b/docs/index.html index 16d143d8..f582d3ce 100644 --- a/docs/index.html +++ b/docs/index.html @@ -27,4 +27,4 @@ } - \ No newline at end of file + diff --git a/docs/openapi.json b/docs/openapi.json index da3969df..fd7cca7b 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1290,4 +1290,4 @@ "description": "Hugging Face Text Generation Inference API" } ] -} \ No newline at end of file +} diff --git a/docs/source/basic_tutorials/consuming_tgi.md b/docs/source/basic_tutorials/consuming_tgi.md index 540f4b13..4829ec7c 100644 --- a/docs/source/basic_tutorials/consuming_tgi.md +++ b/docs/source/basic_tutorials/consuming_tgi.md @@ -23,7 +23,7 @@ You can simply install `huggingface-hub` package with pip. pip install huggingface-hub ``` -Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. +Once you start the TGI server, instantiate `InferenceClient()` with the URL to the endpoint serving the model. You can then call `text_generation()` to hit the endpoint through Python. ```python from huggingface_hub import InferenceClient @@ -83,8 +83,8 @@ Gradio is a Python library that helps you build web applications for your machin pip install huggingface-hub gradio ``` -Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). - +Assume you are serving your model on port 8080, we will query through [InferenceClient](consuming_tgi#inference-client). + ```python import gradio as gr from huggingface_hub import InferenceClient @@ -110,30 +110,30 @@ gr.ChatInterface( ).queue().launch() ``` -The UI looks like this 👇 +The UI looks like this 👇
- -
-You can try the demo directly here 👇 +You can try the demo directly here 👇
-