From dae3bf1d87c13f1186eead4afbfb4a29e4a59f84 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 11 Sep 2024 22:41:56 +0200 Subject: [PATCH] Fix tokenization yi (#2507) * Fixing odd tokenization self modifications on the Rust side (load and resave in Python). * Fixing the builds ? * Fix the gh action? * Fixing the location ? * Validation is odd. * Try a faster runner * Upgrade python version. * Remove sccache * No sccache. * Getting libpython maybe ? * List stuff. * Monkey it up. * have no idea at this point * Tmp. * Shot in the dark. * Tmate the hell out of this. * Desperation. * WTF. * -y. * Apparently 3.10 is not available anymore. * Updating the dockerfile to make libpython discoverable at runtime too. * Put back rust tests. * Why do we want mkl on AMD ? * Forcing 3.11 ? --- .github/workflows/tests.yaml | 40 +----- Cargo.lock | 120 +++++++++++++++++- Cargo.toml | 2 +- Dockerfile | 40 +++--- Dockerfile_amd | 27 ++-- Dockerfile_intel | 5 +- backends/v3/src/queue.rs | 1 + router/Cargo.toml | 1 + router/src/server.rs | 113 +++++------------ server/tests/models/test_bloom.py | 2 +- server/tests/models/test_causal_lm.py | 2 +- server/tests/models/test_seq2seq_lm.py | 2 +- .../models/flash_causal_lm.py | 2 + 13 files changed, 205 insertions(+), 152 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 6faabe3b..5ad0fd6a 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,19 +17,15 @@ concurrency: jobs: run_tests: - runs-on: ubuntu-latest - - env: - SCCACHE_GHA_ENABLED: "on" - RUSTC_WRAPPER: /usr/local/bin/sccache - SCCACHE: 0.3.3 - + runs-on: + group: aws-highmemory-32-plus-priv steps: - uses: actions/checkout@v2 - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 + id: python with: - python-version: 3.9 + python-version: 3.11 - name: Install Rust uses: actions-rs/toolchain@v1 with: @@ -44,30 +40,9 @@ jobs: run: | sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET - - name: Install sccache - run: | - curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache - chmod +x /usr/local/bin/sccache - - name: configure sccache - uses: actions/github-script@v6 - with: - script: | - core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || ''); - core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); - core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}'); - core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-'); - - name: cargo registry cache - uses: actions/cache@v3 - with: - key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }} - restore-keys: | - cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}- - cargo-${{ runner.os }}- - path: | - ~/.cargo/registry - ~/.cargo/git - name: Install run: | + sudo apt install python3.11-dev -y make install-cpu - name: Run server tests run: | @@ -82,6 +57,3 @@ jobs: - name: Run Rust tests run: | cargo test - - name: sccache stats - run: | - /usr/local/bin/sccache --show-stats diff --git a/Cargo.lock b/Cargo.lock index 00c7f005..06a61853 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2118,6 +2118,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "metrics" version = "0.23.0" @@ -3112,6 +3121,69 @@ dependencies = [ "prost 0.12.6", ] +[[package]] +name = "pyo3" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.76", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.76", +] + [[package]] name = "qoi" version = "0.4.1" @@ -4068,7 +4140,7 @@ dependencies = [ "pkg-config", "text-generation-router", "thiserror", - "tokenizers", + "tokenizers 0.19.1", "tokio", "tokio-stream", "tracing", @@ -4091,7 +4163,7 @@ dependencies = [ "tabled", "text-generation-client", "thiserror", - "tokenizers", + "tokenizers 0.20.0", "tokio", "tracing", "tracing-subscriber", @@ -4161,6 +4233,7 @@ dependencies = [ "once_cell", "opentelemetry 0.20.0", "opentelemetry-otlp", + "pyo3", "rand", "regex", "reqwest", @@ -4168,7 +4241,7 @@ dependencies = [ "serde_json", "sysinfo", "thiserror", - "tokenizers", + "tokenizers 0.20.0", "tokio", "tokio-stream", "tower-http", @@ -4219,7 +4292,7 @@ dependencies = [ "slotmap", "text-generation-router", "thiserror", - "tokenizers", + "tokenizers 0.20.0", "tokio", "tokio-stream", "tonic 0.10.2", @@ -4374,6 +4447,39 @@ dependencies = [ "unicode_categories", ] +[[package]] +name = "tokenizers" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70" +dependencies = [ + "aho-corasick", + "derive_builder", + "esaxx-rs", + "getrandom", + "hf-hub", + "indicatif", + "itertools 0.12.1", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.8.4", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.39.3" @@ -4839,6 +4945,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unindent" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + [[package]] name = "untrusted" version = "0.7.1" diff --git a/Cargo.toml b/Cargo.toml index 79fda15d..a50bba24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,7 @@ homepage = "https://github.com/huggingface/text-generation-inference" [workspace.dependencies] base64 = "0.22.0" -tokenizers = { version = "0.19.1", features = ["http"] } +tokenizers = { version = "0.20.0", features = ["http"] } hf-hub = { version = "0.3.1", features = ["tokio"] } metrics = { version = "0.23.0" } metrics-exporter-prometheus = { version = "0.15.1", features = [] } diff --git a/Dockerfile b/Dockerfile index 0d0e89b1..37ced781 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,10 +13,13 @@ COPY benchmark benchmark COPY router router COPY backends backends COPY launcher launcher + RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3.11-dev RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ @@ -37,6 +40,7 @@ COPY router router COPY backends backends COPY launcher launcher RUN cargo build --profile release-opt +RUN cargo build --profile release-opt # Python builder # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile @@ -45,7 +49,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099 ARG PYTORCH_VERSION=2.4.0 -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.11 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=12.4 ARG MAMBA_VERSION=24.3.0-0 @@ -216,33 +220,33 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins COPY --from=pytorch-install /opt/conda /opt/conda # Copy build artifacts from flash attention builder -COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages +COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages # Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from awq kernels builder -COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from eetq kernels builder -COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from lorax punica kernels builder -COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from fbgemm builder -COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages +COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages # Copy build artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from mamba builder -COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages -COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages -COPY --from=flashinfer-builder /opt/conda/lib/python3.10/site-packages/flashinfer/ /opt/conda/lib/python3.10/site-packages/flashinfer/ +COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages +COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages +COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/ # Install flash-attention dependencies RUN pip install einops --no-cache-dir @@ -257,7 +261,9 @@ RUN cd server && \ pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \ pip install nvidia-nccl-cu12==2.22.3 -ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2 +ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2 +# Required to find libpython within the rust binaries +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/" # This is needed because exl2 tries to load flash-attn # And fails with our builds. ENV EXLLAMA_NO_FLASH_ATTN=1 diff --git a/Dockerfile_amd b/Dockerfile_amd index 8cb699dd..a79aae48 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -17,6 +17,8 @@ RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3.11-dev RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ @@ -64,14 +66,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins hipsolver-dev \ rccl-dev \ cmake \ - python3-dev && \ + python3.11-dev && \ rm -rf /var/lib/apt/lists/* # Keep in sync with `server/pyproject.toml ARG MAMBA_VERSION=23.1.0-1 ARG PYTORCH_VERSION='2.3.0' ARG ROCM_VERSION='6.0.2' -ARG PYTHON_VERSION='3.10.10' +ARG PYTHON_VERSION='3.11.10' # Automatically set by buildx ARG TARGETPLATFORM ENV PATH /opt/conda/bin:$PATH @@ -89,10 +91,18 @@ RUN chmod +x ~/mambaforge.sh && \ mamba init && \ rm ~/mambaforge.sh +# RUN conda install intel::mkl-static intel::mkl-include +# Install pytorch +# On arm64 we exit with an error code +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") exit 1 ;; \ + *) /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \ + esac && \ + /opt/conda/bin/conda clean -ya # Install flash-attention, torch dependencies RUN pip install numpy einops ninja --no-cache-dir -RUN conda install intel::mkl-static intel::mkl-include RUN pip uninstall -y triton && \ git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \ cd triton/python && \ @@ -172,19 +182,19 @@ ENV HF_HOME=/data \ PORT=80 # Copy builds artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages # Install server COPY proto proto @@ -201,6 +211,7 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/l COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router # Install launcher COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/" # AWS Sagemaker compatible image FROM base AS sagemaker diff --git a/Dockerfile_intel b/Dockerfile_intel index 0cda4d4b..d7ac09af 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -18,6 +18,8 @@ RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3.11-dev RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ @@ -114,7 +116,7 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ PORT=80 ARG MAMBA_VERSION=23.1.0-1 -ARG PYTHON_VERSION='3.10.10' +ARG PYTHON_VERSION='3.11.10' # Automatically set by buildx ARG TARGETPLATFORM ENV PATH /opt/conda/bin:$PATH @@ -153,6 +155,7 @@ ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/" # Install server COPY proto proto diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 978a495c..2bbb6753 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -357,6 +357,7 @@ impl State { let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) = (block_allocation, &self.block_allocator) { + tracing::debug!("Allocating {tokens} with {input_ids:?}"); match block_allocator.allocate(tokens, input_ids).await { None => { // Entry is over budget diff --git a/router/Cargo.toml b/router/Cargo.toml index 5c328e8a..6a752db6 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -61,6 +61,7 @@ uuid = { version = "1.9.1", default-features = false, features = [ ] } csv = "1.3.0" ureq = "=2.9" +pyo3 = { version = "0.22.2", features = ["auto-initialize"] } [build-dependencies] diff --git a/router/src/server.rs b/router/src/server.rs index 6a04ab00..8bd49b93 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -41,6 +41,7 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo}; use hf_hub::{Cache, Repo, RepoType}; use http::header::AUTHORIZATION; use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle}; +use pyo3::types::IntoPyDict; use serde_json::Value; use std::convert::Infallible; use std::fs::File; @@ -48,7 +49,6 @@ use std::io::BufReader; use std::net::{IpAddr, Ipv4Addr, SocketAddr}; use std::path::{Path, PathBuf}; use thiserror::Error; -use tokenizers::processors::template::TemplateProcessing; use tokenizers::Tokenizer; use tokio::select; use tokio::signal; @@ -1860,18 +1860,34 @@ pub async fn run( }); let tokenizer: Option = tokenizer_filename.and_then(|filename| { - let mut tokenizer = Tokenizer::from_file(filename).ok(); - if let Some(tokenizer) = &mut tokenizer { - if let Some(class) = &tokenizer_config.tokenizer_class { - if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{ - if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) { - tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205"); - tokenizer.with_post_processor(post_processor); - } - } - } - } - tokenizer + use pyo3::prelude::*; + let convert = pyo3::Python::with_gil(|py| -> PyResult<()> { + let transformers = py.import_bound("transformers")?; + let auto = transformers.getattr("AutoTokenizer")?; + let from_pretrained = auto.getattr("from_pretrained")?; + let args = (tokenizer_name.to_string(),); + let kwargs = [( + "revision", + revision.clone().unwrap_or_else(|| "main".to_string()), + )] + .into_py_dict_bound(py); + let tokenizer = from_pretrained.call(args, Some(&kwargs))?; + let save = tokenizer.getattr("save_pretrained")?; + let args = ("out".to_string(),); + save.call1(args)?; + Ok(()) + }) + .inspect_err(|err| { + tracing::error!("Failed to import python tokenizer {err}"); + }); + let filename = if convert.is_ok() { + // If we have correctly loaded and resaved with transformers + // We might have modified the tokenizer.json according to transformers + "out/tokenizer.json".into() + } else { + filename + }; + Tokenizer::from_file(filename).ok() }); let config: Option = config_filename.and_then(|filename| { @@ -2591,77 +2607,6 @@ pub enum WebServerError { Axum(#[from] axum::BoxError), } -/// Create a post_processor for the LlamaTokenizer -fn create_post_processor( - tokenizer: &Tokenizer, - tokenizer_config: &HubTokenizerConfig, -) -> Result { - let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true); - let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false); - - let bos_token = tokenizer_config.bos_token.as_ref(); - let eos_token = tokenizer_config.eos_token.as_ref(); - - if add_bos_token && bos_token.is_none() { - panic!("add_bos_token = true but bos_token is None"); - } - - if add_eos_token && eos_token.is_none() { - panic!("add_eos_token = true but eos_token is None"); - } - - let mut single = Vec::new(); - let mut pair = Vec::new(); - let mut special_tokens = Vec::new(); - - if add_bos_token { - if let Some(bos) = bos_token { - let bos_token_id = tokenizer - .token_to_id(bos.as_str()) - .expect("Should have found the bos token id"); - special_tokens.push((bos.as_str(), bos_token_id)); - single.push(format!("{}:0", bos.as_str())); - pair.push(format!("{}:0", bos.as_str())); - } - } - - single.push("$A:0".to_string()); - pair.push("$A:0".to_string()); - - if add_eos_token { - if let Some(eos) = eos_token { - let eos_token_id = tokenizer - .token_to_id(eos.as_str()) - .expect("Should have found the eos token id"); - special_tokens.push((eos.as_str(), eos_token_id)); - single.push(format!("{}:0", eos.as_str())); - pair.push(format!("{}:0", eos.as_str())); - } - } - - if add_bos_token { - if let Some(bos) = bos_token { - pair.push(format!("{}:1", bos.as_str())); - } - } - - pair.push("$B:1".to_string()); - - if add_eos_token { - if let Some(eos) = eos_token { - pair.push(format!("{}:1", eos.as_str())); - } - } - - let post_processor = TemplateProcessing::builder() - .try_single(single)? - .try_pair(pair)? - .special_tokens(special_tokens) - .build()?; - - Ok(post_processor) -} - type PreparedInput = (String, Option, bool); fn prepare_chat_input( diff --git a/server/tests/models/test_bloom.py b/server/tests/models/test_bloom.py index 08292920..dff5239b 100644 --- a/server/tests/models/test_bloom.py +++ b/server/tests/models/test_bloom.py @@ -267,7 +267,7 @@ def test_batch_concatenate( assert next_batch.max_input_length == 3 assert next_batch.requests[0] == next_batch_0.requests[0] - assert next_batch.requests[1:] == next_batch_1.requests + assert next_batch.requests[1:] == list(next_batch_1.requests) assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0] assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers diff --git a/server/tests/models/test_causal_lm.py b/server/tests/models/test_causal_lm.py index c000ef26..11ca5efe 100644 --- a/server/tests/models/test_causal_lm.py +++ b/server/tests/models/test_causal_lm.py @@ -262,7 +262,7 @@ def test_batch_concatenate( assert next_batch.max_input_length == 3 assert next_batch.requests[0] == next_batch_0.requests[0] - assert next_batch.requests[1:] == next_batch_1.requests + assert next_batch.requests[1:] == list(next_batch_1.requests) assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0] assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py index 02666042..5ba7c64c 100644 --- a/server/tests/models/test_seq2seq_lm.py +++ b/server/tests/models/test_seq2seq_lm.py @@ -281,7 +281,7 @@ def test_batch_concatenate( assert next_batch.max_decoder_input_length == 3 assert next_batch.requests[0] == next_batch_0.requests[0] - assert next_batch.requests[1:] == next_batch_1.requests + assert next_batch.requests[1:] == list(next_batch_1.requests) assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0] assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 65180499..834056aa 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -272,6 +272,8 @@ class FlashCausalLMBatch(Batch): assert prefix_len > 0 prefix_len -= 1 + # Commented as it's costly. + # log_master(logger.debug, "Tokenized input ids {tokenized_input}") prefix_ids.append(tokenized_input[:prefix_len]) tokenized_input = tokenized_input[prefix_len:]