Fix tokenization yi (#2507)
* Fixing odd tokenization self modifications on the Rust side (load and resave in Python). * Fixing the builds ? * Fix the gh action? * Fixing the location ? * Validation is odd. * Try a faster runner * Upgrade python version. * Remove sccache * No sccache. * Getting libpython maybe ? * List stuff. * Monkey it up. * have no idea at this point * Tmp. * Shot in the dark. * Tmate the hell out of this. * Desperation. * WTF. * -y. * Apparently 3.10 is not available anymore. * Updating the dockerfile to make libpython discoverable at runtime too. * Put back rust tests. * Why do we want mkl on AMD ? * Forcing 3.11 ?
This commit is contained in:
parent
a4e3e8c608
commit
dae3bf1d87
|
@ -17,19 +17,15 @@ concurrency:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run_tests:
|
run_tests:
|
||||||
runs-on: ubuntu-latest
|
runs-on:
|
||||||
|
group: aws-highmemory-32-plus-priv
|
||||||
env:
|
|
||||||
SCCACHE_GHA_ENABLED: "on"
|
|
||||||
RUSTC_WRAPPER: /usr/local/bin/sccache
|
|
||||||
SCCACHE: 0.3.3
|
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v1
|
uses: actions/setup-python@v4
|
||||||
|
id: python
|
||||||
with:
|
with:
|
||||||
python-version: 3.9
|
python-version: 3.11
|
||||||
- name: Install Rust
|
- name: Install Rust
|
||||||
uses: actions-rs/toolchain@v1
|
uses: actions-rs/toolchain@v1
|
||||||
with:
|
with:
|
||||||
|
@ -44,30 +40,9 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
|
sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
|
||||||
sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
|
sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
|
||||||
- name: Install sccache
|
|
||||||
run: |
|
|
||||||
curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
|
|
||||||
chmod +x /usr/local/bin/sccache
|
|
||||||
- name: configure sccache
|
|
||||||
uses: actions/github-script@v6
|
|
||||||
with:
|
|
||||||
script: |
|
|
||||||
core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
|
|
||||||
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
|
|
||||||
core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}');
|
|
||||||
core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-');
|
|
||||||
- name: cargo registry cache
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }}
|
|
||||||
restore-keys: |
|
|
||||||
cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-
|
|
||||||
cargo-${{ runner.os }}-
|
|
||||||
path: |
|
|
||||||
~/.cargo/registry
|
|
||||||
~/.cargo/git
|
|
||||||
- name: Install
|
- name: Install
|
||||||
run: |
|
run: |
|
||||||
|
sudo apt install python3.11-dev -y
|
||||||
make install-cpu
|
make install-cpu
|
||||||
- name: Run server tests
|
- name: Run server tests
|
||||||
run: |
|
run: |
|
||||||
|
@ -82,6 +57,3 @@ jobs:
|
||||||
- name: Run Rust tests
|
- name: Run Rust tests
|
||||||
run: |
|
run: |
|
||||||
cargo test
|
cargo test
|
||||||
- name: sccache stats
|
|
||||||
run: |
|
|
||||||
/usr/local/bin/sccache --show-stats
|
|
||||||
|
|
|
@ -2118,6 +2118,15 @@ version = "2.7.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memoffset"
|
||||||
|
version = "0.9.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "metrics"
|
name = "metrics"
|
||||||
version = "0.23.0"
|
version = "0.23.0"
|
||||||
|
@ -3112,6 +3121,69 @@ dependencies = [
|
||||||
"prost 0.12.6",
|
"prost 0.12.6",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyo3"
|
||||||
|
version = "0.22.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"indoc",
|
||||||
|
"libc",
|
||||||
|
"memoffset",
|
||||||
|
"once_cell",
|
||||||
|
"portable-atomic",
|
||||||
|
"pyo3-build-config",
|
||||||
|
"pyo3-ffi",
|
||||||
|
"pyo3-macros",
|
||||||
|
"unindent",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyo3-build-config"
|
||||||
|
version = "0.22.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8"
|
||||||
|
dependencies = [
|
||||||
|
"once_cell",
|
||||||
|
"target-lexicon",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyo3-ffi"
|
||||||
|
version = "0.22.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"pyo3-build-config",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyo3-macros"
|
||||||
|
version = "0.22.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"pyo3-macros-backend",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.76",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyo3-macros-backend"
|
||||||
|
version = "0.22.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372"
|
||||||
|
dependencies = [
|
||||||
|
"heck 0.5.0",
|
||||||
|
"proc-macro2",
|
||||||
|
"pyo3-build-config",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.76",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "qoi"
|
name = "qoi"
|
||||||
version = "0.4.1"
|
version = "0.4.1"
|
||||||
|
@ -4068,7 +4140,7 @@ dependencies = [
|
||||||
"pkg-config",
|
"pkg-config",
|
||||||
"text-generation-router",
|
"text-generation-router",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.19.1",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
@ -4091,7 +4163,7 @@ dependencies = [
|
||||||
"tabled",
|
"tabled",
|
||||||
"text-generation-client",
|
"text-generation-client",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.20.0",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
@ -4161,6 +4233,7 @@ dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"opentelemetry 0.20.0",
|
"opentelemetry 0.20.0",
|
||||||
"opentelemetry-otlp",
|
"opentelemetry-otlp",
|
||||||
|
"pyo3",
|
||||||
"rand",
|
"rand",
|
||||||
"regex",
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
@ -4168,7 +4241,7 @@ dependencies = [
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sysinfo",
|
"sysinfo",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.20.0",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tower-http",
|
"tower-http",
|
||||||
|
@ -4219,7 +4292,7 @@ dependencies = [
|
||||||
"slotmap",
|
"slotmap",
|
||||||
"text-generation-router",
|
"text-generation-router",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers 0.20.0",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-stream",
|
"tokio-stream",
|
||||||
"tonic 0.10.2",
|
"tonic 0.10.2",
|
||||||
|
@ -4374,6 +4447,39 @@ dependencies = [
|
||||||
"unicode_categories",
|
"unicode_categories",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tokenizers"
|
||||||
|
version = "0.20.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"derive_builder",
|
||||||
|
"esaxx-rs",
|
||||||
|
"getrandom",
|
||||||
|
"hf-hub",
|
||||||
|
"indicatif",
|
||||||
|
"itertools 0.12.1",
|
||||||
|
"lazy_static",
|
||||||
|
"log",
|
||||||
|
"macro_rules_attribute",
|
||||||
|
"monostate",
|
||||||
|
"onig",
|
||||||
|
"paste",
|
||||||
|
"rand",
|
||||||
|
"rayon",
|
||||||
|
"rayon-cond",
|
||||||
|
"regex",
|
||||||
|
"regex-syntax 0.8.4",
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
"spm_precompiled",
|
||||||
|
"thiserror",
|
||||||
|
"unicode-normalization-alignments",
|
||||||
|
"unicode-segmentation",
|
||||||
|
"unicode_categories",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "1.39.3"
|
version = "1.39.3"
|
||||||
|
@ -4839,6 +4945,12 @@ version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unindent"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "untrusted"
|
name = "untrusted"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
|
|
|
@ -25,7 +25,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
base64 = "0.22.0"
|
base64 = "0.22.0"
|
||||||
tokenizers = { version = "0.19.1", features = ["http"] }
|
tokenizers = { version = "0.20.0", features = ["http"] }
|
||||||
hf-hub = { version = "0.3.1", features = ["tokio"] }
|
hf-hub = { version = "0.3.1", features = ["tokio"] }
|
||||||
metrics = { version = "0.23.0" }
|
metrics = { version = "0.23.0" }
|
||||||
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
|
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
|
||||||
|
|
40
Dockerfile
40
Dockerfile
|
@ -13,10 +13,13 @@ COPY benchmark benchmark
|
||||||
COPY router router
|
COPY router router
|
||||||
COPY backends backends
|
COPY backends backends
|
||||||
COPY launcher launcher
|
COPY launcher launcher
|
||||||
|
|
||||||
RUN cargo chef prepare --recipe-path recipe.json
|
RUN cargo chef prepare --recipe-path recipe.json
|
||||||
|
|
||||||
FROM chef AS builder
|
FROM chef AS builder
|
||||||
|
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
python3.11-dev
|
||||||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
||||||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
||||||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
||||||
|
@ -37,6 +40,7 @@ COPY router router
|
||||||
COPY backends backends
|
COPY backends backends
|
||||||
COPY launcher launcher
|
COPY launcher launcher
|
||||||
RUN cargo build --profile release-opt
|
RUN cargo build --profile release-opt
|
||||||
|
RUN cargo build --profile release-opt
|
||||||
|
|
||||||
# Python builder
|
# Python builder
|
||||||
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
||||||
|
@ -45,7 +49,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
|
||||||
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
|
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
|
||||||
ARG PYTORCH_VERSION=2.4.0
|
ARG PYTORCH_VERSION=2.4.0
|
||||||
|
|
||||||
ARG PYTHON_VERSION=3.10
|
ARG PYTHON_VERSION=3.11
|
||||||
# Keep in sync with `server/pyproject.toml
|
# Keep in sync with `server/pyproject.toml
|
||||||
ARG CUDA_VERSION=12.4
|
ARG CUDA_VERSION=12.4
|
||||||
ARG MAMBA_VERSION=24.3.0-0
|
ARG MAMBA_VERSION=24.3.0-0
|
||||||
|
@ -216,33 +220,33 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
||||||
COPY --from=pytorch-install /opt/conda /opt/conda
|
COPY --from=pytorch-install /opt/conda /opt/conda
|
||||||
|
|
||||||
# Copy build artifacts from flash attention builder
|
# Copy build artifacts from flash attention builder
|
||||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from flash attention v2 builder
|
# Copy build artifacts from flash attention v2 builder
|
||||||
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
|
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from custom kernels builder
|
# Copy build artifacts from custom kernels builder
|
||||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from exllama kernels builder
|
# Copy build artifacts from exllama kernels builder
|
||||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from exllamav2 kernels builder
|
# Copy build artifacts from exllamav2 kernels builder
|
||||||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from awq kernels builder
|
# Copy build artifacts from awq kernels builder
|
||||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from eetq kernels builder
|
# Copy build artifacts from eetq kernels builder
|
||||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from lorax punica kernels builder
|
# Copy build artifacts from lorax punica kernels builder
|
||||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from fbgemm builder
|
# Copy build artifacts from fbgemm builder
|
||||||
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
|
COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from vllm builder
|
# Copy build artifacts from vllm builder
|
||||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
# Copy build artifacts from mamba builder
|
# Copy build artifacts from mamba builder
|
||||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
|
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
|
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||||
COPY --from=flashinfer-builder /opt/conda/lib/python3.10/site-packages/flashinfer/ /opt/conda/lib/python3.10/site-packages/flashinfer/
|
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
|
||||||
|
|
||||||
# Install flash-attention dependencies
|
# Install flash-attention dependencies
|
||||||
RUN pip install einops --no-cache-dir
|
RUN pip install einops --no-cache-dir
|
||||||
|
@ -257,7 +261,9 @@ RUN cd server && \
|
||||||
pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
|
pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
|
||||||
pip install nvidia-nccl-cu12==2.22.3
|
pip install nvidia-nccl-cu12==2.22.3
|
||||||
|
|
||||||
ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
|
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||||
|
# Required to find libpython within the rust binaries
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
|
||||||
# This is needed because exl2 tries to load flash-attn
|
# This is needed because exl2 tries to load flash-attn
|
||||||
# And fails with our builds.
|
# And fails with our builds.
|
||||||
ENV EXLLAMA_NO_FLASH_ATTN=1
|
ENV EXLLAMA_NO_FLASH_ATTN=1
|
||||||
|
|
|
@ -17,6 +17,8 @@ RUN cargo chef prepare --recipe-path recipe.json
|
||||||
|
|
||||||
FROM chef AS builder
|
FROM chef AS builder
|
||||||
|
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
python3.11-dev
|
||||||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
||||||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
||||||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
||||||
|
@ -64,14 +66,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
||||||
hipsolver-dev \
|
hipsolver-dev \
|
||||||
rccl-dev \
|
rccl-dev \
|
||||||
cmake \
|
cmake \
|
||||||
python3-dev && \
|
python3.11-dev && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Keep in sync with `server/pyproject.toml
|
# Keep in sync with `server/pyproject.toml
|
||||||
ARG MAMBA_VERSION=23.1.0-1
|
ARG MAMBA_VERSION=23.1.0-1
|
||||||
ARG PYTORCH_VERSION='2.3.0'
|
ARG PYTORCH_VERSION='2.3.0'
|
||||||
ARG ROCM_VERSION='6.0.2'
|
ARG ROCM_VERSION='6.0.2'
|
||||||
ARG PYTHON_VERSION='3.10.10'
|
ARG PYTHON_VERSION='3.11.10'
|
||||||
# Automatically set by buildx
|
# Automatically set by buildx
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV PATH /opt/conda/bin:$PATH
|
ENV PATH /opt/conda/bin:$PATH
|
||||||
|
@ -89,10 +91,18 @@ RUN chmod +x ~/mambaforge.sh && \
|
||||||
mamba init && \
|
mamba init && \
|
||||||
rm ~/mambaforge.sh
|
rm ~/mambaforge.sh
|
||||||
|
|
||||||
|
# RUN conda install intel::mkl-static intel::mkl-include
|
||||||
|
# Install pytorch
|
||||||
|
# On arm64 we exit with an error code
|
||||||
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
"linux/arm64") exit 1 ;; \
|
||||||
|
*) /opt/conda/bin/conda update -y conda && \
|
||||||
|
/opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
|
||||||
|
esac && \
|
||||||
|
/opt/conda/bin/conda clean -ya
|
||||||
# Install flash-attention, torch dependencies
|
# Install flash-attention, torch dependencies
|
||||||
RUN pip install numpy einops ninja --no-cache-dir
|
RUN pip install numpy einops ninja --no-cache-dir
|
||||||
|
|
||||||
RUN conda install intel::mkl-static intel::mkl-include
|
|
||||||
RUN pip uninstall -y triton && \
|
RUN pip uninstall -y triton && \
|
||||||
git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
|
git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
|
||||||
cd triton/python && \
|
cd triton/python && \
|
||||||
|
@ -172,19 +182,19 @@ ENV HF_HOME=/data \
|
||||||
PORT=80
|
PORT=80
|
||||||
|
|
||||||
# Copy builds artifacts from vllm builder
|
# Copy builds artifacts from vllm builder
|
||||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from flash attention v2 builder
|
# Copy build artifacts from flash attention v2 builder
|
||||||
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from custom kernels builder
|
# Copy build artifacts from custom kernels builder
|
||||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from exllama kernels builder
|
# Copy build artifacts from exllama kernels builder
|
||||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Copy build artifacts from exllamav2 kernels builder
|
# Copy build artifacts from exllamav2 kernels builder
|
||||||
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
|
@ -201,6 +211,7 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/l
|
||||||
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
|
COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
|
||||||
# Install launcher
|
# Install launcher
|
||||||
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
|
COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
|
||||||
|
|
||||||
# AWS Sagemaker compatible image
|
# AWS Sagemaker compatible image
|
||||||
FROM base AS sagemaker
|
FROM base AS sagemaker
|
||||||
|
|
|
@ -18,6 +18,8 @@ RUN cargo chef prepare --recipe-path recipe.json
|
||||||
|
|
||||||
FROM chef AS builder
|
FROM chef AS builder
|
||||||
|
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
python3.11-dev
|
||||||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
||||||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
||||||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
||||||
|
@ -114,7 +116,7 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||||
PORT=80
|
PORT=80
|
||||||
|
|
||||||
ARG MAMBA_VERSION=23.1.0-1
|
ARG MAMBA_VERSION=23.1.0-1
|
||||||
ARG PYTHON_VERSION='3.10.10'
|
ARG PYTHON_VERSION='3.11.10'
|
||||||
# Automatically set by buildx
|
# Automatically set by buildx
|
||||||
ARG TARGETPLATFORM
|
ARG TARGETPLATFORM
|
||||||
ENV PATH /opt/conda/bin:$PATH
|
ENV PATH /opt/conda/bin:$PATH
|
||||||
|
@ -153,6 +155,7 @@ ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
||||||
ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
||||||
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
|
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
|
||||||
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
|
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
|
||||||
|
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
|
|
|
@ -357,6 +357,7 @@ impl State {
|
||||||
let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) =
|
let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) =
|
||||||
(block_allocation, &self.block_allocator)
|
(block_allocation, &self.block_allocator)
|
||||||
{
|
{
|
||||||
|
tracing::debug!("Allocating {tokens} with {input_ids:?}");
|
||||||
match block_allocator.allocate(tokens, input_ids).await {
|
match block_allocator.allocate(tokens, input_ids).await {
|
||||||
None => {
|
None => {
|
||||||
// Entry is over budget
|
// Entry is over budget
|
||||||
|
|
|
@ -61,6 +61,7 @@ uuid = { version = "1.9.1", default-features = false, features = [
|
||||||
] }
|
] }
|
||||||
csv = "1.3.0"
|
csv = "1.3.0"
|
||||||
ureq = "=2.9"
|
ureq = "=2.9"
|
||||||
|
pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
|
||||||
|
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
|
|
|
@ -41,6 +41,7 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
|
||||||
use hf_hub::{Cache, Repo, RepoType};
|
use hf_hub::{Cache, Repo, RepoType};
|
||||||
use http::header::AUTHORIZATION;
|
use http::header::AUTHORIZATION;
|
||||||
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
|
use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
|
||||||
|
use pyo3::types::IntoPyDict;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::convert::Infallible;
|
use std::convert::Infallible;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
@ -48,7 +49,6 @@ use std::io::BufReader;
|
||||||
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
use std::net::{IpAddr, Ipv4Addr, SocketAddr};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokenizers::processors::template::TemplateProcessing;
|
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
use tokio::select;
|
use tokio::select;
|
||||||
use tokio::signal;
|
use tokio::signal;
|
||||||
|
@ -1860,18 +1860,34 @@ pub async fn run(
|
||||||
});
|
});
|
||||||
|
|
||||||
let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
|
let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
|
||||||
let mut tokenizer = Tokenizer::from_file(filename).ok();
|
use pyo3::prelude::*;
|
||||||
if let Some(tokenizer) = &mut tokenizer {
|
let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
|
||||||
if let Some(class) = &tokenizer_config.tokenizer_class {
|
let transformers = py.import_bound("transformers")?;
|
||||||
if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{
|
let auto = transformers.getattr("AutoTokenizer")?;
|
||||||
if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) {
|
let from_pretrained = auto.getattr("from_pretrained")?;
|
||||||
tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205");
|
let args = (tokenizer_name.to_string(),);
|
||||||
tokenizer.with_post_processor(post_processor);
|
let kwargs = [(
|
||||||
}
|
"revision",
|
||||||
}
|
revision.clone().unwrap_or_else(|| "main".to_string()),
|
||||||
}
|
)]
|
||||||
}
|
.into_py_dict_bound(py);
|
||||||
tokenizer
|
let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
|
||||||
|
let save = tokenizer.getattr("save_pretrained")?;
|
||||||
|
let args = ("out".to_string(),);
|
||||||
|
save.call1(args)?;
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.inspect_err(|err| {
|
||||||
|
tracing::error!("Failed to import python tokenizer {err}");
|
||||||
|
});
|
||||||
|
let filename = if convert.is_ok() {
|
||||||
|
// If we have correctly loaded and resaved with transformers
|
||||||
|
// We might have modified the tokenizer.json according to transformers
|
||||||
|
"out/tokenizer.json".into()
|
||||||
|
} else {
|
||||||
|
filename
|
||||||
|
};
|
||||||
|
Tokenizer::from_file(filename).ok()
|
||||||
});
|
});
|
||||||
|
|
||||||
let config: Option<Config> = config_filename.and_then(|filename| {
|
let config: Option<Config> = config_filename.and_then(|filename| {
|
||||||
|
@ -2591,77 +2607,6 @@ pub enum WebServerError {
|
||||||
Axum(#[from] axum::BoxError),
|
Axum(#[from] axum::BoxError),
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a post_processor for the LlamaTokenizer
|
|
||||||
fn create_post_processor(
|
|
||||||
tokenizer: &Tokenizer,
|
|
||||||
tokenizer_config: &HubTokenizerConfig,
|
|
||||||
) -> Result<TemplateProcessing, tokenizers::processors::template::TemplateProcessingBuilderError> {
|
|
||||||
let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true);
|
|
||||||
let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false);
|
|
||||||
|
|
||||||
let bos_token = tokenizer_config.bos_token.as_ref();
|
|
||||||
let eos_token = tokenizer_config.eos_token.as_ref();
|
|
||||||
|
|
||||||
if add_bos_token && bos_token.is_none() {
|
|
||||||
panic!("add_bos_token = true but bos_token is None");
|
|
||||||
}
|
|
||||||
|
|
||||||
if add_eos_token && eos_token.is_none() {
|
|
||||||
panic!("add_eos_token = true but eos_token is None");
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut single = Vec::new();
|
|
||||||
let mut pair = Vec::new();
|
|
||||||
let mut special_tokens = Vec::new();
|
|
||||||
|
|
||||||
if add_bos_token {
|
|
||||||
if let Some(bos) = bos_token {
|
|
||||||
let bos_token_id = tokenizer
|
|
||||||
.token_to_id(bos.as_str())
|
|
||||||
.expect("Should have found the bos token id");
|
|
||||||
special_tokens.push((bos.as_str(), bos_token_id));
|
|
||||||
single.push(format!("{}:0", bos.as_str()));
|
|
||||||
pair.push(format!("{}:0", bos.as_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
single.push("$A:0".to_string());
|
|
||||||
pair.push("$A:0".to_string());
|
|
||||||
|
|
||||||
if add_eos_token {
|
|
||||||
if let Some(eos) = eos_token {
|
|
||||||
let eos_token_id = tokenizer
|
|
||||||
.token_to_id(eos.as_str())
|
|
||||||
.expect("Should have found the eos token id");
|
|
||||||
special_tokens.push((eos.as_str(), eos_token_id));
|
|
||||||
single.push(format!("{}:0", eos.as_str()));
|
|
||||||
pair.push(format!("{}:0", eos.as_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if add_bos_token {
|
|
||||||
if let Some(bos) = bos_token {
|
|
||||||
pair.push(format!("{}:1", bos.as_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pair.push("$B:1".to_string());
|
|
||||||
|
|
||||||
if add_eos_token {
|
|
||||||
if let Some(eos) = eos_token {
|
|
||||||
pair.push(format!("{}:1", eos.as_str()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let post_processor = TemplateProcessing::builder()
|
|
||||||
.try_single(single)?
|
|
||||||
.try_pair(pair)?
|
|
||||||
.special_tokens(special_tokens)
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
Ok(post_processor)
|
|
||||||
}
|
|
||||||
|
|
||||||
type PreparedInput = (String, Option<GrammarType>, bool);
|
type PreparedInput = (String, Option<GrammarType>, bool);
|
||||||
|
|
||||||
fn prepare_chat_input(
|
fn prepare_chat_input(
|
||||||
|
|
|
@ -267,7 +267,7 @@ def test_batch_concatenate(
|
||||||
assert next_batch.max_input_length == 3
|
assert next_batch.max_input_length == 3
|
||||||
|
|
||||||
assert next_batch.requests[0] == next_batch_0.requests[0]
|
assert next_batch.requests[0] == next_batch_0.requests[0]
|
||||||
assert next_batch.requests[1:] == next_batch_1.requests
|
assert next_batch.requests[1:] == list(next_batch_1.requests)
|
||||||
|
|
||||||
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
||||||
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
||||||
|
|
|
@ -262,7 +262,7 @@ def test_batch_concatenate(
|
||||||
assert next_batch.max_input_length == 3
|
assert next_batch.max_input_length == 3
|
||||||
|
|
||||||
assert next_batch.requests[0] == next_batch_0.requests[0]
|
assert next_batch.requests[0] == next_batch_0.requests[0]
|
||||||
assert next_batch.requests[1:] == next_batch_1.requests
|
assert next_batch.requests[1:] == list(next_batch_1.requests)
|
||||||
|
|
||||||
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
||||||
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
||||||
|
|
|
@ -281,7 +281,7 @@ def test_batch_concatenate(
|
||||||
assert next_batch.max_decoder_input_length == 3
|
assert next_batch.max_decoder_input_length == 3
|
||||||
|
|
||||||
assert next_batch.requests[0] == next_batch_0.requests[0]
|
assert next_batch.requests[0] == next_batch_0.requests[0]
|
||||||
assert next_batch.requests[1:] == next_batch_1.requests
|
assert next_batch.requests[1:] == list(next_batch_1.requests)
|
||||||
|
|
||||||
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
|
||||||
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
|
||||||
|
|
|
@ -272,6 +272,8 @@ class FlashCausalLMBatch(Batch):
|
||||||
assert prefix_len > 0
|
assert prefix_len > 0
|
||||||
prefix_len -= 1
|
prefix_len -= 1
|
||||||
|
|
||||||
|
# Commented as it's costly.
|
||||||
|
# log_master(logger.debug, "Tokenized input ids {tokenized_input}")
|
||||||
prefix_ids.append(tokenized_input[:prefix_len])
|
prefix_ids.append(tokenized_input[:prefix_len])
|
||||||
tokenized_input = tokenized_input[prefix_len:]
|
tokenized_input = tokenized_input[prefix_len:]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue