Putting back the NCCL forced upgrade. (#2999)
* Putting back the NCCL forced upgrade. * . * ... * Ignoring conda. * Dropping conda from the buidl system + torch 2.6 * Cache min. * Rolling back torch version. * Reverting the EETQ modification. * Fix flash attention ? * Actually stay on flash v1. * Patching flash v1. * Torch 2.6, fork of rotary, eetq updated. * Put back nccl latest (override torch). * Slightly more reproducible build and not as scary.
This commit is contained in:
parent
8a211dc7fc
commit
d6881c37ab
|
@ -202,7 +202,7 @@ jobs:
|
|||
target: ${{ env.TARGET }}
|
||||
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
|
||||
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=max
|
||||
cache-from: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=max,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
|
||||
cache-to: type=s3,region=us-east-1,bucket=ci-docker-buildx-cache,name=text-generation-inference-cache${{ env.LABEL }},mode=min,access_key_id=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_ACCESS_KEY_ID }},secret_access_key=${{ secrets.S3_CI_DOCKER_BUILDX_CACHE_SECRET_ACCESS_KEY }},mode=min
|
||||
- name: Final
|
||||
id: final
|
||||
|
|
110
Dockerfile
110
Dockerfile
|
@ -45,21 +45,16 @@ RUN cargo build --profile release-opt --frozen
|
|||
# Python builder
|
||||
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
||||
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
|
||||
WORKDIR /usr/src/
|
||||
|
||||
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
|
||||
ARG PYTORCH_VERSION=2.5.1
|
||||
|
||||
ARG PYTORCH_VERSION=2.6
|
||||
ARG PYTHON_VERSION=3.11
|
||||
|
||||
# Keep in sync with `server/pyproject.toml
|
||||
ARG CUDA_VERSION=12.4
|
||||
ARG MAMBA_VERSION=24.3.0-0
|
||||
ARG CUDA_CHANNEL=nvidia
|
||||
ARG INSTALL_CHANNEL=pytorch
|
||||
# Automatically set by buildx
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ENV PATH=/opt/conda/bin:$PATH
|
||||
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
|
@ -67,26 +62,12 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
|||
curl \
|
||||
git && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install conda
|
||||
# translating Docker's TARGETPLATFORM into mamba arches
|
||||
RUN case ${TARGETPLATFORM} in \
|
||||
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
|
||||
*) MAMBA_ARCH=x86_64 ;; \
|
||||
esac && \
|
||||
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
|
||||
RUN chmod +x ~/mambaforge.sh && \
|
||||
bash ~/mambaforge.sh -b -p /opt/conda && \
|
||||
rm ~/mambaforge.sh
|
||||
|
||||
# Install pytorch
|
||||
# On arm64 we exit with an error code
|
||||
RUN case ${TARGETPLATFORM} in \
|
||||
"linux/arm64") exit 1 ;; \
|
||||
*) /opt/conda/bin/conda update -y conda && \
|
||||
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
|
||||
esac && \
|
||||
/opt/conda/bin/conda clean -ya
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
|
||||
ENV PATH="$PATH:/root/.local/bin"
|
||||
RUN uv python install ${PYTHON_VERSION}
|
||||
RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} pip setuptools packaging
|
||||
ENV VIRTUAL_ENV=/usr/src/.venv/
|
||||
ENV PATH="$PATH:/usr/src/.venv/bin/"
|
||||
|
||||
# CUDA kernels builder image
|
||||
FROM pytorch-install AS kernel-builder
|
||||
|
@ -106,7 +87,7 @@ WORKDIR /usr/src
|
|||
COPY server/Makefile-flash-att Makefile
|
||||
|
||||
# Build specific version of flash attention
|
||||
RUN make build-flash-attention
|
||||
RUN . .venv/bin/activate && make build-flash-attention
|
||||
|
||||
# Build Flash Attention v2 CUDA kernels
|
||||
FROM kernel-builder AS flash-att-v2-builder
|
||||
|
@ -116,14 +97,14 @@ WORKDIR /usr/src
|
|||
COPY server/Makefile-flash-att-v2 Makefile
|
||||
|
||||
# Build specific version of flash attention v2
|
||||
RUN make build-flash-attention-v2-cuda
|
||||
RUN . .venv/bin/activate && make build-flash-attention-v2-cuda
|
||||
|
||||
# Build Transformers exllama kernels
|
||||
FROM kernel-builder AS exllama-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/exllama_kernels/ .
|
||||
|
||||
RUN python setup.py build
|
||||
RUN . .venv/bin/activate && python setup.py build
|
||||
|
||||
# Build Transformers exllama kernels
|
||||
FROM kernel-builder AS exllamav2-kernels-builder
|
||||
|
@ -131,54 +112,50 @@ WORKDIR /usr/src
|
|||
COPY server/Makefile-exllamav2/ Makefile
|
||||
|
||||
# Build specific version of transformers
|
||||
RUN make build-exllamav2
|
||||
RUN . .venv/bin/activate && make build-exllamav2
|
||||
|
||||
# Build Transformers awq kernels
|
||||
FROM kernel-builder AS awq-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-awq Makefile
|
||||
# Build specific version of transformers
|
||||
RUN make build-awq
|
||||
RUN . .venv/bin/activate && make build-awq
|
||||
|
||||
# Build eetq kernels
|
||||
FROM kernel-builder AS eetq-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-eetq Makefile
|
||||
# Build specific version of transformers
|
||||
RUN make build-eetq
|
||||
RUN . .venv/bin/activate && make build-eetq
|
||||
|
||||
# Build Lorax Punica kernels
|
||||
FROM kernel-builder AS lorax-punica-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-lorax-punica Makefile
|
||||
# Build specific version of transformers
|
||||
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
|
||||
RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
|
||||
|
||||
# Build Transformers CUDA kernels
|
||||
FROM kernel-builder AS custom-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/custom_kernels/ .
|
||||
# Build specific version of transformers
|
||||
RUN python setup.py build
|
||||
RUN . .venv/bin/activate && python setup.py build
|
||||
|
||||
# Build mamba kernels
|
||||
FROM kernel-builder AS mamba-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-selective-scan Makefile
|
||||
RUN make build-all
|
||||
RUN . .venv/bin/activate && make build-all
|
||||
|
||||
# Build flashinfer
|
||||
FROM kernel-builder AS flashinfer-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-flashinfer Makefile
|
||||
RUN make install-flashinfer
|
||||
RUN . .venv/bin/activate && make install-flashinfer
|
||||
|
||||
# Text Generation Inference base image
|
||||
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS base
|
||||
|
||||
# Conda env
|
||||
ENV PATH=/opt/conda/bin:$PATH \
|
||||
CONDA_PREFIX=/opt/conda
|
||||
FROM nvidia/cuda:12.4.0-base-ubuntu22.04 AS base
|
||||
|
||||
# Text Generation Inference base env
|
||||
ENV HF_HOME=/data \
|
||||
|
@ -195,60 +172,62 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
|||
git \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
ENV PATH="$PATH:/root/.local/bin"
|
||||
# Install flash-attention dependencies
|
||||
# RUN pip install einops --no-cache-dir
|
||||
|
||||
# Copy conda with PyTorch installed
|
||||
COPY --from=pytorch-install /opt/conda /opt/conda
|
||||
# Copy env with PyTorch installed
|
||||
COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
|
||||
ENV PYTHON_VERSION=3.11
|
||||
RUN uv python install ${PYTHON_VERSION}
|
||||
ENV VIRTUAL_ENV=/usr/src/.venv/
|
||||
ENV PATH="$PATH:/usr/src/.venv/bin/"
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
ENV UV_SYSTEM_PYTHON=1
|
||||
ENV HF_KERNELS_CACHE=/kernels
|
||||
RUN cd server && \
|
||||
pip install -U pip uv && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project && \
|
||||
. ./.venv/bin/activate && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --no-install-project --active && \
|
||||
make gen-server-raw && \
|
||||
hf-kernels download .
|
||||
|
||||
RUN cd server && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines && \
|
||||
. ./.venv/bin/activate && \
|
||||
uv sync --frozen --extra gen --extra attention --extra bnb --extra accelerate --extra compressed-tensors --extra marlin --extra moe --extra quantize --extra peft --extra outlines --active --python=${PYTHON_VERSION} && \
|
||||
uv pip install nvidia-nccl-cu12==2.25.1 && \
|
||||
pwd && \
|
||||
text-generation-server --help
|
||||
|
||||
# Copy build artifacts from flash attention builder
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from flash attention v2 builder
|
||||
COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flash-att-v2-builder /usr/src/.venv/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /usr/src/.venv/lib/python3.11/site-packages
|
||||
|
||||
# Copy build artifacts from custom kernels builder
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllama kernels builder
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from exllamav2 kernels builder
|
||||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from awq kernels builder
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from eetq kernels builder
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from lorax punica kernels builder
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from mamba builder
|
||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/server/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /usr/src/server/.venv/lib/python3.11/site-packages/flashinfer/
|
||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
|
||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /usr/src/.venv/lib/python3.11/site-packages
|
||||
COPY --from=flashinfer-builder /usr/src/.venv/lib/python3.11/site-packages/flashinfer/ /usr/src/.venv/lib/python3.11/site-packages/flashinfer/
|
||||
|
||||
|
||||
# ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
# Required to find libpython within the rust binaries
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
|
||||
# This is needed because exl2 tries to load flash-attn
|
||||
# And fails with our builds.
|
||||
ENV EXLLAMA_NO_FLASH_ATTN=1
|
||||
|
@ -283,5 +262,6 @@ FROM base
|
|||
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
|
||||
RUN chmod +x /tgi-entrypoint.sh
|
||||
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/root/.local/share/uv/python/cpython-3.11.11-linux-x86_64-gnu/lib/"
|
||||
ENTRYPOINT ["/tgi-entrypoint.sh"]
|
||||
# CMD ["--json-output"]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
eetq_commit := 81e0b14d64088d58ef6acd2c8f3e788d59324407
|
||||
eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491
|
||||
|
||||
eetq:
|
||||
# Clone eetq
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
flash_att_commit := 3a9bfd076f98746c73362328958dbc68d145fbec
|
||||
flash_att_commit := ceee0de88c037ee6eda5e75c813a8648e4bcb1c9
|
||||
|
||||
build-flash-attention:
|
||||
if [ ! -d 'flash-attention' ]; then \
|
||||
pip install -U packaging ninja --no-cache-dir && \
|
||||
git clone https://github.com/HazyResearch/flash-attention.git; \
|
||||
git clone https://github.com/Narsil/flash-attention.git; \
|
||||
fi
|
||||
cd flash-attention && git fetch && git checkout $(flash_att_commit) && \
|
||||
MAX_JOBS=8 python setup.py build && cd csrc/layer_norm && python setup.py build && cd ../rotary && python setup.py build
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
install-flashinfer:
|
||||
# We need fsspec as an additional dependency, but
|
||||
# `pip install flashinfer` cannot resolve it.
|
||||
pip install fsspec sympy==1.13.1 numpy
|
||||
pip install -U setuptools
|
||||
uv pip install fsspec sympy==1.13.1 numpy
|
||||
uv pip install -U setuptools
|
||||
TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" FLASHINFER_ENABLE_AOT=1 pip install git+https://github.com/flashinfer-ai/flashinfer.git@v0.2.0.post1#egg=flashinfer --no-build-isolation
|
||||
|
|
749
server/uv.lock
749
server/uv.lock
File diff suppressed because it is too large
Load Diff
|
@ -2,5 +2,5 @@
|
|||
|
||||
ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
|
||||
|
||||
source ./server/.venv/bin/activate
|
||||
source ./.venv/bin/activate
|
||||
exec text-generation-launcher $@
|
||||
|
|
Loading…
Reference in New Issue