local-llm-server/other/non-avx tgi docker/Dockerfile

372 lines
13 KiB
Docker

# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
WORKDIR /usr/src
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json
FROM chef AS builder
ARG GIT_SHA
ARG DOCKER_LABEL
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
rm -f $PROTOC_ZIP
COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo build --release
# ==============================================================================
# Build PyTorch
# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
# FROM debian:bullseye-slim as pytorch-install
FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as pytorch-install
ARG PYTORCH_VERSION=2.0.1
ARG PYTHON_VERSION=3.9
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=11.8
ARG MAMBA_VERSION=23.1.0-1
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH
RUN apt-get update
# Add new repo to install gcc 11 on Ubuntu 20.04
RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \
apt-get update
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
gcc-11 \
g++-11 \
build-essential \
ca-certificates \
ccache \
curl \
git \
cmake \
libjpeg-dev \
libpng-dev \
ninja-build \
&& rm -rf /var/lib/apt/lists/*
RUN /usr/sbin/update-ccache-symlinks
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
ENV PATH /opt/conda/bin:$PATH
# Set gcc path to new gcc version
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
# Install conda
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MAMBA_ARCH=aarch64 ;; \
*) MAMBA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
bash ~/mambaforge.sh -b -p /opt/conda && \
rm ~/mambaforge.sh
RUN git clone --recursive https://github.com/pytorch/pytorch && \
cd pytorch && \
git checkout v${PYTORCH_VERSION} && \
git submodule update --init --recursive
WORKDIR /pytorch
# Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out
RUN echo $PYTORCH_VERSION > version.txt
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake ninja conda-build pyyaml numpy ipython && \
/opt/conda/bin/python -mpip install -r requirements.txt
# Install things for building PyTorch
RUN /opt/conda/bin/conda install -y mkl mkl-include cudnn && \
/opt/conda/bin/conda install -c pytorch magma-cuda118
# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt
# TODO: remove what we don't need
RUN /opt/conda/bin/conda install -y \
jemalloc \
astunparse \
ccache \
cmake \
expecttest \
filelock \
fsspec \
git \
hypothesis \
jinja2 \
libjpeg-turbo \
libpng \
networkx \
ninja \
numpy \
psutil \
pyyaml \
requests \
setuptools \
sympy \
types-dataclasses \
typing-extensions
RUN /opt/conda/bin/conda clean -ya
# Use Intel OpenMP with optimizations. See the documentation for details.
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
# Intel OpenMP thread blocking time in ms.
ENV KMP_BLOCKTIME=0
# Configure CPU thread affinity.
# ENV KMP_AFFINITY="granularity=fine,compact,1,0"
ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD}
# Use Jemalloc for efficient memory management.
ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD}
ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000"
# Install PyTorch without AVX2
# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124
# print(torch.__config__.show().split("\n"), sep="\n")
RUN --mount=type=cache,target=/opt/ccache \
python setup.py clean && \
BLAS_INFO=mklBUILD_TYPE=Release \
CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \
LAPACK_INFO=mkl \
PERF_WITH_AVX=1 \
PERF_WITH_AVX2=0 \
PERF_WITH_AVX512=0 \
TORCH_DISABLE_GPU_ASSERTS=ON \
TORCH_VERSION=${PYTORCH_VERSION} \
USE_CUDA=ON \
USE_CUDNN=ON \
USE_EXCEPTION_PTR=1 \
USE_GFLAGS=OFF \
USE_GLOG=OFF \
USE_MKL=ON \
USE_MKLDNN=ON \
USE_MPI=OFF \
USE_NCCL=1 \
USE_NNPACK=ON \
USE_OPENMP=ON \
USE_ROCM=OFF \
BUILD_TEST=0 \
TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \
python setup.py install && \
cd .. && \
rm -rf pytorch
# Make sure we built everything properly. Build will fail if CUDA isn't available.
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
# ==============================================================================
# Set up the kernel-builder
FROM pytorch-install as kernel-builder
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ninja-build \
&& rm -rf /var/lib/apt/lists/*
RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 && \
/opt/conda/bin/conda clean -ya
# ==============================================================================
# Build Flash Attention CUDA kernels
FROM kernel-builder as flash-att-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att Makefile
# Build specific version of flash attention
RUN MAX_JOBS=5 make build-flash-attention
# ==============================================================================
# Build Flash Attention v2 CUDA kernels
FROM kernel-builder as flash-att-v2-builder
WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2
RUN MAX_JOBS=10 make build-flash-attention-v2
# ==============================================================================
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# ==============================================================================
# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
COPY server/custom_kernels/ .
# Build specific version of transformers
RUN MAX_JOBS=5 python setup.py build
# ==============================================================================
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
WORKDIR /usr/src
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
RUN MAX_JOBS=5 make build-vllm
# ==============================================================================
# Text Generation Inference base image
# nvidia/cuda:11.8.0-base-ubuntu20.04
FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as base
# Conda env
ENV PATH=/opt/conda/bin:$PATH \
CONDA_PREFIX=/opt/conda
# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
HF_HUB_ENABLE_HF_TRANSFER=1 \
PORT=80 \
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib
WORKDIR /usr/src
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
libssl-dev \
ca-certificates \
make \
&& rm -rf /var/lib/apt/lists/*
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
# Install flash-attention dependencies
RUN pip install einops --no-cache-dir
# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements.txt
RUN cd server && \
pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \
pip install optimum auto-gptq
RUN /opt/conda/bin/conda clean -ya
# Fix the error
# /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so
RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama"
# RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/
# RUN find / -name libcudart.so 2>/dev/null
# RUN find / -name "*libc10.so*"
# RUN find / -name libtorch.so
# Make sure our special dependencies were compiled and copied correctly
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
RUN python -c "import torch; torch.cuda.is_available()"
RUN python -c "import torch; import flash_attn_2_cuda"
RUN python -c "import torch; import flash_attn_cuda"
RUN python -c "import torch; import vllm_cache_ops"
RUN python -c "import torch; import vllm_attention_ops"
RUN python -c "import torch; import custom_kernels"
RUN python -c "import torch; import text_generation_server.utils.gptq.exllama"
# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
g++ \
&& rm -rf /var/lib/apt/lists/*
# ==============================================================================
# AWS Sagemaker compatbile image
FROM base as sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
# ==============================================================================
# Final image
FROM base
ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]