# syntax = docker/dockerfile:experimental # DOCKER_BUILDKIT=1 DOCKER_CLI_EXPERIMENTAL=enabled # Rust builder FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef WORKDIR /usr/src ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse FROM chef as planner COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY benchmark benchmark COPY router router COPY launcher launcher RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder ARG GIT_SHA ARG DOCKER_LABEL RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ rm -f $PROTOC_ZIP COPY --from=planner /usr/src/recipe.json recipe.json RUN cargo chef cook --release --recipe-path recipe.json COPY Cargo.toml Cargo.toml COPY rust-toolchain.toml rust-toolchain.toml COPY proto proto COPY benchmark benchmark COPY router router COPY launcher launcher RUN cargo build --release # ============================================================================== # Build PyTorch # Python builder # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as dev-base # FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as pytorch-build # ubuntu:20.04 ARG PYTORCH_VERSION=2.0.1 ARG PYTHON_VERSION=3.9 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=11.8 # Automatically set by buildx ARG TARGETPLATFORM ENV PATH /opt/conda/bin:$PATH RUN apt-get update RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gnupg2 # Add new repo to install gcc 11 on Ubuntu 20.04 RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \ apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \ apt-get update RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ gcc-11 \ g++-11 \ build-essential \ ca-certificates \ ccache \ cmake \ curl \ git \ ninja-build \ libjpeg-dev \ libpng-dev && \ rm -rf /var/lib/apt/lists/* RUN /usr/sbin/update-ccache-symlinks RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache ENV PATH /opt/conda/bin:$PATH # Set gcc path to new gcc version RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11 # Install conda FROM dev-base as pytorch-build # Automatically set by buildx ARG TARGETPLATFORM # translating Docker's TARGETPLATFORM into miniconda arches RUN case ${TARGETPLATFORM} in \ "linux/arm64") MINICONDA_ARCH=aarch64 ;; \ *) MINICONDA_ARCH=x86_64 ;; \ esac && \ curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh" # Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 RUN chmod +x ~/miniconda.sh && \ bash ~/miniconda.sh -b -p /opt/conda && \ rm ~/miniconda.sh RUN git clone --recursive https://github.com/pytorch/pytorch && \ cd pytorch && \ git checkout v${PYTORCH_VERSION} WORKDIR /pytorch RUN git submodule update --init --recursive # Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out RUN echo $PYTORCH_VERSION > version.txt # Install things for building PyTorch RUN /opt/conda/bin/conda install -y mkl mkl-include cudnn libgcc && \ /opt/conda/bin/conda install -c conda-forge libstdcxx-ng=12 && \ /opt/conda/bin/conda install -c pytorch magma-cuda118 && \ /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 RUN find / -name libcudart_static* RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \ /opt/conda/bin/python -mpip install -r requirements.txt # https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt # TODO: remove what we don't need RUN /opt/conda/bin/conda install -y \ jemalloc \ ccache \ cmake \ expecttest \ filelock \ fsspec \ git \ hypothesis \ jinja2 \ libpng \ networkx \ ninja \ numpy \ psutil \ pyyaml \ requests \ setuptools \ sympy \ typing-extensions RUN /opt/conda/bin/conda clean -ya # Use Intel OpenMP with optimizations. See the documentation for details. # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html # Intel OpenMP thread blocking time in ms. ENV KMP_BLOCKTIME=0 # Configure CPU thread affinity. # ENV KMP_AFFINITY="granularity=fine,compact,1,0" ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD} # Use Jemalloc for efficient memory management. ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD} ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000" RUN cat version.txt RUN make triton # Install PyTorch without AVX2 # https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124 # print(torch.__config__.show().split("\n"), sep="\n") RUN --mount=type=cache,target=/opt/ccache \ python setup.py clean && \ BLAS_INFO=mkl \ BUILD_TYPE=Release \ CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \ LAPACK_INFO=mkl \ PERF_WITH_AVX=1 \ PERF_WITH_AVX2=0 \ PERF_WITH_AVX512=0 \ TORCH_DISABLE_GPU_ASSERTS=ON \ TORCH_VERSION=${PYTORCH_VERSION} \ USE_CUDA=ON \ USE_CUDNN=ON \ USE_EXCEPTION_PTR=1 \ USE_GFLAGS=OFF \ USE_GLOG=OFF \ USE_MKL=ON \ USE_MKLDNN=ON \ USE_MPI=OFF \ USE_NCCL=1 \ USE_NNPACK=ON \ USE_OPENMP=ON \ USE_ROCM=OFF \ BUILD_TEST=0 \ CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \ TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ python setup.py install && \ cd .. && \ rm -rf pytorch # Make sure we built everything properly. Build will fail if CUDA isn't available. # RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so # ============================================================================== # Build Flash Attention CUDA kernels FROM pytorch-build as flash-att-builder WORKDIR /usr/src COPY server/Makefile-flash-att Makefile # Build specific version of flash attention RUN MAX_JOBS=5 make build-flash-attention # ============================================================================== # Build Flash Attention v2 CUDA kernels FROM pytorch-build as flash-att-v2-builder WORKDIR /usr/src COPY server/Makefile-flash-att-v2 Makefile # Build specific version of flash attention v2 RUN MAX_JOBS=10 make build-flash-attention-v2 # ============================================================================== # Build Transformers exllama kernels FROM pytorch-build as exllama-kernels-builder WORKDIR /usr/src COPY server/exllama_kernels/ . # Build specific version of transformers RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # ============================================================================== # Build Transformers CUDA kernels FROM pytorch-build as custom-kernels-builder WORKDIR /usr/src COPY server/custom_kernels/ . # Build specific version of transformers RUN MAX_JOBS=5 python setup.py build # ============================================================================== # Build vllm CUDA kernels FROM pytorch-build as vllm-builder WORKDIR /usr/src COPY server/Makefile-vllm Makefile # Build specific version of vllm RUN MAX_JOBS=5 make build-vllm # ============================================================================== # Text Generation Inference base image # nvidia/cuda:11.8.0-base-ubuntu20.04 FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base # Conda env ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ PORT=80 \ LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib WORKDIR /usr/src RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ libssl-dev \ ca-certificates \ make \ && rm -rf /var/lib/apt/lists/* # Copy conda with PyTorch installed COPY --from=pytorch-build /opt/conda /opt/conda # Copy build artifacts from flash attention builder COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy build artifacts from flash attention v2 builder COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy build artifacts from custom kernels builder COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy build artifacts from exllama kernels builder COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Install flash-attention dependencies RUN pip install einops --no-cache-dir # Install server COPY proto proto COPY server server COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements.txt RUN cd server && \ pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \ pip install optimum auto-gptq RUN /opt/conda/bin/conda clean -ya # Fix the error # /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32 RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama" # RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/ # RUN find / -name libcudart.so 2>/dev/null # RUN find / -name "*libc10.so*" # RUN find / -name libtorch.so # Make sure our special dependencies were compiled and copied correctly # RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" # RUN python -c "import torch; torch.cuda.is_available()" RUN python -c "import torch; import flash_attn_2_cuda" RUN python -c "import torch; import flash_attn_cuda" RUN python -c "import torch; import vllm_cache_ops" RUN python -c "import torch; import vllm_attention_ops" RUN python -c "import torch; import custom_kernels" RUN python -c "import torch; import text_generation_server.utils.gptq.exllama" # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router # Install launcher COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ build-essential \ g++ \ && rm -rf /var/lib/apt/lists/* # ============================================================================== # AWS Sagemaker compatbile image FROM base as sagemaker COPY sagemaker-entrypoint.sh entrypoint.sh RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] # ============================================================================== # Final image FROM base ENTRYPOINT ["text-generation-launcher"] CMD ["--json-output"]