diff --git a/other/non-avx tgi docker/Dockerfile b/other/non-avx tgi docker/Dockerfile new file mode 100644 index 0000000..4f6d10a --- /dev/null +++ b/other/non-avx tgi docker/Dockerfile @@ -0,0 +1,223 @@ +# Rust builder +FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef +WORKDIR /usr/src + +ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse + +FROM chef as planner +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY proto proto +COPY benchmark benchmark +COPY router router +COPY launcher launcher +RUN cargo chef prepare --recipe-path recipe.json + +FROM chef AS builder + +ARG GIT_SHA +ARG DOCKER_LABEL + +RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ + curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ + unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ + unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ + rm -f $PROTOC_ZIP + +COPY --from=planner /usr/src/recipe.json recipe.json +RUN cargo chef cook --release --recipe-path recipe.json + +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY proto proto +COPY benchmark benchmark +COPY router router +COPY launcher launcher +RUN cargo build --release + +# Python builder +# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile +FROM debian:bullseye-slim as pytorch-install + +ARG PYTORCH_VERSION=2.0.1 +ARG PYTHON_VERSION=3.9 +# Keep in sync with `server/pyproject.toml +ARG CUDA_VERSION=11.8 +ARG MAMBA_VERSION=23.1.0-1 +ARG CUDA_CHANNEL=nvidia +ARG INSTALL_CHANNEL=pytorch +# Automatically set by buildx +ARG TARGETPLATFORM + +ENV PATH /opt/conda/bin:$PATH + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + ccache \ + curl \ + git && \ + rm -rf /var/lib/apt/lists/* + +# Install conda +# translating Docker's TARGETPLATFORM into mamba arches +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") MAMBA_ARCH=aarch64 ;; \ + *) MAMBA_ARCH=x86_64 ;; \ + esac && \ + curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" +RUN chmod +x ~/mambaforge.sh && \ + bash ~/mambaforge.sh -b -p /opt/conda && \ + rm ~/mambaforge.sh + +# Install PyTorch without AVX2 +RUN git clone --recursive https://github.com/pytorch/pytorch && \ + cd pytorch && \ + git checkout v${PYTORCH_VERSION} && \ + python setup.py clean && \ + USE_CUDA=1 \ + TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ + CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \ + python setup.py install && \ + cd .. && \ + rm -rf pytorch + + +# CUDA kernels builder image +FROM pytorch-install as kernel-builder + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 && \ + /opt/conda/bin/conda clean -ya + +# Build Flash Attention CUDA kernels +FROM kernel-builder as flash-att-builder + +WORKDIR /usr/src + +COPY server/Makefile-flash-att Makefile + +# Build specific version of flash attention +RUN MAX_WORKERS=4 make build-flash-attention + +# Build Flash Attention v2 CUDA kernels +FROM kernel-builder as flash-att-v2-builder + +WORKDIR /usr/src + +COPY server/Makefile-flash-att-v2 Makefile + +# Build specific version of flash attention v2 +RUN MAX_WORKERS=4 make build-flash-attention-v2 + +# Build Transformers exllama kernels +FROM kernel-builder as exllama-kernels-builder + +WORKDIR /usr/src + +COPY server/exllama_kernels/ . + + +# Build specific version of transformers +RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build + +# Build Transformers CUDA kernels +FROM kernel-builder as custom-kernels-builder + +WORKDIR /usr/src + +COPY server/custom_kernels/ . + +# Build specific version of transformers +RUN python setup.py build + +# Build vllm CUDA kernels +FROM kernel-builder as vllm-builder + +WORKDIR /usr/src + +COPY server/Makefile-vllm Makefile + +# Build specific version of vllm +RUN MAX_WORKERS=4 make build-vllm + +# Text Generation Inference base image +FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base + +# Conda env +ENV PATH=/opt/conda/bin:$PATH \ + CONDA_PREFIX=/opt/conda + +# Text Generation Inference base env +ENV HUGGINGFACE_HUB_CACHE=/data \ + HF_HUB_ENABLE_HF_TRANSFER=1 \ + PORT=80 + +WORKDIR /usr/src + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libssl-dev \ + ca-certificates \ + make \ + && rm -rf /var/lib/apt/lists/* + +# Copy conda with PyTorch installed +COPY --from=pytorch-install /opt/conda /opt/conda + +# Copy build artifacts from flash attention builder +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy build artifacts from flash attention v2 builder +COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy build artifacts from custom kernels builder +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +# Copy build artifacts from exllama kernels builder +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Copy builds artifacts from vllm builder +COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages + +# Install flash-attention dependencies +RUN pip install einops --no-cache-dir + +# Install server +COPY proto proto +COPY server server +COPY server/Makefile server/Makefile +RUN cd server && \ + make gen-server && \ + pip install -r requirements.txt && \ + pip install ".[bnb, accelerate, quantize]" --no-cache-dir + +# Install benchmarker +COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark +# Install router +COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router +# Install launcher +COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# AWS Sagemaker compatbile image +FROM base as sagemaker + +COPY sagemaker-entrypoint.sh entrypoint.sh +RUN chmod +x entrypoint.sh + +ENTRYPOINT ["./entrypoint.sh"] + +# Final image +FROM base + +ENTRYPOINT ["text-generation-launcher"] +CMD ["--json-output"]