update dockerfile

2024-06-20 15:36:46 +00:00 · 2024-06-20 15:36:46 +00:00 · 65506e19bf
parent dccab72549
commit 65506e19bf
5 changed files with 325 additions and 689 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -2,3 +2,4 @@ aml
 target
 server/transformers
 server/flash-attention
+hf_cache/
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/131
+++ b/131
@ -72,17 +72,18 @@ RUN chmod +x ~/mambaforge.sh && \

 # Install pytorch
 # On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  exit 1 ;; \
-         *)              /opt/conda/bin/conda update -y conda &&  \
-                         /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
+# RUN case ${TARGETPLATFORM} in \
+#          "linux/arm64")  exit 1 ;; \
+#          *)              /opt/conda/bin/conda update -y conda &&  \
+#                          /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
+#     esac && \
+#     /opt/conda/bin/conda clean -ya
+RUN pip install torch --index-url https://download.pytorch.org/whl/cu121

 # CUDA kernels builder image
 FROM pytorch-install as kernel-builder

-ARG MAX_JOBS=8
+# ARG MAX_JOBS=64

 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
        ninja-build cmake \
@ -106,64 +107,66 @@ WORKDIR /usr/src
 COPY server/Makefile-flash-att-v2 Makefile

 # Build specific version of flash attention v2
+ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"
 RUN make build-flash-attention-v2-cuda

-# Build Transformers exllama kernels
-FROM kernel-builder as exllama-kernels-builder
-WORKDIR /usr/src
-COPY server/exllama_kernels/ .
+# # Build Transformers exllama kernels
+# FROM kernel-builder as exllama-kernels-builder
+# WORKDIR /usr/src
+# COPY server/exllama_kernels/ .

-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

 # Build Transformers exllama kernels
-FROM kernel-builder as exllamav2-kernels-builder
-WORKDIR /usr/src
-COPY server/exllamav2_kernels/ .
+# FROM kernel-builder as exllamav2-kernels-builder
+# WORKDIR /usr/src
+# COPY server/exllamav2_kernels/ .

 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build

 # Build Transformers awq kernels
-FROM kernel-builder as awq-kernels-builder
-WORKDIR /usr/src
-COPY server/Makefile-awq Makefile
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+# FROM kernel-builder as awq-kernels-builder
+# WORKDIR /usr/src
+# COPY server/Makefile-awq Makefile
+# # Build specific version of transformers
+# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq

-# Build eetq kernels
-FROM kernel-builder as eetq-kernels-builder
-WORKDIR /usr/src
-COPY server/Makefile-eetq Makefile
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+# # Build eetq kernels
+# FROM kernel-builder as eetq-kernels-builder
+# WORKDIR /usr/src
+# COPY server/Makefile-eetq Makefile
+# # Build specific version of transformers
+# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

-# Build Transformers CUDA kernels
-FROM kernel-builder as custom-kernels-builder
-WORKDIR /usr/src
-COPY server/custom_kernels/ .
-# Build specific version of transformers
-RUN python setup.py build
+# # Build Transformers CUDA kernels
+# FROM kernel-builder as custom-kernels-builder
+# WORKDIR /usr/src
+# COPY server/custom_kernels/ .
+# # Build specific version of transformers
+# RUN python setup.py build

 # Build vllm CUDA kernels
 FROM kernel-builder as vllm-builder

 WORKDIR /usr/src

-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"

 COPY server/Makefile-vllm Makefile

 # Build specific version of vllm
 RUN make build-vllm-cuda

-# Build mamba kernels
-FROM kernel-builder as mamba-builder
-WORKDIR /usr/src
-COPY server/Makefile-selective-scan Makefile
-RUN make build-all
+# # Build mamba kernels
+# FROM kernel-builder as mamba-builder
+# WORKDIR /usr/src
+# COPY server/Makefile-selective-scan Makefile
+# RUN make build-all

 # Text Generation Inference base image
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
+FROM pytorch-install
+

 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@ -184,7 +187,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        && rm -rf /var/lib/apt/lists/*

 # Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
+# COPY --from=pytorch-install /opt/conda /opt/conda

 # Copy build artifacts from flash attention builder
 COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
@ -194,23 +197,23 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.lin
 # Copy build artifacts from flash attention v2 builder
 COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-# Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from custom kernels builder
+# COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from exllama kernels builder
+# COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from exllamav2 kernels builder
+# COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from awq kernels builder
+# COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from eetq kernels builder
+# COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

-# Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+# # Copy build artifacts from mamba builder
+# COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
+# COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages

 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@ -237,18 +240,18 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
        && rm -rf /var/lib/apt/lists/*

 # AWS Sagemaker compatible image
-FROM base as sagemaker
+# FROM base as sagemaker

-COPY sagemaker-entrypoint.sh entrypoint.sh
-RUN chmod +x entrypoint.sh
+# COPY sagemaker-entrypoint.sh entrypoint.sh
+# RUN chmod +x entrypoint.sh

-ENTRYPOINT ["./entrypoint.sh"]
+# ENTRYPOINT ["./entrypoint.sh"]

-# Final image
-FROM base
+# # Final image
+# FROM base

-COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
-RUN chmod +x /tgi-entrypoint.sh
+# COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
+# RUN chmod +x /tgi-entrypoint.sh

-ENTRYPOINT ["/tgi-entrypoint.sh"]
-CMD ["--json-output"]
+# ENTRYPOINT ["/tgi-entrypoint.sh"]
+# CMD ["--json-output"]
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -48,7 +48,7 @@ minijinja = { git = "https://github.com/mitsuhiko/minijinja.git", rev = "5cd4efb
 futures-util = "0.3.30"
 regex = "1.10.3"
 once_cell = "1.19.0"
-image = "0.25.1"
+image = "0.24.9"
 base64 = "0.22.0"

 [build-dependencies]
--- a/server/text_generation_server/utils/dist.py
+++ b/server/text_generation_server/utils/dist.py
@ -55,7 +55,7 @@ def initialize_torch_distributed():
        backend = "nccl"
        options = ProcessGroupNCCL.Options()
        options.is_high_priority_stream = True
-        options._timeout = timedelta(seconds=60)
+        options._timeout = timedelta(seconds=30)
    else:
        try:
            import oneccl_bindings_for_pytorch
@ -79,7 +79,7 @@ def initialize_torch_distributed():
                backend=backend,
                world_size=WORLD_SIZE,
                rank=RANK,
-                timeout=timedelta(seconds=60),
+                timeout=timedelta(seconds=30),
                pg_options=options,
            )
        else: