update dockerfile

This commit is contained in:
fxmarty 2024-06-20 15:36:46 +00:00
parent dccab72549
commit 65506e19bf
5 changed files with 325 additions and 689 deletions

View File

@ -2,3 +2,4 @@ aml
target
server/transformers
server/flash-attention
hf_cache/

876
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -72,17 +72,18 @@ RUN chmod +x ~/mambaforge.sh && \
# Install pytorch
# On arm64 we exit with an error code
RUN case ${TARGETPLATFORM} in \
"linux/arm64") exit 1 ;; \
*) /opt/conda/bin/conda update -y conda && \
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
esac && \
/opt/conda/bin/conda clean -ya
# RUN case ${TARGETPLATFORM} in \
# "linux/arm64") exit 1 ;; \
# *) /opt/conda/bin/conda update -y conda && \
# /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
# esac && \
# /opt/conda/bin/conda clean -ya
RUN pip install torch --index-url https://download.pytorch.org/whl/cu121
# CUDA kernels builder image
FROM pytorch-install as kernel-builder
ARG MAX_JOBS=8
# ARG MAX_JOBS=64
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
ninja-build cmake \
@ -106,64 +107,66 @@ WORKDIR /usr/src
COPY server/Makefile-flash-att-v2 Makefile
# Build specific version of flash attention v2
ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"
RUN make build-flash-attention-v2-cuda
# Build Transformers exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .
# # Build Transformers exllama kernels
# FROM kernel-builder as exllama-kernels-builder
# WORKDIR /usr/src
# COPY server/exllama_kernels/ .
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers exllama kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .
# FROM kernel-builder as exllamav2-kernels-builder
# WORKDIR /usr/src
# COPY server/exllamav2_kernels/ .
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
# Build Transformers awq kernels
FROM kernel-builder as awq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-awq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
# FROM kernel-builder as awq-kernels-builder
# WORKDIR /usr/src
# COPY server/Makefile-awq Makefile
# # Build specific version of transformers
# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
# Build eetq kernels
FROM kernel-builder as eetq-kernels-builder
WORKDIR /usr/src
COPY server/Makefile-eetq Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
# # Build eetq kernels
# FROM kernel-builder as eetq-kernels-builder
# WORKDIR /usr/src
# COPY server/Makefile-eetq Makefile
# # Build specific version of transformers
# RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
COPY server/custom_kernels/ .
# Build specific version of transformers
RUN python setup.py build
# # Build Transformers CUDA kernels
# FROM kernel-builder as custom-kernels-builder
# WORKDIR /usr/src
# COPY server/custom_kernels/ .
# # Build specific version of transformers
# RUN python setup.py build
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
WORKDIR /usr/src
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ENV TORCH_CUDA_ARCH_LIST="9.0+PTX"
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
RUN make build-vllm-cuda
# Build mamba kernels
FROM kernel-builder as mamba-builder
WORKDIR /usr/src
COPY server/Makefile-selective-scan Makefile
RUN make build-all
# # Build mamba kernels
# FROM kernel-builder as mamba-builder
# WORKDIR /usr/src
# COPY server/Makefile-selective-scan Makefile
# RUN make build-all
# Text Generation Inference base image
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
FROM pytorch-install
# Conda env
ENV PATH=/opt/conda/bin:$PATH \
@ -184,7 +187,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
&& rm -rf /var/lib/apt/lists/*
# Copy conda with PyTorch installed
COPY --from=pytorch-install /opt/conda /opt/conda
# COPY --from=pytorch-install /opt/conda /opt/conda
# Copy build artifacts from flash attention builder
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
@ -194,23 +197,23 @@ COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.lin
# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from awq kernels builder
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from eetq kernels builder
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from custom kernels builder
# COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from exllama kernels builder
# COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from exllamav2 kernels builder
# COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from awq kernels builder
# COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from eetq kernels builder
# COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from mamba builder
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
# # Copy build artifacts from mamba builder
# COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
# COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
# Install flash-attention dependencies
RUN pip install einops --no-cache-dir
@ -237,18 +240,18 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
&& rm -rf /var/lib/apt/lists/*
# AWS Sagemaker compatible image
FROM base as sagemaker
# FROM base as sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh
# COPY sagemaker-entrypoint.sh entrypoint.sh
# RUN chmod +x entrypoint.sh
ENTRYPOINT ["./entrypoint.sh"]
# ENTRYPOINT ["./entrypoint.sh"]
# Final image
FROM base
# # Final image
# FROM base
COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
RUN chmod +x /tgi-entrypoint.sh
# COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
# RUN chmod +x /tgi-entrypoint.sh
ENTRYPOINT ["/tgi-entrypoint.sh"]
CMD ["--json-output"]
# ENTRYPOINT ["/tgi-entrypoint.sh"]
# CMD ["--json-output"]

View File

@ -48,7 +48,7 @@ minijinja = { git = "https://github.com/mitsuhiko/minijinja.git", rev = "5cd4efb
futures-util = "0.3.30"
regex = "1.10.3"
once_cell = "1.19.0"
image = "0.25.1"
image = "0.24.9"
base64 = "0.22.0"
[build-dependencies]

View File

@ -55,7 +55,7 @@ def initialize_torch_distributed():
backend = "nccl"
options = ProcessGroupNCCL.Options()
options.is_high_priority_stream = True
options._timeout = timedelta(seconds=60)
options._timeout = timedelta(seconds=30)
else:
try:
import oneccl_bindings_for_pytorch
@ -79,7 +79,7 @@ def initialize_torch_distributed():
backend=backend,
world_size=WORLD_SIZE,
rank=RANK,
timeout=timedelta(seconds=60),
timeout=timedelta(seconds=30),
pg_options=options,
)
else: