hf_text-generation-inference/Dockerfile_amd

# Rust builder
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
WORKDIR /usr/src

ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

FROM chef as planner
COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder

ARG GIT_SHA
ARG DOCKER_LABEL

RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
    rm -f $PROTOC_ZIP

COPY --from=planner /usr/src/recipe.json recipe.json
RUN cargo chef cook --release --recipe-path recipe.json

COPY Cargo.toml Cargo.toml
COPY rust-toolchain.toml rust-toolchain.toml
COPY proto proto
COPY benchmark benchmark
COPY router router
COPY launcher launcher
RUN cargo build --release

# Text Generation Inference base image for RoCm
FROM rocm/dev-ubuntu-22.04:5.7 as base

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
    build-essential \
    ca-certificates \
    ccache \
    curl \
    git \
    make \
    libssl-dev \
    g++ \
    # Needed to build VLLM & flash.
    rocthrust-dev \
    hipsparse-dev \
    hipblas-dev && \
    rm -rf /var/lib/apt/lists/*

# Keep in sync with `server/pyproject.toml
ARG MAMBA_VERSION=23.1.0-1
ARG PYTORCH_VERSION='2.2.0.dev0'
ARG ROCM_VERSION='5.7'
ARG PYTHON_VERSION='3.10.10'
# Automatically set by buildx
ARG TARGETPLATFORM
ENV PATH /opt/conda/bin:$PATH

# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.
# Install mamba
# translating Docker's TARGETPLATFORM into mamba arches
RUN case ${TARGETPLATFORM} in \
         "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
         *)              MAMBA_ARCH=x86_64   ;; \
    esac && \
    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
RUN chmod +x ~/mambaforge.sh && \
    bash ~/mambaforge.sh -b -p /opt/conda && \
    mamba init && \
    rm ~/mambaforge.sh

# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.
RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/

FROM base AS kernel-builder

# Build vllm kernels
FROM kernel-builder AS vllm-builder
WORKDIR /usr/src

COPY server/Makefile-vllm Makefile

# Build specific version of vllm
RUN make build-vllm-rocm

# Build Flash Attention v2 kernels
FROM kernel-builder AS flash-att-v2-builder
WORKDIR /usr/src

COPY server/Makefile-flash-att-v2 Makefile

# Build specific version of flash attention v2
RUN make build-flash-attention-v2-rocm

# Build Transformers CUDA kernels (gpt-neox and bloom)
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
COPY server/custom_kernels/ .
RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build

# Build exllama kernels
FROM kernel-builder as exllama-kernels-builder
WORKDIR /usr/src
COPY server/exllama_kernels/ .

RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build

# Build exllama v2 kernels
FROM kernel-builder as exllamav2-kernels-builder
WORKDIR /usr/src
COPY server/exllamav2_kernels/ .

RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build

FROM base as base-copy

# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    PORT=80

# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from flash attention v2 builder
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from custom kernels builder
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from exllama kernels builder
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Copy build artifacts from exllamav2 kernels builder
COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

# Install flash-attention dependencies
RUN pip install einops --no-cache-dir

# Install server
COPY proto proto
COPY server server
COPY server/Makefile server/Makefile
RUN cd server && \
    make gen-server && \
    pip install -r requirements_rocm.txt && \
    pip install ".[accelerate, peft]" --no-cache-dir

# Install benchmarker
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
# Install router
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
# Install launcher
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher

# AWS Sagemaker compatible image
FROM base-copy as sagemaker
COPY sagemaker-entrypoint.sh entrypoint.sh
RUN chmod +x entrypoint.sh

ENTRYPOINT ["./entrypoint.sh"]

# Final image
FROM base-copy

ENTRYPOINT ["text-generation-launcher"]
CMD ["--json-output"]
Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00			`# Rust builder`
feat: experimental support for cuda graphs (#1428) Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2024-02-12 02:09:29 -07:00			`FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef`
Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00			`WORKDIR /usr/src`

			`ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse`

			`FROM chef as planner`
			`COPY Cargo.toml Cargo.toml`
			`COPY rust-toolchain.toml rust-toolchain.toml`
			`COPY proto proto`
			`COPY benchmark benchmark`
			`COPY router router`
			`COPY launcher launcher`
			`RUN cargo chef prepare --recipe-path recipe.json`

			`FROM chef AS builder`

			`ARG GIT_SHA`
			`ARG DOCKER_LABEL`

			`RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \`
			`curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \`
			`unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \`
			`unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \`
			`rm -f $PROTOC_ZIP`

			`COPY --from=planner /usr/src/recipe.json recipe.json`
			`RUN cargo chef cook --release --recipe-path recipe.json`

			`COPY Cargo.toml Cargo.toml`
			`COPY rust-toolchain.toml rust-toolchain.toml`
			`COPY proto proto`
			`COPY benchmark benchmark`
			`COPY router router`
			`COPY launcher launcher`
			`RUN cargo build --release`

			`# Text Generation Inference base image for RoCm`
Fixing glibc version in the runtime. (#1556) # What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> 2024-02-13 09:43:47 -07:00			`FROM rocm/dev-ubuntu-22.04:5.7 as base`
Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00
			`RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \`
			`build-essential \`
			`ca-certificates \`
			`ccache \`
			`curl \`
			`git \`
			`make \`
			`libssl-dev \`
			`g++ \`
			`# Needed to build VLLM & flash.`
			`rocthrust-dev \`
			`hipsparse-dev \`
			`hipblas-dev && \`
			`rm -rf /var/lib/apt/lists/*`

			# Keep in sync with `server/pyproject.toml
			`ARG MAMBA_VERSION=23.1.0-1`
			`ARG PYTORCH_VERSION='2.2.0.dev0'`
			`ARG ROCM_VERSION='5.7'`
			`ARG PYTHON_VERSION='3.10.10'`
			`# Automatically set by buildx`
			`ARG TARGETPLATFORM`
			`ENV PATH /opt/conda/bin:$PATH`

			`# TGI seem to require libssl.so.1.1 instead of libssl.so.3 so we can't use ubuntu 22.04. Ubuntu 20.04 has python==3.8, and TGI requires python>=3.9, hence the need for miniconda.`
			`# Install mamba`
			`# translating Docker's TARGETPLATFORM into mamba arches`
			`RUN case ${TARGETPLATFORM} in \`
			`"linux/arm64") MAMBA_ARCH=aarch64 ;; \`
			`*) MAMBA_ARCH=x86_64 ;; \`
			`esac && \`
			`curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"`
			`RUN chmod +x ~/mambaforge.sh && \`
			`bash ~/mambaforge.sh -b -p /opt/conda && \`
			`mamba init && \`
			`rm ~/mambaforge.sh`

GPTQ support on ROCm (#1489) Tested with ``` CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq EXLLAMA_VERSION=1 CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq CUDA_VISIBLE_DEVICES="0,1" text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` all with good and identical results on MI210. --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2024-01-26 08:27:44 -07:00			`# Install PyTorch 2.2 RC compiled against RoCm 5.7, as VLLM can not be compiled with RoCm 5.6.`
			`RUN pip install torch --index-url https://download.pytorch.org/whl/test/rocm5.7/`
Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00
			`FROM base AS kernel-builder`

			`# Build vllm kernels`
			`FROM kernel-builder AS vllm-builder`
			`WORKDIR /usr/src`

			`COPY server/Makefile-vllm Makefile`

			`# Build specific version of vllm`
			`RUN make build-vllm-rocm`

			`# Build Flash Attention v2 kernels`
			`FROM kernel-builder AS flash-att-v2-builder`
			`WORKDIR /usr/src`

			`COPY server/Makefile-flash-att-v2 Makefile`

			`# Build specific version of flash attention v2`
			`RUN make build-flash-attention-v2-rocm`

			`# Build Transformers CUDA kernels (gpt-neox and bloom)`
			`FROM kernel-builder as custom-kernels-builder`
			`WORKDIR /usr/src`
			`COPY server/custom_kernels/ .`
			`RUN PYTORCH_ROCM_ARCH=gfx90a python setup.py build`

GPTQ support on ROCm (#1489) Tested with ``` CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq EXLLAMA_VERSION=1 CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq CUDA_VISIBLE_DEVICES="0,1" text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` all with good and identical results on MI210. --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2024-01-26 08:27:44 -07:00			`# Build exllama kernels`
			`FROM kernel-builder as exllama-kernels-builder`
			`WORKDIR /usr/src`
			`COPY server/exllama_kernels/ .`

			`RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build`

			`# Build exllama v2 kernels`
			`FROM kernel-builder as exllamav2-kernels-builder`
			`WORKDIR /usr/src`
			`COPY server/exllamav2_kernels/ .`

			`RUN PYTORCH_ROCM_ARCH="gfx90a" python setup.py build`

Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00			`FROM base as base-copy`

			`# Text Generation Inference base env`
			`ENV HUGGINGFACE_HUB_CACHE=/data \`
			`HF_HUB_ENABLE_HF_TRANSFER=1 \`
			`PORT=80`

			`# Copy builds artifacts from vllm builder`
			`COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages`

			`# Copy build artifacts from flash attention v2 builder`
			`COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages`

			`# Copy build artifacts from custom kernels builder`
			`COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages`
GPTQ support on ROCm (#1489) Tested with ``` CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq EXLLAMA_VERSION=1 CUDA_VISIBLE_DEVICES=0 text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq CUDA_VISIBLE_DEVICES="0,1" text-generation-launcher --model-id TheBloke/Llama-2-7B-Chat-GPTQ --quantize gptq ``` all with good and identical results on MI210. --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> 2024-01-26 08:27:44 -07:00
			`# Copy build artifacts from exllama kernels builder`
			`COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages`

			`# Copy build artifacts from exllamav2 kernels builder`
			`COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages`
Add RoCm support (#1243) This PR adds support for AMD Instinct MI210 & MI250 GPUs, with paged attention and FAv2 support. Remaining items to discuss, on top of possible others: * Should we have a `ghcr.io/huggingface/text-generation-inference:1.1.0+rocm` hosted image, or is it too early? * Should we set up a CI on MI210/MI250? I don't have access to the runners of TGI though. * Are we comfortable with those changes being directly in TGI, or do we need a fork? --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: OlivierDehaene <olivier@huggingface.co> Co-authored-by: Your Name <you@example.com> 2023-11-27 06:08:12 -07:00
			`# Install flash-attention dependencies`
			`RUN pip install einops --no-cache-dir`

			`# Install server`
			`COPY proto proto`
			`COPY server server`
			`COPY server/Makefile server/Makefile`
			`RUN cd server && \`
			`make gen-server && \`
			`pip install -r requirements_rocm.txt && \`
			`pip install ".[accelerate, peft]" --no-cache-dir`

			`# Install benchmarker`
			`COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark`
			`# Install router`
			`COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router`
			`# Install launcher`
			`COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher`

			`# AWS Sagemaker compatible image`
			`FROM base-copy as sagemaker`
			`COPY sagemaker-entrypoint.sh entrypoint.sh`
			`RUN chmod +x entrypoint.sh`

			`ENTRYPOINT ["./entrypoint.sh"]`

			`# Final image`
			`FROM base-copy`

			`ENTRYPOINT ["text-generation-launcher"]`
			`CMD ["--json-output"]`