diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 7e988992..f82ddd7e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -26,7 +26,7 @@ concurrency: jobs: build-and-push-image: - runs-on: large + runs-on: ubuntu-latest permissions: contents: write packages: write @@ -45,9 +45,7 @@ jobs: uses: rlespinasse/github-slug-action@v4.4.1 - name: Install cosign if: github.event_name != 'pull_request' - uses: sigstore/cosign-installer@f3c664df7af409cb4873aa5068053ba9d61a57b6 #v2.6.0 - with: - cosign-release: 'v1.13.1' + uses: sigstore/cosign-installer@v3.0.2 - name: Tailscale uses: tailscale/github-action@v1 with: @@ -66,7 +64,7 @@ jobs: password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }} registry: registry.internal.huggingface.tech - name: Login to Azure Container Registry - if: github.event_name != 'pull_request' +# if: github.event_name != 'pull_request' uses: docker/login-action@v2.1.0 with: username: ${{ secrets.AZURE_DOCKER_USERNAME }} @@ -136,7 +134,7 @@ jobs: build-and-push-sagemaker-image: needs: - build-and-push-image - runs-on: large + runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v3 diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml index 07eeb270..7ccef3b0 100644 --- a/.github/workflows/client-tests.yaml +++ b/.github/workflows/client-tests.yaml @@ -8,7 +8,7 @@ on: jobs: run_tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index f96c53fb..e82e8b20 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -17,7 +17,7 @@ concurrency: jobs: run_tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest env: SCCACHE_GHA_ENABLED: "on" diff --git a/Dockerfile b/Dockerfile index 9fe0b49b..f027be14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,4 @@ +# Rust builder FROM lukemathwalker/cargo-chef:latest-rust-1.67 AS chef WORKDIR /usr/src @@ -27,51 +28,135 @@ COPY router router COPY launcher launcher RUN cargo build --release -FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as base +# Python builder +# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile +FROM ubuntu:22.04 as pytorch-install -ENV LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 \ - DEBIAN_FRONTEND=noninteractive \ - HUGGINGFACE_HUB_CACHE=/data \ +ARG PYTORCH_VERSION=2.0.0 +ARG PYTHON_VERSION=3.9 +ARG CUDA_VERSION=11.8 +ARG MAMBA_VERSION=23.1.0-1 +ARG CUDA_CHANNEL=nvidia +ARG INSTALL_CHANNEL=pytorch +# Automatically set by buildx +ARG TARGETPLATFORM + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + ccache \ + cmake \ + curl \ + git && \ + rm -rf /var/lib/apt/lists/* +RUN /usr/sbin/update-ccache-symlinks && \ + mkdir /opt/ccache && \ + ccache --set-config=cache_dir=/opt/ccache +ENV PATH /opt/conda/bin:$PATH + +# Install conda +# translating Docker's TARGETPLATFORM into mamba arches +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") MAMBA_ARCH=aarch64 ;; \ + *) MAMBA_ARCH=x86_64 ;; \ + esac && \ + curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" +RUN chmod +x ~/mambaforge.sh && \ + bash ~/mambaforge.sh -b -p /opt/conda && \ + rm ~/mambaforge.sh + +# Install pytorch +# On arm64 we exit with an error code +RUN case ${TARGETPLATFORM} in \ + "linux/arm64") exit 1 ;; \ + *) /opt/conda/bin/conda update -y conda && \ + /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \ + esac && \ + /opt/conda/bin/conda clean -ya + +# CUDA kernels builder image +FROM pytorch-install as kernel-builder + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ninja-build \ + && rm -rf /var/lib/apt/lists/* + +RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 && \ + /opt/conda/bin/conda clean -ya + + +# Build Flash Attention CUDA kernels +FROM kernel-builder as flash-att-builder + +WORKDIR /usr/src + +COPY server/Makefile-flash-att Makefile + +# Build specific version of flash attention +RUN make build-flash-attention + +# Build Transformers CUDA kernels +FROM kernel-builder as transformers-builder + +WORKDIR /usr/src + +COPY server/Makefile-transformers Makefile + +# Build specific version of transformers +RUN BUILD_EXTENSIONS="True" make build-transformers + +# Text Generation Inference base image +FROM ubuntu:22.04 as base + +ARG TARGETPLATFORM +ARG PYTORCH_VERSION=2.0.0 +ARG CUDA_VERSION=11.8 + +# Conda and CUDA env +ENV PATH=/opt/conda/bin:$PATH \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + +# Text Generation Inference base env +ENV HUGGINGFACE_HUB_CACHE=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ MODEL_ID=bigscience/bloom-560m \ QUANTIZE=false \ NUM_SHARD=1 \ - PORT=80 \ - CUDA_HOME=/usr/local/cuda \ - LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \ - CONDA_DEFAULT_ENV=text-generation \ - PATH=$PATH:/opt/miniconda/envs/text-generation/bin:/opt/miniconda/bin:/usr/local/cuda/bin + PORT=80 -RUN apt-get update && apt-get install -y git curl libssl-dev && rm -rf /var/lib/apt/lists/* - -RUN cd ~ && \ - curl -L -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x Miniconda3-latest-Linux-x86_64.sh && \ - bash ./Miniconda3-latest-Linux-x86_64.sh -bf -p /opt/miniconda && \ - conda create -n text-generation python=3.9 -y +LABEL com.nvidia.volumes.needed="nvidia_driver" WORKDIR /usr/src -# Install torch -RUN pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --no-cache-dir +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libssl-dev \ + make \ + && rm -rf /var/lib/apt/lists/* -# Install specific version of flash attention -COPY server/Makefile-flash-att server/Makefile -RUN cd server && make install-flash-attention +# Copy conda with PyTorch installed +COPY --from=pytorch-install /opt/conda /opt/conda -# Install specific version of transformers -COPY server/Makefile-transformers server/Makefile -RUN cd server && BUILD_EXTENSIONS="True" make install-transformers +# Copy build artifacts from flash attention builder +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages -COPY server/Makefile server/Makefile +# Copy build artifacts from transformers builder +COPY --from=transformers-builder /usr/src/transformers /usr/src/transformers +COPY --from=transformers-builder /usr/src/transformers/build/lib.linux-x86_64-cpython-39/transformers /usr/src/transformers/src/transformers + +# Install transformers dependencies +RUN cd /usr/src/transformers && pip install -e . --no-cache-dir # Install server COPY proto proto COPY server server +COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ - /opt/miniconda/envs/text-generation/bin/pip install ".[bnb]" --no-cache-dir + pip install ".[bnb]" --no-cache-dir # Install router COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router @@ -86,7 +171,7 @@ RUN chmod +x entrypoint.sh ENTRYPOINT ["./entrypoint.sh"] -# Original image +# Final image FROM base ENTRYPOINT ["text-generation-launcher"] diff --git a/clients/python/tests/test_client.py b/clients/python/tests/test_client.py index c998de41..d53c2a4d 100644 --- a/clients/python/tests/test_client.py +++ b/clients/python/tests/test_client.py @@ -17,7 +17,7 @@ def test_generate(flan_t5_xxl_url, hf_headers): assert response.details.prefill[0] == PrefillToken(id=0, text="", logprob=None) assert len(response.details.tokens) == 1 assert response.details.tokens[0] == Token( - id=3, text=" ", logprob=-1.984375, special=False + id=3, text="", logprob=-1.984375, special=False ) @@ -83,7 +83,7 @@ async def test_generate_async(flan_t5_xxl_url, hf_headers): assert response.details.prefill[0] == PrefillToken(id=0, text="", logprob=None) assert len(response.details.tokens) == 1 assert response.details.tokens[0] == Token( - id=3, text=" ", logprob=-1.984375, special=False + id=3, text="", logprob=-1.984375, special=False ) diff --git a/server/Makefile-flash-att b/server/Makefile-flash-att index 297fd9d0..ad894bfa 100644 --- a/server/Makefile-flash-att +++ b/server/Makefile-flash-att @@ -1,10 +1,16 @@ flash_att_commit := d478eeec8f16c7939c54e4617dbd36f59b8eeed7 -install-flash-attention: - # Install specific version of flash attention +flash-attention: + # Clone flash attention pip install packaging - pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true - rm -rf flash-attention || true git clone https://github.com/HazyResearch/flash-attention.git - cd flash-attention && git checkout $(flash_att_commit) + +build-flash-attention: flash-attention + cd flash-attention && git fetch && git checkout $(flash_att_commit) + cd flash-attention && python setup.py build + cd flash-attention/csrc/rotary && python setup.py build + cd flash-attention/csrc/layer_norm && python setup.py build + +install-flash-attention: build-flash-attention + pip uninstall flash_attn rotary_emb dropout_layer_norm -y || true cd flash-attention && python setup.py install && cd csrc/layer_norm && python setup.py install && cd ../rotary && python setup.py install \ No newline at end of file diff --git a/server/Makefile-transformers b/server/Makefile-transformers index 1e081336..692a9374 100644 --- a/server/Makefile-transformers +++ b/server/Makefile-transformers @@ -1,10 +1,13 @@ transformers_commit := b8d969ff47c6a9d40538a6ea33df021953363afc -install-transformers: - # Install specific version of transformers with custom cuda kernels +transformers: + # Clone fork of transformers with custom CUDA kernels and sharding logic pip install --upgrade setuptools - pip uninstall transformers -y || true - rm -rf transformers || true git clone https://github.com/OlivierDehaene/transformers.git - cd transformers && git checkout $(transformers_commit) + +build-transformers: transformers + cd transformers && git fetch && git checkout $(transformers_commit) && python setup.py build + +install-transformers: build-transformers + pip uninstall transformers -y || true cd transformers && python setup.py install \ No newline at end of file