diff --git a/Dockerfile b/Dockerfile index c2ae4c85..333f7c91 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ RUN cargo build --release # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install -ARG PYTORCH_VERSION=2.1.1 +ARG PYTORCH_VERSION=2.3.0 ARG PYTHON_VERSION=3.10 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=12.1 @@ -149,6 +149,8 @@ FROM kernel-builder as vllm-builder WORKDIR /usr/src +ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" + COPY server/Makefile-vllm Makefile # Build specific version of vllm @@ -210,7 +212,7 @@ COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/c COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages -# Install vllm/flash-attention dependencies +# Install flash-attention dependencies RUN pip install einops --no-cache-dir # Install server @@ -246,6 +248,7 @@ ENTRYPOINT ["./entrypoint.sh"] FROM base COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh +RUN chmod +x /tgi-entrypoint.sh ENTRYPOINT ["/tgi-entrypoint.sh"] CMD ["--json-output"] diff --git a/server/Makefile-vllm b/server/Makefile-vllm index ada484a6..9999ad89 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -4,7 +4,7 @@ vllm-cuda: git clone https://github.com/OlivierDehaene/vllm.git vllm build-vllm-cuda: vllm-cuda - cd vllm && git fetch && git checkout 4bec8cee87f6bb8cebaec297029713cd2082e0b2 + cd vllm && git fetch && git checkout 3d4693536dcb69f036c26b016a35839b99ebed59 cd vllm && python setup.py build install-vllm-cuda: build-vllm-cuda