parent
e12c34bd25
commit
35509ff5de
39
Dockerfile
39
Dockerfile
|
@ -37,13 +37,13 @@ RUN cargo build --release
|
|||
|
||||
# Python builder
|
||||
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
||||
FROM debian:bullseye-slim as pytorch-install
|
||||
FROM nvidia/cuda:12.1.0-devel-ubuntu20.04 as pytorch-install
|
||||
|
||||
ARG PYTORCH_VERSION=2.0.1
|
||||
ARG PYTHON_VERSION=3.9
|
||||
ARG PYTORCH_VERSION=2.1.1
|
||||
ARG PYTHON_VERSION=3.10
|
||||
# Keep in sync with `server/pyproject.toml
|
||||
ARG CUDA_VERSION=11.8
|
||||
ARG MAMBA_VERSION=23.1.0-1
|
||||
ARG CUDA_VERSION=12.1
|
||||
ARG MAMBA_VERSION=23.3.1-1
|
||||
ARG CUDA_CHANNEL=nvidia
|
||||
ARG INSTALL_CHANNEL=pytorch
|
||||
# Automatically set by buildx
|
||||
|
@ -75,20 +75,19 @@ RUN chmod +x ~/mambaforge.sh && \
|
|||
RUN case ${TARGETPLATFORM} in \
|
||||
"linux/arm64") exit 1 ;; \
|
||||
*) /opt/conda/bin/conda update -y conda && \
|
||||
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" pytorch==$PYTORCH_VERSION "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
|
||||
/opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \
|
||||
esac && \
|
||||
/opt/conda/bin/conda clean -ya
|
||||
|
||||
# CUDA kernels builder image
|
||||
FROM pytorch-install as kernel-builder
|
||||
|
||||
ARG MAX_JOBS=8
|
||||
|
||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||
ninja-build \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 && \
|
||||
/opt/conda/bin/conda clean -ya
|
||||
|
||||
# Build Flash Attention CUDA kernels
|
||||
FROM kernel-builder as flash-att-builder
|
||||
|
||||
|
@ -148,7 +147,7 @@ COPY server/Makefile-vllm Makefile
|
|||
RUN make build-vllm
|
||||
|
||||
# Text Generation Inference base image
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
|
||||
FROM nvidia/cuda:12.1.0-base-ubuntu20.04 as base
|
||||
|
||||
# Conda env
|
||||
ENV PATH=/opt/conda/bin:$PATH \
|
||||
|
@ -172,24 +171,24 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
|||
COPY --from=pytorch-install /opt/conda /opt/conda
|
||||
|
||||
# Copy build artifacts from flash attention builder
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
||||
# Copy build artifacts from flash attention v2 builder
|
||||
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
||||
# Copy build artifacts from custom kernels builder
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
# Copy build artifacts from exllama kernels builder
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
# Copy build artifacts from awq kernels builder
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
# Copy build artifacts from eetq kernels builder
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
||||
# Copy builds artifacts from vllm builder
|
||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
||||
# Install flash-attention dependencies
|
||||
RUN pip install einops --no-cache-dir
|
||||
|
@ -201,7 +200,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements.txt && \
|
||||
pip install ".[bnb, accelerate, quantize]" --no-cache-dir
|
||||
pip install ".[bnb, accelerate, quantize, peft]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
|
|
|
@ -318,6 +318,7 @@ def launcher(event_loop):
|
|||
],
|
||||
volumes=volumes,
|
||||
ports={"80/tcp": port},
|
||||
shm_size="1G"
|
||||
)
|
||||
|
||||
yield ContainerLauncherHandle(client, container.name, port)
|
||||
|
|
|
@ -16,17 +16,13 @@ gen-server:
|
|||
find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
|
||||
touch text_generation_server/pb/__init__.py
|
||||
|
||||
install-torch:
|
||||
# Install specific version of torch
|
||||
pip install torch --extra-index-url https://download.pytorch.org/whl/cu118 --no-cache-dir
|
||||
|
||||
install: gen-server install-torch
|
||||
install: gen-server
|
||||
pip install pip --upgrade
|
||||
pip install -r requirements.txt
|
||||
pip install -e ".[bnb, accelerate]"
|
||||
pip install -e ".[bnb, accelerate, quantize, peft]"
|
||||
|
||||
run-dev:
|
||||
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
|
||||
|
||||
export-requirements:
|
||||
poetry export -o requirements.txt -E bnb -E quantize --without-hashes
|
||||
poetry export -o requirements.txt -E bnb --without-hashes
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,14 +30,16 @@ transformers = "^4.32.1"
|
|||
einops = "^0.6.1"
|
||||
texttable = { version = "^1.6.7", optional = true }
|
||||
datasets = { version = "^2.14.0", optional = true }
|
||||
peft = "^0.4.0"
|
||||
torch = { version = "^2.0.1" }
|
||||
peft = { version = "^0.4.0", optional = true }
|
||||
torch = { version = "^2.1.1", optional = true }
|
||||
scipy = "^1.11.1"
|
||||
pillow = "^10.0.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
torch = ["torch"]
|
||||
accelerate = ["accelerate"]
|
||||
bnb = ["bitsandbytes"]
|
||||
peft = ["peft"]
|
||||
quantize = ["texttable", "datasets", "accelerate"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
@ -47,7 +49,7 @@ pytest = "^7.3.0"
|
|||
|
||||
[[tool.poetry.source]]
|
||||
name = "pytorch-gpu-src"
|
||||
url = "https://download.pytorch.org/whl/cu118"
|
||||
url = "https://download.pytorch.org/whl/cu121"
|
||||
priority = "explicit"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
|
|
|
@ -1,38 +1,23 @@
|
|||
accelerate==0.20.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
async-timeout==4.0.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
backoff==2.2.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
bitsandbytes==0.41.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
certifi==2023.7.22 ; python_version >= "3.9" and python_version < "3.13"
|
||||
charset-normalizer==3.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
bitsandbytes==0.41.2.post2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
certifi==2023.11.17 ; python_version >= "3.9" and python_version < "3.13"
|
||||
charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
|
||||
datasets==2.14.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
|
||||
dill==0.3.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
filelock==3.12.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
googleapis-common-protos==1.60.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpc-interceptor==0.15.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-reflection==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-status==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.58.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
hf-transfer==0.1.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
filelock==3.13.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
fsspec==2023.10.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-reflection==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio-status==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
grpcio==1.59.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
hf-transfer==0.1.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
huggingface-hub==0.16.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
idna==3.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
jinja2==3.1.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
markupsafe==2.1.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multidict==6.0.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "3.13"
|
||||
networkx==3.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
numpy==1.26.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
numpy==1.26.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
@ -42,34 +27,21 @@ opentelemetry-instrumentation==0.36b0 ; python_version >= "3.9" and python_versi
|
|||
opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
packaging==23.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pandas==2.1.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
peft==0.4.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
protobuf==4.24.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
psutil==5.9.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyarrow==13.0.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pytz==2023.3.post1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
packaging==23.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pillow==10.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
protobuf==4.25.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
regex==2023.8.8 ; python_version >= "3.9" and python_version < "3.13"
|
||||
regex==2023.10.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
requests==2.31.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
safetensors==0.3.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
scipy==1.11.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
scipy==1.11.4 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==68.2.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
sympy==1.12 ; python_version >= "3.9" and python_version < "3.13"
|
||||
texttable==1.6.7 ; python_version >= "3.9" and python_version < "3.13"
|
||||
setuptools==69.0.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
torch==2.0.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.33.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.33.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.0.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.1.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "3.13" and sys_platform == "win32"
|
||||
wrapt==1.15.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
xxhash==3.3.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
yarl==1.9.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
wrapt==1.16.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
|
Loading…
Reference in New Issue