Merge branch 'master' into cluster
This commit is contained in:
parent
28c250385d
commit
ee44371fdf
|
@ -1,15 +0,0 @@
|
||||||
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
|
|
||||||
|
|
||||||
1. Run `jupyter server --generate-config` and `jupyter server password` on your local machine, then copy Jupyter's config directory to `./jupyter`
|
|
||||||
2. Place your Rathole client config at `./rathole-client.toml`
|
|
||||||
3. `docker build . -t "paperspace-vllm"`
|
|
||||||
|
|
||||||
To test on your local machine, run this command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run --shm-size 14g --gpus all \
|
|
||||||
-v /storage/models/awq/MythoMax-L2-13B-AWQ:/models/MythoMax-L2-13B-AWQ \
|
|
||||||
-p 7000:7000 -p 8888:8888 \
|
|
||||||
-e API_SERVER_ARGS="--model /models/MythoMax-L2-13B-AWQ --quantization awq --max-num-batched-tokens 99999 --gpu-memory-utilization 1" \
|
|
||||||
vllm-cloud
|
|
||||||
```
|
|
|
@ -1,87 +1,50 @@
|
||||||
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
|
FROM cyberes/vllm-paperspace-base as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y git python3-pip python3-venv wget unzip && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
RUN pip3 install --upgrade pip setuptools wheel
|
|
||||||
|
|
||||||
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
|
|
||||||
|
|
||||||
WORKDIR /local-llm-server
|
|
||||||
|
|
||||||
RUN python3 -m venv /venv
|
|
||||||
RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
|
|
||||||
|
|
||||||
RUN python3 -m venv /jupyterlab
|
|
||||||
RUN /jupyterlab/bin/pip install jupyterlab
|
|
||||||
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
|
||||||
|
|
||||||
RUN mkdir -p /app
|
|
||||||
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
|
|
||||||
RUN unzip -j /tmp/rathole.zip -d /tmp
|
|
||||||
RUN rm /tmp/rathole.zip
|
|
||||||
RUN cp /tmp/rathole /app
|
|
||||||
|
|
||||||
# The local local-llm-server repo may be cached, so we will fetch and reset to the remote every time.
|
|
||||||
# Also, make sure there weren't any pip deps added.
|
|
||||||
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
|
|
||||||
RUN git fetch; git reset --hard origin/master
|
|
||||||
RUN /venv/bin/pip install -r requirements.txt
|
|
||||||
|
|
||||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as runtime
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y supervisor && rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN useradd -ms /bin/bash apiserver
|
RUN useradd -ms /bin/bash apiserver
|
||||||
RUN usermod -s /bin/bash root
|
RUN usermod -s /bin/bash root
|
||||||
|
|
||||||
|
# Required packages
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y python3 python3-pip wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
|
apt-get install -y python3 python3-pip supervisor && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN pip3 install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
# Useful Python packages
|
||||||
|
RUN pip3 install glances
|
||||||
|
|
||||||
|
# Useful tools
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN pip3 install --upgrade pip setuptools wheel
|
# Update the git repo
|
||||||
RUN pip3 install glances
|
RUN cd /local-llm-server && git reset --hard && git pull
|
||||||
|
|
||||||
# Enable root SSH login
|
# Enable root SSH login
|
||||||
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||||
|
|
||||||
# Disable password SSH login
|
# Disable password SSH login
|
||||||
RUN sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
|
RUN sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||||
|
# Create the necessary directory for sshd
|
||||||
# Create the necessary directory for SSH
|
|
||||||
RUN mkdir /var/run/sshd
|
RUN mkdir /var/run/sshd
|
||||||
|
|
||||||
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
|
COPY supervisord.conf /etc/supervisor/supervisord.conf
|
||||||
|
COPY start-vllm.sh /app/start-vllm.sh
|
||||||
COPY --from=build /local-llm-server /local-llm-server
|
COPY init-container.sh /app/init.sh
|
||||||
COPY --from=build /venv /venv
|
COPY start-container.sh /app/start.sh
|
||||||
COPY --from=build /app /app
|
|
||||||
COPY --from=build /jupyterlab /jupyterlab
|
|
||||||
|
|
||||||
RUN cp /local-llm-server/other/vllm/Docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
|
||||||
RUN cp /local-llm-server/other/vllm/Docker/start-vllm.sh /app/start-vllm.sh
|
|
||||||
RUN cp /local-llm-server/other/vllm/Docker/start-container.sh /app/start.sh
|
|
||||||
|
|
||||||
# Copy your secrets in
|
|
||||||
# COPY ./jupyter /app/jupyter
|
|
||||||
|
|
||||||
RUN mkdir -p /var/log/app/
|
RUN mkdir -p /var/log/app/
|
||||||
|
|
||||||
RUN chown -R apiserver:apiserver /local-llm-server && \
|
RUN chown -R apiserver:apiserver /local-llm-server && \
|
||||||
chown -R apiserver:apiserver /app && \
|
chown -R apiserver:apiserver /app && \
|
||||||
chown -R apiserver:apiserver /var/log/app/
|
chown -R apiserver:apiserver /var/log/app/
|
||||||
|
RUN git config --global --add safe.directory /local-llm-server
|
||||||
|
|
||||||
|
RUN chmod +x /app/init.sh
|
||||||
RUN chmod +x /app/start.sh
|
RUN chmod +x /app/start.sh
|
||||||
|
|
||||||
ENV SHELL="/bin/bash"
|
ENV SHELL="/bin/bash"
|
||||||
|
|
||||||
# SSH
|
# Expose Jupyter. We don't need to expose VLLM or SSH since rathole will tunnel those.
|
||||||
EXPOSE 22
|
|
||||||
|
|
||||||
# VLLM
|
|
||||||
EXPOSE 7000
|
|
||||||
|
|
||||||
# Jupyter
|
|
||||||
EXPOSE 8888
|
EXPOSE 8888
|
||||||
|
|
||||||
CMD /app/start.sh
|
CMD /app/start.sh
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
# This container builds and assembles the Python parts of the Docker container.
|
||||||
|
# It is used as the base for the resulting container, which avoids having to re-push
|
||||||
|
# the large PyTorch parts every time the application is rebuilt.
|
||||||
|
|
||||||
|
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y git python3-pip python3-venv wget unzip && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN pip install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
|
||||||
|
|
||||||
|
RUN python3 -m venv /jupyterlab
|
||||||
|
RUN /jupyterlab/bin/pip install jupyterlab
|
||||||
|
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
||||||
|
|
||||||
|
RUN mkdir -p /app
|
||||||
|
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
|
||||||
|
RUN unzip -j /tmp/rathole.zip -d /tmp
|
||||||
|
RUN rm /tmp/rathole.zip
|
||||||
|
RUN cp /tmp/rathole /app
|
||||||
|
|
||||||
|
RUN python3 -m venv /venv
|
||||||
|
RUN /venv/bin/pip3 install --upgrade pip setuptools wheel
|
||||||
|
|
||||||
|
# Install PyTorch before installing VLLM to ensure we use the right version for our CUDA install.
|
||||||
|
RUN wget -q -O - https://raw.githubusercontent.com/vllm-project/vllm/main/requirements.txt | grep -E 'torch*' > /tmp/torch_version
|
||||||
|
RUN /venv/bin/pip3 install "$(cat /tmp/torch_version)" --index-url https://download.pytorch.org/whl/cu118
|
||||||
|
|
||||||
|
# WORKDIR /local-llm-server
|
||||||
|
|
||||||
|
# Don't build VLLM because we don't do that on the inference server. Just install from pip.
|
||||||
|
# RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
|
||||||
|
|
||||||
|
RUN /venv/bin/pip install vllm
|
||||||
|
|
||||||
|
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as base
|
||||||
|
|
||||||
|
COPY --from=build /local-llm-server /local-llm-server
|
||||||
|
COPY --from=build /venv /venv
|
||||||
|
COPY --from=build /app /app
|
||||||
|
COPY --from=build /jupyterlab /jupyterlab
|
|
@ -0,0 +1,47 @@
|
||||||
|
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
|
||||||
|
|
||||||
|
### Running
|
||||||
|
|
||||||
|
1. In Paperspace, create a new notebook.
|
||||||
|
2. Click `Start from Scratch`.
|
||||||
|
3. Select your GPU and set the auto-shutdown timeout to 6 hours.
|
||||||
|
4. Click the `View Advanced Options` button at the bottom of the page. Enter these details in the form that appears:
|
||||||
|
- Container Name: `cyberes/vllm-paperspace:latest`
|
||||||
|
- Container Command: `/app/start.sh`
|
||||||
|
5. Start the notebook. It may take up to five minutes for them to pull and start the custom image.
|
||||||
|
6. Once the container is started, open the log viewer by clicking the icon in the bottom left of the screen. You should see errors from rathole and VLLM as a result of the blank config files. The container will create a new directory in your mounted
|
||||||
|
storage: `/storage/vllm/`.
|
||||||
|
7. Enter your rathole client config in `/storage/vllm/rathole-client.toml`. If you need a visual text editor, first link the directory back to the Jupyter home: `ln -s /storage/vllm /notebooks`
|
||||||
|
8. Restart rathole with `supervisorctl restart rathole` and then view the log: `tail -f /var/log/app/rathole.log`. If you see lines that start with `INFO` and end with `Control channel established`, rathole has connected and is working. Error mesasges will begin
|
||||||
|
with `ERROR`.
|
||||||
|
9. Download an AWQ quantization from [TheBloke](https://huggingface.co/TheBloke) to `/storage/vllm/models/`.
|
||||||
|
10. Enter your VLLM commandline args in `/storage/vllm/cmd.txt`. You need to set `--model` to the path of the model you want to load.
|
||||||
|
11. Restart VLLM with `supervisorctl restart vllm` and then view the log: `tail -f /var/log/app/vllm.log`. It may take up to three minutes to load. When you see the line:
|
||||||
|
```
|
||||||
|
INFO: Uvicorn running on http://0.0.0.0:7000 (Press CTRL+C to quit)
|
||||||
|
```
|
||||||
|
VLLM is running and ready for queries.
|
||||||
|
|
||||||
|
12. In `/notebooks` (the home directory of Jupyter), the notebook `idle.ipynb` will automatically be created. Run this notebook so Paperspace does not shut down your machine due to "inactivity". You **must** keep the running notebook open in a
|
||||||
|
browser tab.
|
||||||
|
|
||||||
|
### Building
|
||||||
|
|
||||||
|
You **must** have a GPU attached to your system when building the container (required for building VLLM).
|
||||||
|
|
||||||
|
1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and CUDA 11.8.
|
||||||
|
2. `bash build-docker.sh`
|
||||||
|
|
||||||
|
To run the container on your local machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 cyberes/vllm-paperspace:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
You will need to create a directory to mount inside the container (for example: `/home/user/testing123/`). Within this should be the folder `models` that holds the model to load, `rathole-client.toml`, and `cmd.txt`.
|
||||||
|
|
||||||
|
If you need to debug something, you can start a shell inside the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 --entrypoint bash cyberes/vllm-paperspace:latest
|
||||||
|
```
|
|
@ -0,0 +1,7 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Build and push the container.
|
||||||
|
|
||||||
|
git pull || exit
|
||||||
|
sudo docker build . -f Dockerfile.base -t cyberes/vllm-paperspace-base --no-cache && sudo docker push cyberes/vllm-paperspace-base:latest || exit
|
||||||
|
sudo docker build . -t cyberes/vllm-paperspace && sudo docker push cyberes/vllm-paperspace:latest
|
|
@ -0,0 +1,40 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "49ae6555-572b-4463-ba01-cc4331932a6c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"i = 0\n",
|
||||||
|
"while True:\n",
|
||||||
|
" print(i)\n",
|
||||||
|
" i += 1\n",
|
||||||
|
" time.sleep(1)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Create the required directories and files.
|
||||||
|
echo "SETTING UP FILE SYSTEM..."
|
||||||
|
mkdir -p /storage/vllm/
|
||||||
|
chown -R apiserver:apiserver /storage/vllm
|
||||||
|
touch /storage/vllm/cmd.txt
|
||||||
|
touch /storage/vllm/rathole-client.toml
|
||||||
|
|
||||||
|
# The user can store SSH auth and authorized_keys to streamline SSH login.
|
||||||
|
if [ -f /storage/vllm/ssh ]; then
|
||||||
|
cp -r /storage/vllm/ssh /root/.ssh
|
||||||
|
echo "Copied ssh from /storage"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If the user has not created the VLLM commandline arg file, create the default.
|
||||||
|
if [ ! -f /storage/vllm/cmd.txt ]; then
|
||||||
|
echo "--max-num-batched-tokens 4098 --quantization awq --model /storage/vllm/models/model-path" >/storage/vllm/cmd.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Copy the idling notebook to storage. This will create a blank notebook every time the container is started.
|
||||||
|
cp /local-llm-server/other/vllm/Docker/idle.ipynb /notebooks/idle.ipynb
|
|
@ -1,13 +1,4 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
mkdir -p /storage/vllm/
|
# Start the services and launch the container.
|
||||||
chown -R apiserver:apiserver /storage/vllm
|
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
|
||||||
touch /storage/vllm/cmd.txt
|
|
||||||
touch /storage/vllm/rathole-client.toml
|
|
||||||
|
|
||||||
if [ -f /storage/vllm/ssh ]; then
|
|
||||||
cp -r /storage/vllm/ssh /root/.ssh
|
|
||||||
echo "Copied ssh from /storage"
|
|
||||||
fi
|
|
||||||
|
|
||||||
/usr/bin/supervisord
|
|
||||||
|
|
|
@ -6,9 +6,4 @@ for pid in $vllm_pid; do
|
||||||
kill -9 $pid
|
kill -9 $pid
|
||||||
done
|
done
|
||||||
|
|
||||||
cd /local-llm-server
|
|
||||||
git fetch
|
|
||||||
git reset --hard origin/master
|
|
||||||
/venv/bin/pip install -r requirements.txt
|
|
||||||
|
|
||||||
/venv/bin/python /local-llm-server/other/vllm/vllm_api_server.py --host 0.0.0.0 --port 7000 --max-log-len 100 $(cat /storage/vllm/cmd.txt)
|
/venv/bin/python /local-llm-server/other/vllm/vllm_api_server.py --host 0.0.0.0 --port 7000 --max-log-len 100 $(cat /storage/vllm/cmd.txt)
|
||||||
|
|
|
@ -1,5 +1,25 @@
|
||||||
[supervisord]
|
[supervisord]
|
||||||
nodaemon=true
|
nodaemon = true
|
||||||
|
user=root
|
||||||
|
pidfile = /var/run/supervisord.pid
|
||||||
|
logfile = /var/log/app/supervisord.log
|
||||||
|
directory = /tmp
|
||||||
|
|
||||||
|
[unix_http_server]
|
||||||
|
file=/var/run/supervisor.sock
|
||||||
|
chmod=0770
|
||||||
|
|
||||||
|
[rpcinterface:supervisor]
|
||||||
|
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||||
|
|
||||||
|
[supervisorctl]
|
||||||
|
serverurl=unix:///var/run/supervisor.sock
|
||||||
|
|
||||||
|
[program:startup]
|
||||||
|
command=/app/init.sh
|
||||||
|
autostart=true
|
||||||
|
autorestart=false
|
||||||
|
startsecs=0
|
||||||
|
|
||||||
[program:vllm]
|
[program:vllm]
|
||||||
command=/bin/bash -c 'bash /app/start-vllm.sh 2>&1 | tee -a /var/log/app/vllm.log'
|
command=/bin/bash -c 'bash /app/start-vllm.sh 2>&1 | tee -a /var/log/app/vllm.log'
|
||||||
|
@ -24,9 +44,20 @@ user=apiserver
|
||||||
environment=HOME="/home/apiserver",USER="apiserver"
|
environment=HOME="/home/apiserver",USER="apiserver"
|
||||||
|
|
||||||
[program:jupyter]
|
[program:jupyter]
|
||||||
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True
|
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True --notebook-dir /notebooks
|
||||||
environment=SHELL="/bin/bash"
|
environment=SHELL="/bin/bash"
|
||||||
; JUPYTER_CONFIG_DIR="/app/jupyter"
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/fd/1
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/fd/2
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
[program:ssh]
|
[program:ssh]
|
||||||
command=/usr/sbin/sshd -D
|
command=/usr/sbin/sshd -D
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
stdout_logfile=/dev/fd/1
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/fd/2
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Run this script to update the container.
|
||||||
|
# Will restart VLLM as well.
|
||||||
|
|
||||||
|
cd /local-llm-server || exit
|
||||||
|
|
||||||
|
git fetch
|
||||||
|
git reset --hard origin/master
|
||||||
|
|
||||||
|
supervisorctl restart vllm
|
Reference in New Issue