Merge branch 'master' into cluster

This commit is contained in:
Cyberes 2023-10-27 19:05:27 -06:00
parent 28c250385d
commit ee44371fdf
11 changed files with 228 additions and 93 deletions

View File

@ -1,15 +0,0 @@
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
1. Run `jupyter server --generate-config` and `jupyter server password` on your local machine, then copy Jupyter's config directory to `./jupyter`
2. Place your Rathole client config at `./rathole-client.toml`
3. `docker build . -t "paperspace-vllm"`
To test on your local machine, run this command:
```bash
docker run --shm-size 14g --gpus all \
-v /storage/models/awq/MythoMax-L2-13B-AWQ:/models/MythoMax-L2-13B-AWQ \
-p 7000:7000 -p 8888:8888 \
-e API_SERVER_ARGS="--model /models/MythoMax-L2-13B-AWQ --quantization awq --max-num-batched-tokens 99999 --gpu-memory-utilization 1" \
vllm-cloud
```

View File

@ -1,87 +1,50 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
RUN apt-get update && \
apt-get install -y git python3-pip python3-venv wget unzip && \
rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade pip setuptools wheel
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
WORKDIR /local-llm-server
RUN python3 -m venv /venv
RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
RUN python3 -m venv /jupyterlab
RUN /jupyterlab/bin/pip install jupyterlab
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
RUN mkdir -p /app
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
RUN unzip -j /tmp/rathole.zip -d /tmp
RUN rm /tmp/rathole.zip
RUN cp /tmp/rathole /app
# The local local-llm-server repo may be cached, so we will fetch and reset to the remote every time.
# Also, make sure there weren't any pip deps added.
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
RUN git fetch; git reset --hard origin/master
RUN /venv/bin/pip install -r requirements.txt
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as runtime
RUN apt-get update && apt-get install -y supervisor && rm -rf /var/lib/apt/lists/*
FROM cyberes/vllm-paperspace-base as runtime
RUN useradd -ms /bin/bash apiserver
RUN usermod -s /bin/bash root
# Required packages
RUN apt-get update && \
apt-get install -y python3 python3-pip wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
apt-get install -y python3 python3-pip supervisor && \
rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade pip setuptools wheel
# Useful Python packages
RUN pip3 install glances
# Useful tools
RUN apt-get update && \
apt-get install -y wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade pip setuptools wheel
RUN pip3 install glances
# Update the git repo
RUN cd /local-llm-server && git reset --hard && git pull
# Enable root SSH login
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
# Disable password SSH login
RUN sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
# Create the necessary directory for SSH
# Create the necessary directory for sshd
RUN mkdir /var/run/sshd
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
COPY --from=build /local-llm-server /local-llm-server
COPY --from=build /venv /venv
COPY --from=build /app /app
COPY --from=build /jupyterlab /jupyterlab
RUN cp /local-llm-server/other/vllm/Docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
RUN cp /local-llm-server/other/vllm/Docker/start-vllm.sh /app/start-vllm.sh
RUN cp /local-llm-server/other/vllm/Docker/start-container.sh /app/start.sh
# Copy your secrets in
# COPY ./jupyter /app/jupyter
COPY supervisord.conf /etc/supervisor/supervisord.conf
COPY start-vllm.sh /app/start-vllm.sh
COPY init-container.sh /app/init.sh
COPY start-container.sh /app/start.sh
RUN mkdir -p /var/log/app/
RUN chown -R apiserver:apiserver /local-llm-server && \
chown -R apiserver:apiserver /app && \
chown -R apiserver:apiserver /var/log/app/
RUN git config --global --add safe.directory /local-llm-server
RUN chmod +x /app/init.sh
RUN chmod +x /app/start.sh
ENV SHELL="/bin/bash"
# SSH
EXPOSE 22
# VLLM
EXPOSE 7000
# Jupyter
# Expose Jupyter. We don't need to expose VLLM or SSH since rathole will tunnel those.
EXPOSE 8888
CMD /app/start.sh

View File

@ -0,0 +1,43 @@
# This container builds and assembles the Python parts of the Docker container.
# It is used as the base for the resulting container, which avoids having to re-push
# the large PyTorch parts every time the application is rebuilt.
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
RUN apt-get update && \
apt-get install -y git python3-pip python3-venv wget unzip && \
rm -rf /var/lib/apt/lists/*
RUN pip install --upgrade pip setuptools wheel
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
RUN python3 -m venv /jupyterlab
RUN /jupyterlab/bin/pip install jupyterlab
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
RUN mkdir -p /app
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
RUN unzip -j /tmp/rathole.zip -d /tmp
RUN rm /tmp/rathole.zip
RUN cp /tmp/rathole /app
RUN python3 -m venv /venv
RUN /venv/bin/pip3 install --upgrade pip setuptools wheel
# Install PyTorch before installing VLLM to ensure we use the right version for our CUDA install.
RUN wget -q -O - https://raw.githubusercontent.com/vllm-project/vllm/main/requirements.txt | grep -E 'torch*' > /tmp/torch_version
RUN /venv/bin/pip3 install "$(cat /tmp/torch_version)" --index-url https://download.pytorch.org/whl/cu118
# WORKDIR /local-llm-server
# Don't build VLLM because we don't do that on the inference server. Just install from pip.
# RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
RUN /venv/bin/pip install vllm
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as base
COPY --from=build /local-llm-server /local-llm-server
COPY --from=build /venv /venv
COPY --from=build /app /app
COPY --from=build /jupyterlab /jupyterlab

View File

@ -0,0 +1,47 @@
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
### Running
1. In Paperspace, create a new notebook.
2. Click `Start from Scratch`.
3. Select your GPU and set the auto-shutdown timeout to 6 hours.
4. Click the `View Advanced Options` button at the bottom of the page. Enter these details in the form that appears:
- Container Name: `cyberes/vllm-paperspace:latest`
- Container Command: `/app/start.sh`
5. Start the notebook. It may take up to five minutes for them to pull and start the custom image.
6. Once the container is started, open the log viewer by clicking the icon in the bottom left of the screen. You should see errors from rathole and VLLM as a result of the blank config files. The container will create a new directory in your mounted
storage: `/storage/vllm/`.
7. Enter your rathole client config in `/storage/vllm/rathole-client.toml`. If you need a visual text editor, first link the directory back to the Jupyter home: `ln -s /storage/vllm /notebooks`
8. Restart rathole with `supervisorctl restart rathole` and then view the log: `tail -f /var/log/app/rathole.log`. If you see lines that start with `INFO` and end with `Control channel established`, rathole has connected and is working. Error mesasges will begin
with `ERROR`.
9. Download an AWQ quantization from [TheBloke](https://huggingface.co/TheBloke) to `/storage/vllm/models/`.
10. Enter your VLLM commandline args in `/storage/vllm/cmd.txt`. You need to set `--model` to the path of the model you want to load.
11. Restart VLLM with `supervisorctl restart vllm` and then view the log: `tail -f /var/log/app/vllm.log`. It may take up to three minutes to load. When you see the line:
```
INFO: Uvicorn running on http://0.0.0.0:7000 (Press CTRL+C to quit)
```
       VLLM is running and ready for queries.
12. In `/notebooks` (the home directory of Jupyter), the notebook `idle.ipynb` will automatically be created. Run this notebook so Paperspace does not shut down your machine due to "inactivity". You **must** keep the running notebook open in a
browser tab.
### Building
You **must** have a GPU attached to your system when building the container (required for building VLLM).
1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and CUDA 11.8.
2. `bash build-docker.sh`
To run the container on your local machine:
```bash
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 cyberes/vllm-paperspace:latest
```
You will need to create a directory to mount inside the container (for example: `/home/user/testing123/`). Within this should be the folder `models` that holds the model to load, `rathole-client.toml`, and `cmd.txt`.
If you need to debug something, you can start a shell inside the container:
```bash
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 --entrypoint bash cyberes/vllm-paperspace:latest
```

View File

@ -0,0 +1,7 @@
#!/bin/bash
# Build and push the container.
git pull || exit
sudo docker build . -f Dockerfile.base -t cyberes/vllm-paperspace-base --no-cache && sudo docker push cyberes/vllm-paperspace-base:latest || exit
sudo docker build . -t cyberes/vllm-paperspace && sudo docker push cyberes/vllm-paperspace:latest

View File

@ -0,0 +1,40 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "49ae6555-572b-4463-ba01-cc4331932a6c",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"i = 0\n",
"while True:\n",
" print(i)\n",
" i += 1\n",
" time.sleep(1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,22 @@
#!/bin/bash
# Create the required directories and files.
echo "SETTING UP FILE SYSTEM..."
mkdir -p /storage/vllm/
chown -R apiserver:apiserver /storage/vllm
touch /storage/vllm/cmd.txt
touch /storage/vllm/rathole-client.toml
# The user can store SSH auth and authorized_keys to streamline SSH login.
if [ -f /storage/vllm/ssh ]; then
cp -r /storage/vllm/ssh /root/.ssh
echo "Copied ssh from /storage"
fi
# If the user has not created the VLLM commandline arg file, create the default.
if [ ! -f /storage/vllm/cmd.txt ]; then
echo "--max-num-batched-tokens 4098 --quantization awq --model /storage/vllm/models/model-path" >/storage/vllm/cmd.txt
fi
# Copy the idling notebook to storage. This will create a blank notebook every time the container is started.
cp /local-llm-server/other/vllm/Docker/idle.ipynb /notebooks/idle.ipynb

View File

@ -1,13 +1,4 @@
#!/bin/bash
mkdir -p /storage/vllm/
chown -R apiserver:apiserver /storage/vllm
touch /storage/vllm/cmd.txt
touch /storage/vllm/rathole-client.toml
if [ -f /storage/vllm/ssh ]; then
cp -r /storage/vllm/ssh /root/.ssh
echo "Copied ssh from /storage"
fi
/usr/bin/supervisord
# Start the services and launch the container.
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf

View File

@ -6,9 +6,4 @@ for pid in $vllm_pid; do
kill -9 $pid
done
cd /local-llm-server
git fetch
git reset --hard origin/master
/venv/bin/pip install -r requirements.txt
/venv/bin/python /local-llm-server/other/vllm/vllm_api_server.py --host 0.0.0.0 --port 7000 --max-log-len 100 $(cat /storage/vllm/cmd.txt)

View File

@ -1,5 +1,25 @@
[supervisord]
nodaemon = true
user=root
pidfile = /var/run/supervisord.pid
logfile = /var/log/app/supervisord.log
directory = /tmp
[unix_http_server]
file=/var/run/supervisor.sock
chmod=0770
[rpcinterface:supervisor]
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
[supervisorctl]
serverurl=unix:///var/run/supervisor.sock
[program:startup]
command=/app/init.sh
autostart=true
autorestart=false
startsecs=0
[program:vllm]
command=/bin/bash -c 'bash /app/start-vllm.sh 2>&1 | tee -a /var/log/app/vllm.log'
@ -24,9 +44,20 @@ user=apiserver
environment=HOME="/home/apiserver",USER="apiserver"
[program:jupyter]
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True --notebook-dir /notebooks
environment=SHELL="/bin/bash"
; JUPYTER_CONFIG_DIR="/app/jupyter"
autostart=true
autorestart=true
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
stderr_logfile_maxbytes=0
[program:ssh]
command=/usr/sbin/sshd -D
autostart=true
autorestart=true
stdout_logfile=/dev/fd/1
stdout_logfile_maxbytes=0
stderr_logfile=/dev/fd/2
stderr_logfile_maxbytes=0

View File

@ -0,0 +1,11 @@
#!/bin/bash
# Run this script to update the container.
# Will restart VLLM as well.
cd /local-llm-server || exit
git fetch
git reset --hard origin/master
supervisorctl restart vllm