Merge cluster to master #3
|
@ -1,15 +0,0 @@
|
|||
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
|
||||
|
||||
1. Run `jupyter server --generate-config` and `jupyter server password` on your local machine, then copy Jupyter's config directory to `./jupyter`
|
||||
2. Place your Rathole client config at `./rathole-client.toml`
|
||||
3. `docker build . -t "paperspace-vllm"`
|
||||
|
||||
To test on your local machine, run this command:
|
||||
|
||||
```bash
|
||||
docker run --shm-size 14g --gpus all \
|
||||
-v /storage/models/awq/MythoMax-L2-13B-AWQ:/models/MythoMax-L2-13B-AWQ \
|
||||
-p 7000:7000 -p 8888:8888 \
|
||||
-e API_SERVER_ARGS="--model /models/MythoMax-L2-13B-AWQ --quantization awq --max-num-batched-tokens 99999 --gpu-memory-utilization 1" \
|
||||
vllm-cloud
|
||||
```
|
|
@ -1,87 +1,50 @@
|
|||
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git python3-pip python3-venv wget unzip && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
RUN pip3 install --upgrade pip setuptools wheel
|
||||
|
||||
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
|
||||
|
||||
WORKDIR /local-llm-server
|
||||
|
||||
RUN python3 -m venv /venv
|
||||
RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
|
||||
|
||||
RUN python3 -m venv /jupyterlab
|
||||
RUN /jupyterlab/bin/pip install jupyterlab
|
||||
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
||||
|
||||
RUN mkdir -p /app
|
||||
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
|
||||
RUN unzip -j /tmp/rathole.zip -d /tmp
|
||||
RUN rm /tmp/rathole.zip
|
||||
RUN cp /tmp/rathole /app
|
||||
|
||||
# The local local-llm-server repo may be cached, so we will fetch and reset to the remote every time.
|
||||
# Also, make sure there weren't any pip deps added.
|
||||
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
|
||||
RUN git fetch; git reset --hard origin/master
|
||||
RUN /venv/bin/pip install -r requirements.txt
|
||||
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as runtime
|
||||
|
||||
RUN apt-get update && apt-get install -y supervisor && rm -rf /var/lib/apt/lists/*
|
||||
FROM cyberes/vllm-paperspace-base as runtime
|
||||
|
||||
RUN useradd -ms /bin/bash apiserver
|
||||
RUN usermod -s /bin/bash root
|
||||
|
||||
# Required packages
|
||||
RUN apt-get update && \
|
||||
apt-get install -y python3 python3-pip wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
|
||||
apt-get install -y python3 python3-pip supervisor && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
RUN pip3 install --upgrade pip setuptools wheel
|
||||
|
||||
# Useful Python packages
|
||||
RUN pip3 install glances
|
||||
|
||||
# Useful tools
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget aria2 git-lfs git openssh-server openssh-client nano tmux file && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN pip3 install --upgrade pip setuptools wheel
|
||||
RUN pip3 install glances
|
||||
# Update the git repo
|
||||
RUN cd /local-llm-server && git reset --hard && git pull
|
||||
|
||||
# Enable root SSH login
|
||||
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
|
||||
|
||||
# Disable password SSH login
|
||||
RUN sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config
|
||||
|
||||
# Create the necessary directory for SSH
|
||||
# Create the necessary directory for sshd
|
||||
RUN mkdir /var/run/sshd
|
||||
|
||||
ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
|
||||
|
||||
COPY --from=build /local-llm-server /local-llm-server
|
||||
COPY --from=build /venv /venv
|
||||
COPY --from=build /app /app
|
||||
COPY --from=build /jupyterlab /jupyterlab
|
||||
|
||||
RUN cp /local-llm-server/other/vllm/Docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
RUN cp /local-llm-server/other/vllm/Docker/start-vllm.sh /app/start-vllm.sh
|
||||
RUN cp /local-llm-server/other/vllm/Docker/start-container.sh /app/start.sh
|
||||
|
||||
# Copy your secrets in
|
||||
# COPY ./jupyter /app/jupyter
|
||||
COPY supervisord.conf /etc/supervisor/supervisord.conf
|
||||
COPY start-vllm.sh /app/start-vllm.sh
|
||||
COPY init-container.sh /app/init.sh
|
||||
COPY start-container.sh /app/start.sh
|
||||
|
||||
RUN mkdir -p /var/log/app/
|
||||
|
||||
RUN chown -R apiserver:apiserver /local-llm-server && \
|
||||
chown -R apiserver:apiserver /app && \
|
||||
chown -R apiserver:apiserver /var/log/app/
|
||||
RUN git config --global --add safe.directory /local-llm-server
|
||||
|
||||
RUN chmod +x /app/init.sh
|
||||
RUN chmod +x /app/start.sh
|
||||
|
||||
ENV SHELL="/bin/bash"
|
||||
|
||||
# SSH
|
||||
EXPOSE 22
|
||||
|
||||
# VLLM
|
||||
EXPOSE 7000
|
||||
|
||||
# Jupyter
|
||||
# Expose Jupyter. We don't need to expose VLLM or SSH since rathole will tunnel those.
|
||||
EXPOSE 8888
|
||||
|
||||
CMD /app/start.sh
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
# This container builds and assembles the Python parts of the Docker container.
|
||||
# It is used as the base for the resulting container, which avoids having to re-push
|
||||
# the large PyTorch parts every time the application is rebuilt.
|
||||
|
||||
FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git python3-pip python3-venv wget unzip && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
RUN pip install --upgrade pip setuptools wheel
|
||||
|
||||
RUN git clone https://git.evulid.cc/cyberes/local-llm-server.git /local-llm-server
|
||||
|
||||
RUN python3 -m venv /jupyterlab
|
||||
RUN /jupyterlab/bin/pip install jupyterlab
|
||||
RUN /jupyterlab/bin/jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
|
||||
|
||||
RUN mkdir -p /app
|
||||
RUN wget https://github.com/rapiz1/rathole/releases/download/v0.4.8/rathole-x86_64-unknown-linux-gnu.zip -O /tmp/rathole.zip
|
||||
RUN unzip -j /tmp/rathole.zip -d /tmp
|
||||
RUN rm /tmp/rathole.zip
|
||||
RUN cp /tmp/rathole /app
|
||||
|
||||
RUN python3 -m venv /venv
|
||||
RUN /venv/bin/pip3 install --upgrade pip setuptools wheel
|
||||
|
||||
# Install PyTorch before installing VLLM to ensure we use the right version for our CUDA install.
|
||||
RUN wget -q -O - https://raw.githubusercontent.com/vllm-project/vllm/main/requirements.txt | grep -E 'torch*' > /tmp/torch_version
|
||||
RUN /venv/bin/pip3 install "$(cat /tmp/torch_version)" --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
# WORKDIR /local-llm-server
|
||||
|
||||
# Don't build VLLM because we don't do that on the inference server. Just install from pip.
|
||||
# RUN /venv/bin/pip install git+https://github.com/vllm-project/vllm
|
||||
|
||||
RUN /venv/bin/pip install vllm
|
||||
|
||||
FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as base
|
||||
|
||||
COPY --from=build /local-llm-server /local-llm-server
|
||||
COPY --from=build /venv /venv
|
||||
COPY --from=build /app /app
|
||||
COPY --from=build /jupyterlab /jupyterlab
|
|
@ -0,0 +1,47 @@
|
|||
**A Docker container for running VLLM on Paperspace Gradient notebooks.**
|
||||
|
||||
### Running
|
||||
|
||||
1. In Paperspace, create a new notebook.
|
||||
2. Click `Start from Scratch`.
|
||||
3. Select your GPU and set the auto-shutdown timeout to 6 hours.
|
||||
4. Click the `View Advanced Options` button at the bottom of the page. Enter these details in the form that appears:
|
||||
- Container Name: `cyberes/vllm-paperspace:latest`
|
||||
- Container Command: `/app/start.sh`
|
||||
5. Start the notebook. It may take up to five minutes for them to pull and start the custom image.
|
||||
6. Once the container is started, open the log viewer by clicking the icon in the bottom left of the screen. You should see errors from rathole and VLLM as a result of the blank config files. The container will create a new directory in your mounted
|
||||
storage: `/storage/vllm/`.
|
||||
7. Enter your rathole client config in `/storage/vllm/rathole-client.toml`. If you need a visual text editor, first link the directory back to the Jupyter home: `ln -s /storage/vllm /notebooks`
|
||||
8. Restart rathole with `supervisorctl restart rathole` and then view the log: `tail -f /var/log/app/rathole.log`. If you see lines that start with `INFO` and end with `Control channel established`, rathole has connected and is working. Error mesasges will begin
|
||||
with `ERROR`.
|
||||
9. Download an AWQ quantization from [TheBloke](https://huggingface.co/TheBloke) to `/storage/vllm/models/`.
|
||||
10. Enter your VLLM commandline args in `/storage/vllm/cmd.txt`. You need to set `--model` to the path of the model you want to load.
|
||||
11. Restart VLLM with `supervisorctl restart vllm` and then view the log: `tail -f /var/log/app/vllm.log`. It may take up to three minutes to load. When you see the line:
|
||||
```
|
||||
INFO: Uvicorn running on http://0.0.0.0:7000 (Press CTRL+C to quit)
|
||||
```
|
||||
VLLM is running and ready for queries.
|
||||
|
||||
12. In `/notebooks` (the home directory of Jupyter), the notebook `idle.ipynb` will automatically be created. Run this notebook so Paperspace does not shut down your machine due to "inactivity". You **must** keep the running notebook open in a
|
||||
browser tab.
|
||||
|
||||
### Building
|
||||
|
||||
You **must** have a GPU attached to your system when building the container (required for building VLLM).
|
||||
|
||||
1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) and CUDA 11.8.
|
||||
2. `bash build-docker.sh`
|
||||
|
||||
To run the container on your local machine:
|
||||
|
||||
```bash
|
||||
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 cyberes/vllm-paperspace:latest
|
||||
```
|
||||
|
||||
You will need to create a directory to mount inside the container (for example: `/home/user/testing123/`). Within this should be the folder `models` that holds the model to load, `rathole-client.toml`, and `cmd.txt`.
|
||||
|
||||
If you need to debug something, you can start a shell inside the container:
|
||||
|
||||
```bash
|
||||
sudo docker run -it --shm-size 14g --gpus all -v /home/user/testing123/notebooks:/notebooks -v /home/user/testing123/storage:/storage -p 8888:8888 --entrypoint bash cyberes/vllm-paperspace:latest
|
||||
```
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Build and push the container.
|
||||
|
||||
git pull || exit
|
||||
sudo docker build . -f Dockerfile.base -t cyberes/vllm-paperspace-base --no-cache && sudo docker push cyberes/vllm-paperspace-base:latest || exit
|
||||
sudo docker build . -t cyberes/vllm-paperspace && sudo docker push cyberes/vllm-paperspace:latest
|
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "49ae6555-572b-4463-ba01-cc4331932a6c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"i = 0\n",
|
||||
"while True:\n",
|
||||
" print(i)\n",
|
||||
" i += 1\n",
|
||||
" time.sleep(1)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Create the required directories and files.
|
||||
echo "SETTING UP FILE SYSTEM..."
|
||||
mkdir -p /storage/vllm/
|
||||
chown -R apiserver:apiserver /storage/vllm
|
||||
touch /storage/vllm/cmd.txt
|
||||
touch /storage/vllm/rathole-client.toml
|
||||
|
||||
# The user can store SSH auth and authorized_keys to streamline SSH login.
|
||||
if [ -f /storage/vllm/ssh ]; then
|
||||
cp -r /storage/vllm/ssh /root/.ssh
|
||||
echo "Copied ssh from /storage"
|
||||
fi
|
||||
|
||||
# If the user has not created the VLLM commandline arg file, create the default.
|
||||
if [ ! -f /storage/vllm/cmd.txt ]; then
|
||||
echo "--max-num-batched-tokens 4098 --quantization awq --model /storage/vllm/models/model-path" >/storage/vllm/cmd.txt
|
||||
fi
|
||||
|
||||
# Copy the idling notebook to storage. This will create a blank notebook every time the container is started.
|
||||
cp /local-llm-server/other/vllm/Docker/idle.ipynb /notebooks/idle.ipynb
|
|
@ -1,13 +1,4 @@
|
|||
#!/bin/bash
|
||||
|
||||
mkdir -p /storage/vllm/
|
||||
chown -R apiserver:apiserver /storage/vllm
|
||||
touch /storage/vllm/cmd.txt
|
||||
touch /storage/vllm/rathole-client.toml
|
||||
|
||||
if [ -f /storage/vllm/ssh ]; then
|
||||
cp -r /storage/vllm/ssh /root/.ssh
|
||||
echo "Copied ssh from /storage"
|
||||
fi
|
||||
|
||||
/usr/bin/supervisord
|
||||
# Start the services and launch the container.
|
||||
/usr/bin/supervisord -c /etc/supervisor/supervisord.conf
|
||||
|
|
|
@ -6,9 +6,4 @@ for pid in $vllm_pid; do
|
|||
kill -9 $pid
|
||||
done
|
||||
|
||||
cd /local-llm-server
|
||||
git fetch
|
||||
git reset --hard origin/master
|
||||
/venv/bin/pip install -r requirements.txt
|
||||
|
||||
/venv/bin/python /local-llm-server/other/vllm/vllm_api_server.py --host 0.0.0.0 --port 7000 --max-log-len 100 $(cat /storage/vllm/cmd.txt)
|
||||
|
|
|
@ -1,5 +1,25 @@
|
|||
[supervisord]
|
||||
nodaemon = true
|
||||
user=root
|
||||
pidfile = /var/run/supervisord.pid
|
||||
logfile = /var/log/app/supervisord.log
|
||||
directory = /tmp
|
||||
|
||||
[unix_http_server]
|
||||
file=/var/run/supervisor.sock
|
||||
chmod=0770
|
||||
|
||||
[rpcinterface:supervisor]
|
||||
supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
|
||||
|
||||
[supervisorctl]
|
||||
serverurl=unix:///var/run/supervisor.sock
|
||||
|
||||
[program:startup]
|
||||
command=/app/init.sh
|
||||
autostart=true
|
||||
autorestart=false
|
||||
startsecs=0
|
||||
|
||||
[program:vllm]
|
||||
command=/bin/bash -c 'bash /app/start-vllm.sh 2>&1 | tee -a /var/log/app/vllm.log'
|
||||
|
@ -24,9 +44,20 @@ user=apiserver
|
|||
environment=HOME="/home/apiserver",USER="apiserver"
|
||||
|
||||
[program:jupyter]
|
||||
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True
|
||||
command=/jupyterlab/bin/jupyter lab --allow-root --ip=0.0.0.0 --no-browser --ServerApp.trust_xheaders=True --ServerApp.disable_check_xsrf=False --ServerApp.allow_remote_access=True --ServerApp.allow_origin='*' --ServerApp.allow_credentials=True --notebook-dir /notebooks
|
||||
environment=SHELL="/bin/bash"
|
||||
; JUPYTER_CONFIG_DIR="/app/jupyter"
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/fd/1
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/fd/2
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:ssh]
|
||||
command=/usr/sbin/sshd -D
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/dev/fd/1
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/fd/2
|
||||
stderr_logfile_maxbytes=0
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Run this script to update the container.
|
||||
# Will restart VLLM as well.
|
||||
|
||||
cd /local-llm-server || exit
|
||||
|
||||
git fetch
|
||||
git reset --hard origin/master
|
||||
|
||||
supervisorctl restart vllm
|
Reference in New Issue