update dockerfile

This commit is contained in:
Cyberes 2023-09-27 16:12:36 -06:00
parent eade509947
commit 74f16afa67
3 changed files with 14 additions and 9 deletions

View File

@ -72,6 +72,8 @@ RUN chown -R apiserver:apiserver /local-llm-server && \
chown -R apiserver:apiserver /app && \ chown -R apiserver:apiserver /app && \
chown -R apiserver:apiserver /var/log/app/ chown -R apiserver:apiserver /var/log/app/
RUN chmod +x /app/start.sh
ENV SHELL="/bin/bash" ENV SHELL="/bin/bash"
# SSH # SSH
@ -83,4 +85,4 @@ EXPOSE 7000
# Jupyter # Jupyter
EXPOSE 8888 EXPOSE 8888
CMD /app/start.sh CMD /app/start.sh

View File

@ -1,7 +1,7 @@
[supervisord] [supervisord]
nodaemon=true nodaemon=true
[program:api_server] [program:vllm_server]
command=bash /app/start-vllm.sh 2>&1 | tee /var/log/app/vllm.log command=bash /app/start-vllm.sh 2>&1 | tee /var/log/app/vllm.log
autostart=true autostart=true
autorestart=true autorestart=true
@ -12,7 +12,7 @@ stderr_logfile_maxbytes=0
user=apiserver user=apiserver
environment=HOME="/home/apiserver",USER="apiserver" environment=HOME="/home/apiserver",USER="apiserver"
[program:proxy] [program:rathole]
command=/app/rathole -c /app/client.toml 2>&1 | tee /var/log/app/rathole.log command=/app/rathole -c /app/client.toml 2>&1 | tee /var/log/app/rathole.log
autostart=true autostart=true
autorestart=true autorestart=true

View File

@ -26,18 +26,21 @@ from llm_server.routes.v1 import bp
from llm_server.stream import init_socketio from llm_server.stream import init_socketio
# TODO: have the workers handle streaming too # TODO: have the workers handle streaming too
# TODO: send extra headers when ratelimited? # TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
# TODO: return 200 when returning formatted sillytavern error # TODO: implement background thread to test backends via sending test prompts
# TODO: add some sort of loadbalancer to send requests to a group of backends # TODO: if backend fails request, mark it as down
# TODO: allow setting concurrent gens per-backend # TODO: allow setting concurrent gens per-backend
# TODO: use first backend as default backend # TODO: set the max tokens to that of the lowest backend
# TODO: implement RRD backend loadbalancer option
# TODO: simulate OpenAI error messages regardless of endpoint # TODO: simulate OpenAI error messages regardless of endpoint
# TODO: allow setting specific simoltaneous IPs allowed per token # TODO: send extra headers when ratelimited?
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests # TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
# TODO: unify logging thread in a function and use async/await instead # TODO: unify logging thread in a function and use async/await instead
# TODO: add more excluding to SYSTEM__ tokens
# Done, but need to verify
# TODO: add more excluding to SYSTEM__ tokens
# TODO: return 200 when returning formatted sillytavern error
try: try:
import vllm import vllm