diff --git a/other/vllm/Docker/Dockerfile b/other/vllm/Docker/Dockerfile index 55111a7..da878ce 100644 --- a/other/vllm/Docker/Dockerfile +++ b/other/vllm/Docker/Dockerfile @@ -72,6 +72,8 @@ RUN chown -R apiserver:apiserver /local-llm-server && \ chown -R apiserver:apiserver /app && \ chown -R apiserver:apiserver /var/log/app/ +RUN chmod +x /app/start.sh + ENV SHELL="/bin/bash" # SSH @@ -83,4 +85,4 @@ EXPOSE 7000 # Jupyter EXPOSE 8888 -CMD /app/start.sh \ No newline at end of file +CMD /app/start.sh diff --git a/other/vllm/Docker/supervisord.conf b/other/vllm/Docker/supervisord.conf index bf3c093..c0fc170 100644 --- a/other/vllm/Docker/supervisord.conf +++ b/other/vllm/Docker/supervisord.conf @@ -1,7 +1,7 @@ [supervisord] nodaemon=true -[program:api_server] +[program:vllm_server] command=bash /app/start-vllm.sh 2>&1 | tee /var/log/app/vllm.log autostart=true autorestart=true @@ -12,7 +12,7 @@ stderr_logfile_maxbytes=0 user=apiserver environment=HOME="/home/apiserver",USER="apiserver" -[program:proxy] +[program:rathole] command=/app/rathole -c /app/client.toml 2>&1 | tee /var/log/app/rathole.log autostart=true autorestart=true diff --git a/server.py b/server.py index 6a9949f..98aaffa 100644 --- a/server.py +++ b/server.py @@ -26,18 +26,21 @@ from llm_server.routes.v1 import bp from llm_server.stream import init_socketio # TODO: have the workers handle streaming too -# TODO: send extra headers when ratelimited? -# TODO: return 200 when returning formatted sillytavern error -# TODO: add some sort of loadbalancer to send requests to a group of backends +# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail +# TODO: implement background thread to test backends via sending test prompts +# TODO: if backend fails request, mark it as down # TODO: allow setting concurrent gens per-backend -# TODO: use first backend as default backend +# TODO: set the max tokens to that of the lowest backend +# TODO: implement RRD backend loadbalancer option # TODO: simulate OpenAI error messages regardless of endpoint -# TODO: allow setting specific simoltaneous IPs allowed per token +# TODO: send extra headers when ratelimited? # TODO: make sure log_prompt() is used everywhere, including errors and invalid requests # TODO: unify logging thread in a function and use async/await instead -# TODO: add more excluding to SYSTEM__ tokens +# Done, but need to verify +# TODO: add more excluding to SYSTEM__ tokens +# TODO: return 200 when returning formatted sillytavern error try: import vllm