update dockerfile
This commit is contained in:
parent
eade509947
commit
74f16afa67
|
@ -72,6 +72,8 @@ RUN chown -R apiserver:apiserver /local-llm-server && \
|
||||||
chown -R apiserver:apiserver /app && \
|
chown -R apiserver:apiserver /app && \
|
||||||
chown -R apiserver:apiserver /var/log/app/
|
chown -R apiserver:apiserver /var/log/app/
|
||||||
|
|
||||||
|
RUN chmod +x /app/start.sh
|
||||||
|
|
||||||
ENV SHELL="/bin/bash"
|
ENV SHELL="/bin/bash"
|
||||||
|
|
||||||
# SSH
|
# SSH
|
||||||
|
@ -83,4 +85,4 @@ EXPOSE 7000
|
||||||
# Jupyter
|
# Jupyter
|
||||||
EXPOSE 8888
|
EXPOSE 8888
|
||||||
|
|
||||||
CMD /app/start.sh
|
CMD /app/start.sh
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
[supervisord]
|
[supervisord]
|
||||||
nodaemon=true
|
nodaemon=true
|
||||||
|
|
||||||
[program:api_server]
|
[program:vllm_server]
|
||||||
command=bash /app/start-vllm.sh 2>&1 | tee /var/log/app/vllm.log
|
command=bash /app/start-vllm.sh 2>&1 | tee /var/log/app/vllm.log
|
||||||
autostart=true
|
autostart=true
|
||||||
autorestart=true
|
autorestart=true
|
||||||
|
@ -12,7 +12,7 @@ stderr_logfile_maxbytes=0
|
||||||
user=apiserver
|
user=apiserver
|
||||||
environment=HOME="/home/apiserver",USER="apiserver"
|
environment=HOME="/home/apiserver",USER="apiserver"
|
||||||
|
|
||||||
[program:proxy]
|
[program:rathole]
|
||||||
command=/app/rathole -c /app/client.toml 2>&1 | tee /var/log/app/rathole.log
|
command=/app/rathole -c /app/client.toml 2>&1 | tee /var/log/app/rathole.log
|
||||||
autostart=true
|
autostart=true
|
||||||
autorestart=true
|
autorestart=true
|
||||||
|
|
15
server.py
15
server.py
|
@ -26,18 +26,21 @@ from llm_server.routes.v1 import bp
|
||||||
from llm_server.stream import init_socketio
|
from llm_server.stream import init_socketio
|
||||||
|
|
||||||
# TODO: have the workers handle streaming too
|
# TODO: have the workers handle streaming too
|
||||||
# TODO: send extra headers when ratelimited?
|
# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
|
||||||
# TODO: return 200 when returning formatted sillytavern error
|
# TODO: implement background thread to test backends via sending test prompts
|
||||||
# TODO: add some sort of loadbalancer to send requests to a group of backends
|
# TODO: if backend fails request, mark it as down
|
||||||
# TODO: allow setting concurrent gens per-backend
|
# TODO: allow setting concurrent gens per-backend
|
||||||
# TODO: use first backend as default backend
|
# TODO: set the max tokens to that of the lowest backend
|
||||||
|
# TODO: implement RRD backend loadbalancer option
|
||||||
|
|
||||||
# TODO: simulate OpenAI error messages regardless of endpoint
|
# TODO: simulate OpenAI error messages regardless of endpoint
|
||||||
# TODO: allow setting specific simoltaneous IPs allowed per token
|
# TODO: send extra headers when ratelimited?
|
||||||
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
|
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
|
||||||
# TODO: unify logging thread in a function and use async/await instead
|
# TODO: unify logging thread in a function and use async/await instead
|
||||||
# TODO: add more excluding to SYSTEM__ tokens
|
|
||||||
|
|
||||||
|
# Done, but need to verify
|
||||||
|
# TODO: add more excluding to SYSTEM__ tokens
|
||||||
|
# TODO: return 200 when returning formatted sillytavern error
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import vllm
|
import vllm
|
||||||
|
|
Reference in New Issue