clean some stuff up, bump VLLM version
This commit is contained in:
parent
0e7f04ab2d
commit
4b3e0671c6
|
@ -7,6 +7,7 @@ The purpose of this server is to abstract your LLM backend from your frontend AP
|
||||||
### Install
|
### Install
|
||||||
|
|
||||||
also need to create /var/log/localllm
|
also need to create /var/log/localllm
|
||||||
|
chown -R server:adm /var/log/localllm/
|
||||||
|
|
||||||
1. `sudo apt install redis`
|
1. `sudo apt install redis`
|
||||||
2. `python3 -m venv venv`
|
2. `python3 -m venv venv`
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from llm_server import opts
|
from llm_server import opts
|
||||||
from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend
|
from llm_server.cluster.cluster_config import get_a_cluster_backend, cluster_config
|
||||||
from llm_server.cluster.stores import redis_running_models
|
from llm_server.cluster.stores import redis_running_models
|
||||||
from llm_server.custom_redis import redis
|
from llm_server.custom_redis import redis
|
||||||
from llm_server.llm.generator import generator
|
from llm_server.llm.generator import generator
|
||||||
|
@ -12,26 +12,38 @@ from llm_server.routes.stats import calculate_wait_time, get_active_gen_workers_
|
||||||
|
|
||||||
|
|
||||||
def get_backends_from_model(model_name: str):
|
def get_backends_from_model(model_name: str):
|
||||||
|
"""
|
||||||
|
Get the backends that are running a specific model. This is the inverse of `get_model_choices()`.
|
||||||
|
:param model_name:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
return [x.decode('utf-8') for x in redis_running_models.smembers(model_name)]
|
return [x.decode('utf-8') for x in redis_running_models.smembers(model_name)]
|
||||||
|
|
||||||
|
|
||||||
def get_running_models():
|
def get_running_models():
|
||||||
return redis_running_models.keys()
|
"""
|
||||||
|
Get all the models that are in the cluster.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
return list(redis_running_models.keys())
|
||||||
|
|
||||||
|
|
||||||
def purge_backend_from_running_models(backend_url: str):
|
def is_valid_model(model_name: str) -> bool:
|
||||||
keys = redis_running_models.keys()
|
"""
|
||||||
pipeline = redis_running_models.pipeline()
|
Is this a model that is being hosted in the cluster?
|
||||||
for model in keys:
|
:param model_name:
|
||||||
pipeline.srem(model, backend_url)
|
:return:
|
||||||
pipeline.execute()
|
"""
|
||||||
|
|
||||||
|
|
||||||
def is_valid_model(model_name: str):
|
|
||||||
return redis_running_models.exists(model_name)
|
return redis_running_models.exists(model_name)
|
||||||
|
|
||||||
|
|
||||||
def test_backend(backend_url: str, test_prompt: bool = False):
|
def test_backend(backend_url: str, test_prompt: bool = False):
|
||||||
|
"""
|
||||||
|
Test (using a test prompt) a backend to check if it is online.
|
||||||
|
:param backend_url:
|
||||||
|
:param test_prompt:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
backend_info = cluster_config.get_backend(backend_url)
|
backend_info = cluster_config.get_backend(backend_url)
|
||||||
if test_prompt:
|
if test_prompt:
|
||||||
handler = VLLMBackend(backend_url)
|
handler = VLLMBackend(backend_url)
|
||||||
|
@ -56,7 +68,12 @@ def test_backend(backend_url: str, test_prompt: bool = False):
|
||||||
return True, i
|
return True, i
|
||||||
|
|
||||||
|
|
||||||
def get_model_choices(regen: bool = False):
|
def get_model_choices(regen: bool = False) -> tuple[dict, dict]:
|
||||||
|
"""
|
||||||
|
Get the infor and stats of the models hosted in the cluster.
|
||||||
|
:param regen:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
if not regen:
|
if not regen:
|
||||||
c = redis.getp('model_choices')
|
c = redis.getp('model_choices')
|
||||||
if c:
|
if c:
|
||||||
|
|
|
@ -9,7 +9,13 @@ from llm_server.custom_redis import RedisCustom
|
||||||
from llm_server.routes.helpers.model import estimate_model_size
|
from llm_server.routes.helpers.model import estimate_model_size
|
||||||
|
|
||||||
|
|
||||||
|
# Don't try to reorganize this file or else you'll run into circular imports.
|
||||||
|
|
||||||
class RedisClusterStore:
|
class RedisClusterStore:
|
||||||
|
"""
|
||||||
|
A class used to store the cluster state in Redis.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, name: str, **kwargs):
|
def __init__(self, name: str, **kwargs):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.config_redis = RedisCustom(name, **kwargs)
|
self.config_redis = RedisCustom(name, **kwargs)
|
||||||
|
@ -52,7 +58,8 @@ class RedisClusterStore:
|
||||||
|
|
||||||
def validate_backend(self, backend_url: str):
|
def validate_backend(self, backend_url: str):
|
||||||
"""
|
"""
|
||||||
Returns the backend URL that was given, or a new one if that was offline.
|
Returns the backend URL that was given.
|
||||||
|
If that backend is offline, it will select a new one. This fallback behavior does NOT take the selected model into account.
|
||||||
:param backend_url:
|
:param backend_url:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
@ -68,6 +75,11 @@ cluster_config = RedisClusterStore('cluster_config')
|
||||||
|
|
||||||
|
|
||||||
def get_backends():
|
def get_backends():
|
||||||
|
"""
|
||||||
|
Get all the backends in the cluster, sorted by priority.
|
||||||
|
The first tuple is the online ones, second is the ones that are offline.
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
backends = cluster_config.all()
|
backends = cluster_config.all()
|
||||||
result = {}
|
result = {}
|
||||||
for k, v in backends.items():
|
for k, v in backends.items():
|
||||||
|
@ -103,13 +115,13 @@ def get_backends():
|
||||||
def get_a_cluster_backend(model=None):
|
def get_a_cluster_backend(model=None):
|
||||||
"""
|
"""
|
||||||
Get a backend from Redis. If there are no online backends, return None.
|
Get a backend from Redis. If there are no online backends, return None.
|
||||||
If `model` is not supplied, we will pick one ourself.
|
If `model` is not supplied, we will pick one ourselves.
|
||||||
"""
|
"""
|
||||||
if model:
|
if model:
|
||||||
# First, determine if there are multiple backends hosting the same model.
|
# First, determine if there are multiple backends hosting the same model.
|
||||||
backends_hosting_model = [i.decode('utf-8') for i in redis_running_models.smembers(model)]
|
backends_hosting_model = [i.decode('utf-8') for i in redis_running_models.smembers(model)]
|
||||||
|
|
||||||
# If so, create an iterator for those backends
|
# If so, create a Redis "cycle" iterator for those backends.
|
||||||
if len(backends_hosting_model):
|
if len(backends_hosting_model):
|
||||||
add_backend_cycler(model, backends_hosting_model)
|
add_backend_cycler(model, backends_hosting_model)
|
||||||
cycled = redis_cycle(model)
|
cycled = redis_cycle(model)
|
||||||
|
|
|
@ -22,6 +22,12 @@ def redis_cycle(list_name):
|
||||||
|
|
||||||
|
|
||||||
def add_backend_cycler(list_name: str, new_elements: list):
|
def add_backend_cycler(list_name: str, new_elements: list):
|
||||||
|
"""
|
||||||
|
Create a `redis_cycle()` iterator in Redis.
|
||||||
|
:param list_name:
|
||||||
|
:param new_elements:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
existing_elements = [i.decode('utf-8') for i in redis_cycler_db.lrange(list_name, 0, -1)]
|
existing_elements = [i.decode('utf-8') for i in redis_cycler_db.lrange(list_name, 0, -1)]
|
||||||
existing_set = set(existing_elements)
|
existing_set = set(existing_elements)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
from llm_server.custom_redis import RedisCustom
|
from llm_server.custom_redis import RedisCustom
|
||||||
|
|
||||||
|
"""
|
||||||
|
Global variables for accessing Redis stores.
|
||||||
|
"""
|
||||||
|
|
||||||
redis_running_models = RedisCustom('running_models')
|
redis_running_models = RedisCustom('running_models')
|
||||||
|
|
|
@ -2,8 +2,12 @@ import time
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
from llm_server.cluster.backend import test_backend
|
from llm_server.cluster.backend import test_backend
|
||||||
from llm_server.cluster.cluster_config import cluster_config
|
|
||||||
from llm_server.cluster.stores import redis_running_models
|
from llm_server.cluster.stores import redis_running_models
|
||||||
|
from llm_server.cluster.cluster_config import cluster_config
|
||||||
|
|
||||||
|
"""
|
||||||
|
The definition for the cluster worker used to test the backends.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def cluster_worker():
|
def cluster_worker():
|
||||||
|
@ -25,6 +29,13 @@ def cluster_worker():
|
||||||
|
|
||||||
|
|
||||||
def check_backend(n, v, test_prompt):
|
def check_backend(n, v, test_prompt):
|
||||||
|
"""
|
||||||
|
The function ran by the worker to test a backend.
|
||||||
|
:param n: I don't remember.
|
||||||
|
:param v: I don't remember.
|
||||||
|
:param test_prompt:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
online, backend_info = test_backend(v['backend_url'], test_prompt=test_prompt)
|
online, backend_info = test_backend(v['backend_url'], test_prompt=test_prompt)
|
||||||
if online:
|
if online:
|
||||||
running_model = backend_info['model']
|
running_model = backend_info['model']
|
||||||
|
|
|
@ -7,7 +7,7 @@ import redis as redis_pkg
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
from flask_caching import Cache
|
from flask_caching import Cache
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
from redis.typing import AnyKeyT, EncodableT, ExpiryT, FieldT, KeyT, PatternT, ZScoreBoundT
|
from redis.typing import AnyKeyT, EncodableT, ExpiryT, FieldT, KeyT, PatternT, ZScoreBoundT, AbsExpiryT
|
||||||
|
|
||||||
flask_cache = Cache(config={'CACHE_TYPE': 'RedisCache', 'CACHE_REDIS_URL': 'redis://localhost:6379/15', 'CACHE_KEY_PREFIX': 'local_llm_flask'})
|
flask_cache = Cache(config={'CACHE_TYPE': 'RedisCache', 'CACHE_REDIS_URL': 'redis://localhost:6379/15', 'CACHE_KEY_PREFIX': 'local_llm_flask'})
|
||||||
|
|
||||||
|
@ -17,9 +17,11 @@ ONE_MONTH_SECONDS = 2678000
|
||||||
class RedisCustom(Redis):
|
class RedisCustom(Redis):
|
||||||
"""
|
"""
|
||||||
A simple wrapper class for Redis to create a "namespace" within a DB,
|
A simple wrapper class for Redis to create a "namespace" within a DB,
|
||||||
which simplyifies key management.
|
which simplifies key management.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# TODO: is there a better way to do this instead of overriding every single method?
|
||||||
|
|
||||||
def __init__(self, prefix, **kwargs):
|
def __init__(self, prefix, **kwargs):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.redis = Redis(**kwargs)
|
self.redis = Redis(**kwargs)
|
||||||
|
@ -34,7 +36,17 @@ class RedisCustom(Redis):
|
||||||
def _key(self, key):
|
def _key(self, key):
|
||||||
return f"{self.prefix}:{key}"
|
return f"{self.prefix}:{key}"
|
||||||
|
|
||||||
def set(self, key, value, ex: Union[ExpiryT, None] = None):
|
def set(self, key: KeyT,
|
||||||
|
value: EncodableT,
|
||||||
|
ex: Union[ExpiryT, None] = None,
|
||||||
|
px: Union[ExpiryT, None] = None,
|
||||||
|
nx: bool = False,
|
||||||
|
xx: bool = False,
|
||||||
|
keepttl: bool = False,
|
||||||
|
get: bool = False,
|
||||||
|
exat: Union[AbsExpiryT, None] = None,
|
||||||
|
pxat: Union[AbsExpiryT, None] = None
|
||||||
|
):
|
||||||
return self.redis.set(self._key(key), value, ex=ex)
|
return self.redis.set(self._key(key), value, ex=ex)
|
||||||
|
|
||||||
def get(self, key, default=None, dtype=None):
|
def get(self, key, default=None, dtype=None):
|
||||||
|
|
|
@ -24,12 +24,19 @@ class VLLMBackend(LLMBackend):
|
||||||
return jsonify({'results': [{'text': backend_response}]}), 200
|
return jsonify({'results': [{'text': backend_response}]}), 200
|
||||||
|
|
||||||
def get_parameters(self, parameters) -> Tuple[dict | None, str | None]:
|
def get_parameters(self, parameters) -> Tuple[dict | None, str | None]:
|
||||||
|
"""
|
||||||
|
Convert the Oobabooga parameters to VLLM and validate them.
|
||||||
|
:param parameters:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# top_k == -1 means disabled
|
# top_k == -1 means disabled
|
||||||
top_k = parameters.get('top_k', self._default_params['top_k'])
|
top_k = parameters.get('top_k', self._default_params['top_k'])
|
||||||
if top_k <= 0:
|
if top_k <= 0:
|
||||||
top_k = -1
|
top_k = -1
|
||||||
|
|
||||||
|
# We call the internal VLLM `SamplingParams` class to validate the input parameters.
|
||||||
|
# Parameters from Oobabooga don't line up here exactly, so we have to shuffle some things around.
|
||||||
# TODO: support more params
|
# TODO: support more params
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
temperature=parameters.get('temperature', self._default_params['temperature']),
|
temperature=parameters.get('temperature', self._default_params['temperature']),
|
||||||
|
@ -45,9 +52,10 @@ class VLLMBackend(LLMBackend):
|
||||||
early_stopping=parameters.get('early_stopping', self._default_params['early_stopping'])
|
early_stopping=parameters.get('early_stopping', self._default_params['early_stopping'])
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
# `SamplingParams` will return a pretty error message. Send that back to the caller.
|
||||||
return None, str(e).strip('.')
|
return None, str(e).strip('.')
|
||||||
|
|
||||||
# We use max_new_tokens throughout the server.
|
# We use max_new_tokens throughout this program, so rename the variable.
|
||||||
result = vars(sampling_params)
|
result = vars(sampling_params)
|
||||||
result['max_new_tokens'] = result.pop('max_tokens')
|
result['max_new_tokens'] = result.pop('max_tokens')
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
|
|
||||||
|
@ -33,9 +33,10 @@ logging_info = LoggingInfo()
|
||||||
LOG_DIRECTORY = None
|
LOG_DIRECTORY = None
|
||||||
|
|
||||||
|
|
||||||
def init_logging(filepath:Path=None):
|
def init_logging(filepath: Path = None):
|
||||||
"""
|
"""
|
||||||
Set up the parent logger.
|
Set up the parent logger. Ensures this logger and all children to log to a file.
|
||||||
|
This is only called by `server.py` since there is wierdness with Gunicorn. The deamon doesn't need this.
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger('llm_server')
|
logger = logging.getLogger('llm_server')
|
||||||
|
@ -53,7 +54,6 @@ def init_logging(filepath:Path=None):
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_logger(name):
|
def create_logger(name):
|
||||||
logger = logging.getLogger('llm_server').getChild(name)
|
logger = logging.getLogger('llm_server').getChild(name)
|
||||||
logger.setLevel(logging_info.level)
|
logger.setLevel(logging_info.level)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from llm_server.custom_redis import ONE_MONTH_SECONDS, flask_cache, redis
|
||||||
from . import openai_bp
|
from . import openai_bp
|
||||||
from ..stats import server_start_time
|
from ..stats import server_start_time
|
||||||
from ... import opts
|
from ... import opts
|
||||||
from ...cluster.cluster_config import cluster_config, get_a_cluster_backend
|
from ...cluster.cluster_config import get_a_cluster_backend, cluster_config
|
||||||
from ...helpers import jsonify_pretty
|
from ...helpers import jsonify_pretty
|
||||||
from ...llm.openai.transform import generate_oai_string
|
from ...llm.openai.transform import generate_oai_string
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import flask
|
||||||
from flask import Response, request
|
from flask import Response, request
|
||||||
|
|
||||||
from llm_server import opts
|
from llm_server import opts
|
||||||
from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend
|
from llm_server.cluster.cluster_config import get_a_cluster_backend, cluster_config
|
||||||
from llm_server.custom_redis import redis
|
from llm_server.custom_redis import redis
|
||||||
from llm_server.database.database import get_token_ratelimit
|
from llm_server.database.database import get_token_ratelimit
|
||||||
from llm_server.database.log_to_db import log_to_db
|
from llm_server.database.log_to_db import log_to_db
|
||||||
|
|
|
@ -6,7 +6,7 @@ from llm_server.custom_redis import flask_cache
|
||||||
from . import bp
|
from . import bp
|
||||||
from ... import opts
|
from ... import opts
|
||||||
from ...cluster.backend import get_backends_from_model, is_valid_model
|
from ...cluster.backend import get_backends_from_model, is_valid_model
|
||||||
from ...cluster.cluster_config import cluster_config, get_a_cluster_backend
|
from ...cluster.cluster_config import get_a_cluster_backend, cluster_config
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/v1/model', methods=['GET'])
|
@bp.route('/v1/model', methods=['GET'])
|
||||||
|
|
|
@ -4,7 +4,7 @@ from llm_server.custom_redis import flask_cache
|
||||||
from . import bp
|
from . import bp
|
||||||
from .generate_stats import generate_stats
|
from .generate_stats import generate_stats
|
||||||
from ..auth import requires_auth
|
from ..auth import requires_auth
|
||||||
from ...cluster.cluster_config import cluster_config, get_backends
|
from ...cluster.cluster_config import get_backends, cluster_config
|
||||||
from ...helpers import jsonify_pretty
|
from ...helpers import jsonify_pretty
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import time
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from llm_server import opts
|
from llm_server import opts
|
||||||
from llm_server.cluster.cluster_config import cluster_config, get_backends
|
from llm_server.cluster.cluster_config import get_backends, cluster_config
|
||||||
from llm_server.custom_redis import redis
|
from llm_server.custom_redis import redis
|
||||||
from llm_server.database.database import weighted_average_column_for_model
|
from llm_server.database.database import weighted_average_column_for_model
|
||||||
from llm_server.llm.info import get_info
|
from llm_server.llm.info import get_info
|
||||||
|
|
|
@ -13,6 +13,6 @@ flask-sock==0.6.0
|
||||||
gunicorn==21.2.0
|
gunicorn==21.2.0
|
||||||
redis==5.0.1
|
redis==5.0.1
|
||||||
ujson==5.8.0
|
ujson==5.8.0
|
||||||
vllm==0.2.1.post1
|
vllm==0.2.7
|
||||||
gradio~=3.46.1
|
gradio~=3.46.1
|
||||||
coloredlogs~=15.0.1
|
coloredlogs~=15.0.1
|
Loading…
Reference in New Issue