clean some stuff up, bump VLLM version

2024-01-10 15:01:26 -07:00 · 2024-01-10 15:01:26 -07:00 · 4b3e0671c6
parent 0e7f04ab2d
commit 4b3e0671c6
15 changed files with 101 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -7,6 +7,7 @@ The purpose of this server is to abstract your LLM backend from your frontend AP
 ### Install
 also need to create /var/log/localllm
 chown -R server:adm /var/log/localllm/
 1. `sudo apt install redis`
 2. `python3 -m venv venv`
--- a/llm_server/cluster/backend.py
+++ b/llm_server/cluster/backend.py
@ -1,7 +1,7 @@
 import numpy as np
 from llm_server import opts
-from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend
+from llm_server.cluster.cluster_config import get_a_cluster_backend, cluster_config
 from llm_server.cluster.stores import redis_running_models
 from llm_server.custom_redis import redis
 from llm_server.llm.generator import generator
@ -12,26 +12,38 @@ from llm_server.routes.stats import calculate_wait_time, get_active_gen_workers_
 def get_backends_from_model(model_name: str):
    """
    Get the backends that are running a specific model. This is the inverse of `get_model_choices()`.
    :param model_name:
    :return:
    """
    return [x.decode('utf-8') for x in redis_running_models.smembers(model_name)]
 def get_running_models():
-    return redis_running_models.keys()
+    """
    Get all the models that are in the cluster.
    :return:
    """
    return list(redis_running_models.keys())
-def purge_backend_from_running_models(backend_url: str):
+def is_valid_model(model_name: str) -> bool:
-    keys = redis_running_models.keys()
+    """
-    pipeline = redis_running_models.pipeline()
+    Is this a model that is being hosted in the cluster?
-    for model in keys:
+    :param model_name:
-        pipeline.srem(model, backend_url)
+    :return:
-    pipeline.execute()
+    """
 def is_valid_model(model_name: str):
    return redis_running_models.exists(model_name)
 def test_backend(backend_url: str, test_prompt: bool = False):
    """
    Test (using a test prompt) a backend to check if it is online.
    :param backend_url:
    :param test_prompt:
    :return:
    """
    backend_info = cluster_config.get_backend(backend_url)
    if test_prompt:
        handler = VLLMBackend(backend_url)
@ -56,7 +68,12 @@ def test_backend(backend_url: str, test_prompt: bool = False):
    return True, i
-def get_model_choices(regen: bool = False):
+def get_model_choices(regen: bool = False) -> tuple[dict, dict]:
    """
    Get the infor and stats of the models hosted in the cluster.
    :param regen:
    :return:
    """
    if not regen:
        c = redis.getp('model_choices')
        if c:
--- a/llm_server/cluster/cluster_config.py
+++ b/llm_server/cluster/cluster_config.py
@ -9,7 +9,13 @@ from llm_server.custom_redis import RedisCustom
 from llm_server.routes.helpers.model import estimate_model_size
 # Don't try to reorganize this file or else you'll run into circular imports.
 class RedisClusterStore:
    """
    A class used to store the cluster state in Redis.
    """
    def __init__(self, name: str, **kwargs):
        self.name = name
        self.config_redis = RedisCustom(name, **kwargs)
@ -52,7 +58,8 @@ class RedisClusterStore:
    def validate_backend(self, backend_url: str):
        """
-        Returns the backend URL that was given, or a new one if that was offline.
+        Returns the backend URL that was given.
        If that backend is offline, it will select a new one. This fallback behavior does NOT take the selected model into account.
        :param backend_url:
        :return:
        """
@ -68,6 +75,11 @@ cluster_config = RedisClusterStore('cluster_config')
 def get_backends():
    """
    Get all the backends in the cluster, sorted by priority.
    The first tuple is the online ones, second is the ones that are offline.
    :return:
    """
    backends = cluster_config.all()
    result = {}
    for k, v in backends.items():
@ -103,13 +115,13 @@ def get_backends():
 def get_a_cluster_backend(model=None):
    """
    Get a backend from Redis. If there are no online backends, return None.
-    If `model` is not supplied, we will pick one ourself.
+    If `model` is not supplied, we will pick one ourselves.
    """
    if model:
        # First, determine if there are multiple backends hosting the same model.
        backends_hosting_model = [i.decode('utf-8') for i in redis_running_models.smembers(model)]
-        # If so, create an iterator for those backends
+        # If so, create a Redis "cycle" iterator for those backends.
        if len(backends_hosting_model):
            add_backend_cycler(model, backends_hosting_model)
            cycled = redis_cycle(model)
--- a/llm_server/cluster/redis_cycle.py
+++ b/llm_server/cluster/redis_cycle.py
@ -22,6 +22,12 @@ def redis_cycle(list_name):
 def add_backend_cycler(list_name: str, new_elements: list):
    """
    Create a `redis_cycle()` iterator in Redis.
    :param list_name:
    :param new_elements:
    :return:
    """
    existing_elements = [i.decode('utf-8') for i in redis_cycler_db.lrange(list_name, 0, -1)]
    existing_set = set(existing_elements)
--- a/llm_server/cluster/stores.py
+++ b/llm_server/cluster/stores.py
@ -1,3 +1,7 @@
 from llm_server.custom_redis import RedisCustom
 """
 Global variables for accessing Redis stores.
 """
 redis_running_models = RedisCustom('running_models')
--- a/llm_server/cluster/worker.py
+++ b/llm_server/cluster/worker.py
@ -2,8 +2,12 @@ import time
 from threading import Thread
 from llm_server.cluster.backend import test_backend
 from llm_server.cluster.cluster_config import cluster_config
 from llm_server.cluster.stores import redis_running_models
 from llm_server.cluster.cluster_config import cluster_config
 """
 The definition for the cluster worker used to test the backends.
 """
 def cluster_worker():
@ -25,6 +29,13 @@ def cluster_worker():
 def check_backend(n, v, test_prompt):
    """
    The function ran by the worker to test a backend.
    :param n: I don't remember.
    :param v: I don't remember.
    :param test_prompt:
    :return:
    """
    online, backend_info = test_backend(v['backend_url'], test_prompt=test_prompt)
    if online:
        running_model = backend_info['model']
--- a/llm_server/custom_redis.py
+++ b/llm_server/custom_redis.py
@ -7,7 +7,7 @@ import redis as redis_pkg
 import simplejson as json
 from flask_caching import Cache
 from redis import Redis
-from redis.typing import AnyKeyT, EncodableT, ExpiryT, FieldT, KeyT, PatternT, ZScoreBoundT
+from redis.typing import AnyKeyT, EncodableT, ExpiryT, FieldT, KeyT, PatternT, ZScoreBoundT, AbsExpiryT
 flask_cache = Cache(config={'CACHE_TYPE': 'RedisCache', 'CACHE_REDIS_URL': 'redis://localhost:6379/15', 'CACHE_KEY_PREFIX': 'local_llm_flask'})
@ -17,9 +17,11 @@ ONE_MONTH_SECONDS = 2678000
 class RedisCustom(Redis):
    """
    A simple wrapper class for Redis to create a "namespace" within a DB,
-    which simplyifies key management.
+    which simplifies key management.
    """
    # TODO: is there a better way to do this instead of overriding every single method?
    def __init__(self, prefix, **kwargs):
        super().__init__()
        self.redis = Redis(**kwargs)
@ -34,7 +36,17 @@ class RedisCustom(Redis):
    def _key(self, key):
        return f"{self.prefix}:{key}"
-    def set(self, key, value, ex: Union[ExpiryT, None] = None):
+    def set(self, key: KeyT,
            value: EncodableT,
            ex: Union[ExpiryT, None] = None,
            px: Union[ExpiryT, None] = None,
            nx: bool = False,
            xx: bool = False,
            keepttl: bool = False,
            get: bool = False,
            exat: Union[AbsExpiryT, None] = None,
            pxat: Union[AbsExpiryT, None] = None
            ):
        return self.redis.set(self._key(key), value, ex=ex)
    def get(self, key, default=None, dtype=None):
--- a/llm_server/llm/vllm/vllm_backend.py
+++ b/llm_server/llm/vllm/vllm_backend.py
@ -24,12 +24,19 @@ class VLLMBackend(LLMBackend):
        return jsonify({'results': [{'text': backend_response}]}), 200
    def get_parameters(self, parameters) -> Tuple[dict | None, str | None]:
        """
        Convert the Oobabooga parameters to VLLM and validate them.
        :param parameters:
        :return:
        """
        try:
            # top_k == -1 means disabled
            top_k = parameters.get('top_k', self._default_params['top_k'])
            if top_k <= 0:
                top_k = -1
            # We call the internal VLLM `SamplingParams` class to validate the input parameters.
            # Parameters from Oobabooga don't line up here exactly, so we have to shuffle some things around.
            # TODO: support more params
            sampling_params = SamplingParams(
                temperature=parameters.get('temperature', self._default_params['temperature']),
@ -45,9 +52,10 @@ class VLLMBackend(LLMBackend):
                early_stopping=parameters.get('early_stopping', self._default_params['early_stopping'])
            )
        except ValueError as e:
            # `SamplingParams` will return a pretty error message. Send that back to the caller.
            return None, str(e).strip('.')
-        # We use max_new_tokens throughout the server.
+        # We use max_new_tokens throughout this program, so rename the variable.
        result = vars(sampling_params)
        result['max_new_tokens'] = result.pop('max_tokens')
--- a/llm_server/logging.py
+++ b/llm_server/logging.py
@ -1,6 +1,6 @@
 import logging
 from pathlib import Path
 import sys
 from pathlib import Path
 import coloredlogs
@ -33,9 +33,10 @@ logging_info = LoggingInfo()
 LOG_DIRECTORY = None
-def init_logging(filepath:Path=None):
+def init_logging(filepath: Path = None):
    """
-    Set up the parent logger.
+    Set up the parent logger. Ensures this logger and all children to log to a file.
    This is only called by `server.py` since there is wierdness with Gunicorn. The deamon doesn't need this.
    :return:
    """
    logger = logging.getLogger('llm_server')
@ -53,7 +54,6 @@ def init_logging(filepath:Path=None):
        logger.addHandler(handler)
 def create_logger(name):
    logger = logging.getLogger('llm_server').getChild(name)
    logger.setLevel(logging_info.level)
--- a/llm_server/routes/openai/models.py
+++ b/llm_server/routes/openai/models.py
@ -7,7 +7,7 @@ from llm_server.custom_redis import ONE_MONTH_SECONDS, flask_cache, redis
 from . import openai_bp
 from ..stats import server_start_time
 from ... import opts
-from ...cluster.cluster_config import cluster_config, get_a_cluster_backend
+from ...cluster.cluster_config import get_a_cluster_backend, cluster_config
 from ...helpers import jsonify_pretty
 from ...llm.openai.transform import generate_oai_string
--- a/llm_server/routes/request_handler.py
+++ b/llm_server/routes/request_handler.py
@ -5,7 +5,7 @@ import flask
 from flask import Response, request
 from llm_server import opts
-from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend
+from llm_server.cluster.cluster_config import get_a_cluster_backend, cluster_config
 from llm_server.custom_redis import redis
 from llm_server.database.database import get_token_ratelimit
 from llm_server.database.log_to_db import log_to_db
--- a/llm_server/routes/v1/info.py
+++ b/llm_server/routes/v1/info.py
@ -6,7 +6,7 @@ from llm_server.custom_redis import flask_cache
 from . import bp
 from ... import opts
 from ...cluster.backend import get_backends_from_model, is_valid_model
-from ...cluster.cluster_config import cluster_config, get_a_cluster_backend
+from ...cluster.cluster_config import get_a_cluster_backend, cluster_config
@bp.route('/v1/model', methods=['GET'])
--- a/llm_server/routes/v1/proxy.py
+++ b/llm_server/routes/v1/proxy.py
@ -4,7 +4,7 @@ from llm_server.custom_redis import flask_cache
 from . import bp
 from .generate_stats import generate_stats
 from ..auth import requires_auth
-from ...cluster.cluster_config import cluster_config, get_backends
+from ...cluster.cluster_config import get_backends, cluster_config
 from ...helpers import jsonify_pretty
--- a/llm_server/workers/mainer.py
+++ b/llm_server/workers/mainer.py
@ -3,7 +3,7 @@ import time
 import requests
 from llm_server import opts
-from llm_server.cluster.cluster_config import cluster_config, get_backends
+from llm_server.cluster.cluster_config import get_backends, cluster_config
 from llm_server.custom_redis import redis
 from llm_server.database.database import weighted_average_column_for_model
 from llm_server.llm.info import get_info
--- a/requirements.txt
+++ b/requirements.txt
@ -13,6 +13,6 @@ flask-sock==0.6.0
 gunicorn==21.2.0
 redis==5.0.1
 ujson==5.8.0
-vllm==0.2.1.post1
+vllm==0.2.7
 gradio~=3.46.1
 coloredlogs~=15.0.1