From 167059490897fed585db72e956de215cf3377d9e Mon Sep 17 00:00:00 2001 From: Cyberes Date: Wed, 4 Oct 2023 16:29:19 -0600 Subject: [PATCH] fix import error --- llm_server/cluster/backend.py | 150 ++++++++++---------- llm_server/cluster/cluster_config.py | 57 ++++++++ llm_server/cluster/model_choices.py | 69 --------- llm_server/cluster/redis_config_cache.py | 15 +- llm_server/cluster/worker.py | 2 +- llm_server/llm/vllm/tokenize.py | 9 +- llm_server/routes/openai/models.py | 3 +- llm_server/routes/openai_request_handler.py | 2 +- llm_server/routes/request_handler.py | 3 +- llm_server/routes/v1/generate_stats.py | 2 +- llm_server/routes/v1/info.py | 4 +- llm_server/routes/v1/proxy.py | 3 +- llm_server/workers/inferencer.py | 3 +- llm_server/workers/mainer.py | 3 +- server.py | 2 +- 15 files changed, 160 insertions(+), 167 deletions(-) diff --git a/llm_server/cluster/backend.py b/llm_server/cluster/backend.py index cadf86e..c301f93 100644 --- a/llm_server/cluster/backend.py +++ b/llm_server/cluster/backend.py @@ -1,10 +1,33 @@ +import numpy as np + from llm_server import opts -from llm_server.cluster.cluster_config import cluster_config -from llm_server.cluster.redis_cycle import add_backend_cycler, redis_cycle +from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend from llm_server.cluster.stores import redis_running_models +from llm_server.custom_redis import redis from llm_server.llm.generator import generator from llm_server.llm.info import get_info -from llm_server.routes.helpers.model import estimate_model_size +from llm_server.routes.queue import priority_queue +from llm_server.routes.stats import get_active_gen_workers_model, calculate_wait_time + + +def get_backends_from_model(model_name: str): + return [x.decode('utf-8') for x in redis_running_models.smembers(model_name)] + + +def get_running_models(): + return redis_running_models.keys() + + +def purge_backend_from_running_models(backend_url: str): + keys = redis_running_models.keys() + pipeline = redis_running_models.pipeline() + for model in keys: + pipeline.srem(model, backend_url) + pipeline.execute() + + +def is_valid_model(model_name: str): + return redis_running_models.exists(model_name) def test_backend(backend_url: str, test_prompt: bool = False): @@ -28,81 +51,64 @@ def test_backend(backend_url: str, test_prompt: bool = False): return True, i -def get_backends(): - backends = cluster_config.all() - result = {} - for k, v in backends.items(): - b = cluster_config.get_backend(k) - status = b.get('online', False) - priority = b['priority'] - result[k] = {'status': status, 'priority': priority} - - if not opts.prioritize_by_size: - online_backends = sorted( - ((url, info) for url, info in backends.items() if info['online']), - key=lambda kv: -kv[1]['priority'], - reverse=True - ) - else: - online_backends = sorted( - ((url, info) for url, info in backends.items() if info['online']), - key=lambda kv: estimate_model_size(kv[1]['model_config']), - reverse=True - ) - offline_backends = sorted( - ((url, info) for url, info in backends.items() if not info['online']), - key=lambda kv: -kv[1]['priority'], - reverse=True - ) - return [url for url, info in online_backends], [url for url, info in offline_backends] -def get_a_cluster_backend(model=None): - """ - Get a backend from Redis. If there are no online backends, return None. - If `model` is not supplied, we will pick one ourself. - """ - if model: - # First, determine if there are multiple backends hosting the same model. - backends_hosting_model = [i.decode('utf-8') for i in redis_running_models.smembers(model)] +def get_model_choices(regen: bool = False): + if not regen: + c = redis.getp('model_choices') + if c: + return c - # If so, create an iterator for those backends - if len(backends_hosting_model): - add_backend_cycler(model, backends_hosting_model) - cycled = redis_cycle(model) - if len(cycled): - return cycled[0] + base_client_api = redis.get('base_client_api', dtype=str) + running_models = get_running_models() + model_choices = {} + for model in running_models: + b = get_backends_from_model(model) + + context_size = [] + avg_gen_per_worker = [] + concurrent_gens = 0 + for backend_url in b: + backend_info = cluster_config.get_backend(backend_url) + if backend_info.get('model_config'): + context_size.append(backend_info['model_config']['max_position_embeddings']) + if backend_info.get('average_generation_elapsed_sec'): + avg_gen_per_worker.append(backend_info['average_generation_elapsed_sec']) + concurrent_gens += backend_info['concurrent_gens'] + + active_gen_workers = get_active_gen_workers_model(model) + proompters_in_queue = priority_queue.len(model) + + if len(avg_gen_per_worker): + average_generation_elapsed_sec = np.average(avg_gen_per_worker) else: - # No backend hosting that model - return None - else: - online, _ = get_backends() - if len(online): - return online[0] + average_generation_elapsed_sec = 0 + estimated_wait_sec = calculate_wait_time(average_generation_elapsed_sec, proompters_in_queue, concurrent_gens, active_gen_workers) + model_choices[model] = { + 'model': model, + 'client_api': f'https://{base_client_api}/{model}', + 'ws_client_api': f'wss://{base_client_api}/{model}/v1/stream' if opts.enable_streaming else None, + 'openai_client_api': f'https://{base_client_api}/openai/{model}' if opts.enable_openi_compatible_backend else 'disabled', + 'backend_count': len(b), + 'estimated_wait': estimated_wait_sec, + 'queued': proompters_in_queue, + 'processing': active_gen_workers, + 'avg_generation_time': average_generation_elapsed_sec, + 'concurrent_gens': concurrent_gens + } -def get_backends_from_model(model_name: str): - return [x.decode('utf-8') for x in redis_running_models.smembers(model_name)] + if len(context_size): + model_choices[model]['context_size'] = min(context_size) + # Python wants to sort lowercase vs. uppercase letters differently. + model_choices = dict(sorted(model_choices.items(), key=lambda item: item[0].upper())) -# def verify_context_size(model_name:str): -# b = get_backends_from_model(model_name) -# for backend_url in b: -# backend_info = cluster_config.get_backend(backend_url) -# backend_info.get() + default_backend_url = get_a_cluster_backend() + default_backend_info = cluster_config.get_backend(default_backend_url) + if not default_backend_info.get('model'): + return None, None + default_model = default_backend_info['model'] - -def get_running_models(): - return redis_running_models.keys() - - -def purge_backend_from_running_models(backend_url: str): - keys = redis_running_models.keys() - pipeline = redis_running_models.pipeline() - for model in keys: - pipeline.srem(model, backend_url) - pipeline.execute() - - -def is_valid_model(model_name: str): - return redis_running_models.exists(model_name) + redis.setp('model_choices', (model_choices, default_model)) + return model_choices, default_model diff --git a/llm_server/cluster/cluster_config.py b/llm_server/cluster/cluster_config.py index 14a6cb0..17e9b05 100644 --- a/llm_server/cluster/cluster_config.py +++ b/llm_server/cluster/cluster_config.py @@ -1,3 +1,60 @@ +from llm_server import opts from llm_server.cluster.redis_config_cache import RedisClusterStore +from llm_server.cluster.redis_cycle import add_backend_cycler, redis_cycle +from llm_server.cluster.stores import redis_running_models +from llm_server.routes.helpers.model import estimate_model_size cluster_config = RedisClusterStore('cluster_config') + + +def get_backends(): + backends = cluster_config.all() + result = {} + for k, v in backends.items(): + b = cluster_config.get_backend(k) + status = b.get('online', False) + priority = b['priority'] + result[k] = {'status': status, 'priority': priority} + + if not opts.prioritize_by_size: + online_backends = sorted( + ((url, info) for url, info in backends.items() if info['online']), + key=lambda kv: -kv[1]['priority'], + reverse=True + ) + else: + online_backends = sorted( + ((url, info) for url, info in backends.items() if info['online']), + key=lambda kv: estimate_model_size(kv[1]['model_config']), + reverse=True + ) + offline_backends = sorted( + ((url, info) for url, info in backends.items() if not info['online']), + key=lambda kv: -kv[1]['priority'], + reverse=True + ) + return [url for url, info in online_backends], [url for url, info in offline_backends] + + +def get_a_cluster_backend(model=None): + """ + Get a backend from Redis. If there are no online backends, return None. + If `model` is not supplied, we will pick one ourself. + """ + if model: + # First, determine if there are multiple backends hosting the same model. + backends_hosting_model = [i.decode('utf-8') for i in redis_running_models.smembers(model)] + + # If so, create an iterator for those backends + if len(backends_hosting_model): + add_backend_cycler(model, backends_hosting_model) + cycled = redis_cycle(model) + if len(cycled): + return cycled[0] + else: + # No backend hosting that model + return None + else: + online, _ = get_backends() + if len(online): + return online[0] diff --git a/llm_server/cluster/model_choices.py b/llm_server/cluster/model_choices.py index 4333dde..ef93bba 100644 --- a/llm_server/cluster/model_choices.py +++ b/llm_server/cluster/model_choices.py @@ -1,70 +1 @@ -import numpy as np - -from llm_server import opts -from llm_server.cluster.backend import get_a_cluster_backend, get_backends_from_model, get_running_models -from llm_server.cluster.cluster_config import cluster_config -from llm_server.custom_redis import redis -from llm_server.routes.queue import priority_queue -from llm_server.routes.stats import calculate_wait_time, get_active_gen_workers_model - - # TODO: give this a better name! -def get_model_choices(regen: bool = False): - if not regen: - c = redis.getp('model_choices') - if c: - return c - - base_client_api = redis.get('base_client_api', dtype=str) - running_models = get_running_models() - model_choices = {} - for model in running_models: - b = get_backends_from_model(model) - - context_size = [] - avg_gen_per_worker = [] - concurrent_gens = 0 - for backend_url in b: - backend_info = cluster_config.get_backend(backend_url) - if backend_info.get('model_config'): - context_size.append(backend_info['model_config']['max_position_embeddings']) - if backend_info.get('average_generation_elapsed_sec'): - avg_gen_per_worker.append(backend_info['average_generation_elapsed_sec']) - concurrent_gens += backend_info['concurrent_gens'] - - active_gen_workers = get_active_gen_workers_model(model) - proompters_in_queue = priority_queue.len(model) - - if len(avg_gen_per_worker): - average_generation_elapsed_sec = np.average(avg_gen_per_worker) - else: - average_generation_elapsed_sec = 0 - estimated_wait_sec = calculate_wait_time(average_generation_elapsed_sec, proompters_in_queue, concurrent_gens, active_gen_workers) - - model_choices[model] = { - 'model': model, - 'client_api': f'https://{base_client_api}/{model}', - 'ws_client_api': f'wss://{base_client_api}/{model}/v1/stream' if opts.enable_streaming else None, - 'openai_client_api': f'https://{base_client_api}/openai/{model}' if opts.enable_openi_compatible_backend else 'disabled', - 'backend_count': len(b), - 'estimated_wait': estimated_wait_sec, - 'queued': proompters_in_queue, - 'processing': active_gen_workers, - 'avg_generation_time': average_generation_elapsed_sec, - 'concurrent_gens': concurrent_gens - } - - if len(context_size): - model_choices[model]['context_size'] = min(context_size) - - # Python wants to sort lowercase vs. uppercase letters differently. - model_choices = dict(sorted(model_choices.items(), key=lambda item: item[0].upper())) - - default_backend_url = get_a_cluster_backend() - default_backend_info = cluster_config.get_backend(default_backend_url) - if not default_backend_info.get('model'): - return None, None - default_model = default_backend_info['model'] - - redis.setp('model_choices', (model_choices, default_model)) - return model_choices, default_model diff --git a/llm_server/cluster/redis_config_cache.py b/llm_server/cluster/redis_config_cache.py index 3bab915..da0b581 100644 --- a/llm_server/cluster/redis_config_cache.py +++ b/llm_server/cluster/redis_config_cache.py @@ -45,5 +45,16 @@ class RedisClusterStore: else: return {} - # def get(self, name: str): - # return self.all().get(name) + def validate_backend(self, backend_url: str): + """ + Returns the backend URL that was given, or a new one if that was offline. + :param backend_url: + :return: + """ + backend_info = self.get_backend(backend_url) + if not backend_info['online']: + old = backend_url + backend_url = get_a_cluster_backend() + print(f'Backend {old} offline. Request was redirected to {backend_url}') + return backend_url + diff --git a/llm_server/cluster/worker.py b/llm_server/cluster/worker.py index 7956198..951f18d 100644 --- a/llm_server/cluster/worker.py +++ b/llm_server/cluster/worker.py @@ -1,8 +1,8 @@ import time from threading import Thread -from llm_server.cluster.backend import test_backend from llm_server.cluster.cluster_config import cluster_config +from llm_server.cluster.backend import test_backend from llm_server.cluster.stores import redis_running_models diff --git a/llm_server/llm/vllm/tokenize.py b/llm_server/llm/vllm/tokenize.py index 2df7789..db2c49d 100644 --- a/llm_server/llm/vllm/tokenize.py +++ b/llm_server/llm/vllm/tokenize.py @@ -4,7 +4,6 @@ import requests import tiktoken from llm_server import opts -from llm_server.cluster.backend import get_a_cluster_backend from llm_server.cluster.cluster_config import cluster_config @@ -13,15 +12,9 @@ def tokenize(prompt: str, backend_url: str) -> int: assert isinstance(prompt, str) assert isinstance(backend_url, str) - # TODO: put this in a shared function # The backend could have died between when the request was # submitted and now, so let's double check it's still online. - backend_info = cluster_config.get_backend(backend_url) - if not backend_info['online']: - old = backend_url - backend_url = get_a_cluster_backend() - print(f'Backend {old} offline. Request was redirected to {backend_url}') - del old # gc + backend_url = cluster_config.validate_backend(backend_url) if not prompt: # The tokenizers have issues when the prompt is None. diff --git a/llm_server/routes/openai/models.py b/llm_server/routes/openai/models.py index 9f2845d..2ff0629 100644 --- a/llm_server/routes/openai/models.py +++ b/llm_server/routes/openai/models.py @@ -7,8 +7,7 @@ from llm_server.custom_redis import ONE_MONTH_SECONDS, flask_cache, redis from . import openai_bp from ..stats import server_start_time from ... import opts -from ...cluster.backend import get_a_cluster_backend -from ...cluster.cluster_config import cluster_config +from ...cluster.cluster_config import cluster_config, get_a_cluster_backend from ...helpers import jsonify_pretty from ...llm.openai.transform import generate_oai_string diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py index 69029be..84b2c76 100644 --- a/llm_server/routes/openai_request_handler.py +++ b/llm_server/routes/openai_request_handler.py @@ -9,7 +9,7 @@ import flask from flask import Response, jsonify, make_response from llm_server import opts -from llm_server.cluster.model_choices import get_model_choices +from llm_server.cluster.backend import get_model_choices from llm_server.custom_redis import redis from llm_server.database.database import is_api_key_moderated, log_prompt from llm_server.llm import get_token_count diff --git a/llm_server/routes/request_handler.py b/llm_server/routes/request_handler.py index d5d7175..d981be8 100644 --- a/llm_server/routes/request_handler.py +++ b/llm_server/routes/request_handler.py @@ -5,8 +5,7 @@ import flask from flask import Response, request from llm_server import opts -from llm_server.cluster.backend import get_a_cluster_backend -from llm_server.cluster.cluster_config import cluster_config +from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend from llm_server.custom_redis import redis from llm_server.database.database import get_token_ratelimit, log_prompt from llm_server.helpers import auto_set_base_client_api diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index 23902b7..3bfbca6 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -3,7 +3,7 @@ from datetime import datetime from llm_server import opts from llm_server.cluster.cluster_config import cluster_config -from llm_server.cluster.model_choices import get_model_choices +from llm_server.cluster.backend import get_model_choices from llm_server.custom_redis import redis from llm_server.database.database import get_distinct_ips_24h, sum_column from llm_server.helpers import deep_sort diff --git a/llm_server/routes/v1/info.py b/llm_server/routes/v1/info.py index 6e37720..342921e 100644 --- a/llm_server/routes/v1/info.py +++ b/llm_server/routes/v1/info.py @@ -5,8 +5,8 @@ from flask import jsonify, request from llm_server.custom_redis import flask_cache from . import bp from ... import opts -from ...cluster.backend import get_a_cluster_backend, get_backends_from_model, is_valid_model -from ...cluster.cluster_config import cluster_config +from ...cluster.backend import get_backends_from_model, is_valid_model +from ...cluster.cluster_config import cluster_config, get_a_cluster_backend @bp.route('/v1/model', methods=['GET']) diff --git a/llm_server/routes/v1/proxy.py b/llm_server/routes/v1/proxy.py index e5ff5d3..6e3708e 100644 --- a/llm_server/routes/v1/proxy.py +++ b/llm_server/routes/v1/proxy.py @@ -4,8 +4,7 @@ from llm_server.custom_redis import flask_cache from . import bp from .generate_stats import generate_stats from ..auth import requires_auth -from ...cluster.backend import get_backends -from ...cluster.cluster_config import cluster_config +from ...cluster.cluster_config import cluster_config, get_backends from ...helpers import jsonify_pretty diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index c5eb12a..84faceb 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -1,8 +1,7 @@ import threading import time -from llm_server.cluster.backend import get_a_cluster_backend -from llm_server.cluster.cluster_config import cluster_config +from llm_server.cluster.cluster_config import cluster_config, get_a_cluster_backend from llm_server.custom_redis import redis from llm_server.llm.generator import generator from llm_server.routes.queue import DataEvent, decr_active_workers, decrement_ip_count, incr_active_workers, increment_ip_count, priority_queue diff --git a/llm_server/workers/mainer.py b/llm_server/workers/mainer.py index 580060d..37c1178 100644 --- a/llm_server/workers/mainer.py +++ b/llm_server/workers/mainer.py @@ -3,8 +3,7 @@ import time import requests from llm_server import opts -from llm_server.cluster.backend import get_backends -from llm_server.cluster.cluster_config import cluster_config +from llm_server.cluster.cluster_config import cluster_config, get_backends from llm_server.custom_redis import redis from llm_server.database.database import weighted_average_column_for_model from llm_server.llm.info import get_info diff --git a/server.py b/server.py index 92d768d..8d723ab 100644 --- a/server.py +++ b/server.py @@ -13,7 +13,7 @@ import simplejson as json from flask import Flask, jsonify, render_template, request from llm_server.cluster.cluster_config import cluster_config -from llm_server.cluster.model_choices import get_model_choices +from llm_server.cluster.backend import get_model_choices from llm_server.config.config import mode_ui_names from llm_server.config.load import load_config from llm_server.database.conn import database