diff --git a/llm_server/cluster/model_choices.py b/llm_server/cluster/model_choices.py index c9a94fd..ec78e2f 100644 --- a/llm_server/cluster/model_choices.py +++ b/llm_server/cluster/model_choices.py @@ -54,7 +54,7 @@ def get_model_choices(regen: bool = False): 'estimated_wait': estimated_wait_sec, 'queued': proompters_in_queue, 'processing': active_gen_workers, - 'avg_generation_time': average_generation_elapsed_sec + 'avg_generation_time': average_generation_elapsed_sec, } if len(context_size): @@ -63,25 +63,28 @@ def get_model_choices(regen: bool = False): model_choices = dict(sorted(model_choices.items())) default_backend = get_a_cluster_backend() - default_backend_info = cluster_config.get_backend(default_backend) - default_context_size = default_backend_info['model_config']['max_position_embeddings'] - default_average_generation_elapsed_sec = default_backend_info.get('average_generation_elapsed_sec') - default_active_gen_workers = redis.get(f'active_gen_workers:{default_backend}', dtype=int, default=0) - default_proompters_in_queue = priority_queue.len(default_backend_info['model']) - default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers) + default_backend_dict = {} + if default_backend: + default_backend_info = cluster_config.get_backend(default_backend) + default_context_size = default_backend_info['model_config']['max_position_embeddings'] + default_average_generation_elapsed_sec = default_backend_info.get('average_generation_elapsed_sec') + default_active_gen_workers = redis.get(f'active_gen_workers:{default_backend}', dtype=int, default=0) + default_proompters_in_queue = priority_queue.len(default_backend_info['model']) + default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers) - default_backend_dict = { - 'client_api': f'https://{base_client_api}/v2', - 'ws_client_api': f'wss://{base_client_api}/v2' if opts.enable_streaming else None, - 'openai_client_api': f'https://{base_client_api}/openai/v2' if opts.enable_openi_compatible_backend else 'disabled', - 'estimated_wait': default_estimated_wait_sec, - 'queued': default_proompters_in_queue, - 'processing': default_active_gen_workers, - 'context_size': default_context_size, - 'hash': default_backend_info['hash'], - 'model': default_backend_info['model'], - 'avg_generation_time': default_average_generation_elapsed_sec - } + default_backend_dict = { + 'client_api': f'https://{base_client_api}/v2', + 'ws_client_api': f'wss://{base_client_api}/v2' if opts.enable_streaming else None, + 'openai_client_api': f'https://{base_client_api}/openai/v2' if opts.enable_openi_compatible_backend else 'disabled', + 'estimated_wait': default_estimated_wait_sec, + 'queued': default_proompters_in_queue, + 'processing': default_active_gen_workers, + 'context_size': default_context_size, + 'hash': default_backend_info['hash'], + 'model': default_backend_info['model'], + 'avg_generation_time': default_average_generation_elapsed_sec, + 'online': True + } redis.setp('model_choices', (model_choices, default_backend_dict)) diff --git a/llm_server/routes/request_handler.py b/llm_server/routes/request_handler.py index 83f510a..0dd862a 100644 --- a/llm_server/routes/request_handler.py +++ b/llm_server/routes/request_handler.py @@ -39,6 +39,10 @@ class RequestHandler: self.token_priority, self.token_simultaneous_ip = self.get_token_ratelimit() self.backend_url = get_a_cluster_backend(selected_model) self.cluster_backend_info = cluster_config.get_backend(self.backend_url) + + if not self.cluster_backend_info.get('mode'): + print(self.backend_url, self.cluster_backend_info) + self.backend = get_backend_handler(self.cluster_backend_info['mode'], self.backend_url) self.parameters = None self.used = False