|
|
|
@ -13,6 +13,7 @@ from llm_server.helpers import auto_set_base_client_api
|
|
|
|
|
from llm_server.llm.oobabooga.ooba_backend import OobaboogaBackend
|
|
|
|
|
from llm_server.llm.vllm.vllm_backend import VLLMBackend
|
|
|
|
|
from llm_server.routes.auth import parse_token
|
|
|
|
|
from llm_server.routes.helpers.client import format_sillytavern_err
|
|
|
|
|
from llm_server.routes.helpers.http import require_api_key, validate_json
|
|
|
|
|
from llm_server.routes.queue import priority_queue
|
|
|
|
|
|
|
|
|
@ -42,6 +43,11 @@ class RequestHandler:
|
|
|
|
|
if not self.cluster_backend_info.get('model'):
|
|
|
|
|
print('keyerror: mode -', selected_model, self.backend_url, self.cluster_backend_info)
|
|
|
|
|
|
|
|
|
|
if not self.cluster_backend_info.get('mode') or not self.cluster_backend_info.get('model'):
|
|
|
|
|
self.offline = True
|
|
|
|
|
else:
|
|
|
|
|
self.offline = False
|
|
|
|
|
|
|
|
|
|
self.selected_model = self.cluster_backend_info['model']
|
|
|
|
|
self.backend = get_backend_handler(self.cluster_backend_info['mode'], self.backend_url)
|
|
|
|
|
self.parameters = None
|
|
|
|
@ -215,8 +221,11 @@ class RequestHandler:
|
|
|
|
|
|
|
|
|
|
def handle_request(self) -> Tuple[flask.Response, int]:
|
|
|
|
|
# Must include this in your child.
|
|
|
|
|
# if self.used:
|
|
|
|
|
# raise Exception('Can only use a RequestHandler object once.')
|
|
|
|
|
# assert not self.used
|
|
|
|
|
# if self.offline:
|
|
|
|
|
# msg = f'{self.selected_model} is not a valid model choice.'
|
|
|
|
|
# print(msg)
|
|
|
|
|
# return format_sillytavern_err(msg)
|
|
|
|
|
raise NotImplementedError
|
|
|
|
|
|
|
|
|
|
def handle_ratelimited(self, do_log: bool = True) -> Tuple[flask.Response, int]:
|
|
|
|
|