From 9a1d41a9b73896190f9bed8e8716f4c386c72966 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 7 Jul 2024 15:05:35 -0600 Subject: [PATCH] get functional again --- config/config.yml.sample | 6 +++++ llm_server/config/model.py | 1 + llm_server/config/scheme.py | 3 ++- llm_server/database/database.py | 2 +- llm_server/globals.py | 1 - llm_server/llm/openai/oai_to_vllm.py | 3 ++- llm_server/routes/ooba_request_handler.py | 1 - llm_server/routes/openai/__init__.py | 3 +-- llm_server/routes/openai/chat_completions.py | 6 ++--- llm_server/routes/openai/completions.py | 4 ++-- llm_server/routes/openai_request_handler.py | 2 +- llm_server/routes/v1/generate_stream.py | 4 ++-- other/tests/config.sh | 5 +---- other/tests/generate.sh | 2 +- other/tests/oai-chat-completion.sh | 2 +- other/tests/oai-completion.sh | 2 +- other/tests/stream.py | 23 +++++++++++++------- server.py | 3 ++- 18 files changed, 42 insertions(+), 31 deletions(-) diff --git a/config/config.yml.sample b/config/config.yml.sample index 9da7506..80906f9 100644 --- a/config/config.yml.sample +++ b/config/config.yml.sample @@ -155,3 +155,9 @@ show_uptime: true # Display the total number of tokens generated. show_total_output_tokens: true + +# seconds +backend_generate_request_timeout: 95 + +# ms +redis_stream_timeout: 25000 \ No newline at end of file diff --git a/llm_server/config/model.py b/llm_server/config/model.py index 3133e0b..b791d9a 100644 --- a/llm_server/config/model.py +++ b/llm_server/config/model.py @@ -73,3 +73,4 @@ class ConfigModel(BaseModel): manual_model_name: Union[str, None] backend_request_timeout: int backend_generate_request_timeout: int + redis_stream_timeout: int diff --git a/llm_server/config/scheme.py b/llm_server/config/scheme.py index 3468765..bfbd459 100644 --- a/llm_server/config/scheme.py +++ b/llm_server/config/scheme.py @@ -55,5 +55,6 @@ config_scheme = bison.Scheme( bison.Option('load_num_prompts', default=True, field_type=bool), bison.Option('manual_model_name', default=None, field_type=Union[str, None]), bison.Option('backend_request_timeout', default=30, field_type=int), - bison.Option('backend_generate_request_timeout', default=95, field_type=int) + bison.Option('backend_generate_request_timeout', default=95, field_type=int), + bison.Option('redis_stream_timeout', default=25000, field_type=int) ) diff --git a/llm_server/database/database.py b/llm_server/database/database.py index 272b141..99ac89b 100644 --- a/llm_server/database/database.py +++ b/llm_server/database/database.py @@ -30,7 +30,7 @@ def do_db_log(ip: str, token: str, prompt: str, response: Union[str, None], gen_ if not response_tokens: response_tokens = get_token_count(response, backend_url) else: - response_tokens = None + response_tokens = 0 # Sometimes we may want to insert null into the DB, but # usually we want to insert a float. diff --git a/llm_server/globals.py b/llm_server/globals.py index 4c99563..819455e 100644 --- a/llm_server/globals.py +++ b/llm_server/globals.py @@ -5,6 +5,5 @@ DEFAULT_OPENAI_SYSTEM_PROMPT = ("You are an assistant chatbot. Your main functio "apologize and suggest the user seek help elsewhere.") OPENAI_FORMATTING_PROMPT = """Lines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user.""" -REDIS_STREAM_TIMEOUT = 25000 LOGGING_FORMAT = "%(asctime)s: %(levelname)s:%(name)s - %(message)s" BACKEND_OFFLINE = 'The model you requested is not a valid choice. Please retry your query.' diff --git a/llm_server/llm/openai/oai_to_vllm.py b/llm_server/llm/openai/oai_to_vllm.py index ed3ba51..dcaf75f 100644 --- a/llm_server/llm/openai/oai_to_vllm.py +++ b/llm_server/llm/openai/oai_to_vllm.py @@ -93,7 +93,8 @@ def return_invalid_model_err(requested_model: str): return_oai_invalid_request_error(msg) -def return_oai_internal_server_error(): +def return_oai_internal_server_error(error: str): + _logger.error(f'OAI Error: {error}') return jsonify({ "error": { "message": "Internal server error", diff --git a/llm_server/routes/ooba_request_handler.py b/llm_server/routes/ooba_request_handler.py index e848d90..5a76105 100644 --- a/llm_server/routes/ooba_request_handler.py +++ b/llm_server/routes/ooba_request_handler.py @@ -20,7 +20,6 @@ class OobaRequestHandler(RequestHandler): def handle_request(self, return_ok: bool = True): assert not self.used if self.offline: - # _logger.debug(f'This backend is offline.') return self.handle_error(llm_server.globals.BACKEND_OFFLINE) request_valid, invalid_response = self.validate_request() diff --git a/llm_server/routes/openai/__init__.py b/llm_server/routes/openai/__init__.py index 7fae26d..a000c42 100644 --- a/llm_server/routes/openai/__init__.py +++ b/llm_server/routes/openai/__init__.py @@ -27,8 +27,7 @@ def handle_error(e): "auth_subrequest_error" """ - _logger.error(f'OAI returning error: {e}') - return return_oai_internal_server_error() + return return_oai_internal_server_error(e) from .models import openai_list_models diff --git a/llm_server/routes/openai/chat_completions.py b/llm_server/routes/openai/chat_completions.py index f675370..f35a518 100644 --- a/llm_server/routes/openai/chat_completions.py +++ b/llm_server/routes/openai/chat_completions.py @@ -32,7 +32,7 @@ def openai_chat_completions(model_name=None): else: handler = OpenAIRequestHandler(incoming_request=request, incoming_json=request_json_body, selected_model=model_name) if handler.offline: - return return_oai_internal_server_error() + return return_oai_internal_server_error(f'backend {handler.backend_url} is offline') if not request_json_body.get('stream'): try: @@ -112,9 +112,9 @@ def openai_chat_completions(model_name=None): try: last_id = '0-0' while True: - stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT) + stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout) if not stream_data: - _logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.") + _logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.") yield 'data: [DONE]\n\n' else: for stream_index, item in stream_data[0][1]: diff --git a/llm_server/routes/openai/completions.py b/llm_server/routes/openai/completions.py index 04fcacd..1a521c0 100644 --- a/llm_server/routes/openai/completions.py +++ b/llm_server/routes/openai/completions.py @@ -157,9 +157,9 @@ def openai_completions(model_name=None): try: last_id = '0-0' while True: - stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT) + stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout) if not stream_data: - _logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.") + _logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.") yield 'data: [DONE]\n\n' else: for stream_index, item in stream_data[0][1]: diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py index 062204e..ca97074 100644 --- a/llm_server/routes/openai_request_handler.py +++ b/llm_server/routes/openai_request_handler.py @@ -32,7 +32,7 @@ class OpenAIRequestHandler(RequestHandler): def handle_request(self) -> Tuple[flask.Response, int]: assert not self.used if self.offline: - return return_oai_internal_server_error() + return return_oai_internal_server_error(f'backend {self.backend_url} is offline.') disable_openai_handling = request.headers.get('Llm-Disable-Openai', False) == 'true' \ and is_valid_api_key(parse_token(request.headers.get('Authorization', ''))) \ diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index ad1b50b..3ed2d51 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -144,9 +144,9 @@ def do_stream(ws, model_name): try: last_id = '0-0' # The ID of the last entry we read. while True: - stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT) + stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout) if not stream_data: - _logger.error(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.") + _logger.error(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.") return else: for stream_index, item in stream_data[0][1]: diff --git a/other/tests/config.sh b/other/tests/config.sh index 64bea46..920438c 100644 --- a/other/tests/config.sh +++ b/other/tests/config.sh @@ -1,11 +1,8 @@ -HOST="proxy.chub-archive.evulid.cc" +HOST="http://localhost:5000" AUTH_KEY="TEST_1df979f0-6df1-41bd-814a-e99b1680e727" PROXY_SERVERS=( "http://172.0.4.7:3128" "http://172.0.4.8:3128" - "http://172.0.4.10:3128" - "http://172.0.4.12:3128" - "http://172.0.4.13:3128" ) diff --git a/other/tests/generate.sh b/other/tests/generate.sh index b1443c0..b88f2ea 100755 --- a/other/tests/generate.sh +++ b/other/tests/generate.sh @@ -49,7 +49,7 @@ while true; do EOF ) - curl "https://$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \ + curl "$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $AUTH_KEY" \ -d "$DATA" diff --git a/other/tests/oai-chat-completion.sh b/other/tests/oai-chat-completion.sh index 5355a8a..e5c58af 100755 --- a/other/tests/oai-chat-completion.sh +++ b/other/tests/oai-chat-completion.sh @@ -43,7 +43,7 @@ while true; do EOF ) - curl "https://$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \ + curl "$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $AUTH_KEY" \ -d "$DATA" diff --git a/other/tests/oai-completion.sh b/other/tests/oai-completion.sh index cc0f9f0..b95bcf5 100755 --- a/other/tests/oai-completion.sh +++ b/other/tests/oai-completion.sh @@ -43,7 +43,7 @@ while true; do EOF ) - curl "https://$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \ + curl "$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $AUTH_KEY" \ -d "$DATA" diff --git a/other/tests/stream.py b/other/tests/stream.py index 75d403b..619325a 100755 --- a/other/tests/stream.py +++ b/other/tests/stream.py @@ -1,7 +1,7 @@ import asyncio import json -import sys import os +import sys import time from pathlib import Path @@ -70,7 +70,14 @@ async def run(context): 'stopping_strings': [] } - async with websockets.connect(f'wss://{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket: + socket_type = 'ws://' + if config['HOST'].startswith('https://'): + socket_type = 'wss://' + config['HOST'] = config['HOST'].strip('http://') + config['HOST'] = config['HOST'].strip('https://') + + print('Connecting to', f'{socket_type}{config["HOST"]}/api/v1/stream') + async with websockets.connect(f'{socket_type}{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket: await websocket.send(json.dumps(request)) yield context # Remove this if you just want to see the reply @@ -89,12 +96,12 @@ async def run(context): async def print_response_stream(prompt): - try: - async for response in run(prompt): - print(response, end='') - sys.stdout.flush() # If we don't flush, we won't see tokens in realtime. - except Exception as e: - print(e) + # try: + async for response in run(prompt): + print(response, end='') + sys.stdout.flush() # If we don't flush, we won't see tokens in realtime. + # except Exception as e: + # print(e) if __name__ == '__main__': diff --git a/server.py b/server.py index e9b358e..70c7333 100644 --- a/server.py +++ b/server.py @@ -14,12 +14,13 @@ from llm_server.routes.v1 import bp from llm_server.routes.v1.generate_stats import generate_stats from llm_server.sock import init_wssocket -# TODO: seperate queue item timeout for websockets (make longer, like 5 minutes) +# TODO: detect blocking disconnect # TODO: return an `error: True`, error code, and error message rather than just a formatted message # TODO: what happens when all backends are offline? What about the "online" key in the stats page? # TODO: redis SCAN vs KEYS?? # TODO: is frequency penalty the same as ooba repetition penalty??? # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions +# TODO: insert pydantic object into database # Lower priority # TODO: if a backend is at its limit of concurrent requests, choose a different one