From 9a1d41a9b73896190f9bed8e8716f4c386c72966 Mon Sep 17 00:00:00 2001
From: Cyberes <cyberes@evulid.cc>
Date: Sun, 7 Jul 2024 15:05:35 -0600
Subject: [PATCH] get functional again

---
 config/config.yml.sample                     |  6 +++++
 llm_server/config/model.py                   |  1 +
 llm_server/config/scheme.py                  |  3 ++-
 llm_server/database/database.py              |  2 +-
 llm_server/globals.py                        |  1 -
 llm_server/llm/openai/oai_to_vllm.py         |  3 ++-
 llm_server/routes/ooba_request_handler.py    |  1 -
 llm_server/routes/openai/__init__.py         |  3 +--
 llm_server/routes/openai/chat_completions.py |  6 ++---
 llm_server/routes/openai/completions.py      |  4 ++--
 llm_server/routes/openai_request_handler.py  |  2 +-
 llm_server/routes/v1/generate_stream.py      |  4 ++--
 other/tests/config.sh                        |  5 +----
 other/tests/generate.sh                      |  2 +-
 other/tests/oai-chat-completion.sh           |  2 +-
 other/tests/oai-completion.sh                |  2 +-
 other/tests/stream.py                        | 23 +++++++++++++-------
 server.py                                    |  3 ++-
 18 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/config/config.yml.sample b/config/config.yml.sample
index 9da7506..80906f9 100644
--- a/config/config.yml.sample
+++ b/config/config.yml.sample
@@ -155,3 +155,9 @@ show_uptime:                      true
 
 # Display the total number of tokens generated.
 show_total_output_tokens:         true
+
+# seconds
+backend_generate_request_timeout: 95
+
+# ms
+redis_stream_timeout: 25000
\ No newline at end of file
diff --git a/llm_server/config/model.py b/llm_server/config/model.py
index 3133e0b..b791d9a 100644
--- a/llm_server/config/model.py
+++ b/llm_server/config/model.py
@@ -73,3 +73,4 @@ class ConfigModel(BaseModel):
     manual_model_name: Union[str, None]
     backend_request_timeout: int
     backend_generate_request_timeout: int
+    redis_stream_timeout: int
diff --git a/llm_server/config/scheme.py b/llm_server/config/scheme.py
index 3468765..bfbd459 100644
--- a/llm_server/config/scheme.py
+++ b/llm_server/config/scheme.py
@@ -55,5 +55,6 @@ config_scheme = bison.Scheme(
     bison.Option('load_num_prompts', default=True, field_type=bool),
     bison.Option('manual_model_name', default=None, field_type=Union[str, None]),
     bison.Option('backend_request_timeout', default=30, field_type=int),
-    bison.Option('backend_generate_request_timeout', default=95, field_type=int)
+    bison.Option('backend_generate_request_timeout', default=95, field_type=int),
+    bison.Option('redis_stream_timeout', default=25000, field_type=int)
 )
diff --git a/llm_server/database/database.py b/llm_server/database/database.py
index 272b141..99ac89b 100644
--- a/llm_server/database/database.py
+++ b/llm_server/database/database.py
@@ -30,7 +30,7 @@ def do_db_log(ip: str, token: str, prompt: str, response: Union[str, None], gen_
         if not response_tokens:
             response_tokens = get_token_count(response, backend_url)
     else:
-        response_tokens = None
+        response_tokens = 0
 
     # Sometimes we may want to insert null into the DB, but
     # usually we want to insert a float.
diff --git a/llm_server/globals.py b/llm_server/globals.py
index 4c99563..819455e 100644
--- a/llm_server/globals.py
+++ b/llm_server/globals.py
@@ -5,6 +5,5 @@ DEFAULT_OPENAI_SYSTEM_PROMPT = ("You are an assistant chatbot. Your main functio
                                 "apologize and suggest the user seek help elsewhere.")
 OPENAI_FORMATTING_PROMPT = """Lines that start with "### ASSISTANT" were messages you sent previously.\nLines that start with "### USER" were messages sent by the user you are chatting with.\nYou will respond to the "### RESPONSE:" prompt as the assistant and follow the instructions given by the user."""
 
-REDIS_STREAM_TIMEOUT = 25000
 LOGGING_FORMAT = "%(asctime)s: %(levelname)s:%(name)s - %(message)s"
 BACKEND_OFFLINE = 'The model you requested is not a valid choice. Please retry your query.'
diff --git a/llm_server/llm/openai/oai_to_vllm.py b/llm_server/llm/openai/oai_to_vllm.py
index ed3ba51..dcaf75f 100644
--- a/llm_server/llm/openai/oai_to_vllm.py
+++ b/llm_server/llm/openai/oai_to_vllm.py
@@ -93,7 +93,8 @@ def return_invalid_model_err(requested_model: str):
     return_oai_invalid_request_error(msg)
 
 
-def return_oai_internal_server_error():
+def return_oai_internal_server_error(error: str):
+    _logger.error(f'OAI Error: {error}')
     return jsonify({
         "error": {
             "message": "Internal server error",
diff --git a/llm_server/routes/ooba_request_handler.py b/llm_server/routes/ooba_request_handler.py
index e848d90..5a76105 100644
--- a/llm_server/routes/ooba_request_handler.py
+++ b/llm_server/routes/ooba_request_handler.py
@@ -20,7 +20,6 @@ class OobaRequestHandler(RequestHandler):
     def handle_request(self, return_ok: bool = True):
         assert not self.used
         if self.offline:
-            # _logger.debug(f'This backend is offline.')
             return self.handle_error(llm_server.globals.BACKEND_OFFLINE)
 
         request_valid, invalid_response = self.validate_request()
diff --git a/llm_server/routes/openai/__init__.py b/llm_server/routes/openai/__init__.py
index 7fae26d..a000c42 100644
--- a/llm_server/routes/openai/__init__.py
+++ b/llm_server/routes/openai/__init__.py
@@ -27,8 +27,7 @@ def handle_error(e):
     "auth_subrequest_error"
     """
 
-    _logger.error(f'OAI returning error: {e}')
-    return return_oai_internal_server_error()
+    return return_oai_internal_server_error(e)
 
 
 from .models import openai_list_models
diff --git a/llm_server/routes/openai/chat_completions.py b/llm_server/routes/openai/chat_completions.py
index f675370..f35a518 100644
--- a/llm_server/routes/openai/chat_completions.py
+++ b/llm_server/routes/openai/chat_completions.py
@@ -32,7 +32,7 @@ def openai_chat_completions(model_name=None):
     else:
         handler = OpenAIRequestHandler(incoming_request=request, incoming_json=request_json_body, selected_model=model_name)
         if handler.offline:
-            return return_oai_internal_server_error()
+            return return_oai_internal_server_error(f'backend {handler.backend_url} is offline')
 
         if not request_json_body.get('stream'):
             try:
@@ -112,9 +112,9 @@ def openai_chat_completions(model_name=None):
                     try:
                         last_id = '0-0'
                         while True:
-                            stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
+                            stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
                             if not stream_data:
-                                _logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
+                                _logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
                                 yield 'data: [DONE]\n\n'
                             else:
                                 for stream_index, item in stream_data[0][1]:
diff --git a/llm_server/routes/openai/completions.py b/llm_server/routes/openai/completions.py
index 04fcacd..1a521c0 100644
--- a/llm_server/routes/openai/completions.py
+++ b/llm_server/routes/openai/completions.py
@@ -157,9 +157,9 @@ def openai_completions(model_name=None):
                     try:
                         last_id = '0-0'
                         while True:
-                            stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
+                            stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
                             if not stream_data:
-                                _logger.debug(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
+                                _logger.debug(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
                                 yield 'data: [DONE]\n\n'
                             else:
                                 for stream_index, item in stream_data[0][1]:
diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py
index 062204e..ca97074 100644
--- a/llm_server/routes/openai_request_handler.py
+++ b/llm_server/routes/openai_request_handler.py
@@ -32,7 +32,7 @@ class OpenAIRequestHandler(RequestHandler):
     def handle_request(self) -> Tuple[flask.Response, int]:
         assert not self.used
         if self.offline:
-            return return_oai_internal_server_error()
+            return return_oai_internal_server_error(f'backend {self.backend_url} is offline.')
 
         disable_openai_handling = request.headers.get('Llm-Disable-Openai', False) == 'true' \
                                   and is_valid_api_key(parse_token(request.headers.get('Authorization', ''))) \
diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py
index ad1b50b..3ed2d51 100644
--- a/llm_server/routes/v1/generate_stream.py
+++ b/llm_server/routes/v1/generate_stream.py
@@ -144,9 +144,9 @@ def do_stream(ws, model_name):
                 try:
                     last_id = '0-0'  # The ID of the last entry we read.
                     while True:
-                        stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().REDIS_STREAM_TIMEOUT)
+                        stream_data = stream_redis.xread({stream_name: last_id}, block=GlobalConfig.get().redis_stream_timeout)
                         if not stream_data:
-                            _logger.error(f"No message received in {GlobalConfig.get().REDIS_STREAM_TIMEOUT / 1000} seconds, closing stream.")
+                            _logger.error(f"No message received in {GlobalConfig.get().redis_stream_timeout / 1000} seconds, closing stream.")
                             return
                         else:
                             for stream_index, item in stream_data[0][1]:
diff --git a/other/tests/config.sh b/other/tests/config.sh
index 64bea46..920438c 100644
--- a/other/tests/config.sh
+++ b/other/tests/config.sh
@@ -1,11 +1,8 @@
-HOST="proxy.chub-archive.evulid.cc"
+HOST="http://localhost:5000"
 
 AUTH_KEY="TEST_1df979f0-6df1-41bd-814a-e99b1680e727"
 
 PROXY_SERVERS=(
   "http://172.0.4.7:3128"
   "http://172.0.4.8:3128"
-  "http://172.0.4.10:3128"
-  "http://172.0.4.12:3128"
-  "http://172.0.4.13:3128"
 )
diff --git a/other/tests/generate.sh b/other/tests/generate.sh
index b1443c0..b88f2ea 100755
--- a/other/tests/generate.sh
+++ b/other/tests/generate.sh
@@ -49,7 +49,7 @@ while true; do
 EOF
   )
 
-  curl "https://$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \
+  curl "$HOST/api/v1/generate" -m 100 -x "$our_proxy_server" \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer $AUTH_KEY" \
     -d "$DATA"
diff --git a/other/tests/oai-chat-completion.sh b/other/tests/oai-chat-completion.sh
index 5355a8a..e5c58af 100755
--- a/other/tests/oai-chat-completion.sh
+++ b/other/tests/oai-chat-completion.sh
@@ -43,7 +43,7 @@ while true; do
 EOF
   )
 
-  curl "https://$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \
+  curl "$HOST/api/openai/v1/chat/completions" -m 100 -x "$our_proxy_server" \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer $AUTH_KEY" \
     -d "$DATA"
diff --git a/other/tests/oai-completion.sh b/other/tests/oai-completion.sh
index cc0f9f0..b95bcf5 100755
--- a/other/tests/oai-completion.sh
+++ b/other/tests/oai-completion.sh
@@ -43,7 +43,7 @@ while true; do
 EOF
   )
 
-  curl "https://$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \
+  curl "$HOST/api/openai/v1/completions" -m 100 -x "$our_proxy_server" \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer $AUTH_KEY" \
     -d "$DATA"
diff --git a/other/tests/stream.py b/other/tests/stream.py
index 75d403b..619325a 100755
--- a/other/tests/stream.py
+++ b/other/tests/stream.py
@@ -1,7 +1,7 @@
 import asyncio
 import json
-import sys
 import os
+import sys
 import time
 from pathlib import Path
 
@@ -70,7 +70,14 @@ async def run(context):
         'stopping_strings': []
     }
 
-    async with websockets.connect(f'wss://{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket:
+    socket_type = 'ws://'
+    if config['HOST'].startswith('https://'):
+        socket_type = 'wss://'
+    config['HOST'] = config['HOST'].strip('http://')
+    config['HOST'] = config['HOST'].strip('https://')
+
+    print('Connecting to', f'{socket_type}{config["HOST"]}/api/v1/stream')
+    async with websockets.connect(f'{socket_type}{config["HOST"]}/api/v1/stream', ping_interval=None) as websocket:
         await websocket.send(json.dumps(request))
 
         yield context  # Remove this if you just want to see the reply
@@ -89,12 +96,12 @@ async def run(context):
 
 
 async def print_response_stream(prompt):
-    try:
-        async for response in run(prompt):
-            print(response, end='')
-            sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
-    except Exception as e:
-        print(e)
+    # try:
+    async for response in run(prompt):
+        print(response, end='')
+        sys.stdout.flush()  # If we don't flush, we won't see tokens in realtime.
+    # except Exception as e:
+    #     print(e)
 
 
 if __name__ == '__main__':
diff --git a/server.py b/server.py
index e9b358e..70c7333 100644
--- a/server.py
+++ b/server.py
@@ -14,12 +14,13 @@ from llm_server.routes.v1 import bp
 from llm_server.routes.v1.generate_stats import generate_stats
 from llm_server.sock import init_wssocket
 
-# TODO: seperate queue item timeout for websockets (make longer, like 5 minutes)
+# TODO: detect blocking disconnect
 # TODO: return an `error: True`, error code, and error message rather than just a formatted message
 # TODO: what happens when all backends are offline? What about the "online" key in the stats page?
 # TODO: redis SCAN vs KEYS??
 # TODO: is frequency penalty the same as ooba repetition penalty???
 # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
+# TODO: insert pydantic object into database
 
 # Lower priority
 # TODO: if a backend is at its limit of concurrent requests, choose a different one