diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index 55fb6e4..6cd98c0 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -159,23 +159,10 @@ def do_stream(ws, model_name): })) except: # The has client closed the stream. - if request: + if response: # Cancel the backend? - request.close() - end_time = time.time() - elapsed_time = end_time - start_time - log_prompt(ip=handler.client_ip, - token=handler.token, - prompt=input_prompt, - response=generated_text, - gen_time=elapsed_time, - parameters=handler.parameters, - headers=r_headers, - backend_response_code=response_status_code, - request_url=r_url, - backend_url=handler.backend_url, - response_tokens=None - ) + response.close() + # used to log here return message_num += 1 @@ -184,22 +171,9 @@ def do_stream(ws, model_name): # If there is no more data, break the loop if not chunk: break - - end_time = time.time() - elapsed_time = end_time - start_time - log_prompt(ip=handler.client_ip, - token=handler.token, - prompt=input_prompt, - response=generated_text, - gen_time=elapsed_time, - parameters=handler.parameters, - headers=r_headers, - backend_response_code=response_status_code, - request_url=r_url, - backend_url=handler.backend_url, - response_tokens=None, - is_error=not response - ) + if response: + response.close() + # used to log here except: traceback.print_exc() generated_text = generated_text + '\n\n' + handler.handle_error('Encountered error while streaming.', 'exception')[0].json['results'][0]['text'] @@ -208,33 +182,19 @@ def do_stream(ws, model_name): 'message_num': message_num, 'text': generated_text })) - if request: - request.close() - log_prompt(ip=handler.client_ip, - token=handler.token, - prompt=input_prompt, - response=generated_text, - gen_time=None, - parameters=handler.parameters, - headers=r_headers, - backend_response_code=response_status_code, - request_url=r_url, - backend_url=handler.backend_url, - response_tokens=None, - is_error=True - ) - return + # used to log here finally: # The worker incremented it, we'll decrement it. decrement_ip_count(handler.client_ip, 'processing_ips') decr_active_workers(handler.selected_model, handler.backend_url) - try: - ws.send(json.dumps({ - 'event': 'stream_end', - 'message_num': message_num - })) - except: - # The client closed the stream. + try: + ws.send(json.dumps({ + 'event': 'stream_end', + 'message_num': message_num + })) + except: + # The client closed the stream. + pass end_time = time.time() elapsed_time = end_time - start_time log_prompt(ip=handler.client_ip, @@ -246,8 +206,7 @@ def do_stream(ws, model_name): headers=r_headers, backend_response_code=response_status_code, request_url=r_url, - backend_url=handler.backend_url, - response_tokens=None + backend_url=handler.backend_url ) finally: try: diff --git a/server.py b/server.py index 0eb0f6c..040b8d9 100644 --- a/server.py +++ b/server.py @@ -24,7 +24,8 @@ from llm_server.routes.server_error import handle_server_error from llm_server.routes.v1 import bp from llm_server.sock import init_socketio -# TODO: make sure system tokens are excluded from 5/24 hr proompters + +# TODO: redis SCAN vs KEYS?? # TODO: implement blind RRD controlled via header and only used when there is a queue on the primary backend(s) # TODO: is frequency penalty the same as ooba repetition penalty??? # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions