diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index 2e2c188..7e97acc 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -104,10 +104,12 @@ def stream(ws): end_time = time.time() elapsed_time = end_time - start_time + r_headers = dict(request.headers) + r_url = request.url def background_task(): generated_tokens = tokenize(generated_text) - log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, dict(request.headers), response_status_code, request.url, response_tokens=generated_tokens) + log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=generated_tokens) # TODO: use async/await instead of threads threading.Thread(target=background_task).start() diff --git a/server.py b/server.py index 27885c1..fab5727 100644 --- a/server.py +++ b/server.py @@ -20,7 +20,6 @@ from llm_server.routes.server_error import handle_server_error # TODO: add more excluding to SYSTEM__ tokens # TODO: make sure the OpenAI moderation endpoint scans the last n messages rather than only the last one (make that threaded) # TODO: support turbo-instruct on openai endpoint -# TODO: show requested model (not actual LLM backend model) in OpenAI responses try: import vllm