fix double logging

This commit is contained in:
Cyberes 2023-09-28 01:34:15 -06:00
parent ecdf819088
commit a4a1d6cce6
7 changed files with 22 additions and 7 deletions

View File

@ -10,6 +10,15 @@ from llm_server.routes.cache import redis
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
if isinstance(response, dict) and response.get('results'):
response = response['results'][0]['text']
try:
j = json.loads(response)
if j.get('results'):
response = j['results'][0]['text']
except:
pass
prompt_tokens = llm_server.llm.get_token_count(prompt)
if not is_error:
if not response_tokens:

View File

@ -27,9 +27,10 @@ class OobaRequestHandler(RequestHandler):
_, backend_response = self.generate_response(llm_request)
return backend_response
def handle_ratelimited(self):
def handle_ratelimited(self, do_log: bool = True):
msg = f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.'
backend_response = self.handle_error(msg)
if do_log:
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response[0].data.decode('utf-8'), None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
return backend_response[0], 200 # We only return the response from handle_error(), not the error code

View File

@ -69,7 +69,7 @@ class OpenAIRequestHandler(RequestHandler):
else:
return backend_response, backend_response_status_code
def handle_ratelimited(self):
def handle_ratelimited(self, do_log: bool = True):
# TODO: return a simulated OpenAI error message
# Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.
return 'Ratelimited', 429

View File

@ -55,7 +55,7 @@ class RedisPriorityQueue:
client_ip = item[0][1]
self.decrement_ip_count(client_ip, 'queued_ip_count')
return item
time.sleep(0.5) # wait for something to be added to the queue
time.sleep(0.1) # wait for something to be added to the queue
def increment_ip_count(self, ip, key):
self.redis.hincrby(key, ip, 1)

View File

@ -204,7 +204,7 @@ class RequestHandler:
# raise Exception('Can only use a RequestHandler object once.')
raise NotImplementedError
def handle_ratelimited(self) -> Tuple[flask.Response, int]:
def handle_ratelimited(self, do_log: bool = True) -> Tuple[flask.Response, int]:
raise NotImplementedError
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:

View File

@ -36,6 +36,7 @@ def stream(ws):
log_in_bg(quitting_err_msg, is_error=True)
def log_in_bg(generated_text_bg, elapsed_time_bg: Union[int, float] = None, is_error: bool = False, status_code: int = None):
def background_task_exception():
generated_tokens = tokenize(generated_text_bg)
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text_bg, elapsed_time_bg, handler.parameters, r_headers, status_code, r_url, response_tokens=generated_tokens, is_error=is_error)
@ -73,7 +74,7 @@ def stream(ws):
err_msg = None
if handler.is_client_ratelimited():
r, _ = handler.handle_ratelimited()
r, _ = handler.handle_ratelimited(do_log=False)
err_msg = r.json['results'][0]['text']
else:
request_valid, invalid_response = handler.validate_request(prompt=input_prompt)
@ -164,6 +165,8 @@ def stream(ws):
'text': generated_text
}))
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
ws.close()
return
finally:
# The worker incremented it, we'll decrement it.
decrement_ip_count(handler.client_ip, 'processing_ips')

View File

@ -35,9 +35,11 @@ from llm_server.stream import init_socketio
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
# TODO: use coloredlogs
# TODO: need to update opts. for workers
# Lower priority
# TODO: the processing stat showed -1 and I had to restart the server
# TODO: estiamted wait time needs to account for full concurrent_gens but the queue is less than concurrent_gens
# TODO: the estiamted wait time lags behind the stats
# TODO: simulate OpenAI error messages regardless of endpoint
# TODO: send extra headers when ratelimited?
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests