fix double logging
This commit is contained in:
parent
ecdf819088
commit
a4a1d6cce6
|
@ -10,6 +10,15 @@ from llm_server.routes.cache import redis
|
|||
|
||||
|
||||
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
|
||||
if isinstance(response, dict) and response.get('results'):
|
||||
response = response['results'][0]['text']
|
||||
try:
|
||||
j = json.loads(response)
|
||||
if j.get('results'):
|
||||
response = j['results'][0]['text']
|
||||
except:
|
||||
pass
|
||||
|
||||
prompt_tokens = llm_server.llm.get_token_count(prompt)
|
||||
if not is_error:
|
||||
if not response_tokens:
|
||||
|
|
|
@ -27,10 +27,11 @@ class OobaRequestHandler(RequestHandler):
|
|||
_, backend_response = self.generate_response(llm_request)
|
||||
return backend_response
|
||||
|
||||
def handle_ratelimited(self):
|
||||
def handle_ratelimited(self, do_log: bool = True):
|
||||
msg = f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.'
|
||||
backend_response = self.handle_error(msg)
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response[0].data.decode('utf-8'), None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||
if do_log:
|
||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response[0].data.decode('utf-8'), None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||
return backend_response[0], 200 # We only return the response from handle_error(), not the error code
|
||||
|
||||
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
||||
|
|
|
@ -69,7 +69,7 @@ class OpenAIRequestHandler(RequestHandler):
|
|||
else:
|
||||
return backend_response, backend_response_status_code
|
||||
|
||||
def handle_ratelimited(self):
|
||||
def handle_ratelimited(self, do_log: bool = True):
|
||||
# TODO: return a simulated OpenAI error message
|
||||
# Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.
|
||||
return 'Ratelimited', 429
|
||||
|
|
|
@ -55,7 +55,7 @@ class RedisPriorityQueue:
|
|||
client_ip = item[0][1]
|
||||
self.decrement_ip_count(client_ip, 'queued_ip_count')
|
||||
return item
|
||||
time.sleep(0.5) # wait for something to be added to the queue
|
||||
time.sleep(0.1) # wait for something to be added to the queue
|
||||
|
||||
def increment_ip_count(self, ip, key):
|
||||
self.redis.hincrby(key, ip, 1)
|
||||
|
|
|
@ -204,7 +204,7 @@ class RequestHandler:
|
|||
# raise Exception('Can only use a RequestHandler object once.')
|
||||
raise NotImplementedError
|
||||
|
||||
def handle_ratelimited(self) -> Tuple[flask.Response, int]:
|
||||
def handle_ratelimited(self, do_log: bool = True) -> Tuple[flask.Response, int]:
|
||||
raise NotImplementedError
|
||||
|
||||
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
||||
|
|
|
@ -36,6 +36,7 @@ def stream(ws):
|
|||
log_in_bg(quitting_err_msg, is_error=True)
|
||||
|
||||
def log_in_bg(generated_text_bg, elapsed_time_bg: Union[int, float] = None, is_error: bool = False, status_code: int = None):
|
||||
|
||||
def background_task_exception():
|
||||
generated_tokens = tokenize(generated_text_bg)
|
||||
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text_bg, elapsed_time_bg, handler.parameters, r_headers, status_code, r_url, response_tokens=generated_tokens, is_error=is_error)
|
||||
|
@ -73,7 +74,7 @@ def stream(ws):
|
|||
|
||||
err_msg = None
|
||||
if handler.is_client_ratelimited():
|
||||
r, _ = handler.handle_ratelimited()
|
||||
r, _ = handler.handle_ratelimited(do_log=False)
|
||||
err_msg = r.json['results'][0]['text']
|
||||
else:
|
||||
request_valid, invalid_response = handler.validate_request(prompt=input_prompt)
|
||||
|
@ -164,6 +165,8 @@ def stream(ws):
|
|||
'text': generated_text
|
||||
}))
|
||||
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
||||
ws.close()
|
||||
return
|
||||
finally:
|
||||
# The worker incremented it, we'll decrement it.
|
||||
decrement_ip_count(handler.client_ip, 'processing_ips')
|
||||
|
|
|
@ -35,9 +35,11 @@ from llm_server.stream import init_socketio
|
|||
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
|
||||
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
||||
# TODO: use coloredlogs
|
||||
# TODO: need to update opts. for workers
|
||||
|
||||
# Lower priority
|
||||
# TODO: the processing stat showed -1 and I had to restart the server
|
||||
# TODO: estiamted wait time needs to account for full concurrent_gens but the queue is less than concurrent_gens
|
||||
# TODO: the estiamted wait time lags behind the stats
|
||||
# TODO: simulate OpenAI error messages regardless of endpoint
|
||||
# TODO: send extra headers when ratelimited?
|
||||
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
|
||||
|
|
Reference in New Issue