fix double logging
This commit is contained in:
parent
ecdf819088
commit
a4a1d6cce6
|
@ -10,6 +10,15 @@ from llm_server.routes.cache import redis
|
||||||
|
|
||||||
|
|
||||||
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
|
def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, request_url, response_tokens: int = None, is_error: bool = False):
|
||||||
|
if isinstance(response, dict) and response.get('results'):
|
||||||
|
response = response['results'][0]['text']
|
||||||
|
try:
|
||||||
|
j = json.loads(response)
|
||||||
|
if j.get('results'):
|
||||||
|
response = j['results'][0]['text']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
prompt_tokens = llm_server.llm.get_token_count(prompt)
|
prompt_tokens = llm_server.llm.get_token_count(prompt)
|
||||||
if not is_error:
|
if not is_error:
|
||||||
if not response_tokens:
|
if not response_tokens:
|
||||||
|
|
|
@ -27,10 +27,11 @@ class OobaRequestHandler(RequestHandler):
|
||||||
_, backend_response = self.generate_response(llm_request)
|
_, backend_response = self.generate_response(llm_request)
|
||||||
return backend_response
|
return backend_response
|
||||||
|
|
||||||
def handle_ratelimited(self):
|
def handle_ratelimited(self, do_log: bool = True):
|
||||||
msg = f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.'
|
msg = f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.'
|
||||||
backend_response = self.handle_error(msg)
|
backend_response = self.handle_error(msg)
|
||||||
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response[0].data.decode('utf-8'), None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
if do_log:
|
||||||
|
log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response[0].data.decode('utf-8'), None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True)
|
||||||
return backend_response[0], 200 # We only return the response from handle_error(), not the error code
|
return backend_response[0], 200 # We only return the response from handle_error(), not the error code
|
||||||
|
|
||||||
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
||||||
|
|
|
@ -69,7 +69,7 @@ class OpenAIRequestHandler(RequestHandler):
|
||||||
else:
|
else:
|
||||||
return backend_response, backend_response_status_code
|
return backend_response, backend_response_status_code
|
||||||
|
|
||||||
def handle_ratelimited(self):
|
def handle_ratelimited(self, do_log: bool = True):
|
||||||
# TODO: return a simulated OpenAI error message
|
# TODO: return a simulated OpenAI error message
|
||||||
# Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.
|
# Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.
|
||||||
return 'Ratelimited', 429
|
return 'Ratelimited', 429
|
||||||
|
|
|
@ -55,7 +55,7 @@ class RedisPriorityQueue:
|
||||||
client_ip = item[0][1]
|
client_ip = item[0][1]
|
||||||
self.decrement_ip_count(client_ip, 'queued_ip_count')
|
self.decrement_ip_count(client_ip, 'queued_ip_count')
|
||||||
return item
|
return item
|
||||||
time.sleep(0.5) # wait for something to be added to the queue
|
time.sleep(0.1) # wait for something to be added to the queue
|
||||||
|
|
||||||
def increment_ip_count(self, ip, key):
|
def increment_ip_count(self, ip, key):
|
||||||
self.redis.hincrby(key, ip, 1)
|
self.redis.hincrby(key, ip, 1)
|
||||||
|
|
|
@ -204,7 +204,7 @@ class RequestHandler:
|
||||||
# raise Exception('Can only use a RequestHandler object once.')
|
# raise Exception('Can only use a RequestHandler object once.')
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def handle_ratelimited(self) -> Tuple[flask.Response, int]:
|
def handle_ratelimited(self, do_log: bool = True) -> Tuple[flask.Response, int]:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
def handle_error(self, error_msg: str, error_type: str = 'error') -> Tuple[flask.Response, int]:
|
||||||
|
|
|
@ -36,6 +36,7 @@ def stream(ws):
|
||||||
log_in_bg(quitting_err_msg, is_error=True)
|
log_in_bg(quitting_err_msg, is_error=True)
|
||||||
|
|
||||||
def log_in_bg(generated_text_bg, elapsed_time_bg: Union[int, float] = None, is_error: bool = False, status_code: int = None):
|
def log_in_bg(generated_text_bg, elapsed_time_bg: Union[int, float] = None, is_error: bool = False, status_code: int = None):
|
||||||
|
|
||||||
def background_task_exception():
|
def background_task_exception():
|
||||||
generated_tokens = tokenize(generated_text_bg)
|
generated_tokens = tokenize(generated_text_bg)
|
||||||
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text_bg, elapsed_time_bg, handler.parameters, r_headers, status_code, r_url, response_tokens=generated_tokens, is_error=is_error)
|
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text_bg, elapsed_time_bg, handler.parameters, r_headers, status_code, r_url, response_tokens=generated_tokens, is_error=is_error)
|
||||||
|
@ -73,7 +74,7 @@ def stream(ws):
|
||||||
|
|
||||||
err_msg = None
|
err_msg = None
|
||||||
if handler.is_client_ratelimited():
|
if handler.is_client_ratelimited():
|
||||||
r, _ = handler.handle_ratelimited()
|
r, _ = handler.handle_ratelimited(do_log=False)
|
||||||
err_msg = r.json['results'][0]['text']
|
err_msg = r.json['results'][0]['text']
|
||||||
else:
|
else:
|
||||||
request_valid, invalid_response = handler.validate_request(prompt=input_prompt)
|
request_valid, invalid_response = handler.validate_request(prompt=input_prompt)
|
||||||
|
@ -164,6 +165,8 @@ def stream(ws):
|
||||||
'text': generated_text
|
'text': generated_text
|
||||||
}))
|
}))
|
||||||
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
||||||
|
ws.close()
|
||||||
|
return
|
||||||
finally:
|
finally:
|
||||||
# The worker incremented it, we'll decrement it.
|
# The worker incremented it, we'll decrement it.
|
||||||
decrement_ip_count(handler.client_ip, 'processing_ips')
|
decrement_ip_count(handler.client_ip, 'processing_ips')
|
||||||
|
|
|
@ -35,9 +35,11 @@ from llm_server.stream import init_socketio
|
||||||
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
|
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
|
||||||
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
||||||
# TODO: use coloredlogs
|
# TODO: use coloredlogs
|
||||||
|
# TODO: need to update opts. for workers
|
||||||
|
|
||||||
# Lower priority
|
# Lower priority
|
||||||
# TODO: the processing stat showed -1 and I had to restart the server
|
# TODO: estiamted wait time needs to account for full concurrent_gens but the queue is less than concurrent_gens
|
||||||
|
# TODO: the estiamted wait time lags behind the stats
|
||||||
# TODO: simulate OpenAI error messages regardless of endpoint
|
# TODO: simulate OpenAI error messages regardless of endpoint
|
||||||
# TODO: send extra headers when ratelimited?
|
# TODO: send extra headers when ratelimited?
|
||||||
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
|
# TODO: make sure log_prompt() is used everywhere, including errors and invalid requests
|
||||||
|
|
Reference in New Issue