From 11e84db59c8f7117af7e70a1dfd6bbcc46e08e95 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Mon, 25 Sep 2023 22:32:48 -0600 Subject: [PATCH] update database, tokenizer handle null prompt, convert top_p to vllm on openai, actually validate prompt on streaming, --- llm_server/database/create.py | 8 ++++---- llm_server/llm/vllm/tokenize.py | 3 +++ llm_server/routes/ooba_request_handler.py | 3 +-- llm_server/routes/openai_request_handler.py | 3 +++ llm_server/routes/v1/generate_stream.py | 7 ++++--- server.py | 8 +++++--- 6 files changed, 20 insertions(+), 12 deletions(-) diff --git a/llm_server/database/create.py b/llm_server/database/create.py index 99ca91d..08ad8a2 100644 --- a/llm_server/database/create.py +++ b/llm_server/database/create.py @@ -13,15 +13,15 @@ def create_db(): backend_url TEXT, request_url TEXT, generation_time FLOAT, - prompt TEXT, + prompt LONGTEXT, prompt_tokens INTEGER, - response TEXT, + response LONGTEXT, response_tokens INTEGER, response_status INTEGER, parameters TEXT, - CHECK (parameters IS NULL OR JSON_VALID(parameters)), + # CHECK (parameters IS NULL OR JSON_VALID(parameters)), headers TEXT, - CHECK (headers IS NULL OR JSON_VALID(headers)), + # CHECK (headers IS NULL OR JSON_VALID(headers)), timestamp INTEGER ) ''') diff --git a/llm_server/llm/vllm/tokenize.py b/llm_server/llm/vllm/tokenize.py index 8d7bffc..3476dcb 100644 --- a/llm_server/llm/vllm/tokenize.py +++ b/llm_server/llm/vllm/tokenize.py @@ -9,6 +9,9 @@ tokenizer = tiktoken.get_encoding("cl100k_base") def tokenize(prompt: str) -> int: + if not prompt: + # The tokenizers have issues when the prompt is None. + return 0 try: r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) j = r.json() diff --git a/llm_server/routes/ooba_request_handler.py b/llm_server/routes/ooba_request_handler.py index 3f021b5..b04d473 100644 --- a/llm_server/routes/ooba_request_handler.py +++ b/llm_server/routes/ooba_request_handler.py @@ -14,8 +14,7 @@ class OobaRequestHandler(RequestHandler): super().__init__(*args, **kwargs) def handle_request(self): - if self.used: - raise Exception('Can only use a RequestHandler object once.') + assert not self.used request_valid, invalid_response = self.validate_request() if not request_valid: diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py index 88c7b07..68c02eb 100644 --- a/llm_server/routes/openai_request_handler.py +++ b/llm_server/routes/openai_request_handler.py @@ -70,6 +70,9 @@ class OpenAIRequestHandler(RequestHandler): if opts.openai_force_no_hashes: self.parameters['stop'].append('### ') + if opts.mode == 'vllm' and self.request_json_body.get('top_p') == 0: + self.request_json_body['top_p'] = 0.01 + llm_request = {**self.parameters, 'prompt': self.prompt} (success, _, _, _), (backend_response, backend_response_status_code) = self.generate_response(llm_request) diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index 488107c..65758f2 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -49,10 +49,10 @@ def stream(ws): handler = OobaRequestHandler(request, request_json_body) generated_text = '' - input_prompt = None + input_prompt = request_json_body['prompt'] response_status_code = 0 start_time = time.time() - request_valid, invalid_response = handler.validate_request() + request_valid, invalid_response = handler.validate_request(prompt=input_prompt) if not request_valid: err_msg = invalid_response[0].json['results'][0]['text'] ws.send(json.dumps({ @@ -69,7 +69,6 @@ def stream(ws): thread.start() thread.join() else: - input_prompt = request_json_body['prompt'] msg_to_backend = { **handler.parameters, 'prompt': input_prompt, @@ -142,7 +141,9 @@ def stream(ws): thread = threading.Thread(target=background_task_exception) thread.start() thread.join() + ws.send(json.dumps({ 'event': 'stream_end', 'message_num': message_num })) + ws.close() # this is important if we encountered and error and exited early. diff --git a/server.py b/server.py index 0a485a8..594afed 100644 --- a/server.py +++ b/server.py @@ -16,14 +16,16 @@ from llm_server.llm import get_token_count from llm_server.routes.openai import openai_bp from llm_server.routes.server_error import handle_server_error -# TODO: allow setting more custom ratelimits per-token -# TODO: add more excluding to SYSTEM__ tokens # TODO: make sure the OpenAI moderation endpoint scans the last n messages rather than only the last one (make that threaded) # TODO: support turbo-instruct on openai endpoint # TODO: option to trim context in openai mode so that we silently fit the model's context # TODO: validate system tokens before excluding them -# TODO: unify logging thread in a function and use async/await instead +# TODO: make sure prompts are logged even when the user cancels generation + # TODO: make sure log_prompt() is used everywhere, including errors and invalid requests +# TODO: unify logging thread in a function and use async/await instead +# TODO: add more excluding to SYSTEM__ tokens + try: import vllm