diff --git a/llm_server/llm/openai/transform.py b/llm_server/llm/openai/transform.py index 4cf2951..39f942a 100644 --- a/llm_server/llm/openai/transform.py +++ b/llm_server/llm/openai/transform.py @@ -80,7 +80,6 @@ def trim_string_to_fit(prompt: str, context_token_limit: int, backend_url: str) token_count = get_token_count(prompt, backend_url) else: break - print(token_count) return prompt diff --git a/llm_server/llm/vllm/tokenize.py b/llm_server/llm/vllm/tokenize.py index d51b1de..006842e 100644 --- a/llm_server/llm/vllm/tokenize.py +++ b/llm_server/llm/vllm/tokenize.py @@ -1,6 +1,6 @@ -import asyncio +import concurrent.futures -import aiohttp +import requests import tiktoken from llm_server import opts @@ -9,27 +9,31 @@ from llm_server import opts def tokenize(prompt: str, backend_url: str) -> int: assert backend_url if not prompt: + # The tokenizers have issues when the prompt is None. return 0 + tokenizer = tiktoken.get_encoding("cl100k_base") - async def run(): - tokenizer = tiktoken.get_encoding("cl100k_base") + # Split the prompt into 300 character chunks + chunk_size = 300 + chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)] - async def send_chunk(chunk): + # Define a function to send a chunk to the server + def send_chunk(chunk): + try: + r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) + j = r.json() + return j['length'] + except Exception as e: + print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}') + return len(tokenizer.encode(chunk)) + 10 + + # Use a ThreadPoolExecutor to send all chunks to the server at once + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks} + for future in concurrent.futures.as_completed(future_to_chunk): + chunk = future_to_chunk[future] try: - async with session.post(f'{backend_url}/tokenize', json={'input': chunk}, verify_ssl=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) as response: - j = await response.json() - return j['length'] - except Exception as e: - print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}') - return len(tokenizer.encode(chunk)) + 10 - - chunk_size = 300 - chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)] - - async with aiohttp.ClientSession() as session: - tasks = [send_chunk(chunk) for chunk in chunks] - lengths = await asyncio.gather(*tasks) - - return sum(lengths) - - return asyncio.run(run()) + data = future.result() + except Exception as exc: + print('%r generated an exception: %s' % (chunk, exc)) + return sum(future.result() for future in future_to_chunk) diff --git a/server.py b/server.py index 4191a84..1d89ca2 100644 --- a/server.py +++ b/server.py @@ -24,7 +24,9 @@ from llm_server.routes.server_error import handle_server_error from llm_server.routes.v1 import bp from llm_server.sock import init_socketio +# TODO: is frequency penalty the same as ooba repetition penalty??? # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions +# TODO: if a backend is at its limit of concurrent requests, choose a different one # Lower priority # TODO: support logit_bias on OpenAI and Ooba endpoints.