fix tokenizer

This commit is contained in:
Cyberes 2023-10-01 17:19:34 -06:00
parent f7e9687527
commit 51881ae39d
3 changed files with 28 additions and 23 deletions

View File

@ -80,7 +80,6 @@ def trim_string_to_fit(prompt: str, context_token_limit: int, backend_url: str)
token_count = get_token_count(prompt, backend_url) token_count = get_token_count(prompt, backend_url)
else: else:
break break
print(token_count)
return prompt return prompt

View File

@ -1,6 +1,6 @@
import asyncio import concurrent.futures
import aiohttp import requests
import tiktoken import tiktoken
from llm_server import opts from llm_server import opts
@ -9,27 +9,31 @@ from llm_server import opts
def tokenize(prompt: str, backend_url: str) -> int: def tokenize(prompt: str, backend_url: str) -> int:
assert backend_url assert backend_url
if not prompt: if not prompt:
# The tokenizers have issues when the prompt is None.
return 0 return 0
async def run():
tokenizer = tiktoken.get_encoding("cl100k_base") tokenizer = tiktoken.get_encoding("cl100k_base")
async def send_chunk(chunk): # Split the prompt into 300 character chunks
chunk_size = 300
chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
# Define a function to send a chunk to the server
def send_chunk(chunk):
try: try:
async with session.post(f'{backend_url}/tokenize', json={'input': chunk}, verify_ssl=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) as response: r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
j = await response.json() j = r.json()
return j['length'] return j['length']
except Exception as e: except Exception as e:
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}') print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
return len(tokenizer.encode(chunk)) + 10 return len(tokenizer.encode(chunk)) + 10
chunk_size = 300 # Use a ThreadPoolExecutor to send all chunks to the server at once
chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)] with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}
async with aiohttp.ClientSession() as session: for future in concurrent.futures.as_completed(future_to_chunk):
tasks = [send_chunk(chunk) for chunk in chunks] chunk = future_to_chunk[future]
lengths = await asyncio.gather(*tasks) try:
data = future.result()
return sum(lengths) except Exception as exc:
print('%r generated an exception: %s' % (chunk, exc))
return asyncio.run(run()) return sum(future.result() for future in future_to_chunk)

View File

@ -24,7 +24,9 @@ from llm_server.routes.server_error import handle_server_error
from llm_server.routes.v1 import bp from llm_server.routes.v1 import bp
from llm_server.sock import init_socketio from llm_server.sock import init_socketio
# TODO: is frequency penalty the same as ooba repetition penalty???
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
# TODO: if a backend is at its limit of concurrent requests, choose a different one
# Lower priority # Lower priority
# TODO: support logit_bias on OpenAI and Ooba endpoints. # TODO: support logit_bias on OpenAI and Ooba endpoints.