fix tokenizer
This commit is contained in:
parent
f7e9687527
commit
51881ae39d
|
@ -80,7 +80,6 @@ def trim_string_to_fit(prompt: str, context_token_limit: int, backend_url: str)
|
||||||
token_count = get_token_count(prompt, backend_url)
|
token_count = get_token_count(prompt, backend_url)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
print(token_count)
|
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import asyncio
|
import concurrent.futures
|
||||||
|
|
||||||
import aiohttp
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
from llm_server import opts
|
from llm_server import opts
|
||||||
|
@ -9,27 +9,31 @@ from llm_server import opts
|
||||||
def tokenize(prompt: str, backend_url: str) -> int:
|
def tokenize(prompt: str, backend_url: str) -> int:
|
||||||
assert backend_url
|
assert backend_url
|
||||||
if not prompt:
|
if not prompt:
|
||||||
|
# The tokenizers have issues when the prompt is None.
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
async def run():
|
|
||||||
tokenizer = tiktoken.get_encoding("cl100k_base")
|
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
|
|
||||||
async def send_chunk(chunk):
|
# Split the prompt into 300 character chunks
|
||||||
|
chunk_size = 300
|
||||||
|
chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
|
||||||
|
|
||||||
|
# Define a function to send a chunk to the server
|
||||||
|
def send_chunk(chunk):
|
||||||
try:
|
try:
|
||||||
async with session.post(f'{backend_url}/tokenize', json={'input': chunk}, verify_ssl=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) as response:
|
r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
|
||||||
j = await response.json()
|
j = r.json()
|
||||||
return j['length']
|
return j['length']
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
|
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
|
||||||
return len(tokenizer.encode(chunk)) + 10
|
return len(tokenizer.encode(chunk)) + 10
|
||||||
|
|
||||||
chunk_size = 300
|
# Use a ThreadPoolExecutor to send all chunks to the server at once
|
||||||
chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}
|
||||||
async with aiohttp.ClientSession() as session:
|
for future in concurrent.futures.as_completed(future_to_chunk):
|
||||||
tasks = [send_chunk(chunk) for chunk in chunks]
|
chunk = future_to_chunk[future]
|
||||||
lengths = await asyncio.gather(*tasks)
|
try:
|
||||||
|
data = future.result()
|
||||||
return sum(lengths)
|
except Exception as exc:
|
||||||
|
print('%r generated an exception: %s' % (chunk, exc))
|
||||||
return asyncio.run(run())
|
return sum(future.result() for future in future_to_chunk)
|
||||||
|
|
|
@ -24,7 +24,9 @@ from llm_server.routes.server_error import handle_server_error
|
||||||
from llm_server.routes.v1 import bp
|
from llm_server.routes.v1 import bp
|
||||||
from llm_server.sock import init_socketio
|
from llm_server.sock import init_socketio
|
||||||
|
|
||||||
|
# TODO: is frequency penalty the same as ooba repetition penalty???
|
||||||
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
|
# TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
|
||||||
|
# TODO: if a backend is at its limit of concurrent requests, choose a different one
|
||||||
|
|
||||||
# Lower priority
|
# Lower priority
|
||||||
# TODO: support logit_bias on OpenAI and Ooba endpoints.
|
# TODO: support logit_bias on OpenAI and Ooba endpoints.
|
||||||
|
|
Reference in New Issue