This repository has been archived on 2024-10-27. You can view files and clone it, but cannot push or open issues or pull requests.
local-llm-server/llm_server/llm/vllm/tokenize.py

44 lines
1.6 KiB
Python
Raw Normal View History

2023-10-01 17:19:34 -06:00
import concurrent.futures
2023-10-01 16:04:53 -06:00
2023-10-01 17:19:34 -06:00
import requests
import tiktoken
from llm_server import opts
2023-09-27 14:36:49 -06:00
2023-09-30 19:41:50 -06:00
def tokenize(prompt: str, backend_url: str) -> int:
2023-10-01 00:20:00 -06:00
assert backend_url
assert isinstance(prompt, str)
assert isinstance(backend_url, str)
if not prompt:
2023-10-01 17:19:34 -06:00
# The tokenizers have issues when the prompt is None.
return 0
2023-10-01 17:19:34 -06:00
tokenizer = tiktoken.get_encoding("cl100k_base")
# Split the prompt into 300 character chunks
chunk_size = 300
chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
# Define a function to send a chunk to the server
def send_chunk(chunk):
try:
r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
j = r.json()
return j['length']
except Exception as e:
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
2023-10-02 02:05:15 -06:00
raise Exception
2023-10-01 17:19:34 -06:00
return len(tokenizer.encode(chunk)) + 10
# Use a ThreadPoolExecutor to send all chunks to the server at once
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}
for future in concurrent.futures.as_completed(future_to_chunk):
chunk = future_to_chunk[future]
2023-10-01 16:04:53 -06:00
try:
2023-10-01 17:19:34 -06:00
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (chunk, exc))
return sum(future.result() for future in future_to_chunk)