29 lines
1.2 KiB
Python
29 lines
1.2 KiB
Python
import requests
|
|
import tiktoken
|
|
|
|
from llm_server import opts
|
|
from llm_server.cluster.cluster_config import cluster_config
|
|
|
|
|
|
def tokenize(prompt: str, backend_url: str) -> int:
|
|
if not prompt:
|
|
# The tokenizers have issues when the prompt is None.
|
|
return 0
|
|
tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
token_limit = cluster_config.get_backend(backend_url)['model_config']['max_position_embeddings']
|
|
|
|
# First we tokenize it locally to determine if it's worth sending it to the backend.
|
|
initial_estimate = len(tokenizer.encode(prompt))
|
|
if initial_estimate <= token_limit + 200:
|
|
try:
|
|
r = requests.post(f'{backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
|
|
j = r.json()
|
|
return j['length']
|
|
except Exception as e:
|
|
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
|
|
return len(tokenizer.encode(prompt)) + 10
|
|
else:
|
|
# If the result was greater than our context size, return the estimate.
|
|
# We won't be sending it through the backend so it does't need to be accurage.
|
|
return initial_estimate
|