local-llm-server/llm_server/llm/vllm/tokenize.py

27 lines
1.0 KiB
Python

import requests
import tiktoken
from llm_server import opts
def tokenize(prompt: str) -> int:
if not prompt:
# The tokenizers have issues when the prompt is None.
return 0
tokenizer = tiktoken.get_encoding("cl100k_base")
# First we tokenize it locally to determine if it's worth sending it to the backend.
initial_estimate = len(tokenizer.encode(prompt))
if initial_estimate <= opts.context_size + 200:
try:
r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
j = r.json()
return j['length']
except Exception as e:
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
return len(tokenizer.encode(prompt)) + 10
else:
# If the result was greater than our context size, return the estimate.
# We won't be sending it through the backend so it does't need to be accurage.
return initial_estimate