import requests import tiktoken from llm_server import opts def tokenize(prompt: str) -> int: if not prompt: # The tokenizers have issues when the prompt is None. return 0 tokenizer = tiktoken.get_encoding("cl100k_base") # First we tokenize it locally to determine if it's worth sending it to the backend. initial_estimate = len(tokenizer.encode(prompt)) if initial_estimate <= opts.context_size + 200: try: r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) j = r.json() return j['length'] except Exception as e: print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}') return len(tokenizer.encode(prompt)) + 10 else: # If the result was greater than our context size, return the estimate. # We won't be sending it through the backend so it does't need to be accurage. return initial_estimate