local-llm-server/llm_server/llm/vllm/tokenize.py

import requests
import tiktoken

from llm_server import opts
from llm_server.cluster.cluster_config import cluster_config


def tokenize(prompt: str, backend_url: str) -> int:
    if not prompt:
        # The tokenizers have issues when the prompt is None.
        return 0
    tokenizer = tiktoken.get_encoding("cl100k_base")
    token_limit = cluster_config.get_backend(backend_url)['model_config']['max_position_embeddings']

    # First we tokenize it locally to determine if it's worth sending it to the backend.
    initial_estimate = len(tokenizer.encode(prompt))
    if initial_estimate <= token_limit + 200:
        try:
            r = requests.post(f'{backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
            j = r.json()
            return j['length']
        except Exception as e:
            print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
            return len(tokenizer.encode(prompt)) + 10
    else:
        # If the result was greater than our context size, return the estimate.
        # We won't be sending it through the backend so it does't need to be accurage.
        return initial_estimate
port to mysql, use vllm tokenizer endpoint 2023-09-20 20:30:31 -06:00			`import requests`
			`import tiktoken`

			`from llm_server import opts`
functional 2023-09-30 19:41:50 -06:00			`from llm_server.cluster.cluster_config import cluster_config`
port to mysql, use vllm tokenizer endpoint 2023-09-20 20:30:31 -06:00
fix error handling 2023-09-27 14:36:49 -06:00
functional 2023-09-30 19:41:50 -06:00			`def tokenize(prompt: str, backend_url: str) -> int:`
update database, tokenizer handle null prompt, convert top_p to vllm on openai, actually validate prompt on streaming, 2023-09-25 22:32:48 -06:00			`if not prompt:`
			`# The tokenizers have issues when the prompt is None.`
			`return 0`
avoid sending to backend to tokenize if it's greater than our specified context size 2023-09-28 03:54:20 -06:00			`tokenizer = tiktoken.get_encoding("cl100k_base")`
functional 2023-09-30 19:41:50 -06:00			`token_limit = cluster_config.get_backend(backend_url)['model_config']['max_position_embeddings']`
avoid sending to backend to tokenize if it's greater than our specified context size 2023-09-28 03:54:20 -06:00
			`# First we tokenize it locally to determine if it's worth sending it to the backend.`
			`initial_estimate = len(tokenizer.encode(prompt))`
functional 2023-09-30 19:41:50 -06:00			`if initial_estimate <= token_limit + 200:`
avoid sending to backend to tokenize if it's greater than our specified context size 2023-09-28 03:54:20 -06:00			`try:`
functional 2023-09-30 19:41:50 -06:00			`r = requests.post(f'{backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)`
avoid sending to backend to tokenize if it's greater than our specified context size 2023-09-28 03:54:20 -06:00			`j = r.json()`
			`return j['length']`
			`except Exception as e:`
			`print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')`
			`return len(tokenizer.encode(prompt)) + 10`
			`else:`
			`# If the result was greater than our context size, return the estimate.`
			`# We won't be sending it through the backend so it does't need to be accurage.`
			`return initial_estimate`