avoid sending to backend to tokenize if it's greater than our specified context size

This commit is contained in:
Cyberes 2023-09-28 03:54:20 -06:00
parent 467b804ad7
commit 347a82b7e1
2 changed files with 17 additions and 9 deletions

View File

@ -5,10 +5,14 @@ from llm_server import opts
def tokenize(prompt: str) -> int:
tokenizer = tiktoken.get_encoding("cl100k_base")
if not prompt:
# The tokenizers have issues when the prompt is None.
return 0
tokenizer = tiktoken.get_encoding("cl100k_base")
# First we tokenize it locally to determine if it's worth sending it to the backend.
initial_estimate = len(tokenizer.encode(prompt))
if initial_estimate <= opts.context_size + 200:
try:
r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
j = r.json()
@ -16,3 +20,7 @@ def tokenize(prompt: str) -> int:
except Exception as e:
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
return len(tokenizer.encode(prompt)) + 10
else:
# If the result was greater than our context size, return the estimate.
# We won't be sending it through the backend so it does't need to be accurage.
return initial_estimate

View File

@ -89,7 +89,6 @@ def generate_stats(regen: bool = False):
'average_generation_elapsed_sec': int(average_generation_time),
# 'estimated_avg_tps': estimated_avg_tps,
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
'nvidia': netdata_stats
},
'online': online,
'endpoints': {
@ -115,6 +114,7 @@ def generate_stats(regen: bool = False):
'anthropicKeys': '',
},
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
'nvidia': netdata_stats
}
result = deep_sort(output)