avoid sending to backend to tokenize if it's greater than our specified context size
This commit is contained in:
parent
467b804ad7
commit
347a82b7e1
|
@ -5,14 +5,22 @@ from llm_server import opts
|
||||||
|
|
||||||
|
|
||||||
def tokenize(prompt: str) -> int:
|
def tokenize(prompt: str) -> int:
|
||||||
tokenizer = tiktoken.get_encoding("cl100k_base")
|
|
||||||
if not prompt:
|
if not prompt:
|
||||||
# The tokenizers have issues when the prompt is None.
|
# The tokenizers have issues when the prompt is None.
|
||||||
return 0
|
return 0
|
||||||
try:
|
tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||||
r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
|
|
||||||
j = r.json()
|
# First we tokenize it locally to determine if it's worth sending it to the backend.
|
||||||
return j['length']
|
initial_estimate = len(tokenizer.encode(prompt))
|
||||||
except Exception as e:
|
if initial_estimate <= opts.context_size + 200:
|
||||||
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
|
try:
|
||||||
return len(tokenizer.encode(prompt)) + 10
|
r = requests.post(f'{opts.backend_url}/tokenize', json={'input': prompt}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
|
||||||
|
j = r.json()
|
||||||
|
return j['length']
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
|
||||||
|
return len(tokenizer.encode(prompt)) + 10
|
||||||
|
else:
|
||||||
|
# If the result was greater than our context size, return the estimate.
|
||||||
|
# We won't be sending it through the backend so it does't need to be accurage.
|
||||||
|
return initial_estimate
|
||||||
|
|
|
@ -89,7 +89,6 @@ def generate_stats(regen: bool = False):
|
||||||
'average_generation_elapsed_sec': int(average_generation_time),
|
'average_generation_elapsed_sec': int(average_generation_time),
|
||||||
# 'estimated_avg_tps': estimated_avg_tps,
|
# 'estimated_avg_tps': estimated_avg_tps,
|
||||||
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
||||||
'nvidia': netdata_stats
|
|
||||||
},
|
},
|
||||||
'online': online,
|
'online': online,
|
||||||
'endpoints': {
|
'endpoints': {
|
||||||
|
@ -115,6 +114,7 @@ def generate_stats(regen: bool = False):
|
||||||
'anthropicKeys': '∞',
|
'anthropicKeys': '∞',
|
||||||
},
|
},
|
||||||
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
|
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
|
||||||
|
'nvidia': netdata_stats
|
||||||
}
|
}
|
||||||
result = deep_sort(output)
|
result = deep_sort(output)
|
||||||
|
|
||||||
|
|
Reference in New Issue