local-llm-server/llm_server/llm/vllm/tokenize.py

import concurrent.futures

import requests
import tiktoken

from llm_server import opts


def tokenize(prompt: str, backend_url: str) -> int:
    assert backend_url
    assert isinstance(prompt, str)
    assert isinstance(backend_url, str)

    if not prompt:
        # The tokenizers have issues when the prompt is None.
        return 0
    tokenizer = tiktoken.get_encoding("cl100k_base")

    # Split the prompt into 300 character chunks
    chunk_size = 300
    chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]

    # Define a function to send a chunk to the server
    def send_chunk(chunk):
        try:
            r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
            j = r.json()
            return j['length']
        except Exception as e:
            print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
            raise Exception
            return len(tokenizer.encode(chunk)) + 10

    # Use a ThreadPoolExecutor to send all chunks to the server at once
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}
        for future in concurrent.futures.as_completed(future_to_chunk):
            chunk = future_to_chunk[future]
            try:
                data = future.result()
            except Exception as exc:
                print('%r generated an exception: %s' % (chunk, exc))
    return sum(future.result() for future in future_to_chunk)
fix tokenizer 2023-10-01 17:19:34 -06:00			`import concurrent.futures`
finish openai endpoints 2023-10-01 16:04:53 -06:00
fix tokenizer 2023-10-01 17:19:34 -06:00			`import requests`
port to mysql, use vllm tokenizer endpoint 2023-09-20 20:30:31 -06:00			`import tiktoken`

			`from llm_server import opts`

fix error handling 2023-09-27 14:36:49 -06:00
functional 2023-09-30 19:41:50 -06:00			`def tokenize(prompt: str, backend_url: str) -> int:`
get streaming working, remove /v2/ 2023-10-01 00:20:00 -06:00			`assert backend_url`
fix processing not being decremented on streaming, fix confusion over queue, adjust stop sequences 2023-10-02 20:53:08 -06:00			`assert isinstance(prompt, str)`
			`assert isinstance(backend_url, str)`

update database, tokenizer handle null prompt, convert top_p to vllm on openai, actually validate prompt on streaming, 2023-09-25 22:32:48 -06:00			`if not prompt:`
fix tokenizer 2023-10-01 17:19:34 -06:00			`# The tokenizers have issues when the prompt is None.`
update database, tokenizer handle null prompt, convert top_p to vllm on openai, actually validate prompt on streaming, 2023-09-25 22:32:48 -06:00			`return 0`
fix tokenizer 2023-10-01 17:19:34 -06:00			`tokenizer = tiktoken.get_encoding("cl100k_base")`

			`# Split the prompt into 300 character chunks`
			`chunk_size = 300`
			`chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]`

			`# Define a function to send a chunk to the server`
			`def send_chunk(chunk):`
			`try:`
			`r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)`
			`j = r.json()`
			`return j['length']`
			`except Exception as e:`
			`print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')`
fix ratelimiting 2023-10-02 02:05:15 -06:00			`raise Exception`
fix tokenizer 2023-10-01 17:19:34 -06:00			`return len(tokenizer.encode(chunk)) + 10`

			`# Use a ThreadPoolExecutor to send all chunks to the server at once`
			`with concurrent.futures.ThreadPoolExecutor() as executor:`
			`future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}`
			`for future in concurrent.futures.as_completed(future_to_chunk):`
			`chunk = future_to_chunk[future]`
finish openai endpoints 2023-10-01 16:04:53 -06:00			`try:`
fix tokenizer 2023-10-01 17:19:34 -06:00			`data = future.result()`
			`except Exception as exc:`
			`print('%r generated an exception: %s' % (chunk, exc))`
			`return sum(future.result() for future in future_to_chunk)`