diff --git a/llm_server/llm/openai/transform.py b/llm_server/llm/openai/transform.py
index 4cf2951..39f942a 100644
--- a/llm_server/llm/openai/transform.py
+++ b/llm_server/llm/openai/transform.py
@@ -80,7 +80,6 @@ def trim_string_to_fit(prompt: str, context_token_limit: int, backend_url: str)
                 token_count = get_token_count(prompt, backend_url)
             else:
                 break
-    print(token_count)
     return prompt
 
 
diff --git a/llm_server/llm/vllm/tokenize.py b/llm_server/llm/vllm/tokenize.py
index d51b1de..006842e 100644
--- a/llm_server/llm/vllm/tokenize.py
+++ b/llm_server/llm/vllm/tokenize.py
@@ -1,6 +1,6 @@
-import asyncio
+import concurrent.futures
 
-import aiohttp
+import requests
 import tiktoken
 
 from llm_server import opts
@@ -9,27 +9,31 @@ from llm_server import opts
 def tokenize(prompt: str, backend_url: str) -> int:
     assert backend_url
     if not prompt:
+        # The tokenizers have issues when the prompt is None.
         return 0
+    tokenizer = tiktoken.get_encoding("cl100k_base")
 
-    async def run():
-        tokenizer = tiktoken.get_encoding("cl100k_base")
+    # Split the prompt into 300 character chunks
+    chunk_size = 300
+    chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
 
-        async def send_chunk(chunk):
+    # Define a function to send a chunk to the server
+    def send_chunk(chunk):
+        try:
+            r = requests.post(f'{backend_url}/tokenize', json={'input': chunk}, verify=opts.verify_ssl, timeout=opts.backend_generate_request_timeout)
+            j = r.json()
+            return j['length']
+        except Exception as e:
+            print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
+            return len(tokenizer.encode(chunk)) + 10
+
+    # Use a ThreadPoolExecutor to send all chunks to the server at once
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_chunk = {executor.submit(send_chunk, chunk): chunk for chunk in chunks}
+        for future in concurrent.futures.as_completed(future_to_chunk):
+            chunk = future_to_chunk[future]
             try:
-                async with session.post(f'{backend_url}/tokenize', json={'input': chunk}, verify_ssl=opts.verify_ssl, timeout=opts.backend_generate_request_timeout) as response:
-                    j = await response.json()
-                    return j['length']
-            except Exception as e:
-                print(f'Failed to tokenize using VLLM -', f'{e.__class__.__name__}: {e}')
-                return len(tokenizer.encode(chunk)) + 10
-
-        chunk_size = 300
-        chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
-
-        async with aiohttp.ClientSession() as session:
-            tasks = [send_chunk(chunk) for chunk in chunks]
-            lengths = await asyncio.gather(*tasks)
-
-        return sum(lengths)
-
-    return asyncio.run(run())
+                data = future.result()
+            except Exception as exc:
+                print('%r generated an exception: %s' % (chunk, exc))
+    return sum(future.result() for future in future_to_chunk)
diff --git a/server.py b/server.py
index 4191a84..1d89ca2 100644
--- a/server.py
+++ b/server.py
@@ -24,7 +24,9 @@ from llm_server.routes.server_error import handle_server_error
 from llm_server.routes.v1 import bp
 from llm_server.sock import init_socketio
 
+# TODO: is frequency penalty the same as ooba repetition penalty???
 # TODO: make sure openai_moderation_enabled works on websockets, completions, and chat completions
+# TODO: if a backend is at its limit of concurrent requests, choose a different one
 
 # Lower priority
 # TODO: support logit_bias on OpenAI and Ooba endpoints.