increases tokenizer failsafe to 500000 characters

2023-07-27 15:21:06 -05:00 · 2023-07-27 15:21:06 -05:00 · d29c304d5a
parent addfa7c57b
commit d29c304d5a
2 changed files with 8 additions and 5 deletions
--- a/src/tokenization/claude.ts
+++ b/src/tokenization/claude.ts
@ -20,7 +20,9 @@ export function init() {
 }

 export function getTokenCount(prompt: string, _model: string) {
-  if (prompt.length > 250000) {
+  // Don't try tokenizing if the prompt is massive to prevent DoS.
+  // 500k characters should be sufficient for all supported models.
+  if (prompt.length > 500000) {
    return {
      tokenizer: "tiktoken (prompt length limit exceeded)",
      token_count: 100000,
--- a/src/tokenization/openai.ts
+++ b/src/tokenization/openai.ts
@ -28,10 +28,11 @@ export function getTokenCount(messages: any[], model: string) {
    for (const key of Object.keys(message)) {
      {
        const value = message[key];
-        // Break if we get a huge message or exceed the token limit to prevent DoS
-        // 100k tokens allows for future 100k GPT-4 models and 250k characters is
-        // just a sanity check
-        if (value.length > 250000 || numTokens > 100000) {
+        // Break if we get a huge message or exceed the token limit to prevent
+        // DoS.
+        // 100k tokens allows for future 100k GPT-4 models and 500k characters
+        // is just a sanity check
+        if (value.length > 500000 || numTokens > 100000) {
          numTokens = 100000;
          return {
            tokenizer: "tiktoken (prompt length limit exceeded)",