41 lines
1009 B
TypeScript
41 lines
1009 B
TypeScript
import * as tokenizer from "./mistral-tokenizer-js";
|
|
import { MistralAIChatMessage } from "../api-support";
|
|
|
|
export function init() {
|
|
tokenizer.initializemistralTokenizer();
|
|
return true;
|
|
}
|
|
|
|
export function getTokenCount(prompt: MistralAIChatMessage[] | string) {
|
|
if (typeof prompt === "string") {
|
|
return getTextTokenCount(prompt);
|
|
}
|
|
|
|
let chunks = [];
|
|
for (const message of prompt) {
|
|
switch (message.role) {
|
|
case "system":
|
|
chunks.push(message.content);
|
|
break;
|
|
case "assistant":
|
|
chunks.push(message.content + "</s>");
|
|
break;
|
|
case "user":
|
|
chunks.push("[INST] " + message.content + " [/INST]");
|
|
break;
|
|
}
|
|
}
|
|
return getTextTokenCount(chunks.join(" "));
|
|
}
|
|
|
|
function getTextTokenCount(prompt: string) {
|
|
if (prompt.length > 800000) {
|
|
throw new Error("Content is too large to tokenize.");
|
|
}
|
|
|
|
return {
|
|
tokenizer: "mistral-tokenizer-js",
|
|
token_count: tokenizer.encode(prompt.normalize("NFKC"))!.length,
|
|
};
|
|
}
|