mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-13 05:32:24 +00:00
* Add tokenizer improvments via Singleton class linting * dev build * Estimation fallback when string exceeds a fixed byte size * Add notice to tiktoken on backend
66 lines
1.8 KiB
JavaScript
66 lines
1.8 KiB
JavaScript
const { getEncoding } = require("js-tiktoken");
|
|
|
|
class TikTokenTokenizer {
|
|
static MAX_KB_ESTIMATE = 10;
|
|
static DIVISOR = 8;
|
|
|
|
constructor() {
|
|
if (TikTokenTokenizer.instance) {
|
|
this.log(
|
|
"Singleton instance already exists. Returning existing instance."
|
|
);
|
|
return TikTokenTokenizer.instance;
|
|
}
|
|
|
|
this.encoder = getEncoding("cl100k_base");
|
|
TikTokenTokenizer.instance = this;
|
|
this.log("Initialized new TikTokenTokenizer instance.");
|
|
}
|
|
|
|
log(text, ...args) {
|
|
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
|
|
}
|
|
|
|
/**
|
|
* Check if the input is too long to encode
|
|
* this is more of a rough estimate and a sanity check to prevent
|
|
* CPU issues from encoding too large of strings
|
|
* Assumes 1 character = 2 bytes in JS
|
|
* @param {string} input
|
|
* @returns {boolean}
|
|
*/
|
|
#isTooLong(input) {
|
|
const bytesEstimate = input.length * 2;
|
|
const kbEstimate = Math.floor(bytesEstimate / 1024);
|
|
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
|
|
}
|
|
|
|
/**
|
|
* Encode a string into tokens for rough token count estimation.
|
|
* @param {string} input
|
|
* @returns {number}
|
|
*/
|
|
tokenizeString(input = "") {
|
|
try {
|
|
if (this.#isTooLong(input)) {
|
|
this.log("Input will take too long to encode - estimating");
|
|
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
|
|
}
|
|
|
|
return this.encoder.encode(input).length;
|
|
} catch (e) {
|
|
this.log("Could not tokenize string! Estimating...", e.message, e.stack);
|
|
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
const tokenizer = new TikTokenTokenizer();
|
|
module.exports = {
|
|
/**
|
|
* Encode a string into tokens for rough token count estimation.
|
|
* @param {string} input
|
|
* @returns {number}
|
|
*/
|
|
tokenizeString: (input) => tokenizer.tokenizeString(input),
|
|
};
|