anything-llm/server/utils/helpers/chat/LLMPerformanceMonitor.js
Timothy Carambat dd7c4675d3
LLM performance metric tracking ()
* WIP performance metric tracking

* fix: patch UI trying to .toFixed() null metric
Anthropic tracking migraiton
cleanup logs

* Apipie implmentation, not tested

* Cleanup Anthropic notes, Add support for AzureOpenAI tracking

* bedrock token metric tracking

* Cohere support

* feat: improve default stream handler to track for provider who are actually OpenAI compliant in usage reporting
add deepseek support

* feat: Add FireworksAI tracking reporting
fix: improve handler when usage:null is reported (why?)

* Add token reporting for GenericOpenAI

* token reporting for koboldcpp + lmstudio

* lint

* support Groq token tracking

* HF token tracking

* token tracking for togetherai

* LiteLLM token tracking

* linting + Mitral token tracking support

* XAI token metric reporting

* native provider runner

* LocalAI token tracking

* Novita token tracking

* OpenRouter token tracking

* Apipie stream metrics

* textwebgenui token tracking

* perplexity token reporting

* ollama token reporting

* lint

* put back comment

* Rip out LC ollama wrapper and use official library

* patch images with new ollama lib

* improve ollama offline message

* fix image handling in ollama llm provider

* lint

* NVIDIA NIM token tracking

* update openai compatbility responses

* UI/UX show/hide metrics on click for user preference

* update bedrock client

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2024-12-16 14:31:17 -08:00

101 lines
3.7 KiB
JavaScript

const { TokenManager } = require("../tiktoken");
/**
* @typedef {import("openai/streaming").Stream<import("openai").OpenAI.ChatCompletionChunk>} OpenAICompatibleStream
* @typedef {(reportedUsage: {[key: string]: number, completion_tokens?: number, prompt_tokens?: number}) => StreamMetrics} EndMeasurementFunction
* @typedef {Array<{content: string}>} Messages
*/
/**
* @typedef {Object} StreamMetrics
* @property {number} prompt_tokens - the number of tokens in the prompt
* @property {number} completion_tokens - the number of tokens in the completion
* @property {number} total_tokens - the total number of tokens
* @property {number} outputTps - the tokens per second of the output
* @property {number} duration - the duration of the stream
*/
/**
* @typedef {Object} MonitoredStream
* @property {number} start - the start time of the stream
* @property {number} duration - the duration of the stream
* @property {StreamMetrics} metrics - the metrics of the stream
* @property {EndMeasurementFunction} endMeasurement - the method to end the stream and calculate the metrics
*/
class LLMPerformanceMonitor {
static tokenManager = new TokenManager();
/**
* Counts the tokens in the messages.
* @param {Array<{content: string}>} messages - the messages sent to the LLM so we can calculate the prompt tokens since most providers do not return this on stream
* @returns {number}
*/
static countTokens(messages = []) {
try {
return this.tokenManager.statsFrom(messages);
} catch (e) {
return 0;
}
}
/**
* Wraps a function and logs the duration (in seconds) of the function call.
* @param {Function} func
* @returns {Promise<{output: any, duration: number}>}
*/
static measureAsyncFunction(func) {
return (async () => {
const start = Date.now();
const output = await func; // is a promise
const end = Date.now();
return { output, duration: (end - start) / 1000 };
})();
}
/**
* Wraps a completion stream and and attaches a start time and duration property to the stream.
* Also attaches an `endMeasurement` method to the stream that will calculate the duration of the stream and metrics.
* @param {Promise<OpenAICompatibleStream>} func
* @param {Messages} messages - the messages sent to the LLM so we can calculate the prompt tokens since most providers do not return this on stream
* @param {boolean} runPromptTokenCalculation - whether to run the prompt token calculation to estimate the `prompt_tokens` metric. This is useful for providers that do not return this on stream.
* @returns {Promise<MonitoredStream>}
*/
static async measureStream(
func,
messages = [],
runPromptTokenCalculation = true
) {
const stream = await func;
stream.start = Date.now();
stream.duration = 0;
stream.metrics = {
completion_tokens: 0,
prompt_tokens: runPromptTokenCalculation ? this.countTokens(messages) : 0,
total_tokens: 0,
outputTps: 0,
duration: 0,
};
stream.endMeasurement = (reportedUsage = {}) => {
const end = Date.now();
const duration = (end - stream.start) / 1000;
// Merge the reported usage with the existing metrics
// so the math in the metrics object is correct when calculating
stream.metrics = {
...stream.metrics,
...reportedUsage,
};
stream.metrics.total_tokens =
stream.metrics.prompt_tokens + (stream.metrics.completion_tokens || 0);
stream.metrics.outputTps = stream.metrics.completion_tokens / duration;
stream.metrics.duration = duration;
return stream.metrics;
};
return stream;
}
}
module.exports = {
LLMPerformanceMonitor,
};