anything-llm/server/utils/chats/openaiCompatible.js
Timothy Carambat dd7c4675d3
LLM performance metric tracking ()
* WIP performance metric tracking

* fix: patch UI trying to .toFixed() null metric
Anthropic tracking migraiton
cleanup logs

* Apipie implmentation, not tested

* Cleanup Anthropic notes, Add support for AzureOpenAI tracking

* bedrock token metric tracking

* Cohere support

* feat: improve default stream handler to track for provider who are actually OpenAI compliant in usage reporting
add deepseek support

* feat: Add FireworksAI tracking reporting
fix: improve handler when usage:null is reported (why?)

* Add token reporting for GenericOpenAI

* token reporting for koboldcpp + lmstudio

* lint

* support Groq token tracking

* HF token tracking

* token tracking for togetherai

* LiteLLM token tracking

* linting + Mitral token tracking support

* XAI token metric reporting

* native provider runner

* LocalAI token tracking

* Novita token tracking

* OpenRouter token tracking

* Apipie stream metrics

* textwebgenui token tracking

* perplexity token reporting

* ollama token reporting

* lint

* put back comment

* Rip out LC ollama wrapper and use official library

* patch images with new ollama lib

* improve ollama offline message

* fix image handling in ollama llm provider

* lint

* NVIDIA NIM token tracking

* update openai compatbility responses

* UI/UX show/hide metrics on click for user preference

* update bedrock client

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2024-12-16 14:31:17 -08:00

499 lines
13 KiB
JavaScript

const { v4: uuidv4 } = require("uuid");
const { DocumentManager } = require("../DocumentManager");
const { WorkspaceChats } = require("../../models/workspaceChats");
const { getVectorDbClass, getLLMProvider } = require("../helpers");
const { writeResponseChunk } = require("../helpers/chat/responses");
const { chatPrompt, sourceIdentifier } = require("./index");
const { PassThrough } = require("stream");
async function chatSync({
workspace,
systemPrompt = null,
history = [],
prompt = null,
temperature = null,
}) {
const uuid = uuidv4();
const chatMode = workspace?.chatMode ?? "chat";
const LLMConnector = getLLMProvider({
provider: workspace?.chatProvider,
model: workspace?.chatModel,
});
const VectorDb = getVectorDbClass();
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
// User is trying to query-mode chat a workspace that has no data in it - so
// we should exit early as no information can be found under these conditions.
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
const textResponse =
workspace?.queryRefusalResponse ??
"There is no relevant information in this workspace to answer your query.";
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: String(prompt),
response: {
text: textResponse,
sources: [],
type: chatMode,
},
include: false,
});
return formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: null,
textResponse,
},
{ model: workspace.slug, finish_reason: "abort" }
);
}
// If we are here we know that we are in a workspace that is:
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
// 2. Chatting in "query" mode and has at least 1 embedding
let contextTexts = [];
let sources = [];
let pinnedDocIdentifiers = [];
await new DocumentManager({
workspace,
maxTokens: LLMConnector.promptWindowLimit(),
})
.pinnedDocs()
.then((pinnedDocs) => {
pinnedDocs.forEach((doc) => {
const { pageContent, ...metadata } = doc;
pinnedDocIdentifiers.push(sourceIdentifier(doc));
contextTexts.push(doc.pageContent);
sources.push({
text:
pageContent.slice(0, 1_000) +
"...continued on in source document...",
...metadata,
});
});
});
const vectorSearchResults =
embeddingsCount !== 0
? await VectorDb.performSimilaritySearch({
namespace: workspace.slug,
input: prompt,
LLMConnector,
similarityThreshold: workspace?.similarityThreshold,
topN: workspace?.topN,
filterIdentifiers: pinnedDocIdentifiers,
})
: {
contextTexts: [],
sources: [],
message: null,
};
// Failed similarity search if it was run at all and failed.
if (!!vectorSearchResults.message) {
return formatJSON(
{
id: uuid,
type: "abort",
textResponse: null,
sources: [],
close: true,
error: vectorSearchResults.message,
},
{ model: workspace.slug, finish_reason: "abort" }
);
}
// For OpenAI Compatible chats, we cannot do backfilling so we simply aggregate results here.
contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
sources = [...sources, ...vectorSearchResults.sources];
// If in query mode and no context chunks are found from search, backfill, or pins - do not
// let the LLM try to hallucinate a response or use general knowledge and exit early
if (chatMode === "query" && contextTexts.length === 0) {
const textResponse =
workspace?.queryRefusalResponse ??
"There is no relevant information in this workspace to answer your query.";
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: prompt,
response: {
text: textResponse,
sources: [],
type: chatMode,
},
include: false,
});
return formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: null,
textResponse,
},
{ model: workspace.slug, finish_reason: "no_content" }
);
}
// Compress & Assemble message to ensure prompt passes token limit with room for response
// and build system messages based on inputs and history.
const messages = await LLMConnector.compressMessages({
systemPrompt: systemPrompt ?? chatPrompt(workspace),
userPrompt: prompt,
contextTexts,
chatHistory: history,
});
// Send the text completion.
const { textResponse, metrics } = await LLMConnector.getChatCompletion(
messages,
{
temperature:
temperature ?? workspace?.openAiTemp ?? LLMConnector.defaultTemp,
}
);
if (!textResponse) {
return formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: "No text completion could be completed with this input.",
textResponse: null,
},
{ model: workspace.slug, finish_reason: "no_content", usage: metrics }
);
}
const { chat } = await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: prompt,
response: { text: textResponse, sources, type: chatMode, metrics },
});
return formatJSON(
{
id: uuid,
type: "textResponse",
close: true,
error: null,
chatId: chat.id,
textResponse,
sources,
},
{ model: workspace.slug, finish_reason: "stop", usage: metrics }
);
}
async function streamChat({
workspace,
response,
systemPrompt = null,
history = [],
prompt = null,
temperature = null,
}) {
const uuid = uuidv4();
const chatMode = workspace?.chatMode ?? "chat";
const LLMConnector = getLLMProvider({
provider: workspace?.chatProvider,
model: workspace?.chatModel,
});
const VectorDb = getVectorDbClass();
const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
// We don't want to write a new method for every LLM to support openAI calls
// via the `handleStreamResponseV2` method handler. So here we create a passthrough
// that on writes to the main response, transforms the chunk to OpenAI format.
// The chunk is coming in the format from `writeResponseChunk` but in the AnythingLLM
// response chunk schema, so we here we mutate each chunk.
const responseInterceptor = new PassThrough({});
responseInterceptor.on("data", (chunk) => {
try {
const originalData = JSON.parse(chunk.toString().split("data: ")[1]);
const modified = formatJSON(originalData, {
chunked: true,
model: workspace.slug,
}); // rewrite to OpenAI format
response.write(`data: ${JSON.stringify(modified)}\n\n`);
} catch (e) {
console.error(e);
}
});
// User is trying to query-mode chat a workspace that has no data in it - so
// we should exit early as no information can be found under these conditions.
if ((!hasVectorizedSpace || embeddingsCount === 0) && chatMode === "query") {
const textResponse =
workspace?.queryRefusalResponse ??
"There is no relevant information in this workspace to answer your query.";
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: String(prompt),
response: {
text: textResponse,
sources: [],
type: chatMode,
},
include: false,
});
writeResponseChunk(
response,
formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: null,
textResponse,
},
{ chunked: true, model: workspace.slug, finish_reason: "abort" }
)
);
return;
}
// If we are here we know that we are in a workspace that is:
// 1. Chatting in "chat" mode and may or may _not_ have embeddings
// 2. Chatting in "query" mode and has at least 1 embedding
let contextTexts = [];
let sources = [];
let pinnedDocIdentifiers = [];
await new DocumentManager({
workspace,
maxTokens: LLMConnector.promptWindowLimit(),
})
.pinnedDocs()
.then((pinnedDocs) => {
pinnedDocs.forEach((doc) => {
const { pageContent, ...metadata } = doc;
pinnedDocIdentifiers.push(sourceIdentifier(doc));
contextTexts.push(doc.pageContent);
sources.push({
text:
pageContent.slice(0, 1_000) +
"...continued on in source document...",
...metadata,
});
});
});
const vectorSearchResults =
embeddingsCount !== 0
? await VectorDb.performSimilaritySearch({
namespace: workspace.slug,
input: prompt,
LLMConnector,
similarityThreshold: workspace?.similarityThreshold,
topN: workspace?.topN,
filterIdentifiers: pinnedDocIdentifiers,
})
: {
contextTexts: [],
sources: [],
message: null,
};
// Failed similarity search if it was run at all and failed.
if (!!vectorSearchResults.message) {
writeResponseChunk(
response,
formatJSON(
{
id: uuid,
type: "abort",
textResponse: null,
sources: [],
close: true,
error: vectorSearchResults.message,
},
{ chunked: true, model: workspace.slug, finish_reason: "abort" }
)
);
return;
}
// For OpenAI Compatible chats, we cannot do backfilling so we simply aggregate results here.
contextTexts = [...contextTexts, ...vectorSearchResults.contextTexts];
sources = [...sources, ...vectorSearchResults.sources];
// If in query mode and no context chunks are found from search, backfill, or pins - do not
// let the LLM try to hallucinate a response or use general knowledge and exit early
if (chatMode === "query" && contextTexts.length === 0) {
const textResponse =
workspace?.queryRefusalResponse ??
"There is no relevant information in this workspace to answer your query.";
await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: prompt,
response: {
text: textResponse,
sources: [],
type: chatMode,
},
include: false,
});
writeResponseChunk(
response,
formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: null,
textResponse,
},
{ chunked: true, model: workspace.slug, finish_reason: "no_content" }
)
);
return;
}
// Compress & Assemble message to ensure prompt passes token limit with room for response
// and build system messages based on inputs and history.
const messages = await LLMConnector.compressMessages({
systemPrompt: systemPrompt ?? chatPrompt(workspace),
userPrompt: prompt,
contextTexts,
chatHistory: history,
});
if (!LLMConnector.streamingEnabled()) {
writeResponseChunk(
response,
formatJSON(
{
id: uuid,
type: "textResponse",
sources: [],
close: true,
error: "Streaming is not available for the connected LLM Provider",
textResponse: null,
},
{
chunked: true,
model: workspace.slug,
finish_reason: "streaming_disabled",
}
)
);
return;
}
const stream = await LLMConnector.streamGetChatCompletion(messages, {
temperature:
temperature ?? workspace?.openAiTemp ?? LLMConnector.defaultTemp,
});
const completeText = await LLMConnector.handleStream(
responseInterceptor,
stream,
{
uuid,
sources,
}
);
if (completeText?.length > 0) {
const { chat } = await WorkspaceChats.new({
workspaceId: workspace.id,
prompt: prompt,
response: {
text: completeText,
sources,
type: chatMode,
metrics: stream.metrics,
},
});
writeResponseChunk(
response,
formatJSON(
{
uuid,
type: "finalizeResponseStream",
close: true,
error: false,
chatId: chat.id,
textResponse: "",
},
{
chunked: true,
model: workspace.slug,
finish_reason: "stop",
usage: stream.metrics,
}
)
);
return;
}
writeResponseChunk(
response,
formatJSON(
{
uuid,
type: "finalizeResponseStream",
close: true,
error: false,
textResponse: "",
},
{
chunked: true,
model: workspace.slug,
finish_reason: "stop",
usage: stream.metrics,
}
)
);
return;
}
function formatJSON(
chat,
{ chunked = false, model, finish_reason = null, usage = {} }
) {
const data = {
id: chat.uuid ?? chat.id,
object: "chat.completion",
created: Number(new Date()),
model: model,
choices: [
{
[chunked ? "delta" : "message"]: {
role: "assistant",
content: chat.textResponse,
},
logprobs: null,
finish_reason: finish_reason,
},
],
usage,
};
return data;
}
module.exports.OpenAICompatibleChat = {
chatSync,
streamChat,
};