mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-23 13:08:11 +00:00
Patch PPLX streaming for timeouts (#3130)
Add in-text citations as well for PPLX token streaming handle timeouts for stream/buffer hanging
This commit is contained in:
parent
df8d34d31e
commit
1bfd461719
3 changed files with 145 additions and 2 deletions
server/utils/AiProviders/perplexity
|
@ -1,6 +1,8 @@
|
||||||
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const { NativeEmbedder } = require("../../EmbeddingEngines/native");
|
const { NativeEmbedder } = require("../../EmbeddingEngines/native");
|
||||||
const {
|
const {
|
||||||
handleDefaultStreamResponseV2,
|
writeResponseChunk,
|
||||||
|
clientAbortedHandler,
|
||||||
} = require("../../helpers/chat/responses");
|
} = require("../../helpers/chat/responses");
|
||||||
const {
|
const {
|
||||||
LLMPerformanceMonitor,
|
LLMPerformanceMonitor,
|
||||||
|
@ -137,8 +139,137 @@ class PerplexityLLM {
|
||||||
return measuredStreamRequest;
|
return measuredStreamRequest;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enrichToken(token, citations) {
|
||||||
|
if (Array.isArray(citations) && citations.length !== 0) {
|
||||||
|
return token.replace(/\[(\d+)\]/g, (match, index) => {
|
||||||
|
const citationIndex = parseInt(index) - 1;
|
||||||
|
return citations[citationIndex]
|
||||||
|
? `[[${index}](${citations[citationIndex]})]`
|
||||||
|
: match;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
handleStream(response, stream, responseProps) {
|
handleStream(response, stream, responseProps) {
|
||||||
return handleDefaultStreamResponseV2(response, stream, responseProps);
|
const timeoutThresholdMs = 800;
|
||||||
|
const { uuid = uuidv4(), sources = [] } = responseProps;
|
||||||
|
let hasUsageMetrics = false;
|
||||||
|
let pplxCitations = []; // Array of links
|
||||||
|
let usage = {
|
||||||
|
completion_tokens: 0,
|
||||||
|
};
|
||||||
|
|
||||||
|
return new Promise(async (resolve) => {
|
||||||
|
let fullText = "";
|
||||||
|
let lastChunkTime = null;
|
||||||
|
|
||||||
|
const handleAbort = () => {
|
||||||
|
stream?.endMeasurement(usage);
|
||||||
|
clientAbortedHandler(resolve, fullText);
|
||||||
|
};
|
||||||
|
response.on("close", handleAbort);
|
||||||
|
|
||||||
|
const timeoutCheck = setInterval(() => {
|
||||||
|
if (lastChunkTime === null) return;
|
||||||
|
|
||||||
|
const now = Number(new Date());
|
||||||
|
const diffMs = now - lastChunkTime;
|
||||||
|
if (diffMs >= timeoutThresholdMs) {
|
||||||
|
console.log(
|
||||||
|
`Perplexity stream did not self-close and has been stale for >${timeoutThresholdMs}ms. Closing response stream.`
|
||||||
|
);
|
||||||
|
writeResponseChunk(response, {
|
||||||
|
uuid,
|
||||||
|
sources,
|
||||||
|
type: "textResponseChunk",
|
||||||
|
textResponse: "",
|
||||||
|
close: true,
|
||||||
|
error: false,
|
||||||
|
});
|
||||||
|
clearInterval(timeoutCheck);
|
||||||
|
response.removeListener("close", handleAbort);
|
||||||
|
stream?.endMeasurement(usage);
|
||||||
|
resolve(fullText);
|
||||||
|
}
|
||||||
|
}, 500);
|
||||||
|
|
||||||
|
// Now handle the chunks from the streamed response and append to fullText.
|
||||||
|
try {
|
||||||
|
for await (const chunk of stream) {
|
||||||
|
lastChunkTime = Number(new Date());
|
||||||
|
const message = chunk?.choices?.[0];
|
||||||
|
const token = message?.delta?.content;
|
||||||
|
|
||||||
|
if (Array.isArray(chunk.citations) && chunk.citations.length !== 0) {
|
||||||
|
pplxCitations = chunk.citations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we see usage metrics in the chunk, we can use them directly
|
||||||
|
// instead of estimating them, but we only want to assign values if
|
||||||
|
// the response object is the exact same key:value pair we expect.
|
||||||
|
if (
|
||||||
|
chunk.hasOwnProperty("usage") && // exists
|
||||||
|
!!chunk.usage && // is not null
|
||||||
|
Object.values(chunk.usage).length > 0 // has values
|
||||||
|
) {
|
||||||
|
if (chunk.usage.hasOwnProperty("prompt_tokens")) {
|
||||||
|
usage.prompt_tokens = Number(chunk.usage.prompt_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk.usage.hasOwnProperty("completion_tokens")) {
|
||||||
|
hasUsageMetrics = true; // to stop estimating counter
|
||||||
|
usage.completion_tokens = Number(chunk.usage.completion_tokens);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token) {
|
||||||
|
let enrichedToken = this.enrichToken(token, pplxCitations);
|
||||||
|
fullText += enrichedToken;
|
||||||
|
if (!hasUsageMetrics) usage.completion_tokens++;
|
||||||
|
|
||||||
|
writeResponseChunk(response, {
|
||||||
|
uuid,
|
||||||
|
sources: [],
|
||||||
|
type: "textResponseChunk",
|
||||||
|
textResponse: enrichedToken,
|
||||||
|
close: false,
|
||||||
|
error: false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (message?.finish_reason) {
|
||||||
|
console.log("closing");
|
||||||
|
writeResponseChunk(response, {
|
||||||
|
uuid,
|
||||||
|
sources,
|
||||||
|
type: "textResponseChunk",
|
||||||
|
textResponse: "",
|
||||||
|
close: true,
|
||||||
|
error: false,
|
||||||
|
});
|
||||||
|
response.removeListener("close", handleAbort);
|
||||||
|
stream?.endMeasurement(usage);
|
||||||
|
clearInterval(timeoutCheck);
|
||||||
|
resolve(fullText);
|
||||||
|
break; // Break streaming when a valid finish_reason is first encountered
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`\x1b[43m\x1b[34m[STREAMING ERROR]\x1b[0m ${e.message}`);
|
||||||
|
writeResponseChunk(response, {
|
||||||
|
uuid,
|
||||||
|
type: "abort",
|
||||||
|
textResponse: null,
|
||||||
|
sources: [],
|
||||||
|
close: true,
|
||||||
|
error: e.message,
|
||||||
|
});
|
||||||
|
stream?.endMeasurement(usage);
|
||||||
|
clearInterval(timeoutCheck);
|
||||||
|
resolve(fullText); // Return what we currently have - if anything.
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
|
// Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
|
||||||
|
|
|
@ -1,4 +1,14 @@
|
||||||
const MODELS = {
|
const MODELS = {
|
||||||
|
"sonar-reasoning-pro": {
|
||||||
|
id: "sonar-reasoning-pro",
|
||||||
|
name: "sonar-reasoning-pro",
|
||||||
|
maxLength: 127072,
|
||||||
|
},
|
||||||
|
"sonar-reasoning": {
|
||||||
|
id: "sonar-reasoning",
|
||||||
|
name: "sonar-reasoning",
|
||||||
|
maxLength: 127072,
|
||||||
|
},
|
||||||
"sonar-pro": {
|
"sonar-pro": {
|
||||||
id: "sonar-pro",
|
id: "sonar-pro",
|
||||||
name: "sonar-pro",
|
name: "sonar-pro",
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
| Model | Parameter Count | Context Length | Model Type |
|
| Model | Parameter Count | Context Length | Model Type |
|
||||||
| :---------------------------------- | :-------------- | :------------- | :-------------- |
|
| :---------------------------------- | :-------------- | :------------- | :-------------- |
|
||||||
|
| `sonar-reasoning-pro` | 8B | 127,072 | Chat Completion |
|
||||||
|
| `sonar-reasoning` | 8B | 127,072 | Chat Completion |
|
||||||
| `sonar-pro` | 8B | 200,000 | Chat Completion |
|
| `sonar-pro` | 8B | 200,000 | Chat Completion |
|
||||||
| `sonar` | 8B | 127,072 | Chat Completion |
|
| `sonar` | 8B | 127,072 | Chat Completion |
|
Loading…
Add table
Reference in a new issue