2024-03-12 15:21:27 -07:00
const {
writeResponseChunk ,
clientAbortedHandler ,
2025-01-16 13:49:06 -08:00
formatChatHistory ,
2024-03-12 15:21:27 -07:00
} = require ( "../../helpers/chat/responses" ) ;
2024-05-16 17:25:05 -07:00
const { NativeEmbedder } = require ( "../../EmbeddingEngines/native" ) ;
2024-12-16 14:31:17 -08:00
const {
LLMPerformanceMonitor ,
} = require ( "../../helpers/chat/LLMPerformanceMonitor" ) ;
const { Ollama } = require ( "ollama" ) ;
2023-12-27 17:21:47 -08:00
// Docs: https://github.com/jmorganca/ollama/blob/main/docs/api.md
class OllamaAILLM {
2024-01-17 12:59:25 -08:00
constructor ( embedder = null , modelPreference = null ) {
2023-12-27 17:21:47 -08:00
if ( ! process . env . OLLAMA _BASE _PATH )
throw new Error ( "No Ollama Base Path was set." ) ;
this . basePath = process . env . OLLAMA _BASE _PATH ;
2024-01-17 12:59:25 -08:00
this . model = modelPreference || process . env . OLLAMA _MODEL _PREF ;
2024-08-02 13:29:17 -07:00
this . performanceMode = process . env . OLLAMA _PERFORMANCE _MODE || "base" ;
2024-07-22 14:44:47 -07:00
this . keepAlive = process . env . OLLAMA _KEEP _ALIVE _TIMEOUT
? Number ( process . env . OLLAMA _KEEP _ALIVE _TIMEOUT )
: 300 ; // Default 5-minute timeout for Ollama model loading.
2023-12-27 17:21:47 -08:00
this . limits = {
history : this . promptWindowLimit ( ) * 0.15 ,
system : this . promptWindowLimit ( ) * 0.15 ,
user : this . promptWindowLimit ( ) * 0.7 ,
} ;
2024-12-16 14:31:17 -08:00
this . client = new Ollama ( { host : this . basePath } ) ;
2024-05-16 17:25:05 -07:00
this . embedder = embedder ? ? new NativeEmbedder ( ) ;
2024-01-17 14:42:05 -08:00
this . defaultTemp = 0.7 ;
2024-12-18 11:21:35 -08:00
this . # log (
` OllamaAILLM initialized with \n model: ${ this . model } \n perf: ${ this . performanceMode } \n n_ctx: ${ this . promptWindowLimit ( ) } `
) ;
}
# log ( text , ... args ) {
console . log ( ` \x 1b[32m[Ollama] \x 1b[0m ${ text } ` , ... args ) ;
2023-12-27 17:21:47 -08:00
}
2023-12-28 14:42:34 -08:00
# appendContext ( contextTexts = [ ] ) {
if ( ! contextTexts || ! contextTexts . length ) return "" ;
return (
"\nContext:\n" +
contextTexts
. map ( ( text , i ) => {
return ` [CONTEXT ${ i } ]: \n ${ text } \n [END CONTEXT ${ i } ] \n \n ` ;
} )
. join ( "" )
) ;
}
2023-12-27 17:21:47 -08:00
streamingEnabled ( ) {
2024-05-01 16:52:28 -07:00
return "streamGetChatCompletion" in this ;
2023-12-27 17:21:47 -08:00
}
2024-08-15 12:13:28 -07:00
static promptWindowLimit ( _modelName ) {
const limit = process . env . OLLAMA _MODEL _TOKEN _LIMIT || 4096 ;
if ( ! limit || isNaN ( Number ( limit ) ) )
throw new Error ( "No Ollama token context limit was set." ) ;
return Number ( limit ) ;
}
2023-12-27 17:21:47 -08:00
// Ensure the user set a value for the token limit
// and if undefined - assume 4096 window.
promptWindowLimit ( ) {
const limit = process . env . OLLAMA _MODEL _TOKEN _LIMIT || 4096 ;
if ( ! limit || isNaN ( Number ( limit ) ) )
throw new Error ( "No Ollama token context limit was set." ) ;
return Number ( limit ) ;
}
async isValidChatCompletionModel ( _ = "" ) {
return true ;
}
2024-07-31 10:47:49 -07:00
/ * *
* Generates appropriate content array for a message + attachments .
* @ param { { userPrompt : string , attachments : import ( "../../helpers" ) . Attachment [ ] } }
2024-12-16 14:31:17 -08:00
* @ returns { { content : string , images : string [ ] } }
2024-07-31 10:47:49 -07:00
* /
# generateContent ( { userPrompt , attachments = [ ] } ) {
2024-12-16 14:31:17 -08:00
if ( ! attachments . length ) return { content : userPrompt } ;
const images = attachments . map (
( attachment ) => attachment . contentString . split ( "base64," ) . slice ( - 1 ) [ 0 ]
) ;
return { content : userPrompt , images } ;
}
2024-07-31 10:47:49 -07:00
2024-12-16 14:31:17 -08:00
/ * *
* Handles errors from the Ollama API to make them more user friendly .
* @ param { Error } e
* /
# errorHandler ( e ) {
switch ( e . message ) {
case "fetch failed" :
throw new Error (
"Your Ollama instance could not be reached or is not responding. Please make sure it is running the API server and your connection information is correct in AnythingLLM."
) ;
default :
return e ;
2024-07-31 10:47:49 -07:00
}
}
/ * *
* Construct the user prompt for this model .
* @ param { { attachments : import ( "../../helpers" ) . Attachment [ ] } } param0
* @ returns
* /
2023-12-27 17:21:47 -08:00
constructPrompt ( {
systemPrompt = "" ,
contextTexts = [ ] ,
chatHistory = [ ] ,
userPrompt = "" ,
2024-07-31 10:47:49 -07:00
attachments = [ ] ,
2023-12-27 17:21:47 -08:00
} ) {
const prompt = {
role : "system" ,
2023-12-28 14:42:34 -08:00
content : ` ${ systemPrompt } ${ this . # appendContext ( contextTexts ) } ` ,
2023-12-27 17:21:47 -08:00
} ;
2024-07-31 10:47:49 -07:00
return [
prompt ,
2025-01-16 13:49:06 -08:00
... formatChatHistory ( chatHistory , this . # generateContent , "spread" ) ,
2024-07-31 10:47:49 -07:00
{
role : "user" ,
... this . # generateContent ( { userPrompt , attachments } ) ,
} ,
] ;
2023-12-27 17:21:47 -08:00
}
async getChatCompletion ( messages = null , { temperature = 0.7 } ) {
2024-12-16 14:31:17 -08:00
const result = await LLMPerformanceMonitor . measureAsyncFunction (
this . client
. chat ( {
model : this . model ,
stream : false ,
messages ,
keep _alive : this . keepAlive ,
options : {
temperature ,
2024-12-18 11:21:35 -08:00
use _mlock : true ,
2024-12-16 14:31:17 -08:00
// There are currently only two performance settings so if its not "base" - its max context.
... ( this . performanceMode === "base"
? { }
2024-12-18 11:21:35 -08:00
: { num _ctx : this . promptWindowLimit ( ) } ) ,
2024-12-16 14:31:17 -08:00
} ,
} )
. then ( ( res ) => {
return {
content : res . message . content ,
usage : {
prompt _tokens : res . prompt _eval _count ,
completion _tokens : res . eval _count ,
total _tokens : res . prompt _eval _count + res . eval _count ,
} ,
} ;
} )
. catch ( ( e ) => {
throw new Error (
` Ollama::getChatCompletion failed to communicate with Ollama. ${ this . # errorHandler ( e ) . message } `
) ;
} )
) ;
2023-12-27 17:21:47 -08:00
2024-12-16 14:31:17 -08:00
if ( ! result . output . content || ! result . output . content . length )
2023-12-27 17:21:47 -08:00
throw new Error ( ` Ollama::getChatCompletion text response was empty. ` ) ;
2024-12-16 14:31:17 -08:00
return {
textResponse : result . output . content ,
metrics : {
prompt _tokens : result . output . usage . prompt _tokens ,
completion _tokens : result . output . usage . completion _tokens ,
total _tokens : result . output . usage . total _tokens ,
outputTps : result . output . usage . completion _tokens / result . duration ,
duration : result . duration ,
} ,
} ;
2023-12-27 17:21:47 -08:00
}
async streamGetChatCompletion ( messages = null , { temperature = 0.7 } ) {
2024-12-16 14:31:17 -08:00
const measuredStreamRequest = await LLMPerformanceMonitor . measureStream (
this . client . chat ( {
model : this . model ,
stream : true ,
messages ,
keep _alive : this . keepAlive ,
options : {
temperature ,
2024-12-28 17:48:24 -08:00
use _mlock : true ,
2024-12-16 14:31:17 -08:00
// There are currently only two performance settings so if its not "base" - its max context.
... ( this . performanceMode === "base"
? { }
2024-12-18 11:21:35 -08:00
: { num _ctx : this . promptWindowLimit ( ) } ) ,
2024-12-16 14:31:17 -08:00
} ,
} ) ,
messages ,
false
) . catch ( ( e ) => {
throw this . # errorHandler ( e ) ;
} ) ;
return measuredStreamRequest ;
2023-12-27 17:21:47 -08:00
}
2024-12-16 14:31:17 -08:00
/ * *
* Handles streaming responses from Ollama .
* @ param { import ( "express" ) . Response } response
* @ param { import ( "../../helpers/chat/LLMPerformanceMonitor" ) . MonitoredStream } stream
* @ param { import ( "express" ) . Request } request
* @ returns { Promise < string > }
* /
2024-02-07 08:15:14 -08:00
handleStream ( response , stream , responseProps ) {
const { uuid = uuidv4 ( ) , sources = [ ] } = responseProps ;
return new Promise ( async ( resolve ) => {
2024-03-12 15:21:27 -07:00
let fullText = "" ;
2024-12-16 14:31:17 -08:00
let usage = {
prompt _tokens : 0 ,
completion _tokens : 0 ,
} ;
2024-03-12 15:21:27 -07:00
// Establish listener to early-abort a streaming response
// in case things go sideways or the user does not like the response.
// We preserve the generated text but continue as if chat was completed
// to preserve previously generated content.
2024-12-16 14:31:17 -08:00
const handleAbort = ( ) => {
stream ? . endMeasurement ( usage ) ;
clientAbortedHandler ( resolve , fullText ) ;
} ;
2024-03-12 15:21:27 -07:00
response . on ( "close" , handleAbort ) ;
2024-02-07 16:23:17 -08:00
try {
for await ( const chunk of stream ) {
if ( chunk === undefined )
throw new Error (
"Stream returned undefined chunk. Aborting reply - check model provider logs."
) ;
2024-12-16 14:31:17 -08:00
if ( chunk . done ) {
usage . prompt _tokens = chunk . prompt _eval _count ;
usage . completion _tokens = chunk . eval _count ;
writeResponseChunk ( response , {
uuid ,
sources ,
type : "textResponseChunk" ,
textResponse : "" ,
close : true ,
error : false ,
} ) ;
response . removeListener ( "close" , handleAbort ) ;
stream ? . endMeasurement ( usage ) ;
resolve ( fullText ) ;
break ;
}
2024-02-07 16:23:17 -08:00
2024-12-16 14:31:17 -08:00
if ( chunk . hasOwnProperty ( "message" ) ) {
const content = chunk . message . content ;
fullText += content ;
writeResponseChunk ( response , {
uuid ,
sources ,
type : "textResponseChunk" ,
textResponse : content ,
close : false ,
error : false ,
} ) ;
}
}
2024-02-07 16:23:17 -08:00
} catch ( error ) {
writeResponseChunk ( response , {
uuid ,
sources : [ ] ,
type : "textResponseChunk" ,
textResponse : "" ,
close : true ,
error : ` Ollama:streaming - could not stream chat. ${
error ? . cause ? ? error . message
} ` ,
} ) ;
2024-03-12 15:21:27 -07:00
response . removeListener ( "close" , handleAbort ) ;
2024-12-16 14:31:17 -08:00
stream ? . endMeasurement ( usage ) ;
resolve ( fullText ) ;
2024-02-07 08:15:14 -08:00
}
} ) ;
}
2023-12-27 17:21:47 -08:00
// Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
async embedTextInput ( textInput ) {
return await this . embedder . embedTextInput ( textInput ) ;
}
async embedChunks ( textChunks = [ ] ) {
return await this . embedder . embedChunks ( textChunks ) ;
}
async compressMessages ( promptArgs = { } , rawHistory = [ ] ) {
const { messageArrayCompressor } = require ( "../../helpers/chat" ) ;
const messageArray = this . constructPrompt ( promptArgs ) ;
return await messageArrayCompressor ( this , messageArray , rawHistory ) ;
}
}
module . exports = {
OllamaAILLM ,
} ;