diff --git a/server/utils/EmbeddingEngines/ollama/index.js b/server/utils/EmbeddingEngines/ollama/index.js index 8f7239c7e..7e2f636e2 100644 --- a/server/utils/EmbeddingEngines/ollama/index.js +++ b/server/utils/EmbeddingEngines/ollama/index.js @@ -1,4 +1,5 @@ const { maximumChunkLength } = require("../../helpers"); +const { Ollama } = require("ollama"); class OllamaEmbedder { constructor() { @@ -7,21 +8,27 @@ class OllamaEmbedder { if (!process.env.EMBEDDING_MODEL_PREF) throw new Error("No embedding model was set."); - this.basePath = `${process.env.EMBEDDING_BASE_PATH}/api/embeddings`; + this.basePath = process.env.EMBEDDING_BASE_PATH; this.model = process.env.EMBEDDING_MODEL_PREF; // Limit of how many strings we can process in a single pass to stay with resource or network limits this.maxConcurrentChunks = 1; this.embeddingMaxChunkLength = maximumChunkLength(); + this.client = new Ollama({ host: this.basePath }); + this.log( + `initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}` + ); } log(text, ...args) { console.log(`\x1b[36m[${this.constructor.name}]\x1b[0m ${text}`, ...args); } + /** + * Checks if the Ollama service is alive by pinging the base path. + * @returns {Promise<boolean>} - A promise that resolves to true if the service is alive, false otherwise. + */ async #isAlive() { - return await fetch(process.env.EMBEDDING_BASE_PATH, { - method: "HEAD", - }) + return await fetch(this.basePath) .then((res) => res.ok) .catch((e) => { this.log(e.message); @@ -40,6 +47,13 @@ class OllamaEmbedder { * This function takes an array of text chunks and embeds them using the Ollama API. * chunks are processed sequentially to avoid overwhelming the API with too many requests * or running out of resources on the endpoint running the ollama instance. + * + * We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings + * so that the maximum context window is used and content is not truncated. + * + * We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded + * on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are + * constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue. * @param {string[]} textChunks - An array of text chunks to embed. * @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings. */ @@ -48,7 +62,6 @@ class OllamaEmbedder { throw new Error( `Ollama service could not be reached. Is Ollama running?` ); - this.log( `Embedding ${textChunks.length} chunks of text with ${this.model}.` ); @@ -58,15 +71,17 @@ class OllamaEmbedder { for (const chunk of textChunks) { try { - const res = await fetch(this.basePath, { - method: "POST", - body: JSON.stringify({ - model: this.model, - prompt: chunk, - }), + const res = await this.client.embeddings({ + model: this.model, + prompt: chunk, + options: { + // Always set the num_ctx to the max chunk length defined by the user in the settings + // so that the maximum context window is used and content is not truncated. + num_ctx: this.embeddingMaxChunkLength, + }, }); - const { embedding } = await res.json(); + const { embedding } = res; if (!Array.isArray(embedding) || embedding.length === 0) throw new Error("Ollama returned an empty embedding for chunk!");