diff --git a/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx b/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx index 16855b359..9bd95bca7 100644 --- a/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx +++ b/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx @@ -2,8 +2,9 @@ import React, { useEffect, useState } from "react"; import System from "@/models/system"; import PreLoader from "@/components/Preloader"; import { OLLAMA_COMMON_URLS } from "@/utils/constants"; -import { CaretDown, CaretUp } from "@phosphor-icons/react"; +import { CaretDown, CaretUp, Info } from "@phosphor-icons/react"; import useProviderEndpointAutoDiscovery from "@/hooks/useProviderEndpointAutoDiscovery"; +import { Tooltip } from "react-tooltip"; export default function OllamaLLMOptions({ settings }) { const { @@ -18,15 +19,13 @@ export default function OllamaLLMOptions({ settings }) { initialBasePath: settings?.OllamaLLMBasePath, ENDPOINTS: OLLAMA_COMMON_URLS, }); - + const [performanceMode, setPerformanceMode] = useState( + settings?.OllamaLLMPerformanceMode || "base" + ); const [maxTokens, setMaxTokens] = useState( settings?.OllamaLLMTokenLimit || 4096 ); - const handleMaxTokensChange = (e) => { - setMaxTokens(Number(e.target.value)); - }; - return ( <div className="w-full flex flex-col gap-y-7"> <div className="w-full flex items-start gap-[36px] mt-1.5"> @@ -46,7 +45,7 @@ export default function OllamaLLMOptions({ settings }) { defaultChecked="4096" min={1} value={maxTokens} - onChange={handleMaxTokensChange} + onChange={(e) => setMaxTokens(Number(e.target.value))} onScroll={(e) => e.target.blur()} required={true} autoComplete="off" @@ -64,7 +63,7 @@ export default function OllamaLLMOptions({ settings }) { }} className="text-white hover:text-white/70 flex items-center text-sm" > - {showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input + {showAdvancedControls ? "Hide" : "Show"} advanced settings {showAdvancedControls ? ( <CaretUp size={14} className="ml-1" /> ) : ( @@ -134,12 +133,57 @@ export default function OllamaLLMOptions({ settings }) { className="underline text-blue-300" href="https://github.com/ollama/ollama/blob/main/docs/faq.md#how-do-i-keep-a-model-loaded-in-memory-or-make-it-unload-immediately" target="_blank" + rel="noreferrer" > {" "} Learn more → </a> </p> </div> + + <div className="flex flex-col w-60"> + <label className="text-white text-sm font-semibold mb-2 flex items-center"> + Performance Mode + <Info + size={16} + className="ml-2 text-white" + data-tooltip-id="performance-mode-tooltip" + /> + </label> + <select + name="OllamaLLMPerformanceMode" + required={true} + className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" + value={performanceMode} + onChange={(e) => setPerformanceMode(e.target.value)} + > + <option value="base">Base (Default)</option> + <option value="maximum">Maximum</option> + </select> + <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2"> + Choose the performance mode for the Ollama model. + </p> + <Tooltip + id="performance-mode-tooltip" + place="bottom" + className="tooltip !text-xs max-w-xs" + > + <p className="text-red-500"> + <strong>Note:</strong> Only change this setting if you + understand its implications on performance and resource usage. + </p> + <br /> + <p> + <strong>Base:</strong> Ollama automatically limits the context + to 2048 tokens, reducing VRAM usage. Suitable for most users. + </p> + <br /> + <p> + <strong>Maximum:</strong> Uses the full context window (up to + Max Tokens). May increase VRAM usage significantly. + </p> + </Tooltip> + </div> </div> </div> </div> diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js index 485837506..216f63ad5 100644 --- a/server/models/systemSettings.js +++ b/server/models/systemSettings.js @@ -412,6 +412,7 @@ const SystemSettings = { OllamaLLMModelPref: process.env.OLLAMA_MODEL_PREF, OllamaLLMTokenLimit: process.env.OLLAMA_MODEL_TOKEN_LIMIT, OllamaLLMKeepAliveSeconds: process.env.OLLAMA_KEEP_ALIVE_TIMEOUT ?? 300, + OllamaLLMPerformanceMode: process.env.OLLAMA_PERFORMANCE_MODE ?? "base", // TogetherAI Keys TogetherAiApiKey: !!process.env.TOGETHER_AI_API_KEY, diff --git a/server/utils/AiProviders/ollama/index.js b/server/utils/AiProviders/ollama/index.js index a4e99f788..02e780777 100644 --- a/server/utils/AiProviders/ollama/index.js +++ b/server/utils/AiProviders/ollama/index.js @@ -13,6 +13,7 @@ class OllamaAILLM { this.basePath = process.env.OLLAMA_BASE_PATH; this.model = modelPreference || process.env.OLLAMA_MODEL_PREF; + this.performanceMode = process.env.OLLAMA_PERFORMANCE_MODE || "base"; this.keepAlive = process.env.OLLAMA_KEEP_ALIVE_TIMEOUT ? Number(process.env.OLLAMA_KEEP_ALIVE_TIMEOUT) : 300; // Default 5-minute timeout for Ollama model loading. @@ -33,6 +34,10 @@ class OllamaAILLM { model: this.model, keepAlive: this.keepAlive, useMLock: true, + // There are currently only two performance settings so if its not "base" - its max context. + ...(this.performanceMode === "base" + ? {} + : { numCtx: this.promptWindowLimit() }), temperature, }); } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index f1cedb707..85981994d 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -101,6 +101,10 @@ const KEY_MAPPING = { envKey: "OLLAMA_MODEL_TOKEN_LIMIT", checks: [nonZero], }, + OllamaLLMPerformanceMode: { + envKey: "OLLAMA_PERFORMANCE_MODE", + checks: [], + }, OllamaLLMKeepAliveSeconds: { envKey: "OLLAMA_KEEP_ALIVE_TIMEOUT", checks: [isInteger],