mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-23 21:18:12 +00:00
Enable customization of chunk length and overlap (#1059)
* Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size
This commit is contained in:
parent
1c11a47f93
commit
ce98ff4653
19 changed files with 455 additions and 47 deletions
|
@ -35,6 +35,9 @@ const GeneralTranscriptionPreference = lazy(
|
|||
const GeneralEmbeddingPreference = lazy(
|
||||
() => import("@/pages/GeneralSettings/EmbeddingPreference")
|
||||
);
|
||||
const EmbeddingTextSplitterPreference = lazy(
|
||||
() => import("@/pages/GeneralSettings/EmbeddingTextSplitterPreference")
|
||||
);
|
||||
const GeneralVectorDatabase = lazy(
|
||||
() => import("@/pages/GeneralSettings/VectorDatabase")
|
||||
);
|
||||
|
@ -86,6 +89,12 @@ export default function App() {
|
|||
path="/settings/embedding-preference"
|
||||
element={<AdminRoute Component={GeneralEmbeddingPreference} />}
|
||||
/>
|
||||
<Route
|
||||
path="/settings/text-splitter-preference"
|
||||
element={
|
||||
<AdminRoute Component={EmbeddingTextSplitterPreference} />
|
||||
}
|
||||
/>
|
||||
<Route
|
||||
path="/settings/vector-database"
|
||||
element={<AdminRoute Component={GeneralVectorDatabase} />}
|
||||
|
|
|
@ -21,7 +21,7 @@ export default function LMStudioOptions({ settings, showAlert = false }) {
|
|||
</p>
|
||||
</div>
|
||||
<a
|
||||
href={paths.settings.embeddingPreference()}
|
||||
href={paths.settings.embedder.modelPreference()}
|
||||
className="text-sm md:text-base my-2 underline"
|
||||
>
|
||||
Manage embedding →
|
||||
|
|
|
@ -21,7 +21,7 @@ export default function LocalAiOptions({ settings, showAlert = false }) {
|
|||
</p>
|
||||
</div>
|
||||
<a
|
||||
href={paths.settings.embeddingPreference()}
|
||||
href={paths.settings.embedder.modelPreference()}
|
||||
className="text-sm md:text-base my-2 underline"
|
||||
>
|
||||
Manage embedding →
|
||||
|
|
|
@ -20,6 +20,7 @@ import {
|
|||
Barcode,
|
||||
ClosedCaptioning,
|
||||
EyeSlash,
|
||||
SplitVertical,
|
||||
} from "@phosphor-icons/react";
|
||||
import useUser from "@/hooks/useUser";
|
||||
import { USER_BACKGROUND_COLOR } from "@/utils/constants";
|
||||
|
@ -288,12 +289,25 @@ const SidebarOptions = ({ user = null }) => (
|
|||
allowedRole={["admin"]}
|
||||
/>
|
||||
<Option
|
||||
href={paths.settings.embeddingPreference()}
|
||||
btnText="Embedding Model"
|
||||
href={paths.settings.embedder.modelPreference()}
|
||||
childLinks={[paths.settings.embedder.chunkingPreference()]}
|
||||
btnText="Embedder Preferences"
|
||||
icon={<FileCode className="h-5 w-5 flex-shrink-0" />}
|
||||
user={user}
|
||||
flex={true}
|
||||
allowedRole={["admin"]}
|
||||
subOptions={
|
||||
<>
|
||||
<Option
|
||||
href={paths.settings.embedder.chunkingPreference()}
|
||||
btnText="Text Splitter & Chunking"
|
||||
icon={<SplitVertical className="h-5 w-5 flex-shrink-0" />}
|
||||
user={user}
|
||||
flex={true}
|
||||
allowedRole={["admin"]}
|
||||
/>
|
||||
</>
|
||||
}
|
||||
/>
|
||||
<Option
|
||||
href={paths.settings.vectorDatabase()}
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
import React, { useEffect, useState } from "react";
|
||||
import Sidebar from "@/components/SettingsSidebar";
|
||||
import { isMobile } from "react-device-detect";
|
||||
import PreLoader from "@/components/Preloader";
|
||||
import CTAButton from "@/components/lib/CTAButton";
|
||||
import Admin from "@/models/admin";
|
||||
import showToast from "@/utils/toast";
|
||||
import { nFormatter, numberWithCommas } from "@/utils/numbers";
|
||||
|
||||
function isNullOrNaN(value) {
|
||||
if (value === null) return true;
|
||||
return isNaN(value);
|
||||
}
|
||||
|
||||
export default function EmbeddingTextSplitterPreference() {
|
||||
const [settings, setSettings] = useState({});
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [hasChanges, setHasChanges] = useState(false);
|
||||
|
||||
const handleSubmit = async (e) => {
|
||||
e.preventDefault();
|
||||
const form = new FormData(e.target);
|
||||
|
||||
if (
|
||||
Number(form.get("text_splitter_chunk_overlap")) >=
|
||||
Number(form.get("text_splitter_chunk_size"))
|
||||
) {
|
||||
showToast(
|
||||
"Chunk overlap cannot be larger or equal to chunk size.",
|
||||
"error"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
setSaving(true);
|
||||
await Admin.updateSystemPreferences({
|
||||
text_splitter_chunk_size: isNullOrNaN(
|
||||
form.get("text_splitter_chunk_size")
|
||||
)
|
||||
? 1000
|
||||
: Number(form.get("text_splitter_chunk_size")),
|
||||
text_splitter_chunk_overlap: isNullOrNaN(
|
||||
form.get("text_splitter_chunk_overlap")
|
||||
)
|
||||
? 1000
|
||||
: Number(form.get("text_splitter_chunk_overlap")),
|
||||
});
|
||||
setSaving(false);
|
||||
setHasChanges(false);
|
||||
showToast("Text chunking strategy settings saved.", "success");
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
async function fetchSettings() {
|
||||
const _settings = (await Admin.systemPreferences())?.settings;
|
||||
setSettings(_settings ?? {});
|
||||
setLoading(false);
|
||||
}
|
||||
fetchSettings();
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div className="w-screen h-screen overflow-hidden bg-sidebar flex">
|
||||
<Sidebar />
|
||||
{loading ? (
|
||||
<div
|
||||
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
|
||||
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
|
||||
>
|
||||
<div className="w-full h-full flex justify-center items-center">
|
||||
<PreLoader />
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<div
|
||||
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
|
||||
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
|
||||
>
|
||||
<form
|
||||
onSubmit={handleSubmit}
|
||||
onChange={() => setHasChanges(true)}
|
||||
className="flex w-full"
|
||||
>
|
||||
<div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
|
||||
<div className="w-full flex flex-col gap-y-1 pb-4 border-white border-b-2 border-opacity-10">
|
||||
<div className="flex gap-x-4 items-center">
|
||||
<p className="text-lg leading-6 font-bold text-white">
|
||||
Text splitting & Chunking Preferences
|
||||
</p>
|
||||
</div>
|
||||
<p className="text-xs leading-[18px] font-base text-white text-opacity-60">
|
||||
Sometimes, you may want to change the default way that new
|
||||
documents are split and chunked before being inserted into
|
||||
your vector database. <br />
|
||||
You should only modify this setting if you understand how text
|
||||
splitting works and it's side effects.
|
||||
</p>
|
||||
<p className="text-xs leading-[18px] font-semibold text-white/80">
|
||||
Changes here will only apply to{" "}
|
||||
<i>newly embedded documents</i>, not existing documents.
|
||||
</p>
|
||||
</div>
|
||||
<div className="w-full justify-end flex">
|
||||
{hasChanges && (
|
||||
<CTAButton className="mt-3 mr-0 -mb-14 z-10">
|
||||
{saving ? "Saving..." : "Save changes"}
|
||||
</CTAButton>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-y-4 mt-8">
|
||||
<div className="flex flex-col max-w-[300px]">
|
||||
<div className="flex flex-col gap-y-2 mb-4">
|
||||
<label className="text-white text-sm font-semibold block">
|
||||
Text Chunk Size
|
||||
</label>
|
||||
<p className="text-xs text-white/60">
|
||||
This is the maximum length of characters that can be
|
||||
present in a single vector.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
name="text_splitter_chunk_size"
|
||||
min={1}
|
||||
max={settings?.max_embed_chunk_size || 1000}
|
||||
onWheel={(e) => e?.currentTarget?.blur()}
|
||||
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
placeholder="maximum length of vectorized text"
|
||||
defaultValue={
|
||||
isNullOrNaN(settings?.text_splitter_chunk_size)
|
||||
? 1000
|
||||
: Number(settings?.text_splitter_chunk_size)
|
||||
}
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
/>
|
||||
<p className="text-xs text-white/40">
|
||||
Embed model maximum length is{" "}
|
||||
{numberWithCommas(settings?.max_embed_chunk_size || 1000)}.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex flex-col gap-y-4 mt-8">
|
||||
<div className="flex flex-col max-w-[300px]">
|
||||
<div className="flex flex-col gap-y-2 mb-4">
|
||||
<label className="text-white text-sm font-semibold block">
|
||||
Text Chunk Overlap
|
||||
</label>
|
||||
<p className="text-xs text-white/60">
|
||||
This is the maximum overlap of characters that occurs
|
||||
during chunking between two adjacent text chunks.
|
||||
</p>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
name="text_splitter_chunk_overlap"
|
||||
min={0}
|
||||
onWheel={(e) => e?.currentTarget?.blur()}
|
||||
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
placeholder="maximum length of vectorized text"
|
||||
defaultValue={
|
||||
isNullOrNaN(settings?.text_splitter_chunk_overlap)
|
||||
? 20
|
||||
: Number(settings?.text_splitter_chunk_overlap)
|
||||
}
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
|
@ -98,6 +98,10 @@ export default {
|
|||
transcriptionPreference: () => {
|
||||
return "/settings/transcription-preference";
|
||||
},
|
||||
embedder: {
|
||||
modelPreference: () => "/settings/embedding-preference",
|
||||
chunkingPreference: () => "/settings/text-splitter-preference",
|
||||
},
|
||||
embeddingPreference: () => {
|
||||
return "/settings/embedding-preference";
|
||||
},
|
||||
|
|
|
@ -8,7 +8,10 @@ const { User } = require("../models/user");
|
|||
const { DocumentVectors } = require("../models/vectors");
|
||||
const { Workspace } = require("../models/workspace");
|
||||
const { WorkspaceChats } = require("../models/workspaceChats");
|
||||
const { getVectorDbClass } = require("../utils/helpers");
|
||||
const {
|
||||
getVectorDbClass,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../utils/helpers");
|
||||
const {
|
||||
validRoleSelection,
|
||||
canModifyAdmin,
|
||||
|
@ -311,6 +314,7 @@ function adminEndpoints(app) {
|
|||
}
|
||||
);
|
||||
|
||||
// TODO: Allow specification of which props to get instead of returning all of them all the time.
|
||||
app.get(
|
||||
"/admin/system-preferences",
|
||||
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
|
||||
|
@ -333,6 +337,16 @@ function adminEndpoints(app) {
|
|||
support_email:
|
||||
(await SystemSettings.get({ label: "support_email" }))?.value ||
|
||||
null,
|
||||
text_splitter_chunk_size:
|
||||
(await SystemSettings.get({ label: "text_splitter_chunk_size" }))
|
||||
?.value ||
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength ||
|
||||
null,
|
||||
text_splitter_chunk_overlap:
|
||||
(await SystemSettings.get({ label: "text_splitter_chunk_overlap" }))
|
||||
?.value || null,
|
||||
max_embed_chunk_size:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1000,
|
||||
};
|
||||
response.status(200).json({ settings });
|
||||
} catch (e) {
|
||||
|
|
|
@ -5,6 +5,11 @@ process.env.NODE_ENV === "development"
|
|||
const { isValidUrl } = require("../utils/http");
|
||||
const prisma = require("../utils/prisma");
|
||||
|
||||
function isNullOrNaN(value) {
|
||||
if (value === null) return true;
|
||||
return isNaN(value);
|
||||
}
|
||||
|
||||
const SystemSettings = {
|
||||
protectedFields: ["multi_user_mode"],
|
||||
supportedFields: [
|
||||
|
@ -15,6 +20,8 @@ const SystemSettings = {
|
|||
"telemetry_id",
|
||||
"footer_data",
|
||||
"support_email",
|
||||
"text_splitter_chunk_size",
|
||||
"text_splitter_chunk_overlap",
|
||||
],
|
||||
validations: {
|
||||
footer_data: (updates) => {
|
||||
|
@ -28,6 +35,32 @@ const SystemSettings = {
|
|||
return JSON.stringify([]);
|
||||
}
|
||||
},
|
||||
text_splitter_chunk_size: (update) => {
|
||||
try {
|
||||
if (isNullOrNaN(update)) throw new Error("Value is not a number.");
|
||||
if (Number(update) <= 0) throw new Error("Value must be non-zero.");
|
||||
return Number(update);
|
||||
} catch (e) {
|
||||
console.error(
|
||||
`Failed to run validation function on text_splitter_chunk_size`,
|
||||
e.message
|
||||
);
|
||||
return 1000;
|
||||
}
|
||||
},
|
||||
text_splitter_chunk_overlap: (update) => {
|
||||
try {
|
||||
if (isNullOrNaN(update)) throw new Error("Value is not a number");
|
||||
if (Number(update) < 0) throw new Error("Value cannot be less than 0.");
|
||||
return Number(update);
|
||||
} catch (e) {
|
||||
console.error(
|
||||
`Failed to run validation function on text_splitter_chunk_overlap`,
|
||||
e.message
|
||||
);
|
||||
return 20;
|
||||
}
|
||||
},
|
||||
},
|
||||
currentSettings: async function () {
|
||||
const llmProvider = process.env.LLM_PROVIDER;
|
||||
|
@ -84,6 +117,15 @@ const SystemSettings = {
|
|||
}
|
||||
},
|
||||
|
||||
getValueOrFallback: async function (clause = {}, fallback = null) {
|
||||
try {
|
||||
return (await this.get(clause))?.value ?? fallback;
|
||||
} catch (error) {
|
||||
console.error(error.message);
|
||||
return fallback;
|
||||
}
|
||||
},
|
||||
|
||||
where: async function (clause = {}, limit) {
|
||||
try {
|
||||
const settings = await prisma.system_settings.findMany({
|
||||
|
|
|
@ -17,7 +17,9 @@ class AzureOpenAiEmbedder {
|
|||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
|
||||
this.maxConcurrentChunks = 16;
|
||||
this.embeddingMaxChunkLength = 1_000;
|
||||
|
||||
// https://learn.microsoft.com/en-us/answers/questions/1188074/text-embedding-ada-002-token-context-length
|
||||
this.embeddingMaxChunkLength = 2048;
|
||||
}
|
||||
|
||||
async embedTextInput(textInput) {
|
||||
|
|
|
@ -13,7 +13,9 @@ class OpenAiEmbedder {
|
|||
|
||||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||
this.maxConcurrentChunks = 500;
|
||||
this.embeddingMaxChunkLength = 1_000;
|
||||
|
||||
// https://platform.openai.com/docs/guides/embeddings/embedding-models
|
||||
this.embeddingMaxChunkLength = 8_191;
|
||||
}
|
||||
|
||||
async embedTextInput(textInput) {
|
||||
|
|
84
server/utils/TextSplitter/index.js
Normal file
84
server/utils/TextSplitter/index.js
Normal file
|
@ -0,0 +1,84 @@
|
|||
function isNullOrNaN(value) {
|
||||
if (value === null) return true;
|
||||
return isNaN(value);
|
||||
}
|
||||
|
||||
class TextSplitter {
|
||||
#splitter;
|
||||
constructor(config = {}) {
|
||||
/*
|
||||
config can be a ton of things depending on what is required or optional by the specific splitter.
|
||||
Non-splitter related keys
|
||||
{
|
||||
splitByFilename: string, // TODO
|
||||
}
|
||||
------
|
||||
Default: "RecursiveCharacterTextSplitter"
|
||||
Config: {
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
}
|
||||
------
|
||||
*/
|
||||
this.config = config;
|
||||
this.#splitter = this.#setSplitter(config);
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
// Does a quick check to determine the text chunk length limit.
|
||||
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
|
||||
// so here we want to allow override of the default 1000, but up to the models maximum, which is
|
||||
// sometimes user defined.
|
||||
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
|
||||
const prefValue = isNullOrNaN(preferred)
|
||||
? Number(embedderLimit)
|
||||
: Number(preferred);
|
||||
const limit = Number(embedderLimit);
|
||||
if (prefValue > limit)
|
||||
console.log(
|
||||
`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
|
||||
);
|
||||
return prefValue > limit ? limit : prefValue;
|
||||
}
|
||||
|
||||
#setSplitter(config = {}) {
|
||||
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||||
return new RecursiveSplitter({
|
||||
chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
|
||||
chunkOverlap: isNaN(config?.chunkOverlap)
|
||||
? 20
|
||||
: Number(config?.chunkOverlap),
|
||||
});
|
||||
}
|
||||
|
||||
async splitText(documentText) {
|
||||
return this.#splitter._splitText(documentText);
|
||||
}
|
||||
}
|
||||
|
||||
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
||||
class RecursiveSplitter {
|
||||
constructor({ chunkSize, chunkOverlap }) {
|
||||
const {
|
||||
RecursiveCharacterTextSplitter,
|
||||
} = require("langchain/text_splitter");
|
||||
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||||
this.engine = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
chunkOverlap,
|
||||
});
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
async _splitText(documentText) {
|
||||
return this.engine.splitText(documentText);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.TextSplitter = TextSplitter;
|
|
@ -1,5 +1,5 @@
|
|||
const { AstraDB: AstraClient } = require("@datastax/astra-db-ts");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const {
|
||||
|
@ -147,10 +147,17 @@ const AstraDB = {
|
|||
return { vectorized: true, error: null };
|
||||
}
|
||||
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
const { ChromaClient } = require("chromadb");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const {
|
||||
|
@ -180,10 +180,17 @@ const Chroma = {
|
|||
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -5,9 +5,10 @@ const {
|
|||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { SystemSettings } = require("../../../models/systemSettings");
|
||||
|
||||
const LanceDb = {
|
||||
uri: `${
|
||||
|
@ -180,10 +181,17 @@ const LanceDb = {
|
|||
// We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
|
||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ const {
|
|||
IndexType,
|
||||
MilvusClient,
|
||||
} = require("@zilliz/milvus2-sdk-node");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const {
|
||||
|
@ -182,10 +182,17 @@ const Milvus = {
|
|||
return { vectorized: true, error: null };
|
||||
}
|
||||
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
const { Pinecone } = require("@pinecone-database/pinecone");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const {
|
||||
|
@ -125,10 +125,17 @@ const PineconeDB = {
|
|||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
const { QdrantClient } = require("@qdrant/js-client-rest");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const {
|
||||
|
@ -198,10 +198,17 @@ const QDrant = {
|
|||
// We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments`
|
||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
const { default: weaviate } = require("weaviate-ts-client");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const {
|
||||
|
@ -241,10 +241,17 @@ const Weaviate = {
|
|||
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ const {
|
|||
IndexType,
|
||||
MilvusClient,
|
||||
} = require("@zilliz/milvus2-sdk-node");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const {
|
||||
|
@ -183,10 +183,17 @@ const Zilliz = {
|
|||
return { vectorized: true, error: null };
|
||||
}
|
||||
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue