Enable customization of chunk length and overlap ()

* Enable customization of chunk length and overlap

* fix onboarding link
show max limit in UI and prevent overlap >= chunk size
This commit is contained in:
Timothy Carambat 2024-04-06 16:38:07 -07:00 committed by GitHub
parent 1c11a47f93
commit ce98ff4653
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 455 additions and 47 deletions
frontend/src
App.jsx
components
LLMSelection
LMStudioOptions
LocalAiOptions
SettingsSidebar
pages/GeneralSettings/EmbeddingTextSplitterPreference
utils
server
endpoints
models
utils
EmbeddingEngines
azureOpenAi
openAi
TextSplitter
vectorDbProviders
astra
chroma
lance
milvus
pinecone
qdrant
weaviate
zilliz

View file

@ -35,6 +35,9 @@ const GeneralTranscriptionPreference = lazy(
const GeneralEmbeddingPreference = lazy( const GeneralEmbeddingPreference = lazy(
() => import("@/pages/GeneralSettings/EmbeddingPreference") () => import("@/pages/GeneralSettings/EmbeddingPreference")
); );
const EmbeddingTextSplitterPreference = lazy(
() => import("@/pages/GeneralSettings/EmbeddingTextSplitterPreference")
);
const GeneralVectorDatabase = lazy( const GeneralVectorDatabase = lazy(
() => import("@/pages/GeneralSettings/VectorDatabase") () => import("@/pages/GeneralSettings/VectorDatabase")
); );
@ -86,6 +89,12 @@ export default function App() {
path="/settings/embedding-preference" path="/settings/embedding-preference"
element={<AdminRoute Component={GeneralEmbeddingPreference} />} element={<AdminRoute Component={GeneralEmbeddingPreference} />}
/> />
<Route
path="/settings/text-splitter-preference"
element={
<AdminRoute Component={EmbeddingTextSplitterPreference} />
}
/>
<Route <Route
path="/settings/vector-database" path="/settings/vector-database"
element={<AdminRoute Component={GeneralVectorDatabase} />} element={<AdminRoute Component={GeneralVectorDatabase} />}

View file

@ -21,7 +21,7 @@ export default function LMStudioOptions({ settings, showAlert = false }) {
</p> </p>
</div> </div>
<a <a
href={paths.settings.embeddingPreference()} href={paths.settings.embedder.modelPreference()}
className="text-sm md:text-base my-2 underline" className="text-sm md:text-base my-2 underline"
> >
Manage embedding &rarr; Manage embedding &rarr;

View file

@ -21,7 +21,7 @@ export default function LocalAiOptions({ settings, showAlert = false }) {
</p> </p>
</div> </div>
<a <a
href={paths.settings.embeddingPreference()} href={paths.settings.embedder.modelPreference()}
className="text-sm md:text-base my-2 underline" className="text-sm md:text-base my-2 underline"
> >
Manage embedding &rarr; Manage embedding &rarr;

View file

@ -20,6 +20,7 @@ import {
Barcode, Barcode,
ClosedCaptioning, ClosedCaptioning,
EyeSlash, EyeSlash,
SplitVertical,
} from "@phosphor-icons/react"; } from "@phosphor-icons/react";
import useUser from "@/hooks/useUser"; import useUser from "@/hooks/useUser";
import { USER_BACKGROUND_COLOR } from "@/utils/constants"; import { USER_BACKGROUND_COLOR } from "@/utils/constants";
@ -288,12 +289,25 @@ const SidebarOptions = ({ user = null }) => (
allowedRole={["admin"]} allowedRole={["admin"]}
/> />
<Option <Option
href={paths.settings.embeddingPreference()} href={paths.settings.embedder.modelPreference()}
btnText="Embedding Model" childLinks={[paths.settings.embedder.chunkingPreference()]}
btnText="Embedder Preferences"
icon={<FileCode className="h-5 w-5 flex-shrink-0" />} icon={<FileCode className="h-5 w-5 flex-shrink-0" />}
user={user} user={user}
flex={true} flex={true}
allowedRole={["admin"]} allowedRole={["admin"]}
subOptions={
<>
<Option
href={paths.settings.embedder.chunkingPreference()}
btnText="Text Splitter & Chunking"
icon={<SplitVertical className="h-5 w-5 flex-shrink-0" />}
user={user}
flex={true}
allowedRole={["admin"]}
/>
</>
}
/> />
<Option <Option
href={paths.settings.vectorDatabase()} href={paths.settings.vectorDatabase()}

View file

@ -0,0 +1,180 @@
import React, { useEffect, useState } from "react";
import Sidebar from "@/components/SettingsSidebar";
import { isMobile } from "react-device-detect";
import PreLoader from "@/components/Preloader";
import CTAButton from "@/components/lib/CTAButton";
import Admin from "@/models/admin";
import showToast from "@/utils/toast";
import { nFormatter, numberWithCommas } from "@/utils/numbers";
function isNullOrNaN(value) {
if (value === null) return true;
return isNaN(value);
}
export default function EmbeddingTextSplitterPreference() {
const [settings, setSettings] = useState({});
const [loading, setLoading] = useState(true);
const [saving, setSaving] = useState(false);
const [hasChanges, setHasChanges] = useState(false);
const handleSubmit = async (e) => {
e.preventDefault();
const form = new FormData(e.target);
if (
Number(form.get("text_splitter_chunk_overlap")) >=
Number(form.get("text_splitter_chunk_size"))
) {
showToast(
"Chunk overlap cannot be larger or equal to chunk size.",
"error"
);
return;
}
setSaving(true);
await Admin.updateSystemPreferences({
text_splitter_chunk_size: isNullOrNaN(
form.get("text_splitter_chunk_size")
)
? 1000
: Number(form.get("text_splitter_chunk_size")),
text_splitter_chunk_overlap: isNullOrNaN(
form.get("text_splitter_chunk_overlap")
)
? 1000
: Number(form.get("text_splitter_chunk_overlap")),
});
setSaving(false);
setHasChanges(false);
showToast("Text chunking strategy settings saved.", "success");
};
useEffect(() => {
async function fetchSettings() {
const _settings = (await Admin.systemPreferences())?.settings;
setSettings(_settings ?? {});
setLoading(false);
}
fetchSettings();
}, []);
return (
<div className="w-screen h-screen overflow-hidden bg-sidebar flex">
<Sidebar />
{loading ? (
<div
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
>
<div className="w-full h-full flex justify-center items-center">
<PreLoader />
</div>
</div>
) : (
<div
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
>
<form
onSubmit={handleSubmit}
onChange={() => setHasChanges(true)}
className="flex w-full"
>
<div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
<div className="w-full flex flex-col gap-y-1 pb-4 border-white border-b-2 border-opacity-10">
<div className="flex gap-x-4 items-center">
<p className="text-lg leading-6 font-bold text-white">
Text splitting & Chunking Preferences
</p>
</div>
<p className="text-xs leading-[18px] font-base text-white text-opacity-60">
Sometimes, you may want to change the default way that new
documents are split and chunked before being inserted into
your vector database. <br />
You should only modify this setting if you understand how text
splitting works and it's side effects.
</p>
<p className="text-xs leading-[18px] font-semibold text-white/80">
Changes here will only apply to{" "}
<i>newly embedded documents</i>, not existing documents.
</p>
</div>
<div className="w-full justify-end flex">
{hasChanges && (
<CTAButton className="mt-3 mr-0 -mb-14 z-10">
{saving ? "Saving..." : "Save changes"}
</CTAButton>
)}
</div>
<div className="flex flex-col gap-y-4 mt-8">
<div className="flex flex-col max-w-[300px]">
<div className="flex flex-col gap-y-2 mb-4">
<label className="text-white text-sm font-semibold block">
Text Chunk Size
</label>
<p className="text-xs text-white/60">
This is the maximum length of characters that can be
present in a single vector.
</p>
</div>
<input
type="number"
name="text_splitter_chunk_size"
min={1}
max={settings?.max_embed_chunk_size || 1000}
onWheel={(e) => e?.currentTarget?.blur()}
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
placeholder="maximum length of vectorized text"
defaultValue={
isNullOrNaN(settings?.text_splitter_chunk_size)
? 1000
: Number(settings?.text_splitter_chunk_size)
}
required={true}
autoComplete="off"
/>
<p className="text-xs text-white/40">
Embed model maximum length is{" "}
{numberWithCommas(settings?.max_embed_chunk_size || 1000)}.
</p>
</div>
</div>
<div className="flex flex-col gap-y-4 mt-8">
<div className="flex flex-col max-w-[300px]">
<div className="flex flex-col gap-y-2 mb-4">
<label className="text-white text-sm font-semibold block">
Text Chunk Overlap
</label>
<p className="text-xs text-white/60">
This is the maximum overlap of characters that occurs
during chunking between two adjacent text chunks.
</p>
</div>
<input
type="number"
name="text_splitter_chunk_overlap"
min={0}
onWheel={(e) => e?.currentTarget?.blur()}
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
placeholder="maximum length of vectorized text"
defaultValue={
isNullOrNaN(settings?.text_splitter_chunk_overlap)
? 20
: Number(settings?.text_splitter_chunk_overlap)
}
required={true}
autoComplete="off"
/>
</div>
</div>
</div>
</form>
</div>
)}
</div>
);
}

View file

@ -98,6 +98,10 @@ export default {
transcriptionPreference: () => { transcriptionPreference: () => {
return "/settings/transcription-preference"; return "/settings/transcription-preference";
}, },
embedder: {
modelPreference: () => "/settings/embedding-preference",
chunkingPreference: () => "/settings/text-splitter-preference",
},
embeddingPreference: () => { embeddingPreference: () => {
return "/settings/embedding-preference"; return "/settings/embedding-preference";
}, },

View file

@ -8,7 +8,10 @@ const { User } = require("../models/user");
const { DocumentVectors } = require("../models/vectors"); const { DocumentVectors } = require("../models/vectors");
const { Workspace } = require("../models/workspace"); const { Workspace } = require("../models/workspace");
const { WorkspaceChats } = require("../models/workspaceChats"); const { WorkspaceChats } = require("../models/workspaceChats");
const { getVectorDbClass } = require("../utils/helpers"); const {
getVectorDbClass,
getEmbeddingEngineSelection,
} = require("../utils/helpers");
const { const {
validRoleSelection, validRoleSelection,
canModifyAdmin, canModifyAdmin,
@ -311,6 +314,7 @@ function adminEndpoints(app) {
} }
); );
// TODO: Allow specification of which props to get instead of returning all of them all the time.
app.get( app.get(
"/admin/system-preferences", "/admin/system-preferences",
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])], [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
@ -333,6 +337,16 @@ function adminEndpoints(app) {
support_email: support_email:
(await SystemSettings.get({ label: "support_email" }))?.value || (await SystemSettings.get({ label: "support_email" }))?.value ||
null, null,
text_splitter_chunk_size:
(await SystemSettings.get({ label: "text_splitter_chunk_size" }))
?.value ||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength ||
null,
text_splitter_chunk_overlap:
(await SystemSettings.get({ label: "text_splitter_chunk_overlap" }))
?.value || null,
max_embed_chunk_size:
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1000,
}; };
response.status(200).json({ settings }); response.status(200).json({ settings });
} catch (e) { } catch (e) {

View file

@ -5,6 +5,11 @@ process.env.NODE_ENV === "development"
const { isValidUrl } = require("../utils/http"); const { isValidUrl } = require("../utils/http");
const prisma = require("../utils/prisma"); const prisma = require("../utils/prisma");
function isNullOrNaN(value) {
if (value === null) return true;
return isNaN(value);
}
const SystemSettings = { const SystemSettings = {
protectedFields: ["multi_user_mode"], protectedFields: ["multi_user_mode"],
supportedFields: [ supportedFields: [
@ -15,6 +20,8 @@ const SystemSettings = {
"telemetry_id", "telemetry_id",
"footer_data", "footer_data",
"support_email", "support_email",
"text_splitter_chunk_size",
"text_splitter_chunk_overlap",
], ],
validations: { validations: {
footer_data: (updates) => { footer_data: (updates) => {
@ -28,6 +35,32 @@ const SystemSettings = {
return JSON.stringify([]); return JSON.stringify([]);
} }
}, },
text_splitter_chunk_size: (update) => {
try {
if (isNullOrNaN(update)) throw new Error("Value is not a number.");
if (Number(update) <= 0) throw new Error("Value must be non-zero.");
return Number(update);
} catch (e) {
console.error(
`Failed to run validation function on text_splitter_chunk_size`,
e.message
);
return 1000;
}
},
text_splitter_chunk_overlap: (update) => {
try {
if (isNullOrNaN(update)) throw new Error("Value is not a number");
if (Number(update) < 0) throw new Error("Value cannot be less than 0.");
return Number(update);
} catch (e) {
console.error(
`Failed to run validation function on text_splitter_chunk_overlap`,
e.message
);
return 20;
}
},
}, },
currentSettings: async function () { currentSettings: async function () {
const llmProvider = process.env.LLM_PROVIDER; const llmProvider = process.env.LLM_PROVIDER;
@ -84,6 +117,15 @@ const SystemSettings = {
} }
}, },
getValueOrFallback: async function (clause = {}, fallback = null) {
try {
return (await this.get(clause))?.value ?? fallback;
} catch (error) {
console.error(error.message);
return fallback;
}
},
where: async function (clause = {}, limit) { where: async function (clause = {}, limit) {
try { try {
const settings = await prisma.system_settings.findMany({ const settings = await prisma.system_settings.findMany({

View file

@ -17,7 +17,9 @@ class AzureOpenAiEmbedder {
// Limit of how many strings we can process in a single pass to stay with resource or network limits // Limit of how many strings we can process in a single pass to stay with resource or network limits
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request // https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
this.maxConcurrentChunks = 16; this.maxConcurrentChunks = 16;
this.embeddingMaxChunkLength = 1_000;
// https://learn.microsoft.com/en-us/answers/questions/1188074/text-embedding-ada-002-token-context-length
this.embeddingMaxChunkLength = 2048;
} }
async embedTextInput(textInput) { async embedTextInput(textInput) {

View file

@ -13,7 +13,9 @@ class OpenAiEmbedder {
// Limit of how many strings we can process in a single pass to stay with resource or network limits // Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 500; this.maxConcurrentChunks = 500;
this.embeddingMaxChunkLength = 1_000;
// https://platform.openai.com/docs/guides/embeddings/embedding-models
this.embeddingMaxChunkLength = 8_191;
} }
async embedTextInput(textInput) { async embedTextInput(textInput) {

View file

@ -0,0 +1,84 @@
function isNullOrNaN(value) {
if (value === null) return true;
return isNaN(value);
}
class TextSplitter {
#splitter;
constructor(config = {}) {
/*
config can be a ton of things depending on what is required or optional by the specific splitter.
Non-splitter related keys
{
splitByFilename: string, // TODO
}
------
Default: "RecursiveCharacterTextSplitter"
Config: {
chunkSize: number,
chunkOverlap: number,
}
------
*/
this.config = config;
this.#splitter = this.#setSplitter(config);
}
log(text, ...args) {
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
}
// Does a quick check to determine the text chunk length limit.
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
// so here we want to allow override of the default 1000, but up to the models maximum, which is
// sometimes user defined.
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
const prefValue = isNullOrNaN(preferred)
? Number(embedderLimit)
: Number(preferred);
const limit = Number(embedderLimit);
if (prefValue > limit)
console.log(
`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
);
return prefValue > limit ? limit : prefValue;
}
#setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({
chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
chunkOverlap: isNaN(config?.chunkOverlap)
? 20
: Number(config?.chunkOverlap),
});
}
async splitText(documentText) {
return this.#splitter._splitText(documentText);
}
}
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
class RecursiveSplitter {
constructor({ chunkSize, chunkOverlap }) {
const {
RecursiveCharacterTextSplitter,
} = require("langchain/text_splitter");
this.log(`Will split with`, { chunkSize, chunkOverlap });
this.engine = new RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
});
}
log(text, ...args) {
console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
}
async _splitText(documentText) {
return this.engine.splitText(documentText);
}
}
module.exports.TextSplitter = TextSplitter;

View file

@ -1,5 +1,5 @@
const { AstraDB: AstraClient } = require("@datastax/astra-db-ts"); const { AstraDB: AstraClient } = require("@datastax/astra-db-ts");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { const {
@ -147,10 +147,17 @@ const AstraDB = {
return { vectorized: true, error: null }; return { vectorized: true, error: null };
} }
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -1,5 +1,5 @@
const { ChromaClient } = require("chromadb"); const { ChromaClient } = require("chromadb");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { const {
@ -180,10 +180,17 @@ const Chroma = {
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents // because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb. // from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -5,9 +5,10 @@ const {
getEmbeddingEngineSelection, getEmbeddingEngineSelection,
} = require("../../helpers"); } = require("../../helpers");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai"); const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { SystemSettings } = require("../../../models/systemSettings");
const LanceDb = { const LanceDb = {
uri: `${ uri: `${
@ -180,10 +181,17 @@ const LanceDb = {
// We have to do this manually as opposed to using LangChains `xyz.fromDocuments` // We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents // because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb. // from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -4,7 +4,7 @@ const {
IndexType, IndexType,
MilvusClient, MilvusClient,
} = require("@zilliz/milvus2-sdk-node"); } = require("@zilliz/milvus2-sdk-node");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { const {
@ -182,10 +182,17 @@ const Milvus = {
return { vectorized: true, error: null }; return { vectorized: true, error: null };
} }
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -1,5 +1,5 @@
const { Pinecone } = require("@pinecone-database/pinecone"); const { Pinecone } = require("@pinecone-database/pinecone");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { const {
@ -125,10 +125,17 @@ const PineconeDB = {
// because we then cannot atomically control our namespace to granularly find/remove documents // because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb. // from vectordb.
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167 // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -1,5 +1,5 @@
const { QdrantClient } = require("@qdrant/js-client-rest"); const { QdrantClient } = require("@qdrant/js-client-rest");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { const {
@ -198,10 +198,17 @@ const QDrant = {
// We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments` // We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents // because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb. // from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -1,5 +1,5 @@
const { default: weaviate } = require("weaviate-ts-client"); const { default: weaviate } = require("weaviate-ts-client");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { const {
@ -241,10 +241,17 @@ const Weaviate = {
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments` // We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
// because we then cannot atomically control our namespace to granularly find/remove documents // because we then cannot atomically control our namespace to granularly find/remove documents
// from vectordb. // from vectordb.
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);

View file

@ -4,7 +4,7 @@ const {
IndexType, IndexType,
MilvusClient, MilvusClient,
} = require("@zilliz/milvus2-sdk-node"); } = require("@zilliz/milvus2-sdk-node");
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter"); const { TextSplitter } = require("../../TextSplitter");
const { v4: uuidv4 } = require("uuid"); const { v4: uuidv4 } = require("uuid");
const { storeVectorResult, cachedVectorInformation } = require("../../files"); const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { const {
@ -183,10 +183,17 @@ const Zilliz = {
return { vectorized: true, error: null }; return { vectorized: true, error: null };
} }
const textSplitter = new RecursiveCharacterTextSplitter({ const textSplitter = new TextSplitter({
chunkSize: chunkSize: TextSplitter.determineMaxChunkSize(
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000, await SystemSettings.getValueOrFallback({
chunkOverlap: 20, label: "text_splitter_chunk_size",
}),
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
}); });
const textChunks = await textSplitter.splitText(pageContent); const textChunks = await textSplitter.splitText(pageContent);