mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-23 13:08:11 +00:00
Enable customization of chunk length and overlap (#1059)
* Enable customization of chunk length and overlap * fix onboarding link show max limit in UI and prevent overlap >= chunk size
This commit is contained in:
parent
1c11a47f93
commit
ce98ff4653
19 changed files with 455 additions and 47 deletions
|
@ -35,6 +35,9 @@ const GeneralTranscriptionPreference = lazy(
|
||||||
const GeneralEmbeddingPreference = lazy(
|
const GeneralEmbeddingPreference = lazy(
|
||||||
() => import("@/pages/GeneralSettings/EmbeddingPreference")
|
() => import("@/pages/GeneralSettings/EmbeddingPreference")
|
||||||
);
|
);
|
||||||
|
const EmbeddingTextSplitterPreference = lazy(
|
||||||
|
() => import("@/pages/GeneralSettings/EmbeddingTextSplitterPreference")
|
||||||
|
);
|
||||||
const GeneralVectorDatabase = lazy(
|
const GeneralVectorDatabase = lazy(
|
||||||
() => import("@/pages/GeneralSettings/VectorDatabase")
|
() => import("@/pages/GeneralSettings/VectorDatabase")
|
||||||
);
|
);
|
||||||
|
@ -86,6 +89,12 @@ export default function App() {
|
||||||
path="/settings/embedding-preference"
|
path="/settings/embedding-preference"
|
||||||
element={<AdminRoute Component={GeneralEmbeddingPreference} />}
|
element={<AdminRoute Component={GeneralEmbeddingPreference} />}
|
||||||
/>
|
/>
|
||||||
|
<Route
|
||||||
|
path="/settings/text-splitter-preference"
|
||||||
|
element={
|
||||||
|
<AdminRoute Component={EmbeddingTextSplitterPreference} />
|
||||||
|
}
|
||||||
|
/>
|
||||||
<Route
|
<Route
|
||||||
path="/settings/vector-database"
|
path="/settings/vector-database"
|
||||||
element={<AdminRoute Component={GeneralVectorDatabase} />}
|
element={<AdminRoute Component={GeneralVectorDatabase} />}
|
||||||
|
|
|
@ -21,7 +21,7 @@ export default function LMStudioOptions({ settings, showAlert = false }) {
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<a
|
<a
|
||||||
href={paths.settings.embeddingPreference()}
|
href={paths.settings.embedder.modelPreference()}
|
||||||
className="text-sm md:text-base my-2 underline"
|
className="text-sm md:text-base my-2 underline"
|
||||||
>
|
>
|
||||||
Manage embedding →
|
Manage embedding →
|
||||||
|
|
|
@ -21,7 +21,7 @@ export default function LocalAiOptions({ settings, showAlert = false }) {
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<a
|
<a
|
||||||
href={paths.settings.embeddingPreference()}
|
href={paths.settings.embedder.modelPreference()}
|
||||||
className="text-sm md:text-base my-2 underline"
|
className="text-sm md:text-base my-2 underline"
|
||||||
>
|
>
|
||||||
Manage embedding →
|
Manage embedding →
|
||||||
|
|
|
@ -20,6 +20,7 @@ import {
|
||||||
Barcode,
|
Barcode,
|
||||||
ClosedCaptioning,
|
ClosedCaptioning,
|
||||||
EyeSlash,
|
EyeSlash,
|
||||||
|
SplitVertical,
|
||||||
} from "@phosphor-icons/react";
|
} from "@phosphor-icons/react";
|
||||||
import useUser from "@/hooks/useUser";
|
import useUser from "@/hooks/useUser";
|
||||||
import { USER_BACKGROUND_COLOR } from "@/utils/constants";
|
import { USER_BACKGROUND_COLOR } from "@/utils/constants";
|
||||||
|
@ -288,12 +289,25 @@ const SidebarOptions = ({ user = null }) => (
|
||||||
allowedRole={["admin"]}
|
allowedRole={["admin"]}
|
||||||
/>
|
/>
|
||||||
<Option
|
<Option
|
||||||
href={paths.settings.embeddingPreference()}
|
href={paths.settings.embedder.modelPreference()}
|
||||||
btnText="Embedding Model"
|
childLinks={[paths.settings.embedder.chunkingPreference()]}
|
||||||
|
btnText="Embedder Preferences"
|
||||||
icon={<FileCode className="h-5 w-5 flex-shrink-0" />}
|
icon={<FileCode className="h-5 w-5 flex-shrink-0" />}
|
||||||
user={user}
|
user={user}
|
||||||
flex={true}
|
flex={true}
|
||||||
allowedRole={["admin"]}
|
allowedRole={["admin"]}
|
||||||
|
subOptions={
|
||||||
|
<>
|
||||||
|
<Option
|
||||||
|
href={paths.settings.embedder.chunkingPreference()}
|
||||||
|
btnText="Text Splitter & Chunking"
|
||||||
|
icon={<SplitVertical className="h-5 w-5 flex-shrink-0" />}
|
||||||
|
user={user}
|
||||||
|
flex={true}
|
||||||
|
allowedRole={["admin"]}
|
||||||
|
/>
|
||||||
|
</>
|
||||||
|
}
|
||||||
/>
|
/>
|
||||||
<Option
|
<Option
|
||||||
href={paths.settings.vectorDatabase()}
|
href={paths.settings.vectorDatabase()}
|
||||||
|
|
|
@ -0,0 +1,180 @@
|
||||||
|
import React, { useEffect, useState } from "react";
|
||||||
|
import Sidebar from "@/components/SettingsSidebar";
|
||||||
|
import { isMobile } from "react-device-detect";
|
||||||
|
import PreLoader from "@/components/Preloader";
|
||||||
|
import CTAButton from "@/components/lib/CTAButton";
|
||||||
|
import Admin from "@/models/admin";
|
||||||
|
import showToast from "@/utils/toast";
|
||||||
|
import { nFormatter, numberWithCommas } from "@/utils/numbers";
|
||||||
|
|
||||||
|
function isNullOrNaN(value) {
|
||||||
|
if (value === null) return true;
|
||||||
|
return isNaN(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function EmbeddingTextSplitterPreference() {
|
||||||
|
const [settings, setSettings] = useState({});
|
||||||
|
const [loading, setLoading] = useState(true);
|
||||||
|
const [saving, setSaving] = useState(false);
|
||||||
|
const [hasChanges, setHasChanges] = useState(false);
|
||||||
|
|
||||||
|
const handleSubmit = async (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
const form = new FormData(e.target);
|
||||||
|
|
||||||
|
if (
|
||||||
|
Number(form.get("text_splitter_chunk_overlap")) >=
|
||||||
|
Number(form.get("text_splitter_chunk_size"))
|
||||||
|
) {
|
||||||
|
showToast(
|
||||||
|
"Chunk overlap cannot be larger or equal to chunk size.",
|
||||||
|
"error"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setSaving(true);
|
||||||
|
await Admin.updateSystemPreferences({
|
||||||
|
text_splitter_chunk_size: isNullOrNaN(
|
||||||
|
form.get("text_splitter_chunk_size")
|
||||||
|
)
|
||||||
|
? 1000
|
||||||
|
: Number(form.get("text_splitter_chunk_size")),
|
||||||
|
text_splitter_chunk_overlap: isNullOrNaN(
|
||||||
|
form.get("text_splitter_chunk_overlap")
|
||||||
|
)
|
||||||
|
? 1000
|
||||||
|
: Number(form.get("text_splitter_chunk_overlap")),
|
||||||
|
});
|
||||||
|
setSaving(false);
|
||||||
|
setHasChanges(false);
|
||||||
|
showToast("Text chunking strategy settings saved.", "success");
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
async function fetchSettings() {
|
||||||
|
const _settings = (await Admin.systemPreferences())?.settings;
|
||||||
|
setSettings(_settings ?? {});
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
fetchSettings();
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="w-screen h-screen overflow-hidden bg-sidebar flex">
|
||||||
|
<Sidebar />
|
||||||
|
{loading ? (
|
||||||
|
<div
|
||||||
|
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
|
||||||
|
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
|
||||||
|
>
|
||||||
|
<div className="w-full h-full flex justify-center items-center">
|
||||||
|
<PreLoader />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div
|
||||||
|
style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
|
||||||
|
className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[16px] bg-main-gradient w-full h-full overflow-y-scroll"
|
||||||
|
>
|
||||||
|
<form
|
||||||
|
onSubmit={handleSubmit}
|
||||||
|
onChange={() => setHasChanges(true)}
|
||||||
|
className="flex w-full"
|
||||||
|
>
|
||||||
|
<div className="flex flex-col w-full px-1 md:pl-6 md:pr-[50px] md:py-6 py-16">
|
||||||
|
<div className="w-full flex flex-col gap-y-1 pb-4 border-white border-b-2 border-opacity-10">
|
||||||
|
<div className="flex gap-x-4 items-center">
|
||||||
|
<p className="text-lg leading-6 font-bold text-white">
|
||||||
|
Text splitting & Chunking Preferences
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<p className="text-xs leading-[18px] font-base text-white text-opacity-60">
|
||||||
|
Sometimes, you may want to change the default way that new
|
||||||
|
documents are split and chunked before being inserted into
|
||||||
|
your vector database. <br />
|
||||||
|
You should only modify this setting if you understand how text
|
||||||
|
splitting works and it's side effects.
|
||||||
|
</p>
|
||||||
|
<p className="text-xs leading-[18px] font-semibold text-white/80">
|
||||||
|
Changes here will only apply to{" "}
|
||||||
|
<i>newly embedded documents</i>, not existing documents.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div className="w-full justify-end flex">
|
||||||
|
{hasChanges && (
|
||||||
|
<CTAButton className="mt-3 mr-0 -mb-14 z-10">
|
||||||
|
{saving ? "Saving..." : "Save changes"}
|
||||||
|
</CTAButton>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex flex-col gap-y-4 mt-8">
|
||||||
|
<div className="flex flex-col max-w-[300px]">
|
||||||
|
<div className="flex flex-col gap-y-2 mb-4">
|
||||||
|
<label className="text-white text-sm font-semibold block">
|
||||||
|
Text Chunk Size
|
||||||
|
</label>
|
||||||
|
<p className="text-xs text-white/60">
|
||||||
|
This is the maximum length of characters that can be
|
||||||
|
present in a single vector.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
name="text_splitter_chunk_size"
|
||||||
|
min={1}
|
||||||
|
max={settings?.max_embed_chunk_size || 1000}
|
||||||
|
onWheel={(e) => e?.currentTarget?.blur()}
|
||||||
|
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||||
|
placeholder="maximum length of vectorized text"
|
||||||
|
defaultValue={
|
||||||
|
isNullOrNaN(settings?.text_splitter_chunk_size)
|
||||||
|
? 1000
|
||||||
|
: Number(settings?.text_splitter_chunk_size)
|
||||||
|
}
|
||||||
|
required={true}
|
||||||
|
autoComplete="off"
|
||||||
|
/>
|
||||||
|
<p className="text-xs text-white/40">
|
||||||
|
Embed model maximum length is{" "}
|
||||||
|
{numberWithCommas(settings?.max_embed_chunk_size || 1000)}.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex flex-col gap-y-4 mt-8">
|
||||||
|
<div className="flex flex-col max-w-[300px]">
|
||||||
|
<div className="flex flex-col gap-y-2 mb-4">
|
||||||
|
<label className="text-white text-sm font-semibold block">
|
||||||
|
Text Chunk Overlap
|
||||||
|
</label>
|
||||||
|
<p className="text-xs text-white/60">
|
||||||
|
This is the maximum overlap of characters that occurs
|
||||||
|
during chunking between two adjacent text chunks.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
type="number"
|
||||||
|
name="text_splitter_chunk_overlap"
|
||||||
|
min={0}
|
||||||
|
onWheel={(e) => e?.currentTarget?.blur()}
|
||||||
|
className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||||
|
placeholder="maximum length of vectorized text"
|
||||||
|
defaultValue={
|
||||||
|
isNullOrNaN(settings?.text_splitter_chunk_overlap)
|
||||||
|
? 20
|
||||||
|
: Number(settings?.text_splitter_chunk_overlap)
|
||||||
|
}
|
||||||
|
required={true}
|
||||||
|
autoComplete="off"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
|
@ -98,6 +98,10 @@ export default {
|
||||||
transcriptionPreference: () => {
|
transcriptionPreference: () => {
|
||||||
return "/settings/transcription-preference";
|
return "/settings/transcription-preference";
|
||||||
},
|
},
|
||||||
|
embedder: {
|
||||||
|
modelPreference: () => "/settings/embedding-preference",
|
||||||
|
chunkingPreference: () => "/settings/text-splitter-preference",
|
||||||
|
},
|
||||||
embeddingPreference: () => {
|
embeddingPreference: () => {
|
||||||
return "/settings/embedding-preference";
|
return "/settings/embedding-preference";
|
||||||
},
|
},
|
||||||
|
|
|
@ -8,7 +8,10 @@ const { User } = require("../models/user");
|
||||||
const { DocumentVectors } = require("../models/vectors");
|
const { DocumentVectors } = require("../models/vectors");
|
||||||
const { Workspace } = require("../models/workspace");
|
const { Workspace } = require("../models/workspace");
|
||||||
const { WorkspaceChats } = require("../models/workspaceChats");
|
const { WorkspaceChats } = require("../models/workspaceChats");
|
||||||
const { getVectorDbClass } = require("../utils/helpers");
|
const {
|
||||||
|
getVectorDbClass,
|
||||||
|
getEmbeddingEngineSelection,
|
||||||
|
} = require("../utils/helpers");
|
||||||
const {
|
const {
|
||||||
validRoleSelection,
|
validRoleSelection,
|
||||||
canModifyAdmin,
|
canModifyAdmin,
|
||||||
|
@ -311,6 +314,7 @@ function adminEndpoints(app) {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// TODO: Allow specification of which props to get instead of returning all of them all the time.
|
||||||
app.get(
|
app.get(
|
||||||
"/admin/system-preferences",
|
"/admin/system-preferences",
|
||||||
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
|
[validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
|
||||||
|
@ -333,6 +337,16 @@ function adminEndpoints(app) {
|
||||||
support_email:
|
support_email:
|
||||||
(await SystemSettings.get({ label: "support_email" }))?.value ||
|
(await SystemSettings.get({ label: "support_email" }))?.value ||
|
||||||
null,
|
null,
|
||||||
|
text_splitter_chunk_size:
|
||||||
|
(await SystemSettings.get({ label: "text_splitter_chunk_size" }))
|
||||||
|
?.value ||
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength ||
|
||||||
|
null,
|
||||||
|
text_splitter_chunk_overlap:
|
||||||
|
(await SystemSettings.get({ label: "text_splitter_chunk_overlap" }))
|
||||||
|
?.value || null,
|
||||||
|
max_embed_chunk_size:
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1000,
|
||||||
};
|
};
|
||||||
response.status(200).json({ settings });
|
response.status(200).json({ settings });
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|
|
@ -5,6 +5,11 @@ process.env.NODE_ENV === "development"
|
||||||
const { isValidUrl } = require("../utils/http");
|
const { isValidUrl } = require("../utils/http");
|
||||||
const prisma = require("../utils/prisma");
|
const prisma = require("../utils/prisma");
|
||||||
|
|
||||||
|
function isNullOrNaN(value) {
|
||||||
|
if (value === null) return true;
|
||||||
|
return isNaN(value);
|
||||||
|
}
|
||||||
|
|
||||||
const SystemSettings = {
|
const SystemSettings = {
|
||||||
protectedFields: ["multi_user_mode"],
|
protectedFields: ["multi_user_mode"],
|
||||||
supportedFields: [
|
supportedFields: [
|
||||||
|
@ -15,6 +20,8 @@ const SystemSettings = {
|
||||||
"telemetry_id",
|
"telemetry_id",
|
||||||
"footer_data",
|
"footer_data",
|
||||||
"support_email",
|
"support_email",
|
||||||
|
"text_splitter_chunk_size",
|
||||||
|
"text_splitter_chunk_overlap",
|
||||||
],
|
],
|
||||||
validations: {
|
validations: {
|
||||||
footer_data: (updates) => {
|
footer_data: (updates) => {
|
||||||
|
@ -28,6 +35,32 @@ const SystemSettings = {
|
||||||
return JSON.stringify([]);
|
return JSON.stringify([]);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
text_splitter_chunk_size: (update) => {
|
||||||
|
try {
|
||||||
|
if (isNullOrNaN(update)) throw new Error("Value is not a number.");
|
||||||
|
if (Number(update) <= 0) throw new Error("Value must be non-zero.");
|
||||||
|
return Number(update);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(
|
||||||
|
`Failed to run validation function on text_splitter_chunk_size`,
|
||||||
|
e.message
|
||||||
|
);
|
||||||
|
return 1000;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
text_splitter_chunk_overlap: (update) => {
|
||||||
|
try {
|
||||||
|
if (isNullOrNaN(update)) throw new Error("Value is not a number");
|
||||||
|
if (Number(update) < 0) throw new Error("Value cannot be less than 0.");
|
||||||
|
return Number(update);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(
|
||||||
|
`Failed to run validation function on text_splitter_chunk_overlap`,
|
||||||
|
e.message
|
||||||
|
);
|
||||||
|
return 20;
|
||||||
|
}
|
||||||
|
},
|
||||||
},
|
},
|
||||||
currentSettings: async function () {
|
currentSettings: async function () {
|
||||||
const llmProvider = process.env.LLM_PROVIDER;
|
const llmProvider = process.env.LLM_PROVIDER;
|
||||||
|
@ -84,6 +117,15 @@ const SystemSettings = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
getValueOrFallback: async function (clause = {}, fallback = null) {
|
||||||
|
try {
|
||||||
|
return (await this.get(clause))?.value ?? fallback;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(error.message);
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
where: async function (clause = {}, limit) {
|
where: async function (clause = {}, limit) {
|
||||||
try {
|
try {
|
||||||
const settings = await prisma.system_settings.findMany({
|
const settings = await prisma.system_settings.findMany({
|
||||||
|
|
|
@ -17,7 +17,9 @@ class AzureOpenAiEmbedder {
|
||||||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||||
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
|
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
|
||||||
this.maxConcurrentChunks = 16;
|
this.maxConcurrentChunks = 16;
|
||||||
this.embeddingMaxChunkLength = 1_000;
|
|
||||||
|
// https://learn.microsoft.com/en-us/answers/questions/1188074/text-embedding-ada-002-token-context-length
|
||||||
|
this.embeddingMaxChunkLength = 2048;
|
||||||
}
|
}
|
||||||
|
|
||||||
async embedTextInput(textInput) {
|
async embedTextInput(textInput) {
|
||||||
|
|
|
@ -13,7 +13,9 @@ class OpenAiEmbedder {
|
||||||
|
|
||||||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||||
this.maxConcurrentChunks = 500;
|
this.maxConcurrentChunks = 500;
|
||||||
this.embeddingMaxChunkLength = 1_000;
|
|
||||||
|
// https://platform.openai.com/docs/guides/embeddings/embedding-models
|
||||||
|
this.embeddingMaxChunkLength = 8_191;
|
||||||
}
|
}
|
||||||
|
|
||||||
async embedTextInput(textInput) {
|
async embedTextInput(textInput) {
|
||||||
|
|
84
server/utils/TextSplitter/index.js
Normal file
84
server/utils/TextSplitter/index.js
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
function isNullOrNaN(value) {
|
||||||
|
if (value === null) return true;
|
||||||
|
return isNaN(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
class TextSplitter {
|
||||||
|
#splitter;
|
||||||
|
constructor(config = {}) {
|
||||||
|
/*
|
||||||
|
config can be a ton of things depending on what is required or optional by the specific splitter.
|
||||||
|
Non-splitter related keys
|
||||||
|
{
|
||||||
|
splitByFilename: string, // TODO
|
||||||
|
}
|
||||||
|
------
|
||||||
|
Default: "RecursiveCharacterTextSplitter"
|
||||||
|
Config: {
|
||||||
|
chunkSize: number,
|
||||||
|
chunkOverlap: number,
|
||||||
|
}
|
||||||
|
------
|
||||||
|
*/
|
||||||
|
this.config = config;
|
||||||
|
this.#splitter = this.#setSplitter(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
log(text, ...args) {
|
||||||
|
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Does a quick check to determine the text chunk length limit.
|
||||||
|
// Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
|
||||||
|
// so here we want to allow override of the default 1000, but up to the models maximum, which is
|
||||||
|
// sometimes user defined.
|
||||||
|
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
|
||||||
|
const prefValue = isNullOrNaN(preferred)
|
||||||
|
? Number(embedderLimit)
|
||||||
|
: Number(preferred);
|
||||||
|
const limit = Number(embedderLimit);
|
||||||
|
if (prefValue > limit)
|
||||||
|
console.log(
|
||||||
|
`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
|
||||||
|
);
|
||||||
|
return prefValue > limit ? limit : prefValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
#setSplitter(config = {}) {
|
||||||
|
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||||||
|
return new RecursiveSplitter({
|
||||||
|
chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
|
||||||
|
chunkOverlap: isNaN(config?.chunkOverlap)
|
||||||
|
? 20
|
||||||
|
: Number(config?.chunkOverlap),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async splitText(documentText) {
|
||||||
|
return this.#splitter._splitText(documentText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
|
||||||
|
class RecursiveSplitter {
|
||||||
|
constructor({ chunkSize, chunkOverlap }) {
|
||||||
|
const {
|
||||||
|
RecursiveCharacterTextSplitter,
|
||||||
|
} = require("langchain/text_splitter");
|
||||||
|
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||||||
|
this.engine = new RecursiveCharacterTextSplitter({
|
||||||
|
chunkSize,
|
||||||
|
chunkOverlap,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
log(text, ...args) {
|
||||||
|
console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
|
||||||
|
}
|
||||||
|
|
||||||
|
async _splitText(documentText) {
|
||||||
|
return this.engine.splitText(documentText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports.TextSplitter = TextSplitter;
|
|
@ -1,5 +1,5 @@
|
||||||
const { AstraDB: AstraClient } = require("@datastax/astra-db-ts");
|
const { AstraDB: AstraClient } = require("@datastax/astra-db-ts");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const {
|
const {
|
||||||
|
@ -147,10 +147,17 @@ const AstraDB = {
|
||||||
return { vectorized: true, error: null };
|
return { vectorized: true, error: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
const { ChromaClient } = require("chromadb");
|
const { ChromaClient } = require("chromadb");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const {
|
const {
|
||||||
|
@ -180,10 +180,17 @@ const Chroma = {
|
||||||
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
||||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||||
// from vectordb.
|
// from vectordb.
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -5,9 +5,10 @@ const {
|
||||||
getEmbeddingEngineSelection,
|
getEmbeddingEngineSelection,
|
||||||
} = require("../../helpers");
|
} = require("../../helpers");
|
||||||
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
|
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
|
const { SystemSettings } = require("../../../models/systemSettings");
|
||||||
|
|
||||||
const LanceDb = {
|
const LanceDb = {
|
||||||
uri: `${
|
uri: `${
|
||||||
|
@ -180,10 +181,17 @@ const LanceDb = {
|
||||||
// We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
|
// We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
|
||||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||||
// from vectordb.
|
// from vectordb.
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ const {
|
||||||
IndexType,
|
IndexType,
|
||||||
MilvusClient,
|
MilvusClient,
|
||||||
} = require("@zilliz/milvus2-sdk-node");
|
} = require("@zilliz/milvus2-sdk-node");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const {
|
const {
|
||||||
|
@ -182,10 +182,17 @@ const Milvus = {
|
||||||
return { vectorized: true, error: null };
|
return { vectorized: true, error: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
const { Pinecone } = require("@pinecone-database/pinecone");
|
const { Pinecone } = require("@pinecone-database/pinecone");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const {
|
const {
|
||||||
|
@ -125,10 +125,17 @@ const PineconeDB = {
|
||||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||||
// from vectordb.
|
// from vectordb.
|
||||||
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
|
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
const { QdrantClient } = require("@qdrant/js-client-rest");
|
const { QdrantClient } = require("@qdrant/js-client-rest");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const {
|
const {
|
||||||
|
@ -198,10 +198,17 @@ const QDrant = {
|
||||||
// We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments`
|
// We have to do this manually as opposed to using LangChains `Qdrant.fromDocuments`
|
||||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||||
// from vectordb.
|
// from vectordb.
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
const { default: weaviate } = require("weaviate-ts-client");
|
const { default: weaviate } = require("weaviate-ts-client");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const {
|
const {
|
||||||
|
@ -241,10 +241,17 @@ const Weaviate = {
|
||||||
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
// We have to do this manually as opposed to using LangChains `Chroma.fromDocuments`
|
||||||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||||
// from vectordb.
|
// from vectordb.
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ const {
|
||||||
IndexType,
|
IndexType,
|
||||||
MilvusClient,
|
MilvusClient,
|
||||||
} = require("@zilliz/milvus2-sdk-node");
|
} = require("@zilliz/milvus2-sdk-node");
|
||||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
const { TextSplitter } = require("../../TextSplitter");
|
||||||
const { v4: uuidv4 } = require("uuid");
|
const { v4: uuidv4 } = require("uuid");
|
||||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||||
const {
|
const {
|
||||||
|
@ -183,10 +183,17 @@ const Zilliz = {
|
||||||
return { vectorized: true, error: null };
|
return { vectorized: true, error: null };
|
||||||
}
|
}
|
||||||
|
|
||||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
const textSplitter = new TextSplitter({
|
||||||
chunkSize:
|
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
await SystemSettings.getValueOrFallback({
|
||||||
chunkOverlap: 20,
|
label: "text_splitter_chunk_size",
|
||||||
|
}),
|
||||||
|
getEmbeddingEngineSelection()?.embeddingMaxChunkLength
|
||||||
|
),
|
||||||
|
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||||
|
{ label: "text_splitter_chunk_overlap" },
|
||||||
|
20
|
||||||
|
),
|
||||||
});
|
});
|
||||||
const textChunks = await textSplitter.splitText(pageContent);
|
const textChunks = await textSplitter.splitText(pageContent);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue