mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
feat: add support for variable chunk length (#415)
fix: cleanup code for embedding length clarify resolves #388
This commit is contained in:
parent
48dd99b028
commit
8cc1455b72
15 changed files with 89 additions and 24 deletions
|
@ -47,6 +47,7 @@ GID='1000'
|
|||
# EMBEDDING_ENGINE='localai'
|
||||
# EMBEDDING_BASE_PATH='https://localhost:8080/v1'
|
||||
# EMBEDDING_MODEL_PREF='text-embedding-ada-002'
|
||||
# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be
|
||||
|
||||
###########################################
|
||||
######## Vector Database Selection ########
|
||||
|
|
|
@ -30,6 +30,22 @@ export default function LocalAiOptions({ settings }) {
|
|||
/>
|
||||
</div>
|
||||
<LocalAIModelSelection settings={settings} basePath={basePath} />
|
||||
<div className="flex flex-col w-60">
|
||||
<label className="text-white text-sm font-semibold block mb-4">
|
||||
Max embedding chunk length
|
||||
</label>
|
||||
<input
|
||||
type="number"
|
||||
name="EmbeddingModelMaxChunkLength"
|
||||
className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
|
||||
placeholder="1000"
|
||||
min={1}
|
||||
onScroll={(e) => e.target.blur()}
|
||||
defaultValue={settings?.EmbeddingModelMaxChunkLength}
|
||||
required={false}
|
||||
autoComplete="off"
|
||||
/>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ JWT_SECRET="my-random-string-for-seeding" # Please generate random string at lea
|
|||
# EMBEDDING_ENGINE='localai'
|
||||
# EMBEDDING_BASE_PATH='https://localhost:8080/v1'
|
||||
# EMBEDDING_MODEL_PREF='text-embedding-ada-002'
|
||||
# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=1000 # The max chunk size in chars a string to embed can be
|
||||
|
||||
###########################################
|
||||
######## Vector Database Selection ########
|
||||
|
|
|
@ -27,6 +27,8 @@ const SystemSettings = {
|
|||
EmbeddingEngine: process.env.EMBEDDING_ENGINE,
|
||||
EmbeddingBasePath: process.env.EMBEDDING_BASE_PATH,
|
||||
EmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
|
||||
EmbeddingModelMaxChunkLength:
|
||||
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
|
||||
...(vectorDB === "pinecone"
|
||||
? {
|
||||
PineConeEnvironment: process.env.PINECONE_ENVIRONMENT,
|
||||
|
|
|
@ -16,7 +16,7 @@ class AzureOpenAiEmbedder {
|
|||
|
||||
// The maximum amount of "inputs" that OpenAI API can process in a single call.
|
||||
// https://learn.microsoft.com/en-us/azure/ai-services/openai/faq#i-am-trying-to-use-embeddings-and-received-the-error--invalidrequesterror--too-many-inputs--the-max-number-of-inputs-is-1---how-do-i-fix-this-:~:text=consisting%20of%20up%20to%2016%20inputs%20per%20API%20request
|
||||
this.embeddingChunkLimit = 16;
|
||||
this.embeddingMaxChunkLength = 16;
|
||||
}
|
||||
|
||||
async embedTextInput(textInput) {
|
||||
|
@ -34,9 +34,9 @@ class AzureOpenAiEmbedder {
|
|||
|
||||
// Because there is a limit on how many chunks can be sent at once to Azure OpenAI
|
||||
// we concurrently execute each max batch of text chunks possible.
|
||||
// Refer to constructor embeddingChunkLimit for more info.
|
||||
// Refer to constructor embeddingMaxChunkLength for more info.
|
||||
const embeddingRequests = [];
|
||||
for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
|
||||
for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
|
||||
embeddingRequests.push(
|
||||
new Promise((resolve) => {
|
||||
this.openai
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
const { toChunks } = require("../../helpers");
|
||||
const { toChunks, maximumChunkLength } = require("../../helpers");
|
||||
|
||||
class LocalAiEmbedder {
|
||||
constructor() {
|
||||
|
@ -12,8 +12,8 @@ class LocalAiEmbedder {
|
|||
});
|
||||
this.openai = new OpenAIApi(config);
|
||||
|
||||
// Arbitrary limit to ensure we stay within reasonable POST request size.
|
||||
this.embeddingChunkLimit = 1_000;
|
||||
// Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
|
||||
this.embeddingMaxChunkLength = maximumChunkLength();
|
||||
}
|
||||
|
||||
async embedTextInput(textInput) {
|
||||
|
@ -23,7 +23,7 @@ class LocalAiEmbedder {
|
|||
|
||||
async embedChunks(textChunks = []) {
|
||||
const embeddingRequests = [];
|
||||
for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
|
||||
for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
|
||||
embeddingRequests.push(
|
||||
new Promise((resolve) => {
|
||||
this.openai
|
||||
|
|
|
@ -4,6 +4,7 @@ const { toChunks } = require("../../helpers");
|
|||
|
||||
class NativeEmbedder {
|
||||
constructor() {
|
||||
// Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
||||
this.model = "Xenova/all-MiniLM-L6-v2";
|
||||
this.cacheDir = path.resolve(
|
||||
process.env.STORAGE_DIR
|
||||
|
@ -12,8 +13,8 @@ class NativeEmbedder {
|
|||
);
|
||||
this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
|
||||
|
||||
// Limit the number of chunks to send per loop to not overload compute.
|
||||
this.embeddingChunkLimit = 16;
|
||||
// Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
|
||||
this.embeddingMaxChunkLength = 1_000;
|
||||
|
||||
// Make directory when it does not exist in existing installations
|
||||
if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
|
||||
|
@ -62,7 +63,7 @@ class NativeEmbedder {
|
|||
async embedChunks(textChunks = []) {
|
||||
const Embedder = await this.embedderClient();
|
||||
const embeddingResults = [];
|
||||
for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
|
||||
for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
|
||||
const output = await Embedder(chunk, {
|
||||
pooling: "mean",
|
||||
normalize: true,
|
||||
|
|
|
@ -10,8 +10,8 @@ class OpenAiEmbedder {
|
|||
const openai = new OpenAIApi(config);
|
||||
this.openai = openai;
|
||||
|
||||
// Arbitrary limit to ensure we stay within reasonable POST request size.
|
||||
this.embeddingChunkLimit = 1_000;
|
||||
// Arbitrary limit of string size in chars to ensure we stay within reasonable POST request size.
|
||||
this.embeddingMaxChunkLength = 1_000;
|
||||
}
|
||||
|
||||
async embedTextInput(textInput) {
|
||||
|
@ -22,9 +22,9 @@ class OpenAiEmbedder {
|
|||
async embedChunks(textChunks = []) {
|
||||
// Because there is a hard POST limit on how many chunks can be sent at once to OpenAI (~8mb)
|
||||
// we concurrently execute each max batch of text chunks possible.
|
||||
// Refer to constructor embeddingChunkLimit for more info.
|
||||
// Refer to constructor embeddingMaxChunkLength for more info.
|
||||
const embeddingRequests = [];
|
||||
for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
|
||||
for (const chunk of toChunks(textChunks, this.embeddingMaxChunkLength)) {
|
||||
embeddingRequests.push(
|
||||
new Promise((resolve) => {
|
||||
this.openai
|
||||
|
|
|
@ -70,6 +70,20 @@ function getEmbeddingEngineSelection() {
|
|||
}
|
||||
}
|
||||
|
||||
// Some models have lower restrictions on chars that can be encoded in a single pass
|
||||
// and by default we assume it can handle 1,000 chars, but some models use work with smaller
|
||||
// chars so here we can override that value when embedding information.
|
||||
function maximumChunkLength() {
|
||||
if (
|
||||
!!process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH &&
|
||||
!isNaN(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) &&
|
||||
Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH) > 1
|
||||
)
|
||||
return Number(process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH);
|
||||
|
||||
return 1_000;
|
||||
}
|
||||
|
||||
function toChunks(arr, size) {
|
||||
return Array.from({ length: Math.ceil(arr.length / size) }, (_v, i) =>
|
||||
arr.slice(i * size, i * size + size)
|
||||
|
@ -78,6 +92,7 @@ function toChunks(arr, size) {
|
|||
|
||||
module.exports = {
|
||||
getEmbeddingEngineSelection,
|
||||
maximumChunkLength,
|
||||
getVectorDbClass,
|
||||
getLLMProvider,
|
||||
toChunks,
|
||||
|
|
|
@ -90,6 +90,10 @@ const KEY_MAPPING = {
|
|||
envKey: "EMBEDDING_MODEL_PREF",
|
||||
checks: [isNotEmpty],
|
||||
},
|
||||
EmbeddingModelMaxChunkLength: {
|
||||
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
|
||||
checks: [nonZero],
|
||||
},
|
||||
|
||||
// Vector Database Selection Settings
|
||||
VectorDB: {
|
||||
|
|
|
@ -2,7 +2,11 @@ const { ChromaClient } = require("chromadb");
|
|||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getLLMProvider } = require("../../helpers");
|
||||
const {
|
||||
toChunks,
|
||||
getLLMProvider,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
|
||||
const Chroma = {
|
||||
name: "Chroma",
|
||||
|
@ -175,7 +179,8 @@ const Chroma = {
|
|||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
const lancedb = require("vectordb");
|
||||
const { toChunks, getLLMProvider } = require("../../helpers");
|
||||
const {
|
||||
toChunks,
|
||||
getLLMProvider,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
|
||||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
|
@ -176,7 +180,8 @@ const LanceDb = {
|
|||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
|
|
@ -2,7 +2,11 @@ const { PineconeClient } = require("@pinecone-database/pinecone");
|
|||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getLLMProvider } = require("../../helpers");
|
||||
const {
|
||||
toChunks,
|
||||
getLLMProvider,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
|
||||
const Pinecone = {
|
||||
name: "Pinecone",
|
||||
|
@ -130,7 +134,8 @@ const Pinecone = {
|
|||
// from vectordb.
|
||||
// https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L167
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
|
|
@ -2,7 +2,11 @@ const { QdrantClient } = require("@qdrant/js-client-rest");
|
|||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getLLMProvider } = require("../../helpers");
|
||||
const {
|
||||
toChunks,
|
||||
getLLMProvider,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
|
||||
const QDrant = {
|
||||
name: "QDrant",
|
||||
|
@ -174,7 +178,8 @@ const QDrant = {
|
|||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
|
|
@ -2,7 +2,11 @@ const { default: weaviate } = require("weaviate-ts-client");
|
|||
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getLLMProvider } = require("../../helpers");
|
||||
const {
|
||||
toChunks,
|
||||
getLLMProvider,
|
||||
getEmbeddingEngineSelection,
|
||||
} = require("../../helpers");
|
||||
const { camelCase } = require("../../helpers/camelcase");
|
||||
|
||||
const Weaviate = {
|
||||
|
@ -237,7 +241,8 @@ const Weaviate = {
|
|||
// because we then cannot atomically control our namespace to granularly find/remove documents
|
||||
// from vectordb.
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkSize:
|
||||
getEmbeddingEngineSelection()?.embeddingMaxChunkLength || 1_000,
|
||||
chunkOverlap: 20,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
|
Loading…
Add table
Reference in a new issue