From b23cb1a90fd5ebe062a4567f1e68cd7200fbcac7 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Tue, 21 May 2024 14:43:39 -0500
Subject: [PATCH] Improve RAG results via chunkHeader append (#1473)

---
 .../ChatHistory/Citation/index.jsx            |  7 ++++-
 server/utils/TextSplitter/index.js            | 26 +++++++++++++++++--
 server/utils/vectorDbProviders/astra/index.js |  4 +++
 .../utils/vectorDbProviders/chroma/index.js   |  4 +++
 server/utils/vectorDbProviders/lance/index.js |  4 +++
 .../utils/vectorDbProviders/milvus/index.js   |  4 +++
 .../utils/vectorDbProviders/pinecone/index.js |  4 +++
 .../utils/vectorDbProviders/qdrant/index.js   |  4 +++
 .../utils/vectorDbProviders/weaviate/index.js |  4 +++
 .../utils/vectorDbProviders/zilliz/index.js   |  4 +++
 10 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
index a3a579c95..de4c4f72d 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
@@ -115,6 +115,11 @@ function SkeletonLine() {
   );
 }
 
+function omitChunkHeader(text) {
+  if (!text.startsWith("<document_metadata>")) return text;
+  return text.split("</document_metadata>")[1].trim();
+}
+
 function CitationDetailModal({ source, onClose }) {
   const { references, title, chunks } = source;
   const { isUrl, text: webpageUrl, href: linkTo } = parseChunkSource(source);
@@ -167,7 +172,7 @@ function CitationDetailModal({ source, onClose }) {
               <div key={idx} className="pt-6 text-white">
                 <div className="flex flex-col w-full justify-start pb-6 gap-y-1">
                   <p className="text-white whitespace-pre-line">
-                    {HTMLDecode(text)}
+                    {HTMLDecode(omitChunkHeader(text))}
                   </p>
 
                   {!!score && (
diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js
index f79fb87fa..4162fa74e 100644
--- a/server/utils/TextSplitter/index.js
+++ b/server/utils/TextSplitter/index.js
@@ -17,6 +17,7 @@ class TextSplitter {
       Config: {
         chunkSize: number,
         chunkOverlap: number,
+        chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
       }
       ------
     */
@@ -44,6 +45,18 @@ class TextSplitter {
     return prefValue > limit ? limit : prefValue;
   }
 
+  stringifyHeader() {
+    if (!this.config.chunkHeaderMeta) return null;
+    let content = "";
+    Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
+      if (!key || !value) return;
+      content += `${key}: ${value}\n`;
+    });
+
+    if (!content) return null;
+    return `<document_metadata>\n${content}</document_metadata>\n\n`;
+  }
+
   #setSplitter(config = {}) {
     // if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
     return new RecursiveSplitter({
@@ -51,6 +64,7 @@ class TextSplitter {
       chunkOverlap: isNaN(config?.chunkOverlap)
         ? 20
         : Number(config?.chunkOverlap),
+      chunkHeader: this.stringifyHeader(),
     });
   }
 
@@ -61,11 +75,12 @@ class TextSplitter {
 
 // Wrapper for Langchain default RecursiveCharacterTextSplitter class.
 class RecursiveSplitter {
-  constructor({ chunkSize, chunkOverlap }) {
+  constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
     const {
       RecursiveCharacterTextSplitter,
     } = require("@langchain/textsplitters");
     this.log(`Will split with`, { chunkSize, chunkOverlap });
+    this.chunkHeader = chunkHeader;
     this.engine = new RecursiveCharacterTextSplitter({
       chunkSize,
       chunkOverlap,
@@ -77,7 +92,14 @@ class RecursiveSplitter {
   }
 
   async _splitText(documentText) {
-    return this.engine.splitText(documentText);
+    if (!this.chunkHeader) return this.engine.splitText(documentText);
+    const strings = await this.engine.splitText(documentText);
+    const documents = await this.engine.createDocuments(strings, [], {
+      chunkHeader: this.chunkHeader,
+    });
+    return documents
+      .filter((doc) => !!doc.pageContent)
+      .map((doc) => doc.pageContent);
   }
 }
 
diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js
index 50e8ba34c..30ff2bbff 100644
--- a/server/utils/vectorDbProviders/astra/index.js
+++ b/server/utils/vectorDbProviders/astra/index.js
@@ -157,6 +157,10 @@ const AstraDB = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
index d17883b7e..90956a949 100644
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@@ -200,6 +200,10 @@ const Chroma = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
index db2662954..54c12c045 100644
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@@ -198,6 +198,10 @@ const LanceDb = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js
index 273092331..d720c2657 100644
--- a/server/utils/vectorDbProviders/milvus/index.js
+++ b/server/utils/vectorDbProviders/milvus/index.js
@@ -192,6 +192,10 @@ const Milvus = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
index 9b68ef1b5..d1aeb2f64 100644
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@@ -143,6 +143,10 @@ const PineconeDB = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
index e8511d0b5..ff55c06f6 100644
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@@ -217,6 +217,10 @@ const QDrant = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
index f19329a45..978e2557a 100644
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@@ -259,6 +259,10 @@ const Weaviate = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);
 
diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js
index a7ee04388..ebb59157d 100644
--- a/server/utils/vectorDbProviders/zilliz/index.js
+++ b/server/utils/vectorDbProviders/zilliz/index.js
@@ -193,6 +193,10 @@ const Zilliz = {
           { label: "text_splitter_chunk_overlap" },
           20
         ),
+        chunkHeaderMeta: {
+          sourceDocument: metadata?.title,
+          published: metadata?.published || "unknown",
+        },
       });
       const textChunks = await textSplitter.splitText(pageContent);