Lancedb support (#6)

* add start of lanceDB support * lancedb initial support * add null method for deletion of documents from namespace since LanceDB does not support show warning modal on frontend for this * update .env.example and lancedb methods for sourcing * change export method * update readme
2025-04-17 18:18:11 +00:00 · 2023-06-08 18:40:29 -07:00 · 2023-06-08 18:40:29 -07:00 · ad15e1f9b6
commit ad15e1f9b6
parent 93ee4349ba
8 changed files with 360 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,8 @@

 [![Twitter](https://img.shields.io/twitter/url/https/twitter.com/tim.svg?style=social&label=Follow%20%40Timothy%20Carambat)](https://twitter.com/tcarambat) [![](https://dcbadge.vercel.app/api/server/6UyHPeGZAC?compact=true&style=flat)](https://discord.gg/6UyHPeGZAC)

-A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports Pinecone & ChromaDB for vector storage and OpenAI for chatting.
+A full-stack application and tool suite that enables you to turn any document, resource, or piece of content into a piece of data that any LLM can use as reference during chatting. This application runs with very minimal overhead as by default the LLM and vectorDB are hosted remotely, but can be swapped for local instances. Currently this project supports [Pinecone](https://pinecone.io), [ChromaDB](https://trychroma.com) & more for vector storage and [OpenAI](https://openai.com) for LLM/chatting.
+

 ![Chatting](/images/screenshots/chat.png)
 [view more screenshots](/images/screenshots/SCREENSHOTS.md)
@ -38,8 +39,8 @@ This monorepo consists of three main sections:
 - `yarn` and `node` on your machine
 - `python` 3.8+ for running scripts in `collector/`.
 - access to an LLM like `GPT-3.5`, `GPT-4`*.
- a [Pinecone.io](https://pinecone.io) free account* **or** Local Chroma instance running.
-*you can use drop in replacements for these. This is just the easiest to get up and running fast.
+- a [Pinecone.io](https://pinecone.io) free account*.
+*you can use drop in replacements for these. This is just the easiest to get up and running fast. We support multiple vector database providers.

 ### How to get started
 - `yarn setup` from the project root directory.
--- a/frontend/src/components/Modals/ManageWorkspace.jsx
+++ b/frontend/src/components/Modals/ManageWorkspace.jsx
@ -14,6 +14,7 @@ import { nFormatter } from "../../utils/numbers";
 import { dollarFormat } from "../../utils/numbers";
 import paths from "../../utils/paths";
 import { useParams } from "react-router-dom";
+import { titleCase } from "text-case";

 const noop = () => false;
 export default function ManageWorkspace({ hideModal = noop, workspace }) {
@ -24,15 +25,19 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) {
  const [directories, setDirectories] = useState(null);
  const [originalDocuments, setOriginalDocuments] = useState([]);
  const [selectedFiles, setSelectFiles] = useState([]);
+  const [vectordb, setVectorDB] = useState(null);
+  const [showingNoRemovalModal, setShowingNoRemovalModal] = useState(false);

  useEffect(() => {
    async function fetchKeys() {
      const _workspace = await Workspace.bySlug(workspace.slug);
      const localFiles = await System.localFiles();
+      const settings = await System.keys();
      const originalDocs = _workspace.documents.map((doc) => doc.docpath) || [];
      setDirectories(localFiles);
      setOriginalDocuments([...originalDocs]);
      setSelectFiles([...originalDocs]);
+      setVectorDB(settings?.VectorDB);
      setLoading(false);
    }
    fetchKeys();
@ -97,11 +102,25 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) {
      : selectedFiles.some((doc) => doc.includes(filepath));
  };

+  const isOriginalDoc = (filepath) => {
+    const isFolder = !filepath.includes("/");
+    return isFolder
+      ? originalDocuments.some((doc) => doc.includes(filepath.split("/")[0]))
+      : originalDocuments.some((doc) => doc.includes(filepath));
+  };
+
  const toggleSelection = (filepath) => {
    const isFolder = !filepath.includes("/");
    const parent = isFolder ? filepath : filepath.split("/")[0];

    if (isSelected(filepath)) {
+      // Certain vector DBs do not contain the ability to delete vectors
+      // so we cannot remove from these. The user will have to clear the entire workspace.
+      if (["lancedb"].includes(vectordb) && isOriginalDoc(filepath)) {
+        setShowingNoRemovalModal(true);
+        return false;
+      }
+
      const updatedDocs = isFolder
        ? selectedFiles.filter((doc) => !doc.includes(parent))
        : selectedFiles.filter((doc) => !doc.includes(filepath));
@ -168,6 +187,12 @@ export default function ManageWorkspace({ hideModal = noop, workspace }) {
          updateWorkspace={updateWorkspace}
        />
      )}
+      {showingNoRemovalModal && (
+        <CannotRemoveModal
+          hideModal={() => setShowingNoRemovalModal(false)}
+          vectordb={vectordb}
+        />
+      )}
      <div className="fixed top-0 left-0 right-0 z-50 w-full p-4 overflow-x-hidden overflow-y-auto md:inset-0 h-[calc(100%-1rem)] h-full bg-black bg-opacity-50 flex items-center justify-center">
        <div
          className="flex fixed top-0 left-0 right-0 w-full h-full"
@ -463,6 +488,42 @@ function ConfirmationModal({
  );
 }

+function CannotRemoveModal({ hideModal, vectordb }) {
+  return (
+    <dialog
+      open={true}
+      style={{ zIndex: 100 }}
+      className="fixed top-0 flex bg-black bg-opacity-50 w-[100vw] h-full items-center justify-center "
+    >
+      <div className="px-10 p-4 w-1/2 rounded-lg bg-white shadow dark:bg-stone-700 text-black dark:text-slate-200">
+        <div className="flex flex-col w-full">
+          <p className="text-lg font-semibold text-red-500">
+            You cannot remove this document!
+          </p>
+
+          <div className="flex flex-col gap-y-1">
+            <p className="text-base mt-4">
+              {titleCase(vectordb)} does not support atomic removal of
+              documents.
+              <br />
+              Unfortunately, you will have to delete the entire workspace to
+              remove this document from being referenced.
+            </p>
+          </div>
+          <div className="flex w-full justify-center items-center mt-4">
+            <button
+              onClick={hideModal}
+              className="text-gray-800 hover:bg-gray-100 px-4 py-1 rounded-lg dark:text-slate-200 dark:hover:bg-stone-900"
+            >
+              I Understand
+            </button>
+          </div>
+        </div>
+      </div>
+    </dialog>
+  );
+}
+
 export function useManageWorkspaceModal() {
  const [showing, setShowing] = useState(false);
  const showModal = () => {
--- a/server/.env.example
+++ b/server/.env.example
@ -13,6 +13,9 @@ PINECONE_ENVIRONMENT=
 PINECONE_API_KEY=
 PINECONE_INDEX=

+# Enable all below if you are using vector database: LanceDB.
+# VECTOR_DB="lancedb"
+
 # CLOUD DEPLOYMENT VARIRABLES ONLY
 # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
 # STORAGE_DIR= # absolute filesystem path with no trailing slash
--- a/server/.gitignore
+++ b/server/.gitignore
@ -4,4 +4,5 @@ documents/*
 vector-cache/*.json
 !documents/DOCUMENTS.md
 logs/server.log
-*.db
+*.db
+lancedb
--- a/server/package.json
+++ b/server/package.json
@ -29,7 +29,8 @@
    "slugify": "^1.6.6",
    "sqlite": "^4.2.1",
    "sqlite3": "^5.1.6",
-    "uuid": "^9.0.0"
+    "uuid": "^9.0.0",
+    "vectordb": "0.1.5-beta"
  },
  "devDependencies": {
    "nodemon": "^2.0.22",
--- a/server/utils/chroma/index.js
+++ b/server/utils/chroma/index.js
@ -65,14 +65,6 @@ const Chroma = {
      modelName: model,
    });
  },
-  chatLLM: function () {
-    const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
-    return new ChatOpenAI({
-      openAIApiKey: process.env.OPEN_AI_KEY,
-      temperature: 0.7,
-      modelName: model,
-    });
-  },
  embedChunk: async function (openai, textChunk) {
    const {
      data: { data },
@ -274,16 +266,6 @@ const Chroma = {
      };
    }

-    // const collection = await client.getCollection({ name: namespace, embeddingFunction: this.embeddingFunc() })
-    // const results = await collection.get({
-    //   where: {
-    //     description: 'a custom file uploaded by the user.'
-    //   },
-    //   includes: ['ids']
-    // })
-    // console.log(results)
-    // return { response: null, sources: [], }
-
    const vectorStore = await ChromaStore.fromExistingCollection(
      this.embedder(),
      { collectionName: namespace, url: process.env.CHROMA_ENDPOINT }
--- a/server/utils/helpers/index.js
+++ b/server/utils/helpers/index.js
@ -1,6 +1,7 @@
 function getVectorDbClass() {
  const { Pinecone } = require("../pinecone");
  const { Chroma } = require("../chroma");
+  const { LanceDb } = require("../lancedb");

  const vectorSelection = process.env.VECTOR_DB || "pinecone";
  switch (vectorSelection) {
@ -8,6 +9,8 @@ function getVectorDbClass() {
      return Pinecone;
    case "chroma":
      return Chroma;
+    case "lancedb":
+      return LanceDb;
    default:
      throw new Error("ENV: No VECTOR_DB value found in environment!");
  }
--- a/server/utils/lancedb/index.js
+++ b/server/utils/lancedb/index.js
@ -0,0 +1,285 @@
+const lancedb = require("vectordb");
+const { toChunks } = require("../helpers");
+const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
+const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
+const { storeVectorResult, cachedVectorInformation } = require("../files");
+const { Configuration, OpenAIApi } = require("openai");
+const { v4: uuidv4 } = require("uuid");
+
+// Since we roll our own results for prompting we
+// have to manually curate sources as well.
+function curateLanceSources(sources = []) {
+  const knownDocs = [];
+  const documents = [];
+  for (const source of sources) {
+    const { text: _t, vector: _v, score: _s, ...metadata } = source;
+    if (
+      Object.keys(metadata).length > 0 &&
+      !knownDocs.includes(metadata.title)
+    ) {
+      documents.push({ ...metadata });
+      knownDocs.push(metadata.title);
+    }
+  }
+
+  return documents;
+}
+
+const LanceDb = {
+  uri: `${!!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./"
+    }lancedb`,
+  name: "LanceDb",
+  connect: async function () {
+    if (process.env.VECTOR_DB !== "lancedb")
+      throw new Error("LanceDB::Invalid ENV settings");
+
+    const client = await lancedb.connect(this.uri);
+    return { client };
+  },
+  heartbeat: async function () {
+    await this.connect();
+    return { heartbeat: Number(new Date()) };
+  },
+  totalIndicies: async function () {
+    return 0; // Unsupported for LanceDB - so always zero
+  },
+  embeddingFunc: function () {
+    return new lancedb.OpenAIEmbeddingFunction(
+      "context",
+      process.env.OPEN_AI_KEY
+    );
+  },
+  embedder: function () {
+    return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY });
+  },
+  openai: function () {
+    const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY });
+    const openai = new OpenAIApi(config);
+    return openai;
+  },
+  embedChunk: async function (openai, textChunk) {
+    const {
+      data: { data },
+    } = await openai.createEmbedding({
+      model: "text-embedding-ada-002",
+      input: textChunk,
+    });
+    return data.length > 0 && data[0].hasOwnProperty("embedding")
+      ? data[0].embedding
+      : null;
+  },
+  getChatCompletion: async function (openai, messages = []) {
+    const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
+    const { data } = await openai.createChatCompletion({
+      model,
+      messages,
+    });
+
+    if (!data.hasOwnProperty("choices")) return null;
+    return data.choices[0].message.content;
+  },
+  namespace: async function (client, namespace = null) {
+    if (!namespace) throw new Error("No namespace value provided.");
+    const collection = await client.openTable(namespace).catch(() => false);
+    if (!collection) return null;
+
+    return {
+      ...collection,
+    };
+  },
+  updateOrCreateCollection: async function (client, data = [], namespace) {
+    if (await this.hasNamespace(namespace)) {
+      const collection = await client.openTable(namespace);
+      const result = await collection.add(data);
+      console.log({ result });
+      return true;
+    }
+
+    const result = await client.createTable(namespace, data);
+    console.log({ result });
+    return true;
+  },
+  hasNamespace: async function (namespace = null) {
+    if (!namespace) return false;
+    const { client } = await this.connect();
+    const exists = await this.namespaceExists(client, namespace);
+    return exists;
+  },
+  namespaceExists: async function (client, namespace = null) {
+    if (!namespace) throw new Error("No namespace value provided.");
+    const collections = await client.tableNames();
+    return collections.includes(namespace);
+  },
+  deleteVectorsInNamespace: async function (client, namespace = null) {
+    const fs = require("fs");
+    fs.rm(`${client.uri}/${namespace}.lance`, { recursive: true }, () => null);
+    return true;
+  },
+  deleteDocumentFromNamespace: async function (_namespace, _docId) {
+    console.error(
+      `LanceDB:deleteDocumentFromNamespace - unsupported operation. No changes made to vector db.`
+    );
+    return false;
+  },
+  addDocumentToNamespace: async function (
+    namespace,
+    documentData = {},
+    fullFilePath = null
+  ) {
+    const { DocumentVectors } = require("../../models/vectors");
+    try {
+      const { pageContent, docId, ...metadata } = documentData;
+      if (!pageContent || pageContent.length == 0) return false;
+
+      console.log("Adding new vectorized document into namespace", namespace);
+      const cacheResult = await cachedVectorInformation(fullFilePath);
+      if (cacheResult.exists) {
+        const { client } = await this.connect();
+        const { chunks } = cacheResult;
+        const documentVectors = [];
+        const submissions = [];
+
+        for (const chunk of chunks) {
+          chunk.forEach((chunk) => {
+            const id = uuidv4();
+            const { id: _id, ...metadata } = chunk.metadata;
+            documentVectors.push({ docId, vectorId: id });
+            submissions.push({ id: id, vector: chunk.values, ...metadata });
+          });
+        }
+
+        console.log(submissions);
+        await this.updateOrCreateCollection(client, submissions, namespace);
+        await DocumentVectors.bulkInsert(documentVectors);
+        return true;
+      }
+
+      // If we are here then we are going to embed and store a novel document.
+      // We have to do this manually as opposed to using LangChains `xyz.fromDocuments`
+      // because we then cannot atomically control our namespace to granularly find/remove documents
+      // from vectordb.
+      const textSplitter = new RecursiveCharacterTextSplitter({
+        chunkSize: 1000,
+        chunkOverlap: 20,
+      });
+      const textChunks = await textSplitter.splitText(pageContent);
+
+      console.log("Chunks created from document:", textChunks.length);
+      const documentVectors = [];
+      const vectors = [];
+      const submissions = [];
+      const openai = this.openai();
+
+      for (const textChunk of textChunks) {
+        const vectorValues = await this.embedChunk(openai, textChunk);
+
+        if (!!vectorValues) {
+          const vectorRecord = {
+            id: uuidv4(),
+            values: vectorValues,
+            // [DO NOT REMOVE]
+            // LangChain will be unable to find your text if you embed manually and dont include the `text` key.
+            // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64
+            metadata: { ...metadata, text: textChunk },
+          };
+
+          vectors.push(vectorRecord);
+          submissions.push({
+            id: vectorRecord.id,
+            vector: vectorRecord.values,
+            ...vectorRecord.metadata,
+          });
+          documentVectors.push({ docId, vectorId: vectorRecord.id });
+        } else {
+          console.error(
+            "Could not use OpenAI to embed document chunk! This document will not be recorded."
+          );
+        }
+      }
+
+      if (vectors.length > 0) {
+        const chunks = [];
+        for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
+
+        console.log("Inserting vectorized chunks into LanceDB collection.");
+        const { client } = await this.connect();
+        await this.updateOrCreateCollection(client, submissions, namespace);
+        await storeVectorResult(chunks, fullFilePath);
+      }
+
+      await DocumentVectors.bulkInsert(documentVectors);
+      return true;
+    } catch (e) {
+      console.error("addDocumentToNamespace", e.message);
+      return false;
+    }
+  },
+  query: async function (reqBody = {}) {
+    const { namespace = null, input } = reqBody;
+    if (!namespace || !input) throw new Error("Invalid request body");
+
+    const { client } = await this.connect();
+    if (!(await this.namespaceExists(client, namespace))) {
+      return {
+        response: null,
+        sources: [],
+        message: "Invalid query - no documents found for workspace!",
+      };
+    }
+
+    // LanceDB does not have langchainJS support so we roll our own here.
+    const queryVector = await this.embedChunk(this.openai(), input);
+    const collection = await client.openTable(namespace);
+    const relevantResults = await collection
+      .search(queryVector)
+      .metricType("cosine")
+      .limit(2)
+      .execute();
+    const messages = [
+      {
+        role: "system",
+        content: `The following is a friendly conversation between a human and an AI. The AI is very casual and talkative and responds with a friendly tone. If the AI does not know the answer to a question, it truthfully says it does not know.
+      Relevant pieces of information for context of the current query:
+      ${relevantResults.map((result) => result.text).join("\n\n")}`,
+      },
+      { role: "user", content: input },
+    ];
+    const responseText = await this.getChatCompletion(this.openai(), messages);
+
+    return {
+      response: responseText,
+      sources: curateLanceSources(relevantResults),
+      message: false,
+    };
+  },
+  "namespace-stats": async function (reqBody = {}) {
+    const { namespace = null } = reqBody;
+    if (!namespace) throw new Error("namespace required");
+    const { client } = await this.connect();
+    if (!(await this.namespaceExists(client, namespace)))
+      throw new Error("Namespace by that name does not exist.");
+    const stats = await this.namespace(client, namespace);
+    return stats
+      ? stats
+      : { message: "No stats were able to be fetched from DB for namespace" };
+  },
+  "delete-namespace": async function (reqBody = {}) {
+    const { namespace = null } = reqBody;
+    const { client } = await this.connect();
+    if (!(await this.namespaceExists(client, namespace)))
+      throw new Error("Namespace by that name does not exist.");
+
+    await this.deleteVectorsInNamespace(client, namespace);
+    return {
+      message: `Namespace ${namespace} was deleted.`,
+    };
+  },
+  reset: async function () {
+    const { client } = await this.connect();
+    const fs = require("fs");
+    fs.rm(`${client.uri}`, { recursive: true }, () => null);
+    return { reset: true };
+  },
+};
+
+module.exports.LanceDb = LanceDb