anything-llm/server/utils/files/index.js
Sean Hatfield ae510619f0
Purge cached docs and remove docs from all workspaces on vectorDB/embedder changes ()
* wip remove all docs clear vector db on embedder/vector db change

* purge all cached docs and remove docs from workspaces on vectordb/embedder change

* lint

* remove unneeded console log

* remove reset vector stores endpoint and move to server side updateENV with postUpdate check

* reset embed module

* remove unused import

* simplify deletion process
rescoped document deletion to be more general for speed, everything needs to be reset anyway
fixed issue where unembedded docs not in any workspaces, but cached, were not removed

* add back missing readme file
update warning text modals

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
2024-12-16 12:16:20 -08:00

307 lines
10 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { v5: uuidv5 } = require("uuid");
const { Document } = require("../../models/documents");
const { DocumentSyncQueue } = require("../../models/documentSyncQueue");
const documentsPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
const vectorCachePath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../storage/vector-cache`)
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
// Should take in a folder that is a subfolder of documents
// eg: youtube-subject/video-123.json
async function fileData(filePath = null) {
if (!filePath) throw new Error("No docPath provided in request");
const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
return null;
const data = fs.readFileSync(fullFilePath, "utf8");
return JSON.parse(data);
}
async function viewLocalFiles() {
if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
const liveSyncAvailable = await DocumentSyncQueue.enabled();
const directory = {
name: "documents",
type: "folder",
items: [],
};
for (const file of fs.readdirSync(documentsPath)) {
if (path.extname(file) === ".md") continue;
const folderPath = path.resolve(documentsPath, file);
const isFolder = fs.lstatSync(folderPath).isDirectory();
if (isFolder) {
const subdocs = {
name: file,
type: "folder",
items: [],
};
const subfiles = fs.readdirSync(folderPath);
const filenames = {};
for (const subfile of subfiles) {
if (path.extname(subfile) !== ".json") continue;
const filePath = path.join(folderPath, subfile);
const rawData = fs.readFileSync(filePath, "utf8");
const cachefilename = `${file}/${subfile}`;
const { pageContent, ...metadata } = JSON.parse(rawData);
subdocs.items.push({
name: subfile,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
canWatch: liveSyncAvailable
? DocumentSyncQueue.canWatch(metadata)
: false,
// pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
// watched: false, // boolean to indicate if this document is watched in ANY workspace
});
filenames[cachefilename] = subfile;
}
// Grab the pinned workspaces and watched documents for this folder's documents
// at the time of the query so we don't have to re-query the database for each file
const pinnedWorkspacesByDocument =
await getPinnedWorkspacesByDocument(filenames);
const watchedDocumentsFilenames =
await getWatchedDocumentFilenames(filenames);
for (const item of subdocs.items) {
item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || [];
item.watched =
watchedDocumentsFilenames.hasOwnProperty(item.name) || false;
}
directory.items.push(subdocs);
}
}
// Make sure custom-documents is always the first folder in picker
directory.items = [
directory.items.find((folder) => folder.name === "custom-documents"),
...directory.items.filter((folder) => folder.name !== "custom-documents"),
].filter((i) => !!i);
return directory;
}
/**
* Searches the vector-cache folder for existing information so we dont have to re-embed a
* document and can instead push directly to vector db.
* @param {string} filename - the filename to check for cached vector information
* @param {boolean} checkOnly - if true, only check if the file exists, do not return the cached data
* @returns {Promise<{exists: boolean, chunks: any[]}>} - a promise that resolves to an object containing the existence of the file and its cached chunks
*/
async function cachedVectorInformation(filename = null, checkOnly = false) {
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
const digest = uuidv5(filename, uuidv5.URL);
const file = path.resolve(vectorCachePath, `${digest}.json`);
const exists = fs.existsSync(file);
if (checkOnly) return exists;
if (!exists) return { exists, chunks: [] };
console.log(
`Cached vectorized results of ${filename} found! Using cached data to save on embed costs.`
);
const rawData = fs.readFileSync(file, "utf8");
return { exists: true, chunks: JSON.parse(rawData) };
}
// vectorData: pre-chunked vectorized data for a given file that includes the proper metadata and chunk-size limit so it can be iterated and dumped into Pinecone, etc
// filename is the fullpath to the doc so we can compare by filename to find cached matches.
async function storeVectorResult(vectorData = [], filename = null) {
if (!filename) return;
console.log(
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
);
if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
const digest = uuidv5(filename, uuidv5.URL);
const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
return;
}
// Purges a file from the documents/ folder.
async function purgeSourceDocument(filename = null) {
if (!filename) return;
const filePath = path.resolve(documentsPath, normalizePath(filename));
if (
!fs.existsSync(filePath) ||
!isWithin(documentsPath, filePath) ||
!fs.lstatSync(filePath).isFile()
)
return;
console.log(`Purging source document of ${filename}.`);
fs.rmSync(filePath);
return;
}
// Purges a vector-cache file from the vector-cache/ folder.
async function purgeVectorCache(filename = null) {
if (!filename) return;
const digest = uuidv5(filename, uuidv5.URL);
const filePath = path.resolve(vectorCachePath, `${digest}.json`);
if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
console.log(`Purging vector-cache of ${filename}.`);
fs.rmSync(filePath);
return;
}
// Search for a specific document by its unique name in the entire `documents`
// folder via iteration of all folders and checking if the expected file exists.
async function findDocumentInDocuments(documentName = null) {
if (!documentName) return null;
for (const folder of fs.readdirSync(documentsPath)) {
const isFolder = fs
.lstatSync(path.join(documentsPath, folder))
.isDirectory();
if (!isFolder) continue;
const targetFilename = normalizePath(documentName);
const targetFileLocation = path.join(documentsPath, folder, targetFilename);
if (
!fs.existsSync(targetFileLocation) ||
!isWithin(documentsPath, targetFileLocation)
)
continue;
const fileData = fs.readFileSync(targetFileLocation, "utf8");
const cachefilename = `${folder}/${targetFilename}`;
const { pageContent, ...metadata } = JSON.parse(fileData);
return {
name: targetFilename,
type: "file",
...metadata,
cached: await cachedVectorInformation(cachefilename, true),
};
}
return null;
}
/**
* Checks if a given path is within another path.
* @param {string} outer - The outer path (should be resolved).
* @param {string} inner - The inner path (should be resolved).
* @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
*/
function isWithin(outer, inner) {
if (outer === inner) return false;
const rel = path.relative(outer, inner);
return !rel.startsWith("../") && rel !== "..";
}
function normalizePath(filepath = "") {
const result = path
.normalize(filepath.trim())
.replace(/^(\.\.(\/|\\|$))+/, "")
.trim();
if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
return result;
}
// Check if the vector-cache folder is empty or not
// useful for it the user is changing embedders as this will
// break the previous cache.
function hasVectorCachedFiles() {
try {
return (
fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json"))
.length !== 0
);
} catch {}
return false;
}
/**
* @param {string[]} filenames - array of filenames to check for pinned workspaces
* @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
*/
async function getPinnedWorkspacesByDocument(filenames = []) {
return (
await Document.where(
{
docpath: {
in: Object.keys(filenames),
},
pinned: true,
},
null,
null,
null,
{
workspaceId: true,
docpath: true,
}
)
).reduce((result, { workspaceId, docpath }) => {
const filename = filenames[docpath];
if (!result[filename]) result[filename] = [];
if (!result[filename].includes(workspaceId))
result[filename].push(workspaceId);
return result;
}, {});
}
/**
* Get a record of filenames and their corresponding workspaceIds that have watched a document
* that will be used to determine if a document should be displayed in the watched documents sidebar
* @param {string[]} filenames - array of filenames to check for watched workspaces
* @returns {Promise<Record<string, string[]>>} - a record of filenames and their corresponding workspaceIds
*/
async function getWatchedDocumentFilenames(filenames = []) {
return (
await Document.where(
{
docpath: { in: Object.keys(filenames) },
watched: true,
},
null,
null,
null,
{ workspaceId: true, docpath: true }
)
).reduce((result, { workspaceId, docpath }) => {
const filename = filenames[docpath];
result[filename] = workspaceId;
return result;
}, {});
}
/**
* Purges the entire vector-cache folder and recreates it.
* @returns {void}
*/
function purgeEntireVectorCache() {
fs.rmSync(vectorCachePath, { recursive: true, force: true });
fs.mkdirSync(vectorCachePath);
return;
}
module.exports = {
findDocumentInDocuments,
cachedVectorInformation,
viewLocalFiles,
purgeSourceDocument,
purgeVectorCache,
storeVectorResult,
fileData,
normalizePath,
isWithin,
documentsPath,
hasVectorCachedFiles,
purgeEntireVectorCache,
};