mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
Employ strict validations on document pathing (#627)
* Employ strict validations on document pathing * add comment * update validSubfolder var
This commit is contained in:
parent
0db6c3b2aa
commit
8a7324d0e7
2 changed files with 96 additions and 88 deletions
server/utils/files
|
@ -1,37 +1,29 @@
|
|||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { v5: uuidv5 } = require("uuid");
|
||||
const documentsPath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/documents`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents`);
|
||||
const vectorCachePath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/vector-cache`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
|
||||
|
||||
// Should take in a folder that is a subfolder of documents
|
||||
// eg: youtube-subject/video-123.json
|
||||
async function fileData(filePath = null) {
|
||||
if (!filePath) throw new Error("No docPath provided in request");
|
||||
const fullFilePath = path.resolve(documentsPath, normalizePath(filePath));
|
||||
if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath))
|
||||
return null;
|
||||
|
||||
const fullPath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(
|
||||
__dirname,
|
||||
`../../storage/documents/${normalizePath(filePath)}`
|
||||
)
|
||||
: path.resolve(
|
||||
process.env.STORAGE_DIR,
|
||||
`documents/${normalizePath(filePath)}`
|
||||
);
|
||||
|
||||
const fileExists = fs.existsSync(fullPath);
|
||||
if (!fileExists) return null;
|
||||
|
||||
const data = fs.readFileSync(fullPath, "utf8");
|
||||
const data = fs.readFileSync(fullFilePath, "utf8");
|
||||
return JSON.parse(data);
|
||||
}
|
||||
|
||||
async function viewLocalFiles() {
|
||||
const folder =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/documents`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents`);
|
||||
const dirExists = fs.existsSync(folder);
|
||||
if (!dirExists) fs.mkdirSync(folder);
|
||||
if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
|
||||
|
||||
const directory = {
|
||||
name: "documents",
|
||||
|
@ -39,14 +31,9 @@ async function viewLocalFiles() {
|
|||
items: [],
|
||||
};
|
||||
|
||||
for (const file of fs.readdirSync(folder)) {
|
||||
for (const file of fs.readdirSync(documentsPath)) {
|
||||
if (path.extname(file) === ".md") continue;
|
||||
|
||||
const folderPath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/documents/${file}`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents/${file}`);
|
||||
|
||||
const folderPath = path.resolve(documentsPath, file);
|
||||
const isFolder = fs.lstatSync(folderPath).isDirectory();
|
||||
if (isFolder) {
|
||||
const subdocs = {
|
||||
|
@ -83,10 +70,7 @@ async function cachedVectorInformation(filename = null, checkOnly = false) {
|
|||
if (!filename) return checkOnly ? false : { exists: false, chunks: [] };
|
||||
|
||||
const digest = uuidv5(filename, uuidv5.URL);
|
||||
const file =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/vector-cache/${digest}.json`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `vector-cache/${digest}.json`);
|
||||
const file = path.resolve(vectorCachePath, `${digest}.json`);
|
||||
const exists = fs.existsSync(file);
|
||||
|
||||
if (checkOnly) return exists;
|
||||
|
@ -106,15 +90,10 @@ async function storeVectorResult(vectorData = [], filename = null) {
|
|||
console.log(
|
||||
`Caching vectorized results of ${filename} to prevent duplicated embedding.`
|
||||
);
|
||||
const folder =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/vector-cache`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `vector-cache`);
|
||||
|
||||
if (!fs.existsSync(folder)) fs.mkdirSync(folder);
|
||||
if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath);
|
||||
|
||||
const digest = uuidv5(filename, uuidv5.URL);
|
||||
const writeTo = path.resolve(folder, `${digest}.json`);
|
||||
const writeTo = path.resolve(vectorCachePath, `${digest}.json`);
|
||||
fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8");
|
||||
return;
|
||||
}
|
||||
|
@ -122,21 +101,16 @@ async function storeVectorResult(vectorData = [], filename = null) {
|
|||
// Purges a file from the documents/ folder.
|
||||
async function purgeSourceDocument(filename = null) {
|
||||
if (!filename) return;
|
||||
console.log(`Purging source document of ${filename}.`);
|
||||
const filePath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(
|
||||
__dirname,
|
||||
`../../storage/documents`,
|
||||
normalizePath(filename)
|
||||
)
|
||||
: path.resolve(
|
||||
process.env.STORAGE_DIR,
|
||||
`documents`,
|
||||
normalizePath(filename)
|
||||
);
|
||||
const filePath = path.resolve(documentsPath, normalizePath(filename));
|
||||
|
||||
if (!fs.existsSync(filePath)) return;
|
||||
if (
|
||||
!fs.existsSync(filePath) ||
|
||||
!isWithin(documentsPath, filePath) ||
|
||||
!fs.lstatSync(filePath).isFile()
|
||||
)
|
||||
return;
|
||||
|
||||
console.log(`Purging source document of ${filename}.`);
|
||||
fs.rmSync(filePath);
|
||||
return;
|
||||
}
|
||||
|
@ -144,15 +118,11 @@ async function purgeSourceDocument(filename = null) {
|
|||
// Purges a vector-cache file from the vector-cache/ folder.
|
||||
async function purgeVectorCache(filename = null) {
|
||||
if (!filename) return;
|
||||
console.log(`Purging vector-cache of ${filename}.`);
|
||||
|
||||
const digest = uuidv5(filename, uuidv5.URL);
|
||||
const filePath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/vector-cache`, `${digest}.json`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `vector-cache`, `${digest}.json`);
|
||||
const filePath = path.resolve(vectorCachePath, `${digest}.json`);
|
||||
|
||||
if (!fs.existsSync(filePath)) return;
|
||||
if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return;
|
||||
console.log(`Purging vector-cache of ${filename}.`);
|
||||
fs.rmSync(filePath);
|
||||
return;
|
||||
}
|
||||
|
@ -161,24 +131,20 @@ async function purgeVectorCache(filename = null) {
|
|||
// folder via iteration of all folders and checking if the expected file exists.
|
||||
async function findDocumentInDocuments(documentName = null) {
|
||||
if (!documentName) return null;
|
||||
const documentsFolder =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/documents`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents`);
|
||||
|
||||
for (const folder of fs.readdirSync(documentsFolder)) {
|
||||
for (const folder of fs.readdirSync(documentsPath)) {
|
||||
const isFolder = fs
|
||||
.lstatSync(path.join(documentsFolder, folder))
|
||||
.lstatSync(path.join(documentsPath, folder))
|
||||
.isDirectory();
|
||||
if (!isFolder) continue;
|
||||
|
||||
const targetFilename = normalizePath(documentName);
|
||||
const targetFileLocation = path.join(
|
||||
documentsFolder,
|
||||
folder,
|
||||
targetFilename
|
||||
);
|
||||
if (!fs.existsSync(targetFileLocation)) continue;
|
||||
const targetFileLocation = path.join(documentsPath, folder, targetFilename);
|
||||
|
||||
if (
|
||||
!fs.existsSync(targetFileLocation) ||
|
||||
!isWithin(documentsPath, targetFileLocation)
|
||||
)
|
||||
continue;
|
||||
|
||||
const fileData = fs.readFileSync(targetFileLocation, "utf8");
|
||||
const cachefilename = `${folder}/${targetFilename}`;
|
||||
|
@ -194,8 +160,25 @@ async function findDocumentInDocuments(documentName = null) {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a given path is within another path.
|
||||
* @param {string} outer - The outer path (should be resolved).
|
||||
* @param {string} inner - The inner path (should be resolved).
|
||||
* @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
|
||||
*/
|
||||
function isWithin(outer, inner) {
|
||||
if (outer === inner) return false;
|
||||
const rel = path.relative(outer, inner);
|
||||
return !rel.startsWith("../") && rel !== "..";
|
||||
}
|
||||
|
||||
function normalizePath(filepath = "") {
|
||||
return path.normalize(filepath).replace(/^(\.\.(\/|\\|$))+/, "");
|
||||
const result = path
|
||||
.normalize(filepath.trim())
|
||||
.replace(/^(\.\.(\/|\\|$))+/, "")
|
||||
.trim();
|
||||
if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
|
||||
return result;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
@ -207,4 +190,6 @@ module.exports = {
|
|||
storeVectorResult,
|
||||
fileData,
|
||||
normalizePath,
|
||||
isWithin,
|
||||
documentsPath,
|
||||
};
|
||||
|
|
|
@ -1,30 +1,53 @@
|
|||
const fs = require("fs");
|
||||
const path = require("path");
|
||||
const { purgeVectorCache, purgeSourceDocument, normalizePath } = require(".");
|
||||
const {
|
||||
purgeVectorCache,
|
||||
purgeSourceDocument,
|
||||
normalizePath,
|
||||
isWithin,
|
||||
documentsPath,
|
||||
} = require(".");
|
||||
const { Document } = require("../../models/documents");
|
||||
const { Workspace } = require("../../models/workspace");
|
||||
|
||||
async function purgeDocument(filename) {
|
||||
async function purgeDocument(filename = null) {
|
||||
if (!filename || !normalizePath(filename)) return;
|
||||
|
||||
await purgeVectorCache(filename);
|
||||
await purgeSourceDocument(filename);
|
||||
const workspaces = await Workspace.where();
|
||||
for (const workspace of workspaces) {
|
||||
await Document.removeDocuments(workspace, [filename]);
|
||||
}
|
||||
await purgeVectorCache(filename);
|
||||
await purgeSourceDocument(filename);
|
||||
return;
|
||||
}
|
||||
|
||||
async function purgeFolder(folderName) {
|
||||
if (folderName === "custom-documents") return;
|
||||
const documentsFolder =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(__dirname, `../../storage/documents`)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents`);
|
||||
async function purgeFolder(folderName = null) {
|
||||
if (!folderName) return;
|
||||
const subFolder = normalizePath(folderName);
|
||||
const subFolderPath = path.resolve(documentsPath, subFolder);
|
||||
const validRemovableSubFolders = fs
|
||||
.readdirSync(documentsPath)
|
||||
.map((folder) => {
|
||||
// Filter out any results which are not folders or
|
||||
// are the protected custom-documents folder.
|
||||
if (folder === "custom-documents") return null;
|
||||
const subfolderPath = path.resolve(documentsPath, folder);
|
||||
if (!fs.lstatSync(subfolderPath).isDirectory()) return null;
|
||||
return folder;
|
||||
})
|
||||
.filter((subFolder) => !!subFolder);
|
||||
|
||||
if (
|
||||
!validRemovableSubFolders.includes(subFolder) ||
|
||||
!fs.existsSync(subFolderPath) ||
|
||||
!isWithin(documentsPath, subFolderPath)
|
||||
)
|
||||
return;
|
||||
|
||||
const folderPath = path.resolve(documentsFolder, normalizePath(folderName));
|
||||
const filenames = fs
|
||||
.readdirSync(folderPath)
|
||||
.map((file) => path.join(folderPath, file));
|
||||
.readdirSync(subFolderPath)
|
||||
.map((file) => path.join(subFolderPath, file));
|
||||
const workspaces = await Workspace.where();
|
||||
|
||||
const purgePromises = [];
|
||||
|
@ -47,7 +70,7 @@ async function purgeFolder(folderName) {
|
|||
}
|
||||
|
||||
await Promise.all(purgePromises.flat().map((f) => f()));
|
||||
fs.rmSync(folderPath, { recursive: true }); // Delete root document and source files.
|
||||
fs.rmSync(subFolderPath, { recursive: true }); // Delete target document-folder and source files.
|
||||
|
||||
return;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue