mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-04-17 18:18:11 +00:00
Support XLSX files (#2403)
* support xlsx files * lint * create seperate docs for each xlsx sheet * lint * use node-xlsx pkg for parsing xslx files * lint * update error handling --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
93d64642f3
commit
b658f5012d
4 changed files with 132 additions and 1 deletions
collector
|
@ -33,6 +33,7 @@
|
|||
"mime": "^3.0.0",
|
||||
"moment": "^2.29.4",
|
||||
"node-html-parser": "^6.1.13",
|
||||
"node-xlsx": "^0.24.0",
|
||||
"officeparser": "^4.0.5",
|
||||
"openai": "4.38.5",
|
||||
"pdf-parse": "^1.1.1",
|
||||
|
@ -48,4 +49,4 @@
|
|||
"nodemon": "^2.0.22",
|
||||
"prettier": "^2.4.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
113
collector/processSingleFile/convert/asXlsx.js
Normal file
113
collector/processSingleFile/convert/asXlsx.js
Normal file
|
@ -0,0 +1,113 @@
|
|||
const { v4 } = require("uuid");
|
||||
const xlsx = require("node-xlsx").default;
|
||||
const path = require("path");
|
||||
const fs = require("fs");
|
||||
const {
|
||||
createdDate,
|
||||
trashFile,
|
||||
writeToServerDocuments,
|
||||
} = require("../../utils/files");
|
||||
const { tokenizeString } = require("../../utils/tokenizer");
|
||||
const { default: slugify } = require("slugify");
|
||||
|
||||
function convertToCSV(data) {
|
||||
return data
|
||||
.map((row) =>
|
||||
row
|
||||
.map((cell) => {
|
||||
if (cell === null || cell === undefined) return "";
|
||||
if (typeof cell === "string" && cell.includes(","))
|
||||
return `"${cell}"`;
|
||||
return cell;
|
||||
})
|
||||
.join(",")
|
||||
)
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
async function asXlsx({ fullFilePath = "", filename = "" }) {
|
||||
const documents = [];
|
||||
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
|
||||
lower: true,
|
||||
trim: true,
|
||||
});
|
||||
|
||||
const outFolderPath =
|
||||
process.env.NODE_ENV === "development"
|
||||
? path.resolve(
|
||||
__dirname,
|
||||
`../../../server/storage/documents/${folderName}`
|
||||
)
|
||||
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);
|
||||
|
||||
try {
|
||||
const workSheetsFromFile = xlsx.parse(fullFilePath);
|
||||
if (!fs.existsSync(outFolderPath))
|
||||
fs.mkdirSync(outFolderPath, { recursive: true });
|
||||
|
||||
for (const sheet of workSheetsFromFile) {
|
||||
try {
|
||||
const { name, data } = sheet;
|
||||
const content = convertToCSV(data);
|
||||
|
||||
if (!content?.length) {
|
||||
console.warn(`Sheet "${name}" is empty. Skipping.`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`-- Processing sheet: ${name} --`);
|
||||
const sheetData = {
|
||||
id: v4(),
|
||||
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
|
||||
title: `${filename} - Sheet:${name}`,
|
||||
docAuthor: "Unknown",
|
||||
description: `Spreadsheet data from sheet: ${name}`,
|
||||
docSource: "an xlsx file uploaded by the user.",
|
||||
chunkSource: "",
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(/\s+/).length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
sheetData,
|
||||
`sheet-${slugify(name)}`,
|
||||
outFolderPath
|
||||
);
|
||||
documents.push(document);
|
||||
console.log(
|
||||
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
|
||||
);
|
||||
} catch (err) {
|
||||
console.error(`Error processing sheet "${name}":`, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Could not process xlsx file!", err);
|
||||
return {
|
||||
success: false,
|
||||
reason: `Error processing ${filename}: ${err.message}`,
|
||||
documents: [],
|
||||
};
|
||||
} finally {
|
||||
trashFile(fullFilePath);
|
||||
}
|
||||
|
||||
if (documents.length === 0) {
|
||||
console.error(`No valid sheets found in ${filename}.`);
|
||||
return {
|
||||
success: false,
|
||||
reason: `No valid sheets found in ${filename}.`,
|
||||
documents: [],
|
||||
};
|
||||
}
|
||||
|
||||
console.log(
|
||||
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
|
||||
);
|
||||
return { success: true, reason: null, documents };
|
||||
}
|
||||
|
||||
module.exports = asXlsx;
|
|
@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
|
|||
".pptx",
|
||||
],
|
||||
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
|
||||
".xlsx",
|
||||
],
|
||||
|
||||
"application/vnd.oasis.opendocument.text": [".odt"],
|
||||
"application/vnd.oasis.opendocument.presentation": [".odp"],
|
||||
|
||||
|
@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
|
|||
".odt": "./convert/asOfficeMime.js",
|
||||
".odp": "./convert/asOfficeMime.js",
|
||||
|
||||
".xlsx": "./convert/asXlsx.js",
|
||||
|
||||
".mbox": "./convert/asMbox.js",
|
||||
|
||||
".epub": "./convert/asEPub.js",
|
||||
|
|
|
@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
|
|||
css-select "^5.1.0"
|
||||
he "1.2.0"
|
||||
|
||||
node-xlsx@^0.24.0:
|
||||
version "0.24.0"
|
||||
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
|
||||
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
|
||||
dependencies:
|
||||
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"
|
||||
|
||||
nodemailer@6.9.13:
|
||||
version "6.9.13"
|
||||
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
|
||||
|
@ -3528,6 +3535,10 @@ ws@8.14.2:
|
|||
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
|
||||
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==
|
||||
|
||||
"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
|
||||
version "0.20.2"
|
||||
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"
|
||||
|
||||
xml2js@^0.6.2:
|
||||
version "0.6.2"
|
||||
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"
|
||||
|
|
Loading…
Add table
Reference in a new issue