mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-13 13:42:23 +00:00
191 lines
5.7 KiB
JavaScript
191 lines
5.7 KiB
JavaScript
|
const fs = require("fs");
|
||
|
const os = require("os");
|
||
|
const path = require("path");
|
||
|
const NodeCanvasFactory = require("./CanvasFactory");
|
||
|
|
||
|
class OCRLoader {
|
||
|
constructor() {
|
||
|
this.cacheDir = path.resolve(
|
||
|
process.env.STORAGE_DIR
|
||
|
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
|
||
|
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
log(text, ...args) {
|
||
|
console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Loads a PDF file and returns an array of documents.
|
||
|
* This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function
|
||
|
* @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
|
||
|
*/
|
||
|
async ocrPDF(
|
||
|
filePath,
|
||
|
{ maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {}
|
||
|
) {
|
||
|
if (
|
||
|
!filePath ||
|
||
|
!fs.existsSync(filePath) ||
|
||
|
!fs.statSync(filePath).isFile()
|
||
|
) {
|
||
|
this.log(`File ${filePath} does not exist. Skipping OCR.`);
|
||
|
return [];
|
||
|
}
|
||
|
|
||
|
const documentTitle = path.basename(filePath);
|
||
|
this.log(`Starting OCR of ${documentTitle}`);
|
||
|
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
|
||
|
let buffer = fs.readFileSync(filePath);
|
||
|
const canvasFactory = new NodeCanvasFactory();
|
||
|
await canvasFactory.init();
|
||
|
global.Image = canvasFactory.Image;
|
||
|
|
||
|
const pdfDocument = await pdfjs.getDocument({
|
||
|
data: new Uint8Array(buffer),
|
||
|
canvasFactory,
|
||
|
}).promise;
|
||
|
buffer = null;
|
||
|
|
||
|
const documents = [];
|
||
|
const meta = await pdfDocument.getMetadata().catch(() => null);
|
||
|
const metadata = {
|
||
|
source: filePath,
|
||
|
pdf: {
|
||
|
version: "v2.0.550",
|
||
|
info: meta?.info,
|
||
|
metadata: meta?.metadata,
|
||
|
totalPages: pdfDocument.numPages,
|
||
|
},
|
||
|
};
|
||
|
|
||
|
async function getPageAsBuffer(pageNumber, scale = 1) {
|
||
|
let canvas = null;
|
||
|
let context = null;
|
||
|
try {
|
||
|
const page = await pdfDocument.getPage(pageNumber);
|
||
|
const viewport = page.getViewport(scale);
|
||
|
({ canvas, context } = canvasFactory.create(
|
||
|
viewport.width,
|
||
|
viewport.height
|
||
|
));
|
||
|
await page.render({
|
||
|
canvasFactory,
|
||
|
canvasContext: context,
|
||
|
viewport,
|
||
|
}).promise;
|
||
|
return canvas.toBuffer();
|
||
|
} catch (e) {
|
||
|
this.log(`Error getting page as buffer: ${e.message}`);
|
||
|
return null;
|
||
|
} finally {
|
||
|
canvas = null;
|
||
|
context = null;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const { createWorker, OEM } = require("tesseract.js");
|
||
|
const BATCH_SIZE = batchSize;
|
||
|
const MAX_EXECUTION_TIME = maxExecutionTime;
|
||
|
const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4);
|
||
|
const totalPages = pdfDocument.numPages;
|
||
|
const workerPool = await Promise.all(
|
||
|
Array(NUM_WORKERS)
|
||
|
.fill(0)
|
||
|
.map(() =>
|
||
|
createWorker("eng", OEM.LSTM_ONLY, {
|
||
|
cachePath: this.cacheDir,
|
||
|
})
|
||
|
)
|
||
|
);
|
||
|
|
||
|
const startTime = Date.now();
|
||
|
try {
|
||
|
this.log("Bootstrapping OCR completed successfully!", {
|
||
|
MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME,
|
||
|
BATCH_SIZE,
|
||
|
MAX_CONCURRENT_WORKERS: NUM_WORKERS,
|
||
|
TOTAL_PAGES: totalPages,
|
||
|
});
|
||
|
const timeoutPromise = new Promise((_, reject) => {
|
||
|
setTimeout(() => {
|
||
|
reject(
|
||
|
new Error(
|
||
|
`OCR job took too long to complete (${
|
||
|
MAX_EXECUTION_TIME / 1000
|
||
|
} seconds)`
|
||
|
)
|
||
|
);
|
||
|
}, MAX_EXECUTION_TIME);
|
||
|
});
|
||
|
|
||
|
const processPages = async () => {
|
||
|
for (
|
||
|
let startPage = 1;
|
||
|
startPage <= totalPages;
|
||
|
startPage += BATCH_SIZE
|
||
|
) {
|
||
|
const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages);
|
||
|
const pageNumbers = Array.from(
|
||
|
{ length: endPage - startPage + 1 },
|
||
|
(_, i) => startPage + i
|
||
|
);
|
||
|
this.log(`Working on pages ${startPage} - ${endPage}`);
|
||
|
|
||
|
const pageQueue = [...pageNumbers];
|
||
|
const results = [];
|
||
|
const workerPromises = workerPool.map(async (worker, workerIndex) => {
|
||
|
while (pageQueue.length > 0) {
|
||
|
const pageNum = pageQueue.shift();
|
||
|
this.log(
|
||
|
`\x1b[34m[Worker ${
|
||
|
workerIndex + 1
|
||
|
}]\x1b[0m assigned pg${pageNum}`
|
||
|
);
|
||
|
const imageBuffer = await getPageAsBuffer(pageNum, 5);
|
||
|
const { data } = await worker.recognize(imageBuffer, {}, "text");
|
||
|
this.log(
|
||
|
`✅ \x1b[34m[Worker ${
|
||
|
workerIndex + 1
|
||
|
}]\x1b[0m completed pg${pageNum}`
|
||
|
);
|
||
|
results.push({
|
||
|
pageContent: data.text,
|
||
|
metadata: {
|
||
|
...metadata,
|
||
|
loc: { pageNumber: pageNum },
|
||
|
},
|
||
|
});
|
||
|
}
|
||
|
});
|
||
|
|
||
|
await Promise.all(workerPromises);
|
||
|
documents.push(
|
||
|
...results.sort(
|
||
|
(a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
return documents;
|
||
|
};
|
||
|
|
||
|
await Promise.race([timeoutPromise, processPages()]);
|
||
|
} catch (e) {
|
||
|
this.log(`Error: ${e.message}`);
|
||
|
} finally {
|
||
|
global.Image = undefined;
|
||
|
await Promise.all(workerPool.map((worker) => worker.terminate()));
|
||
|
}
|
||
|
|
||
|
this.log(`Completed OCR of ${documentTitle}!`, {
|
||
|
documentsParsed: documents.length,
|
||
|
totalPages: totalPages,
|
||
|
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
|
||
|
});
|
||
|
return documents;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
module.exports = OCRLoader;
|