const fs = require("fs"); const os = require("os"); const path = require("path"); const NodeCanvasFactory = require("./CanvasFactory"); class OCRLoader { constructor() { this.cacheDir = path.resolve( process.env.STORAGE_DIR ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`) : path.resolve(__dirname, `../../../server/storage/models/tesseract`) ); } log(text, ...args) { console.log(`\x1b[36m[OCRLoader]\x1b[0m ${text}`, ...args); } /** * Loads a PDF file and returns an array of documents. * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata. */ async ocrPDF( filePath, { maxExecutionTime = 300_000, batchSize = 10, maxWorkers = null } = {} ) { if ( !filePath || !fs.existsSync(filePath) || !fs.statSync(filePath).isFile() ) { this.log(`File ${filePath} does not exist. Skipping OCR.`); return []; } const documentTitle = path.basename(filePath); this.log(`Starting OCR of ${documentTitle}`); const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js"); let buffer = fs.readFileSync(filePath); const canvasFactory = new NodeCanvasFactory(); await canvasFactory.init(); global.Image = canvasFactory.Image; const pdfDocument = await pdfjs.getDocument({ data: new Uint8Array(buffer), canvasFactory, }).promise; buffer = null; const documents = []; const meta = await pdfDocument.getMetadata().catch(() => null); const metadata = { source: filePath, pdf: { version: "v2.0.550", info: meta?.info, metadata: meta?.metadata, totalPages: pdfDocument.numPages, }, }; async function getPageAsBuffer(pageNumber, scale = 1) { let canvas = null; let context = null; try { const page = await pdfDocument.getPage(pageNumber); const viewport = page.getViewport(scale); ({ canvas, context } = canvasFactory.create( viewport.width, viewport.height )); await page.render({ canvasFactory, canvasContext: context, viewport, }).promise; return canvas.toBuffer(); } catch (e) { this.log(`Error getting page as buffer: ${e.message}`); return null; } finally { canvas = null; context = null; } } const { createWorker, OEM } = require("tesseract.js"); const BATCH_SIZE = batchSize; const MAX_EXECUTION_TIME = maxExecutionTime; const NUM_WORKERS = maxWorkers ?? Math.min(os.cpus().length, 4); const totalPages = pdfDocument.numPages; const workerPool = await Promise.all( Array(NUM_WORKERS) .fill(0) .map(() => createWorker("eng", OEM.LSTM_ONLY, { cachePath: this.cacheDir, }) ) ); const startTime = Date.now(); try { this.log("Bootstrapping OCR completed successfully!", { MAX_EXECUTION_TIME_MS: MAX_EXECUTION_TIME, BATCH_SIZE, MAX_CONCURRENT_WORKERS: NUM_WORKERS, TOTAL_PAGES: totalPages, }); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => { reject( new Error( `OCR job took too long to complete (${ MAX_EXECUTION_TIME / 1000 } seconds)` ) ); }, MAX_EXECUTION_TIME); }); const processPages = async () => { for ( let startPage = 1; startPage <= totalPages; startPage += BATCH_SIZE ) { const endPage = Math.min(startPage + BATCH_SIZE - 1, totalPages); const pageNumbers = Array.from( { length: endPage - startPage + 1 }, (_, i) => startPage + i ); this.log(`Working on pages ${startPage} - ${endPage}`); const pageQueue = [...pageNumbers]; const results = []; const workerPromises = workerPool.map(async (worker, workerIndex) => { while (pageQueue.length > 0) { const pageNum = pageQueue.shift(); this.log( `\x1b[34m[Worker ${ workerIndex + 1 }]\x1b[0m assigned pg${pageNum}` ); const imageBuffer = await getPageAsBuffer(pageNum, 5); const { data } = await worker.recognize(imageBuffer, {}, "text"); this.log( `✅ \x1b[34m[Worker ${ workerIndex + 1 }]\x1b[0m completed pg${pageNum}` ); results.push({ pageContent: data.text, metadata: { ...metadata, loc: { pageNumber: pageNum }, }, }); } }); await Promise.all(workerPromises); documents.push( ...results.sort( (a, b) => a.metadata.loc.pageNumber - b.metadata.loc.pageNumber ) ); } return documents; }; await Promise.race([timeoutPromise, processPages()]); } catch (e) { this.log(`Error: ${e.message}`); } finally { global.Image = undefined; await Promise.all(workerPool.map((worker) => worker.terminate())); } this.log(`Completed OCR of ${documentTitle}!`, { documentsParsed: documents.length, totalPages: totalPages, executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`, }); return documents; } } module.exports = OCRLoader;