From 2f89bbae744c4986d226256cd4e7aaf6e2977d7d Mon Sep 17 00:00:00 2001 From: timothycarambat <rambat1010@gmail.com> Date: Thu, 13 Feb 2025 16:06:47 -0800 Subject: [PATCH 1/2] OCR PDFs as fallback in spawn thread --- collector/package.json | 3 +- .../convert/asPDF/PDFLoader/index.js | 120 ++++++++++++++---- .../convert/asPDF/PDFLoader/ocrWorker.js | 41 ++++++ .../processSingleFile/convert/asPDF/index.js | 13 +- collector/yarn.lock | 83 +++++++++++- server/storage/models/.gitignore | 3 +- 6 files changed, 231 insertions(+), 32 deletions(-) create mode 100644 collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js diff --git a/collector/package.json b/collector/package.json index 1509645a1..61fd686af 100644 --- a/collector/package.json +++ b/collector/package.json @@ -38,6 +38,7 @@ "openai": "4.38.5", "pdf-parse": "^1.1.1", "puppeteer": "~21.5.2", + "scribe.js-ocr": "^0.7.1", "sharp": "^0.33.5", "slugify": "^1.6.6", "url-pattern": "^1.0.3", @@ -50,4 +51,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} +} \ No newline at end of file diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js index 26bcf2b1c..1662022ea 100644 --- a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js @@ -4,9 +4,17 @@ class PDFLoader { constructor(filePath, { splitPages = true } = {}) { this.filePath = filePath; this.splitPages = splitPages; + this.metadata = {}; } + /** + * Loads a PDF file and returns an array of documents. + * This function is reserved to parsing for DIGITAL documents - scanned documents are not supported in this function + * For scanned documents, use the `asOCR` function instead. + * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata. + */ async load() { + const documents = []; const buffer = await fs.readFile(this.filePath); const { getDocument, version } = await this.getPdfJS(); @@ -18,15 +26,21 @@ class PDFLoader { }).promise; const meta = await pdf.getMetadata().catch(() => null); - const documents = []; + this.metadata = { + source: this.filePath, + pdf: { + version, + info: meta?.info, + metadata: meta?.metadata, + totalPages: pdf.numPages, + }, + }; for (let i = 1; i <= pdf.numPages; i += 1) { const page = await pdf.getPage(i); const content = await page.getTextContent(); - if (content.items.length === 0) { - continue; - } + if (content.items.length === 0) continue; let lastY; const textItems = []; @@ -45,42 +59,94 @@ class PDFLoader { documents.push({ pageContent: text.trim(), metadata: { - source: this.filePath, - pdf: { - version, - info: meta?.info, - metadata: meta?.metadata, - totalPages: pdf.numPages, - }, + ...this.metadata, loc: { pageNumber: i }, }, }); } - if (this.splitPages) { - return documents; - } - - if (documents.length === 0) { - return []; - } + if (this.splitPages) return documents; + if (documents.length === 0) return []; return [ { pageContent: documents.map((doc) => doc.pageContent).join("\n\n"), - metadata: { - source: this.filePath, - pdf: { - version, - info: meta?.info, - metadata: meta?.metadata, - totalPages: pdf.numPages, - }, - }, + metadata: this.metadata, }, ]; } + /** + * Loads a PDF file and returns an array of documents. + * This function is reserved to parsing for SCANNED documents - digital documents are not supported in this function + * For digital documents, use the `load` function instead. + * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata. + */ + async asOCR() { + const { fork } = require("child_process"); + const path = require("path"); + const timeout = 300_000; + const ocrDataDirectory = + process.env.NODE_ENV === "development" + ? path.resolve(__dirname, `../../../../../server/storage/models/ocr`) + : path.resolve( + process.env.STORAGE_DIR ?? + path.resolve(__dirname, `../../../../../server/storage`), + `models/ocr` + ); + + return new Promise((resolve, _) => { + const worker = fork(path.join(__dirname, "ocrWorker.js")); + let isResolved = false; + + const cleanupAndResolve = (result = []) => { + if (!isResolved) { + isResolved = true; + worker.kill("SIGTERM"); + resolve(result); + } + }; + + worker.on("message", (result) => { + if (result.error) { + console.log( + `[PDFLoader] Error parsing PDF with OCR engine: ${result.error}` + ); + cleanupAndResolve([]); + } else { + cleanupAndResolve([ + { + pageContent: result.textContent, + metadata: { + ...this.metadata, + source: this.filePath, + }, + }, + ]); + } + }); + + setTimeout(() => { + console.log( + `[PDFLoader] OCR Worker timeout (${timeout / 1000} seconds)` + ); + cleanupAndResolve([]); + }, timeout); + + worker.on("error", (error) => { + console.error(`[PDFLoader] OCR Worker error: ${error}`); + cleanupAndResolve([]); + }); + + worker.send({ + filePath: this.filePath, + mode: "speed", // TODO: Make this configurable + langs: ["eng"], // TODO: Make this configurable + runDir: ocrDataDirectory, + }); + }); + } + async getPdfJS() { try { const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"); diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js b/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js new file mode 100644 index 000000000..c0a619550 --- /dev/null +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js @@ -0,0 +1,41 @@ +const fs = require("fs"); + +const processFile = async ( + filePath, + mode = "speed", + langs = ["eng"], + runDir = null +) => { + const { default: scribe } = await import("scribe.js-ocr"); + // Change directory to the run directory if it is provided + // This will allow the traineddata files to be persisted in storage + // and not be deleted after the worker is done or the app is killed. + // If not defined this will wind up pulling the traineddata files into the current + // directory of the worker (collector/index.js) and will cause subsequent runs to be slower. + if (runDir) { + if (!fs.existsSync(runDir)) fs.mkdirSync(runDir, { recursive: true }); + process.chdir(runDir); + } + + try { + await scribe.importFiles([filePath]); + const textContent = await scribe + .recognize({ mode, langs }) + .then(() => scribe.exportData("text")); + + return { textContent }; + } catch (e) { + return { error: e.message }; + } finally { + scribe.terminate(); + } +}; + +process.on( + "message", + async ({ filePath, mode = "speed", langs = ["eng"], runDir = null }) => { + const result = await processFile(filePath, mode, langs, runDir); + process.send(result); + process.exit(0); + } +); diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index e3e42d3bd..425bb378f 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -15,7 +15,14 @@ async function asPdf({ fullFilePath = "", filename = "" }) { console.log(`-- Working ${filename} --`); const pageContent = []; - const docs = await pdfLoader.load(); + let docs = await pdfLoader.load(); + + if (docs.length === 0) { + console.log( + `[PDFLoader] No text content found for ${filename}. Attempting OCR parse.` + ); + docs = await pdfLoader.asOCR(); + } for (const doc of docs) { console.log( @@ -28,7 +35,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) { } if (!pageContent.length) { - console.error(`Resulting text content was empty for ${filename}.`); + console.error( + `[PDFLoader] Resulting text content was empty for ${filename}.` + ); trashFile(fullFilePath); return { success: false, diff --git a/collector/yarn.lock b/collector/yarn.lock index 402fee3ea..a1e0daee7 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -361,6 +361,26 @@ unbzip2-stream "1.4.3" yargs "17.7.2" +"@scribe.js/tesseract.js-core@^6.0.3": + version "6.0.3" + resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js-core/-/tesseract.js-core-6.0.3.tgz#cc48ccead95045e1629e9bb4c04f638c15664f89" + integrity sha512-q2bXN0yQCEs5IA9138vF+xGSfNhOaMVdPKkuZNSiz3A+SfuRZuAU3iKzIXJMdSJnWvHSBOUPLDdqguClCNX9Qw== + +"@scribe.js/tesseract.js@^6.0.2": + version "6.0.2" + resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js/-/tesseract.js-6.0.2.tgz#24e0af34be38f92ffb68d1462fc203b4c209aaf7" + integrity sha512-FaLrLXtgpnQDI2fZSwg6plAgMm6s24v3rxTiG9zObKI22wPAH98CUVNR7BqgEVu3Yq7jLVWyoq14C4iJbWQcZA== + dependencies: + "@scribe.js/tesseract.js-core" "^6.0.3" + bmp-js "^0.1.0" + idb-keyval "^6.2.0" + is-url "^1.2.4" + node-fetch "^2.6.9" + opencollective-postinstall "^2.0.3" + regenerator-runtime "^0.13.3" + wasm-feature-detect "^1.2.11" + zlibjs "^0.3.1" + "@selderee/plugin-htmlparser2@^0.11.0": version "0.11.0" resolved "https://registry.yarnpkg.com/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz#d5b5e29a7ba6d3958a1972c7be16f4b2c188c517" @@ -436,6 +456,11 @@ dependencies: "@types/node" "*" +"@webgpu/types@0.1.21": + version "0.1.21" + resolved "https://registry.yarnpkg.com/@webgpu/types/-/types-0.1.21.tgz#b181202daec30d66ccd67264de23814cfd176d3a" + integrity sha512-pUrWq3V5PiSGFLeLxoGqReTZmiiXwY3jRkIG5sLLKjyqNxrwm/04b4nw7LSmGWJcKk59XOM/YRTUwOzo4MMlow== + "@xenova/transformers@^2.11.0": version "2.17.1" resolved "https://registry.yarnpkg.com/@xenova/transformers/-/transformers-2.17.1.tgz#712f7a72c76c8aa2075749382f83dc7dd4e5a9a5" @@ -693,6 +718,11 @@ bluebird@~3.4.0: resolved "https://registry.yarnpkg.com/bluebird/-/bluebird-3.4.7.tgz#f72d760be09b7f76d08ed8fae98b289a8d05fab3" integrity sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA== +bmp-js@^0.1.0: + version "0.1.0" + resolved "https://registry.yarnpkg.com/bmp-js/-/bmp-js-0.1.0.tgz#e05a63f796a6c1ff25f4771ec7adadc148c07233" + integrity sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw== + body-parser@1.20.2, body-parser@^1.20.2: version "1.20.2" resolved "https://registry.yarnpkg.com/body-parser/-/body-parser-1.20.2.tgz#6feb0e21c4724d06de7ff38da36dad4f57a747fd" @@ -788,6 +818,13 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== +canvaskit-wasm@^0.39.1: + version "0.39.1" + resolved "https://registry.yarnpkg.com/canvaskit-wasm/-/canvaskit-wasm-0.39.1.tgz#c3c8f3962cbabbedf246f7bcf90e859013c7eae9" + integrity sha512-Gy3lCmhUdKq+8bvDrs9t8+qf7RvcjuQn+we7vTVVyqgOVO1UVfHpsnBxkTZw+R4ApEJ3D5fKySl9TU11hmjl/A== + dependencies: + "@webgpu/types" "0.1.21" + chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -917,6 +954,11 @@ commander@^10.0.1: resolved "https://registry.yarnpkg.com/commander/-/commander-10.0.1.tgz#881ee46b4f77d1c1dccc5823433aa39b022cbe06" integrity sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug== +commander@^11.1.0: + version "11.1.0" + resolved "https://registry.yarnpkg.com/commander/-/commander-11.1.0.tgz#62fdce76006a68e5c1ab3314dc92e800eb83d906" + integrity sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ== + commander@^2.8.1: version "2.20.3" resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33" @@ -1780,6 +1822,11 @@ iconv-lite@0.6.3, iconv-lite@^0.6.3: dependencies: safer-buffer ">= 2.1.2 < 3.0.0" +idb-keyval@^6.2.0: + version "6.2.1" + resolved "https://registry.yarnpkg.com/idb-keyval/-/idb-keyval-6.2.1.tgz#94516d625346d16f56f3b33855da11bfded2db33" + integrity sha512-8Sb3veuYCyrZL+VBt9LJfZjLUPWVvqn8tG28VqYNFCo43KHcKuq+b4EiXGeuaLAQWL2YmyDgMp2aSpH9JHsEQg== + ieee754@^1.1.13, ieee754@^1.2.1: version "1.2.1" resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.2.1.tgz#8eb7a10a63fff25d15a57b001586d177d1b0d352" @@ -1903,6 +1950,11 @@ is-stream@^2.0.0: resolved "https://registry.yarnpkg.com/is-stream/-/is-stream-2.0.1.tgz#fac1e3d53b97ad5a9d0ae9cef2389f5810a5c077" integrity sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg== +is-url@^1.2.4: + version "1.2.4" + resolved "https://registry.yarnpkg.com/is-url/-/is-url-1.2.4.tgz#04a4df46d28c4cff3d73d01ff06abeb318a1aa52" + integrity sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww== + isarray@~1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" @@ -2431,7 +2483,7 @@ node-ensure@^0.0.0: resolved "https://registry.yarnpkg.com/node-ensure/-/node-ensure-0.0.0.tgz#ecae764150de99861ec5c810fd5d096b183932a7" integrity sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw== -node-fetch@^2.6.12, node-fetch@^2.6.7: +node-fetch@^2.6.12, node-fetch@^2.6.7, node-fetch@^2.6.9: version "2.7.0" resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.7.0.tgz#d0f0fa6e3e2dc1d27efcd8ad99d550bda94d187d" integrity sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A== @@ -2634,6 +2686,11 @@ openapi-types@^12.1.3: resolved "https://registry.yarnpkg.com/openapi-types/-/openapi-types-12.1.3.tgz#471995eb26c4b97b7bd356aacf7b91b73e777dd3" integrity sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw== +opencollective-postinstall@^2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz#7a0fff978f6dbfa4d006238fbac98ed4198c3259" + integrity sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q== + option@~0.2.1: version "0.2.4" resolved "https://registry.yarnpkg.com/option/-/option-0.2.4.tgz#fd475cdf98dcabb3cb397a3ba5284feb45edbfe4" @@ -2990,6 +3047,11 @@ readdirp@~3.6.0: dependencies: picomatch "^2.2.1" +regenerator-runtime@^0.13.3: + version "0.13.11" + resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz#f6dca3e7ceec20590d07ada785636a90cdca17f9" + integrity sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg== + require-directory@^2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42" @@ -3044,6 +3106,15 @@ sax@>=0.6.0: resolved "https://registry.yarnpkg.com/sax/-/sax-1.3.0.tgz#a5dbe77db3be05c9d1ee7785dbd3ea9de51593d0" integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA== +scribe.js-ocr@^0.7.1: + version "0.7.1" + resolved "https://registry.yarnpkg.com/scribe.js-ocr/-/scribe.js-ocr-0.7.1.tgz#bb9f3b8acfe7b591457974afe176201e7c0691b6" + integrity sha512-t9xALGmOfnNG91KQHUQf7wmsTAcuOZKcwzzuVmD4Kf+Gi0SyM+s3ve8jxxHCmvnUvFnPNYs1XKZUOmqhhfOyUw== + dependencies: + "@scribe.js/tesseract.js" "^6.0.2" + canvaskit-wasm "^0.39.1" + commander "^11.1.0" + seek-bzip@^1.0.5: version "1.0.6" resolved "https://registry.yarnpkg.com/seek-bzip/-/seek-bzip-1.0.6.tgz#35c4171f55a680916b52a07859ecf3b5857f21c4" @@ -3602,6 +3673,11 @@ vary@^1, vary@~1.1.2: resolved "https://registry.yarnpkg.com/vary/-/vary-1.1.2.tgz#2299f02c6ded30d4a5961b0b9f74524a18f634fc" integrity sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg== +wasm-feature-detect@^1.2.11: + version "1.8.0" + resolved "https://registry.yarnpkg.com/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz#4e9f55b0a64d801f372fbb0324ed11ad3abd0c78" + integrity sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ== + wavefile@^11.0.0: version "11.0.0" resolved "https://registry.yarnpkg.com/wavefile/-/wavefile-11.0.0.tgz#9302165874327ff63a704d00b154c753eaa1b8e7" @@ -3766,6 +3842,11 @@ youtubei.js@^9.1.0: tslib "^2.5.0" undici "^5.19.1" +zlibjs@^0.3.1: + version "0.3.1" + resolved "https://registry.yarnpkg.com/zlibjs/-/zlibjs-0.3.1.tgz#50197edb28a1c42ca659cc8b4e6a9ddd6d444554" + integrity sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w== + zod-to-json-schema@^3.22.3, zod-to-json-schema@^3.22.5: version "3.23.0" resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz#4fc60e88d3c709eedbfaae3f92f8a7bf786469f2" diff --git a/server/storage/models/.gitignore b/server/storage/models/.gitignore index 0481af71a..2c4e3a9ca 100644 --- a/server/storage/models/.gitignore +++ b/server/storage/models/.gitignore @@ -6,4 +6,5 @@ apipie novita mixedbread-ai* gemini -togetherAi \ No newline at end of file +togetherAi +ocr \ No newline at end of file From c463710b0f2f7a55a715d37450662b794e325174 Mon Sep 17 00:00:00 2001 From: timothycarambat <rambat1010@gmail.com> Date: Thu, 13 Feb 2025 21:59:12 -0800 Subject: [PATCH 2/2] wip --- collector/package.json | 3 +- .../convert/asPDF/PDFLoader/CanvasFactory.js | 39 ++++++ .../convert/asPDF/PDFLoader/index.js | 106 ++++++++--------- .../convert/asPDF/PDFLoader/ocrWorker.js | 41 ------- collector/yarn.lock | 112 +++++++++++------- 5 files changed, 157 insertions(+), 144 deletions(-) create mode 100644 collector/processSingleFile/convert/asPDF/PDFLoader/CanvasFactory.js delete mode 100644 collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js diff --git a/collector/package.json b/collector/package.json index 61fd686af..67d04fc8b 100644 --- a/collector/package.json +++ b/collector/package.json @@ -19,6 +19,7 @@ "@xenova/transformers": "^2.11.0", "bcrypt": "^5.1.0", "body-parser": "^1.20.2", + "canvas": "2.11.2", "cors": "^2.8.5", "dotenv": "^16.0.3", "epub2": "^3.0.2", @@ -38,9 +39,9 @@ "openai": "4.38.5", "pdf-parse": "^1.1.1", "puppeteer": "~21.5.2", - "scribe.js-ocr": "^0.7.1", "sharp": "^0.33.5", "slugify": "^1.6.6", + "tesseract.js": "^6.0.0", "url-pattern": "^1.0.3", "uuid": "^9.0.0", "wavefile": "^11.0.0", diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/CanvasFactory.js b/collector/processSingleFile/convert/asPDF/PDFLoader/CanvasFactory.js new file mode 100644 index 000000000..35ee651da --- /dev/null +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/CanvasFactory.js @@ -0,0 +1,39 @@ + +class NodeCanvasFactory { + constructor() { + this.Canvas = null; + } + + async init() { + this.Canvas = await import("canvas"); + this.Image = this.Canvas.Image; + } + + create( + width, + height, + transparent + ) { + const canvas = this.Canvas.createCanvas(width, height); + const context = canvas.getContext("2d", { alpha: transparent }); + if (transparent) context.clearRect(0, 0, width, height); + return { + canvas, + context, + }; + } + + reset(canvasAndContext, width, height) { + canvasAndContext.canvas.width = width; + canvasAndContext.canvas.height = height; + } + + destroy(canvasAndContext) { + canvasAndContext.canvas.width = 0; + canvasAndContext.canvas.height = 0; + canvasAndContext.canvas = null; + canvasAndContext.context = null; + } +} + +module.exports = NodeCanvasFactory; \ No newline at end of file diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js index 1662022ea..53b1f2f02 100644 --- a/collector/processSingleFile/convert/asPDF/PDFLoader/index.js +++ b/collector/processSingleFile/convert/asPDF/PDFLoader/index.js @@ -1,4 +1,6 @@ const fs = require("fs").promises; +const path = require("path"); +const NodeCanvasFactory = require("./CanvasFactory"); class PDFLoader { constructor(filePath, { splitPages = true } = {}) { @@ -83,74 +85,64 @@ class PDFLoader { * @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata. */ async asOCR() { - const { fork } = require("child_process"); - const path = require("path"); - const timeout = 300_000; - const ocrDataDirectory = - process.env.NODE_ENV === "development" - ? path.resolve(__dirname, `../../../../../server/storage/models/ocr`) - : path.resolve( - process.env.STORAGE_DIR ?? - path.resolve(__dirname, `../../../../../server/storage`), - `models/ocr` - ); + const documents = []; + const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js"); + const buffer = await fs.readFile(this.filePath); + const canvasFactory = new NodeCanvasFactory(); + await canvasFactory.init(); + global.Image = canvasFactory.Image; - return new Promise((resolve, _) => { - const worker = fork(path.join(__dirname, "ocrWorker.js")); - let isResolved = false; + const pdfDocument = await pdfjs.getDocument({ + data: new Uint8Array(buffer), + canvasFactory, + }).promise; - const cleanupAndResolve = (result = []) => { - if (!isResolved) { - isResolved = true; - worker.kill("SIGTERM"); - resolve(result); - } - }; + async function getPageAsBuffer(pageNumber, scale = 1) { + const page = await pdfDocument.getPage(pageNumber); + const viewport = page.getViewport(scale); + const { canvas, context } = canvasFactory.create( + viewport.width, + viewport.height, + false + ); - worker.on("message", (result) => { - if (result.error) { - console.log( - `[PDFLoader] Error parsing PDF with OCR engine: ${result.error}` - ); - cleanupAndResolve([]); - } else { - cleanupAndResolve([ - { - pageContent: result.textContent, - metadata: { - ...this.metadata, - source: this.filePath, - }, - }, - ]); - } - }); + await page.render({ + canvasFactory, + canvasContext: context, + viewport, + }).promise; - setTimeout(() => { - console.log( - `[PDFLoader] OCR Worker timeout (${timeout / 1000} seconds)` - ); - cleanupAndResolve([]); - }, timeout); + return canvas.toBuffer(); + } - worker.on("error", (error) => { - console.error(`[PDFLoader] OCR Worker error: ${error}`); - cleanupAndResolve([]); - }); - - worker.send({ - filePath: this.filePath, - mode: "speed", // TODO: Make this configurable - langs: ["eng"], // TODO: Make this configurable - runDir: ocrDataDirectory, - }); + const { createWorker, setLogging, OEM } = require("tesseract.js"); + setLogging(false); + const worker = await createWorker("eng", OEM.LSTM_ONLY, { + cachePath: path.resolve(__dirname, `../../../../storage/tmp`), }); + + for (let i = 1; i <= pdfDocument.numPages; i += 1) { + const image = await getPageAsBuffer(i, 5); + const { data } = await worker.recognize(image, {}, "text"); + documents.push({ + pageContent: data.text, + metadata: { + ...this.metadata, + loc: { pageNumber: i }, + }, + }); + } + + return documents; } async getPdfJS() { try { const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"); - return { getDocument: pdfjs.getDocument, version: pdfjs.version }; + return { + getDocument: pdfjs.getDocument, + version: pdfjs.version, + }; } catch (e) { console.error(e); throw new Error( diff --git a/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js b/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js deleted file mode 100644 index c0a619550..000000000 --- a/collector/processSingleFile/convert/asPDF/PDFLoader/ocrWorker.js +++ /dev/null @@ -1,41 +0,0 @@ -const fs = require("fs"); - -const processFile = async ( - filePath, - mode = "speed", - langs = ["eng"], - runDir = null -) => { - const { default: scribe } = await import("scribe.js-ocr"); - // Change directory to the run directory if it is provided - // This will allow the traineddata files to be persisted in storage - // and not be deleted after the worker is done or the app is killed. - // If not defined this will wind up pulling the traineddata files into the current - // directory of the worker (collector/index.js) and will cause subsequent runs to be slower. - if (runDir) { - if (!fs.existsSync(runDir)) fs.mkdirSync(runDir, { recursive: true }); - process.chdir(runDir); - } - - try { - await scribe.importFiles([filePath]); - const textContent = await scribe - .recognize({ mode, langs }) - .then(() => scribe.exportData("text")); - - return { textContent }; - } catch (e) { - return { error: e.message }; - } finally { - scribe.terminate(); - } -}; - -process.on( - "message", - async ({ filePath, mode = "speed", langs = ["eng"], runDir = null }) => { - const result = await processFile(filePath, mode, langs, runDir); - process.send(result); - process.exit(0); - } -); diff --git a/collector/yarn.lock b/collector/yarn.lock index a1e0daee7..79758de5a 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -280,7 +280,7 @@ "@langchain/core" "~0.1" js-tiktoken "^1.0.11" -"@mapbox/node-pre-gyp@^1.0.11": +"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11": version "1.0.11" resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa" integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ== @@ -361,26 +361,6 @@ unbzip2-stream "1.4.3" yargs "17.7.2" -"@scribe.js/tesseract.js-core@^6.0.3": - version "6.0.3" - resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js-core/-/tesseract.js-core-6.0.3.tgz#cc48ccead95045e1629e9bb4c04f638c15664f89" - integrity sha512-q2bXN0yQCEs5IA9138vF+xGSfNhOaMVdPKkuZNSiz3A+SfuRZuAU3iKzIXJMdSJnWvHSBOUPLDdqguClCNX9Qw== - -"@scribe.js/tesseract.js@^6.0.2": - version "6.0.2" - resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js/-/tesseract.js-6.0.2.tgz#24e0af34be38f92ffb68d1462fc203b4c209aaf7" - integrity sha512-FaLrLXtgpnQDI2fZSwg6plAgMm6s24v3rxTiG9zObKI22wPAH98CUVNR7BqgEVu3Yq7jLVWyoq14C4iJbWQcZA== - dependencies: - "@scribe.js/tesseract.js-core" "^6.0.3" - bmp-js "^0.1.0" - idb-keyval "^6.2.0" - is-url "^1.2.4" - node-fetch "^2.6.9" - opencollective-postinstall "^2.0.3" - regenerator-runtime "^0.13.3" - wasm-feature-detect "^1.2.11" - zlibjs "^0.3.1" - "@selderee/plugin-htmlparser2@^0.11.0": version "0.11.0" resolved "https://registry.yarnpkg.com/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz#d5b5e29a7ba6d3958a1972c7be16f4b2c188c517" @@ -456,11 +436,6 @@ dependencies: "@types/node" "*" -"@webgpu/types@0.1.21": - version "0.1.21" - resolved "https://registry.yarnpkg.com/@webgpu/types/-/types-0.1.21.tgz#b181202daec30d66ccd67264de23814cfd176d3a" - integrity sha512-pUrWq3V5PiSGFLeLxoGqReTZmiiXwY3jRkIG5sLLKjyqNxrwm/04b4nw7LSmGWJcKk59XOM/YRTUwOzo4MMlow== - "@xenova/transformers@^2.11.0": version "2.17.1" resolved "https://registry.yarnpkg.com/@xenova/transformers/-/transformers-2.17.1.tgz#712f7a72c76c8aa2075749382f83dc7dd4e5a9a5" @@ -818,12 +793,14 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== -canvaskit-wasm@^0.39.1: - version "0.39.1" - resolved "https://registry.yarnpkg.com/canvaskit-wasm/-/canvaskit-wasm-0.39.1.tgz#c3c8f3962cbabbedf246f7bcf90e859013c7eae9" - integrity sha512-Gy3lCmhUdKq+8bvDrs9t8+qf7RvcjuQn+we7vTVVyqgOVO1UVfHpsnBxkTZw+R4ApEJ3D5fKySl9TU11hmjl/A== +canvas@2.11.2, canvas@^2.11.2: + version "2.11.2" + resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860" + integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw== dependencies: - "@webgpu/types" "0.1.21" + "@mapbox/node-pre-gyp" "^1.0.0" + nan "^2.17.0" + simple-get "^3.0.3" chalk@^2.4.2: version "2.4.2" @@ -954,11 +931,6 @@ commander@^10.0.1: resolved "https://registry.yarnpkg.com/commander/-/commander-10.0.1.tgz#881ee46b4f77d1c1dccc5823433aa39b022cbe06" integrity sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug== -commander@^11.1.0: - version "11.1.0" - resolved "https://registry.yarnpkg.com/commander/-/commander-11.1.0.tgz#62fdce76006a68e5c1ab3314dc92e800eb83d906" - integrity sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ== - commander@^2.8.1: version "2.20.3" resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33" @@ -1085,6 +1057,13 @@ decamelize@1.2.0: resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA== +decompress-response@^4.2.0: + version "4.2.1" + resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986" + integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw== + dependencies: + mimic-response "^2.0.0" + decompress-response@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc" @@ -2328,6 +2307,11 @@ mime@^3.0.0: resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7" integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A== +mimic-response@^2.0.0: + version "2.1.0" + resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43" + integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA== + mimic-response@^3.1.0: version "3.1.0" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" @@ -2441,6 +2425,11 @@ mustache@^4.2.0: resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== +nan@^2.17.0: + version "2.22.0" + resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3" + integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw== + napi-build-utils@^1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806" @@ -2796,6 +2785,11 @@ path-type@^4.0.0: resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== +path2d@^0.2.0: + version "0.2.2" + resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.2.2.tgz#cc85d61ed7827e7863a2ee36713d4b5315a3d85d" + integrity sha512-+vnG6S4dYcYxZd+CZxzXCNKdELYZSKfohrk98yajCo1PtRoDgCTrrwOvK1GT0UoAdVszagDVllQc0U1vaX4NUQ== + pdf-parse@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7" @@ -2804,6 +2798,14 @@ pdf-parse@^1.1.1: debug "^3.1.0" node-ensure "^0.0.0" +pdfjs-dist@4.2.67: + version "4.2.67" + resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz#dd2a65a4b00d95cd4bc2c1f6a27c5e9eb31d512a" + integrity sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA== + optionalDependencies: + canvas "^2.11.2" + path2d "^0.2.0" + peberminta@^0.9.0: version "0.9.0" resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352" @@ -3106,15 +3108,6 @@ sax@>=0.6.0: resolved "https://registry.yarnpkg.com/sax/-/sax-1.3.0.tgz#a5dbe77db3be05c9d1ee7785dbd3ea9de51593d0" integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA== -scribe.js-ocr@^0.7.1: - version "0.7.1" - resolved "https://registry.yarnpkg.com/scribe.js-ocr/-/scribe.js-ocr-0.7.1.tgz#bb9f3b8acfe7b591457974afe176201e7c0691b6" - integrity sha512-t9xALGmOfnNG91KQHUQf7wmsTAcuOZKcwzzuVmD4Kf+Gi0SyM+s3ve8jxxHCmvnUvFnPNYs1XKZUOmqhhfOyUw== - dependencies: - "@scribe.js/tesseract.js" "^6.0.2" - canvaskit-wasm "^0.39.1" - commander "^11.1.0" - seek-bzip@^1.0.5: version "1.0.6" resolved "https://registry.yarnpkg.com/seek-bzip/-/seek-bzip-1.0.6.tgz#35c4171f55a680916b52a07859ecf3b5857f21c4" @@ -3275,6 +3268,15 @@ simple-concat@^1.0.0: resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f" integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== +simple-get@^3.0.3: + version "3.1.1" + resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55" + integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA== + dependencies: + decompress-response "^4.2.0" + once "^1.3.1" + simple-concat "^1.0.0" + simple-get@^4.0.0, simple-get@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543" @@ -3492,6 +3494,26 @@ tar@^6.1.11: mkdirp "^1.0.3" yallist "^4.0.0" +tesseract.js-core@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/tesseract.js-core/-/tesseract.js-core-6.0.0.tgz#6f25da94f70f8e8f02aff47a43be61d49e6f67c3" + integrity sha512-1Qncm/9oKM7xgrQXZXNB+NRh19qiXGhxlrR8EwFbK5SaUbPZnS5OMtP/ghtqfd23hsr1ZvZbZjeuAGcMxd/ooA== + +tesseract.js@^6.0.0: + version "6.0.0" + resolved "https://registry.yarnpkg.com/tesseract.js/-/tesseract.js-6.0.0.tgz#62ff7fffc8833b5810430a4067785e49d5ca8e7f" + integrity sha512-tqYCod1HwJzkeZw1l6XWx+ly2hhisGcBtak9MArhYwDAxL0NgeVhLJcUjqPxZMQtpgtVUzWcpZPryi+hnaQGVw== + dependencies: + bmp-js "^0.1.0" + idb-keyval "^6.2.0" + is-url "^1.2.4" + node-fetch "^2.6.9" + opencollective-postinstall "^2.0.3" + regenerator-runtime "^0.13.3" + tesseract.js-core "^6.0.0" + wasm-feature-detect "^1.2.11" + zlibjs "^0.3.1" + text-hex@1.0.x: version "1.0.0" resolved "https://registry.yarnpkg.com/text-hex/-/text-hex-1.0.0.tgz#69dc9c1b17446ee79a92bf5b884bb4b9127506f5"