This commit is contained in:
timothycarambat 2025-02-13 21:59:12 -08:00
parent 2f89bbae74
commit c463710b0f
5 changed files with 157 additions and 144 deletions
collector
package.json
processSingleFile/convert/asPDF/PDFLoader
yarn.lock

View file

@ -19,6 +19,7 @@
"@xenova/transformers": "^2.11.0",
"bcrypt": "^5.1.0",
"body-parser": "^1.20.2",
"canvas": "2.11.2",
"cors": "^2.8.5",
"dotenv": "^16.0.3",
"epub2": "^3.0.2",
@ -38,9 +39,9 @@
"openai": "4.38.5",
"pdf-parse": "^1.1.1",
"puppeteer": "~21.5.2",
"scribe.js-ocr": "^0.7.1",
"sharp": "^0.33.5",
"slugify": "^1.6.6",
"tesseract.js": "^6.0.0",
"url-pattern": "^1.0.3",
"uuid": "^9.0.0",
"wavefile": "^11.0.0",

View file

@ -0,0 +1,39 @@
class NodeCanvasFactory {
constructor() {
this.Canvas = null;
}
async init() {
this.Canvas = await import("canvas");
this.Image = this.Canvas.Image;
}
create(
width,
height,
transparent
) {
const canvas = this.Canvas.createCanvas(width, height);
const context = canvas.getContext("2d", { alpha: transparent });
if (transparent) context.clearRect(0, 0, width, height);
return {
canvas,
context,
};
}
reset(canvasAndContext, width, height) {
canvasAndContext.canvas.width = width;
canvasAndContext.canvas.height = height;
}
destroy(canvasAndContext) {
canvasAndContext.canvas.width = 0;
canvasAndContext.canvas.height = 0;
canvasAndContext.canvas = null;
canvasAndContext.context = null;
}
}
module.exports = NodeCanvasFactory;

View file

@ -1,4 +1,6 @@
const fs = require("fs").promises;
const path = require("path");
const NodeCanvasFactory = require("./CanvasFactory");
class PDFLoader {
constructor(filePath, { splitPages = true } = {}) {
@ -83,74 +85,64 @@ class PDFLoader {
* @returns {Promise<{pageContent: string, metadata: object}[]>} An array of documents with page content and metadata.
*/
async asOCR() {
const { fork } = require("child_process");
const path = require("path");
const timeout = 300_000;
const ocrDataDirectory =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../../../../server/storage/models/ocr`)
: path.resolve(
process.env.STORAGE_DIR ??
path.resolve(__dirname, `../../../../../server/storage`),
`models/ocr`
);
const documents = [];
const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
const buffer = await fs.readFile(this.filePath);
const canvasFactory = new NodeCanvasFactory();
await canvasFactory.init();
global.Image = canvasFactory.Image;
return new Promise((resolve, _) => {
const worker = fork(path.join(__dirname, "ocrWorker.js"));
let isResolved = false;
const pdfDocument = await pdfjs.getDocument({
data: new Uint8Array(buffer),
canvasFactory,
}).promise;
const cleanupAndResolve = (result = []) => {
if (!isResolved) {
isResolved = true;
worker.kill("SIGTERM");
resolve(result);
}
};
async function getPageAsBuffer(pageNumber, scale = 1) {
const page = await pdfDocument.getPage(pageNumber);
const viewport = page.getViewport(scale);
const { canvas, context } = canvasFactory.create(
viewport.width,
viewport.height,
false
);
worker.on("message", (result) => {
if (result.error) {
console.log(
`[PDFLoader] Error parsing PDF with OCR engine: ${result.error}`
);
cleanupAndResolve([]);
} else {
cleanupAndResolve([
{
pageContent: result.textContent,
metadata: {
...this.metadata,
source: this.filePath,
},
},
]);
}
});
await page.render({
canvasFactory,
canvasContext: context,
viewport,
}).promise;
setTimeout(() => {
console.log(
`[PDFLoader] OCR Worker timeout (${timeout / 1000} seconds)`
);
cleanupAndResolve([]);
}, timeout);
return canvas.toBuffer();
}
worker.on("error", (error) => {
console.error(`[PDFLoader] OCR Worker error: ${error}`);
cleanupAndResolve([]);
});
worker.send({
filePath: this.filePath,
mode: "speed", // TODO: Make this configurable
langs: ["eng"], // TODO: Make this configurable
runDir: ocrDataDirectory,
});
const { createWorker, setLogging, OEM } = require("tesseract.js");
setLogging(false);
const worker = await createWorker("eng", OEM.LSTM_ONLY, {
cachePath: path.resolve(__dirname, `../../../../storage/tmp`),
});
for (let i = 1; i <= pdfDocument.numPages; i += 1) {
const image = await getPageAsBuffer(i, 5);
const { data } = await worker.recognize(image, {}, "text");
documents.push({
pageContent: data.text,
metadata: {
...this.metadata,
loc: { pageNumber: i },
},
});
}
return documents;
}
async getPdfJS() {
try {
const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
return { getDocument: pdfjs.getDocument, version: pdfjs.version };
return {
getDocument: pdfjs.getDocument,
version: pdfjs.version,
};
} catch (e) {
console.error(e);
throw new Error(

View file

@ -1,41 +0,0 @@
const fs = require("fs");
const processFile = async (
filePath,
mode = "speed",
langs = ["eng"],
runDir = null
) => {
const { default: scribe } = await import("scribe.js-ocr");
// Change directory to the run directory if it is provided
// This will allow the traineddata files to be persisted in storage
// and not be deleted after the worker is done or the app is killed.
// If not defined this will wind up pulling the traineddata files into the current
// directory of the worker (collector/index.js) and will cause subsequent runs to be slower.
if (runDir) {
if (!fs.existsSync(runDir)) fs.mkdirSync(runDir, { recursive: true });
process.chdir(runDir);
}
try {
await scribe.importFiles([filePath]);
const textContent = await scribe
.recognize({ mode, langs })
.then(() => scribe.exportData("text"));
return { textContent };
} catch (e) {
return { error: e.message };
} finally {
scribe.terminate();
}
};
process.on(
"message",
async ({ filePath, mode = "speed", langs = ["eng"], runDir = null }) => {
const result = await processFile(filePath, mode, langs, runDir);
process.send(result);
process.exit(0);
}
);

View file

@ -280,7 +280,7 @@
"@langchain/core" "~0.1"
js-tiktoken "^1.0.11"
"@mapbox/node-pre-gyp@^1.0.11":
"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
@ -361,26 +361,6 @@
unbzip2-stream "1.4.3"
yargs "17.7.2"
"@scribe.js/tesseract.js-core@^6.0.3":
version "6.0.3"
resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js-core/-/tesseract.js-core-6.0.3.tgz#cc48ccead95045e1629e9bb4c04f638c15664f89"
integrity sha512-q2bXN0yQCEs5IA9138vF+xGSfNhOaMVdPKkuZNSiz3A+SfuRZuAU3iKzIXJMdSJnWvHSBOUPLDdqguClCNX9Qw==
"@scribe.js/tesseract.js@^6.0.2":
version "6.0.2"
resolved "https://registry.yarnpkg.com/@scribe.js/tesseract.js/-/tesseract.js-6.0.2.tgz#24e0af34be38f92ffb68d1462fc203b4c209aaf7"
integrity sha512-FaLrLXtgpnQDI2fZSwg6plAgMm6s24v3rxTiG9zObKI22wPAH98CUVNR7BqgEVu3Yq7jLVWyoq14C4iJbWQcZA==
dependencies:
"@scribe.js/tesseract.js-core" "^6.0.3"
bmp-js "^0.1.0"
idb-keyval "^6.2.0"
is-url "^1.2.4"
node-fetch "^2.6.9"
opencollective-postinstall "^2.0.3"
regenerator-runtime "^0.13.3"
wasm-feature-detect "^1.2.11"
zlibjs "^0.3.1"
"@selderee/plugin-htmlparser2@^0.11.0":
version "0.11.0"
resolved "https://registry.yarnpkg.com/@selderee/plugin-htmlparser2/-/plugin-htmlparser2-0.11.0.tgz#d5b5e29a7ba6d3958a1972c7be16f4b2c188c517"
@ -456,11 +436,6 @@
dependencies:
"@types/node" "*"
"@webgpu/types@0.1.21":
version "0.1.21"
resolved "https://registry.yarnpkg.com/@webgpu/types/-/types-0.1.21.tgz#b181202daec30d66ccd67264de23814cfd176d3a"
integrity sha512-pUrWq3V5PiSGFLeLxoGqReTZmiiXwY3jRkIG5sLLKjyqNxrwm/04b4nw7LSmGWJcKk59XOM/YRTUwOzo4MMlow==
"@xenova/transformers@^2.11.0":
version "2.17.1"
resolved "https://registry.yarnpkg.com/@xenova/transformers/-/transformers-2.17.1.tgz#712f7a72c76c8aa2075749382f83dc7dd4e5a9a5"
@ -818,12 +793,14 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
canvaskit-wasm@^0.39.1:
version "0.39.1"
resolved "https://registry.yarnpkg.com/canvaskit-wasm/-/canvaskit-wasm-0.39.1.tgz#c3c8f3962cbabbedf246f7bcf90e859013c7eae9"
integrity sha512-Gy3lCmhUdKq+8bvDrs9t8+qf7RvcjuQn+we7vTVVyqgOVO1UVfHpsnBxkTZw+R4ApEJ3D5fKySl9TU11hmjl/A==
canvas@2.11.2, canvas@^2.11.2:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@webgpu/types" "0.1.21"
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"
chalk@^2.4.2:
version "2.4.2"
@ -954,11 +931,6 @@ commander@^10.0.1:
resolved "https://registry.yarnpkg.com/commander/-/commander-10.0.1.tgz#881ee46b4f77d1c1dccc5823433aa39b022cbe06"
integrity sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==
commander@^11.1.0:
version "11.1.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-11.1.0.tgz#62fdce76006a68e5c1ab3314dc92e800eb83d906"
integrity sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==
commander@^2.8.1:
version "2.20.3"
resolved "https://registry.yarnpkg.com/commander/-/commander-2.20.3.tgz#fd485e84c03eb4881c20722ba48035e8531aeb33"
@ -1085,6 +1057,13 @@ decamelize@1.2.0:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==
decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"
decompress-response@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
@ -2328,6 +2307,11 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==
mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==
mimic-response@^3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
@ -2441,6 +2425,11 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==
nan@^2.17.0:
version "2.22.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3"
integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==
napi-build-utils@^1.0.1:
version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
@ -2796,6 +2785,11 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==
path2d@^0.2.0:
version "0.2.2"
resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.2.2.tgz#cc85d61ed7827e7863a2ee36713d4b5315a3d85d"
integrity sha512-+vnG6S4dYcYxZd+CZxzXCNKdELYZSKfohrk98yajCo1PtRoDgCTrrwOvK1GT0UoAdVszagDVllQc0U1vaX4NUQ==
pdf-parse@^1.1.1:
version "1.1.1"
resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7"
@ -2804,6 +2798,14 @@ pdf-parse@^1.1.1:
debug "^3.1.0"
node-ensure "^0.0.0"
pdfjs-dist@4.2.67:
version "4.2.67"
resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz#dd2a65a4b00d95cd4bc2c1f6a27c5e9eb31d512a"
integrity sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==
optionalDependencies:
canvas "^2.11.2"
path2d "^0.2.0"
peberminta@^0.9.0:
version "0.9.0"
resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352"
@ -3106,15 +3108,6 @@ sax@>=0.6.0:
resolved "https://registry.yarnpkg.com/sax/-/sax-1.3.0.tgz#a5dbe77db3be05c9d1ee7785dbd3ea9de51593d0"
integrity sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==
scribe.js-ocr@^0.7.1:
version "0.7.1"
resolved "https://registry.yarnpkg.com/scribe.js-ocr/-/scribe.js-ocr-0.7.1.tgz#bb9f3b8acfe7b591457974afe176201e7c0691b6"
integrity sha512-t9xALGmOfnNG91KQHUQf7wmsTAcuOZKcwzzuVmD4Kf+Gi0SyM+s3ve8jxxHCmvnUvFnPNYs1XKZUOmqhhfOyUw==
dependencies:
"@scribe.js/tesseract.js" "^6.0.2"
canvaskit-wasm "^0.39.1"
commander "^11.1.0"
seek-bzip@^1.0.5:
version "1.0.6"
resolved "https://registry.yarnpkg.com/seek-bzip/-/seek-bzip-1.0.6.tgz#35c4171f55a680916b52a07859ecf3b5857f21c4"
@ -3275,6 +3268,15 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"
simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
@ -3492,6 +3494,26 @@ tar@^6.1.11:
mkdirp "^1.0.3"
yallist "^4.0.0"
tesseract.js-core@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/tesseract.js-core/-/tesseract.js-core-6.0.0.tgz#6f25da94f70f8e8f02aff47a43be61d49e6f67c3"
integrity sha512-1Qncm/9oKM7xgrQXZXNB+NRh19qiXGhxlrR8EwFbK5SaUbPZnS5OMtP/ghtqfd23hsr1ZvZbZjeuAGcMxd/ooA==
tesseract.js@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/tesseract.js/-/tesseract.js-6.0.0.tgz#62ff7fffc8833b5810430a4067785e49d5ca8e7f"
integrity sha512-tqYCod1HwJzkeZw1l6XWx+ly2hhisGcBtak9MArhYwDAxL0NgeVhLJcUjqPxZMQtpgtVUzWcpZPryi+hnaQGVw==
dependencies:
bmp-js "^0.1.0"
idb-keyval "^6.2.0"
is-url "^1.2.4"
node-fetch "^2.6.9"
opencollective-postinstall "^2.0.3"
regenerator-runtime "^0.13.3"
tesseract.js-core "^6.0.0"
wasm-feature-detect "^1.2.11"
zlibjs "^0.3.1"
text-hex@1.0.x:
version "1.0.0"
resolved "https://registry.yarnpkg.com/text-hex/-/text-hex-1.0.0.tgz#69dc9c1b17446ee79a92bf5b884bb4b9127506f5"