Add OCR of image support ()

* OCR PDFs as fallback in spawn thread

* wip

* build our own worker fanout and wrapper

* norm pkgs

* Add image OCR support
This commit is contained in:
Timothy Carambat 2025-02-14 12:07:33 -08:00 committed by GitHub
parent 2a9066e83a
commit 89bba68219
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 118 additions and 2 deletions
.github/workflows
collector
processSingleFile/convert
utils

View file

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View file

@ -0,0 +1,48 @@
const { v4 } = require("uuid");
const { tokenizeString } = require("../../utils/tokenizer");
const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const OCRLoader = require("../../utils/OCRLoader");
const { default: slugify } = require("slugify");
async function asImage({ fullFilePath = "", filename = "" }) {
let content = await new OCRLoader().ocrImage(fullFilePath);
if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}
console.log(`-- Working ${filename} --`);
const data = {
id: v4(),
url: "file://" + fullFilePath,
title: filename,
docAuthor: "Unknown", // TODO: Find a better author
description: "Unknown", // TODO: Find a better description
docSource: "a text file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content),
};
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null, documents: [document] };
}
module.exports = asImage;

View file

@ -185,6 +185,67 @@ class OCRLoader {
});
return documents;
}
/**
* Loads an image file and returns the OCRed text.
* @param {string} filePath - The path to the image file.
* @param {Object} options - The options for the OCR.
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
* @returns {Promise<string>} The OCRed text.
*/
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
let content = "";
let worker = null;
if (
!filePath ||
!fs.existsSync(filePath) ||
!fs.statSync(filePath).isFile()
) {
this.log(`File ${filePath} does not exist. Skipping OCR.`);
return null;
}
const documentTitle = path.basename(filePath);
try {
this.log(`Starting OCR of ${documentTitle}`);
const startTime = Date.now();
const { createWorker, OEM } = require("tesseract.js");
worker = await createWorker("eng", OEM.LSTM_ONLY, {
cachePath: this.cacheDir,
});
// Race the timeout with the OCR
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => {
reject(
new Error(
`OCR job took too long to complete (${
maxExecutionTime / 1000
} seconds)`
)
);
}, maxExecutionTime);
});
const processImage = async () => {
const { data } = await worker.recognize(filePath, {}, "text");
content = data.text;
};
await Promise.race([timeoutPromise, processImage()]);
this.log(`Completed OCR of ${documentTitle}!`, {
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
});
return content;
} catch (e) {
this.log(`Error: ${e.message}`);
return null;
} finally {
if (!worker) return;
await worker.terminate();
}
}
}
module.exports = OCRLoader;

View file

@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
"video/mp4": [".mp4"],
"video/mpeg": [".mpeg"],
"application/epub+zip": [".epub"],
"image/png": [".png"],
"image/jpeg": [".jpg"],
"image/jpg": [".jpg"],
};
const SUPPORTED_FILETYPE_CONVERTERS = {
@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".wav": "./convert/asAudio.js",
".mp4": "./convert/asAudio.js",
".mpeg": "./convert/asAudio.js",
".png": "./convert/asImage.js",
".jpg": "./convert/asImage.js",
".jpeg": "./convert/asImage.js",
};
module.exports = {

View file

@ -1,6 +1,6 @@
const MimeLib = require("mime");
class MimeDetector {
nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
nonTextTypes = ["multipart", "model", "audio", "video", "font"];
badMimes = [
"application/octet-stream",
"application/zip",