mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2025-03-13 05:32:24 +00:00
Add OCR of image support (#3219)
* OCR PDFs as fallback in spawn thread * wip * build our own worker fanout and wrapper * norm pkgs * Add image OCR support
This commit is contained in:
parent
2a9066e83a
commit
89bba68219
5 changed files with 118 additions and 2 deletions
.github/workflows
collector
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
|
@ -6,7 +6,7 @@ concurrency:
|
|||
|
||||
on:
|
||||
push:
|
||||
branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
|
||||
branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
|
48
collector/processSingleFile/convert/asImage.js
Normal file
48
collector/processSingleFile/convert/asImage.js
Normal file
|
@ -0,0 +1,48 @@
|
|||
const { v4 } = require("uuid");
|
||||
const { tokenizeString } = require("../../utils/tokenizer");
|
||||
const {
|
||||
createdDate,
|
||||
trashFile,
|
||||
writeToServerDocuments,
|
||||
} = require("../../utils/files");
|
||||
const OCRLoader = require("../../utils/OCRLoader");
|
||||
const { default: slugify } = require("slugify");
|
||||
|
||||
async function asImage({ fullFilePath = "", filename = "" }) {
|
||||
let content = await new OCRLoader().ocrImage(fullFilePath);
|
||||
|
||||
if (!content?.length) {
|
||||
console.error(`Resulting text content was empty for ${filename}.`);
|
||||
trashFile(fullFilePath);
|
||||
return {
|
||||
success: false,
|
||||
reason: `No text content found in ${filename}.`,
|
||||
documents: [],
|
||||
};
|
||||
}
|
||||
|
||||
console.log(`-- Working ${filename} --`);
|
||||
const data = {
|
||||
id: v4(),
|
||||
url: "file://" + fullFilePath,
|
||||
title: filename,
|
||||
docAuthor: "Unknown", // TODO: Find a better author
|
||||
description: "Unknown", // TODO: Find a better description
|
||||
docSource: "a text file uploaded by the user.",
|
||||
chunkSource: "",
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
data,
|
||||
`${slugify(filename)}-${data.id}`
|
||||
);
|
||||
trashFile(fullFilePath);
|
||||
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
|
||||
return { success: true, reason: null, documents: [document] };
|
||||
}
|
||||
|
||||
module.exports = asImage;
|
|
@ -185,6 +185,67 @@ class OCRLoader {
|
|||
});
|
||||
return documents;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads an image file and returns the OCRed text.
|
||||
* @param {string} filePath - The path to the image file.
|
||||
* @param {Object} options - The options for the OCR.
|
||||
* @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
|
||||
* @returns {Promise<string>} The OCRed text.
|
||||
*/
|
||||
async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
|
||||
let content = "";
|
||||
let worker = null;
|
||||
if (
|
||||
!filePath ||
|
||||
!fs.existsSync(filePath) ||
|
||||
!fs.statSync(filePath).isFile()
|
||||
) {
|
||||
this.log(`File ${filePath} does not exist. Skipping OCR.`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const documentTitle = path.basename(filePath);
|
||||
try {
|
||||
this.log(`Starting OCR of ${documentTitle}`);
|
||||
const startTime = Date.now();
|
||||
const { createWorker, OEM } = require("tesseract.js");
|
||||
worker = await createWorker("eng", OEM.LSTM_ONLY, {
|
||||
cachePath: this.cacheDir,
|
||||
});
|
||||
|
||||
// Race the timeout with the OCR
|
||||
const timeoutPromise = new Promise((_, reject) => {
|
||||
setTimeout(() => {
|
||||
reject(
|
||||
new Error(
|
||||
`OCR job took too long to complete (${
|
||||
maxExecutionTime / 1000
|
||||
} seconds)`
|
||||
)
|
||||
);
|
||||
}, maxExecutionTime);
|
||||
});
|
||||
|
||||
const processImage = async () => {
|
||||
const { data } = await worker.recognize(filePath, {}, "text");
|
||||
content = data.text;
|
||||
};
|
||||
|
||||
await Promise.race([timeoutPromise, processImage()]);
|
||||
this.log(`Completed OCR of ${documentTitle}!`, {
|
||||
executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
|
||||
});
|
||||
|
||||
return content;
|
||||
} catch (e) {
|
||||
this.log(`Error: ${e.message}`);
|
||||
return null;
|
||||
} finally {
|
||||
if (!worker) return;
|
||||
await worker.terminate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = OCRLoader;
|
||||
|
|
|
@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
|
|||
"video/mp4": [".mp4"],
|
||||
"video/mpeg": [".mpeg"],
|
||||
"application/epub+zip": [".epub"],
|
||||
"image/png": [".png"],
|
||||
"image/jpeg": [".jpg"],
|
||||
"image/jpg": [".jpg"],
|
||||
};
|
||||
|
||||
const SUPPORTED_FILETYPE_CONVERTERS = {
|
||||
|
@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
|
|||
".wav": "./convert/asAudio.js",
|
||||
".mp4": "./convert/asAudio.js",
|
||||
".mpeg": "./convert/asAudio.js",
|
||||
|
||||
".png": "./convert/asImage.js",
|
||||
".jpg": "./convert/asImage.js",
|
||||
".jpeg": "./convert/asImage.js",
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
const MimeLib = require("mime");
|
||||
class MimeDetector {
|
||||
nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
|
||||
nonTextTypes = ["multipart", "model", "audio", "video", "font"];
|
||||
badMimes = [
|
||||
"application/octet-stream",
|
||||
"application/zip",
|
||||
|
|
Loading…
Add table
Reference in a new issue