Add OCR of image support (#3219)

* OCR PDFs as fallback in spawn thread * wip * build our own worker fanout and wrapper * norm pkgs * Add image OCR support
2025-03-13 05:32:24 +00:00 · 2025-02-14 12:07:33 -08:00 · 2025-02-14 12:07:33 -08:00 · 89bba68219
commit 89bba68219
parent 2a9066e83a
5 changed files with 118 additions and 2 deletions
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@ -6,7 +6,7 @@ concurrency:

 on:
  push:
-    branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
+    branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
    paths-ignore:
      - '**.md'
      - 'cloud-deployments/*'
--- a/collector/processSingleFile/convert/asImage.js
+++ b/collector/processSingleFile/convert/asImage.js
@ -0,0 +1,48 @@
+const { v4 } = require("uuid");
+const { tokenizeString } = require("../../utils/tokenizer");
+const {
+  createdDate,
+  trashFile,
+  writeToServerDocuments,
+} = require("../../utils/files");
+const OCRLoader = require("../../utils/OCRLoader");
+const { default: slugify } = require("slugify");
+
+async function asImage({ fullFilePath = "", filename = "" }) {
+  let content = await new OCRLoader().ocrImage(fullFilePath);
+
+  if (!content?.length) {
+    console.error(`Resulting text content was empty for ${filename}.`);
+    trashFile(fullFilePath);
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
+  }
+
+  console.log(`-- Working ${filename} --`);
+  const data = {
+    id: v4(),
+    url: "file://" + fullFilePath,
+    title: filename,
+    docAuthor: "Unknown", // TODO: Find a better author
+    description: "Unknown", // TODO: Find a better description
+    docSource: "a text file uploaded by the user.",
+    chunkSource: "",
+    published: createdDate(fullFilePath),
+    wordCount: content.split(" ").length,
+    pageContent: content,
+    token_count_estimate: tokenizeString(content),
+  };
+
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
+  trashFile(fullFilePath);
+  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
+  return { success: true, reason: null, documents: [document] };
+}
+
+module.exports = asImage;
--- a/collector/utils/OCRLoader/index.js
+++ b/collector/utils/OCRLoader/index.js
@ -185,6 +185,67 @@ class OCRLoader {
    });
    return documents;
  }
+
+  /**
+   * Loads an image file and returns the OCRed text.
+   * @param {string} filePath - The path to the image file.
+   * @param {Object} options - The options for the OCR.
+   * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
+   * @returns {Promise<string>} The OCRed text.
+   */
+  async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
+    let content = "";
+    let worker = null;
+    if (
+      !filePath ||
+      !fs.existsSync(filePath) ||
+      !fs.statSync(filePath).isFile()
+    ) {
+      this.log(`File ${filePath} does not exist. Skipping OCR.`);
+      return null;
+    }
+
+    const documentTitle = path.basename(filePath);
+    try {
+      this.log(`Starting OCR of ${documentTitle}`);
+      const startTime = Date.now();
+      const { createWorker, OEM } = require("tesseract.js");
+      worker = await createWorker("eng", OEM.LSTM_ONLY, {
+        cachePath: this.cacheDir,
+      });
+
+      // Race the timeout with the OCR
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(() => {
+          reject(
+            new Error(
+              `OCR job took too long to complete (${
+                maxExecutionTime / 1000
+              } seconds)`
+            )
+          );
+        }, maxExecutionTime);
+      });
+
+      const processImage = async () => {
+        const { data } = await worker.recognize(filePath, {}, "text");
+        content = data.text;
+      };
+
+      await Promise.race([timeoutPromise, processImage()]);
+      this.log(`Completed OCR of ${documentTitle}!`, {
+        executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
+      });
+
+      return content;
+    } catch (e) {
+      this.log(`Error: ${e.message}`);
+      return null;
+    } finally {
+      if (!worker) return;
+      await worker.terminate();
+    }
+  }
 }

 module.exports = OCRLoader;
--- a/collector/utils/constants.js
+++ b/collector/utils/constants.js
@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
  "video/mp4": [".mp4"],
  "video/mpeg": [".mpeg"],
  "application/epub+zip": [".epub"],
+  "image/png": [".png"],
+  "image/jpeg": [".jpg"],
+  "image/jpg": [".jpg"],
 };

 const SUPPORTED_FILETYPE_CONVERTERS = {
@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".wav": "./convert/asAudio.js",
  ".mp4": "./convert/asAudio.js",
  ".mpeg": "./convert/asAudio.js",
+
+  ".png": "./convert/asImage.js",
+  ".jpg": "./convert/asImage.js",
+  ".jpeg": "./convert/asImage.js",
 };

 module.exports = {
--- a/collector/utils/files/mime.js
+++ b/collector/utils/files/mime.js
@ -1,6 +1,6 @@
 const MimeLib = require("mime");
 class MimeDetector {
-  nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
+  nonTextTypes = ["multipart", "model", "audio", "video", "font"];
  badMimes = [
    "application/octet-stream",
    "application/zip",