Make RapidOCR dependency optional as flaky requirements

RapidOCR depends on OpenCV which by default requires a bunch of GUI
paramters. This system package dependency set (like libgl1) is flaky

Making the RapidOCR dependency optional should allow khoj to be more
resilient to setup/dependency failures

Trade-off is that OCR for documents may not always be available and
it'll require looking at server logs to find out when this happens
This commit is contained in:
Debanjum Singh Solanky 2024-09-18 19:12:16 -07:00
parent 0a568244fd
commit 077b88bafa
2 changed files with 7 additions and 5 deletions

View file

@ -73,7 +73,7 @@ dependencies = [
"psycopg2-binary == 2.9.9", "psycopg2-binary == 2.9.9",
"lxml == 4.9.3", "lxml == 4.9.3",
"tzdata == 2023.3", "tzdata == 2023.3",
"rapidocr-onnxruntime == 1.3.22", "rapidocr-onnxruntime == 1.3.24",
"openai-whisper >= 20231117", "openai-whisper >= 20231117",
"django-phonenumber-field == 7.3.0", "django-phonenumber-field == 7.3.0",
"phonenumbers == 8.13.27", "phonenumbers == 8.13.27",

View file

@ -4,8 +4,6 @@ import os
from datetime import datetime from datetime import datetime
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from rapidocr_onnxruntime import RapidOCR
from khoj.database.models import Entry as DbEntry from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser from khoj.database.models import KhojUser
from khoj.processor.content.text_to_entries import TextToEntries from khoj.processor.content.text_to_entries import TextToEntries
@ -58,7 +56,6 @@ class ImageToEntries(TextToEntries):
entry_to_location_map: List[Tuple[str, str]] = [] entry_to_location_map: List[Tuple[str, str]] = []
for image_file in image_files: for image_file in image_files:
try: try:
loader = RapidOCR()
bytes = image_files[image_file] bytes = image_files[image_file]
# write the image to a temporary file # write the image to a temporary file
timestamp_now = datetime.utcnow().timestamp() timestamp_now = datetime.utcnow().timestamp()
@ -71,13 +68,18 @@ class ImageToEntries(TextToEntries):
bytes = image_files[image_file] bytes = image_files[image_file]
f.write(bytes) f.write(bytes)
try: try:
from rapidocr_onnxruntime import RapidOCR
loader = RapidOCR()
image_entries_per_file = "" image_entries_per_file = ""
result, _ = loader(tmp_file) result, _ = loader(tmp_file)
if result: if result:
expanded_entries = [text[1] for text in result] expanded_entries = [text[1] for text in result]
image_entries_per_file = " ".join(expanded_entries) image_entries_per_file = " ".join(expanded_entries)
except ImportError: except ImportError:
logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.") logger.warning(
f"Unable to process image or scanned file for text: {image_file}. This file will not be indexed."
)
continue continue
entry_to_location_map.append((image_entries_per_file, image_file)) entry_to_location_map.append((image_entries_per_file, image_file))
entries.extend([image_entries_per_file]) entries.extend([image_entries_per_file])