mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Make RapidOCR dependency optional as flaky requirements
RapidOCR depends on OpenCV which by default requires a bunch of GUI paramters. This system package dependency set (like libgl1) is flaky Making the RapidOCR dependency optional should allow khoj to be more resilient to setup/dependency failures Trade-off is that OCR for documents may not always be available and it'll require looking at server logs to find out when this happens
This commit is contained in:
parent
0a568244fd
commit
077b88bafa
2 changed files with 7 additions and 5 deletions
|
@ -73,7 +73,7 @@ dependencies = [
|
||||||
"psycopg2-binary == 2.9.9",
|
"psycopg2-binary == 2.9.9",
|
||||||
"lxml == 4.9.3",
|
"lxml == 4.9.3",
|
||||||
"tzdata == 2023.3",
|
"tzdata == 2023.3",
|
||||||
"rapidocr-onnxruntime == 1.3.22",
|
"rapidocr-onnxruntime == 1.3.24",
|
||||||
"openai-whisper >= 20231117",
|
"openai-whisper >= 20231117",
|
||||||
"django-phonenumber-field == 7.3.0",
|
"django-phonenumber-field == 7.3.0",
|
||||||
"phonenumbers == 8.13.27",
|
"phonenumbers == 8.13.27",
|
||||||
|
|
|
@ -4,8 +4,6 @@ import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from rapidocr_onnxruntime import RapidOCR
|
|
||||||
|
|
||||||
from khoj.database.models import Entry as DbEntry
|
from khoj.database.models import Entry as DbEntry
|
||||||
from khoj.database.models import KhojUser
|
from khoj.database.models import KhojUser
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
|
@ -58,7 +56,6 @@ class ImageToEntries(TextToEntries):
|
||||||
entry_to_location_map: List[Tuple[str, str]] = []
|
entry_to_location_map: List[Tuple[str, str]] = []
|
||||||
for image_file in image_files:
|
for image_file in image_files:
|
||||||
try:
|
try:
|
||||||
loader = RapidOCR()
|
|
||||||
bytes = image_files[image_file]
|
bytes = image_files[image_file]
|
||||||
# write the image to a temporary file
|
# write the image to a temporary file
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
timestamp_now = datetime.utcnow().timestamp()
|
||||||
|
@ -71,13 +68,18 @@ class ImageToEntries(TextToEntries):
|
||||||
bytes = image_files[image_file]
|
bytes = image_files[image_file]
|
||||||
f.write(bytes)
|
f.write(bytes)
|
||||||
try:
|
try:
|
||||||
|
from rapidocr_onnxruntime import RapidOCR
|
||||||
|
|
||||||
|
loader = RapidOCR()
|
||||||
image_entries_per_file = ""
|
image_entries_per_file = ""
|
||||||
result, _ = loader(tmp_file)
|
result, _ = loader(tmp_file)
|
||||||
if result:
|
if result:
|
||||||
expanded_entries = [text[1] for text in result]
|
expanded_entries = [text[1] for text in result]
|
||||||
image_entries_per_file = " ".join(expanded_entries)
|
image_entries_per_file = " ".join(expanded_entries)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning(f"Unable to process file: {image_file}. This file will not be indexed.")
|
logger.warning(
|
||||||
|
f"Unable to process image or scanned file for text: {image_file}. This file will not be indexed."
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
entry_to_location_map.append((image_entries_per_file, image_file))
|
entry_to_location_map.append((image_entries_per_file, image_file))
|
||||||
entries.extend([image_entries_per_file])
|
entries.extend([image_entries_per_file])
|
||||||
|
|
Loading…
Reference in a new issue