diff --git a/pyproject.toml b/pyproject.toml index c816f4d2..25a78ab9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ dependencies = [ "gunicorn == 21.2.0", "lxml == 4.9.3", "tzdata == 2023.3", + "rapidocr-onnxruntime == 1.3.8" ] dynamic = ["version"] diff --git a/src/khoj/processor/pdf/pdf_to_entries.py b/src/khoj/processor/pdf/pdf_to_entries.py index 24dcdc5a..19d463eb 100644 --- a/src/khoj/processor/pdf/pdf_to_entries.py +++ b/src/khoj/processor/pdf/pdf_to_entries.py @@ -68,7 +68,7 @@ class PdfToEntries(TextToEntries): with open(f"{tmp_file}", "wb") as f: bytes = pdf_files[pdf_file] f.write(bytes) - loader = PyMuPDFLoader(f"{tmp_file}") + loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True) pdf_entries_per_file = [page.page_content for page in loader.load()] entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) entries.extend(pdf_entries_per_file)