Upgrade RapidOCR and enable for Python 3.12. Fix PDF OCR test

2024-11-23 15:38:55 +01:00 · 2024-06-22 16:00:06 +05:30 · 2024-06-22 16:00:06 +05:30 · 22f6db0a6b
commit 22f6db0a6b
parent 55a23eae25
3 changed files with 12 additions and 6 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -73,7 +73,7 @@ dependencies = [
    "psycopg2-binary == 2.9.9",
    "lxml == 4.9.3",
    "tzdata == 2023.3",
-    "rapidocr-onnxruntime == 1.3.11; python_version<'3.12'",
+    "rapidocr-onnxruntime == 1.3.22",
    "openai-whisper >= 20231117",
    "django-phonenumber-field == 7.3.0",
    "phonenumbers == 8.13.27",
--- a/tests/test_docx_to_entries.py
+++ b/tests/test_docx_to_entries.py
@ -1,5 +1,3 @@
-import os
-
 from khoj.processor.content.docx.docx_to_entries import DocxToEntries


--- a/tests/test_pdf_to_entries.py
+++ b/tests/test_pdf_to_entries.py
@ -1,4 +1,5 @@
 import os
+import re

 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.utils.fs_syncer import get_pdf_files
@ -38,16 +39,23 @@ def test_multi_page_pdf_to_jsonl():

 def test_ocr_page_pdf_to_jsonl():
    "Convert multiple pages from single PDF file to jsonl."
-    # Act
+    # Arrange
+    expected_str = "playing on a strip of marsh"
+    expected_str_with_variable_spaces = re.compile(expected_str.replace(" ", r"\s*"), re.IGNORECASE)
+
    # Extract Entries from specified Pdf files
    with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
        pdf_bytes = f.read()
-
    data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
+
+    # Act
    entries = PdfToEntries.extract_pdf_entries(pdf_files=data)
+    raw_entry = entries[1][0].raw
+
+    # Assert
    assert len(entries) == 2
    assert len(entries[1]) == 1
-    assert "playing on a strip of marsh" in entries[1][0].raw
+    assert re.search(expected_str_with_variable_spaces, raw_entry) is not None


 def test_get_pdf_files(tmp_path):