From 22f6db0a6bf02fa7a8473e87be529b14d86a3ba1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 22 Jun 2024 16:00:06 +0530 Subject: [PATCH] Upgrade RapidOCR and enable for Python 3.12. Fix PDF OCR test --- pyproject.toml | 2 +- tests/test_docx_to_entries.py | 2 -- tests/test_pdf_to_entries.py | 14 +++++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index de4a05f0..19e7a076 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,7 @@ dependencies = [ "psycopg2-binary == 2.9.9", "lxml == 4.9.3", "tzdata == 2023.3", - "rapidocr-onnxruntime == 1.3.11; python_version<'3.12'", + "rapidocr-onnxruntime == 1.3.22", "openai-whisper >= 20231117", "django-phonenumber-field == 7.3.0", "phonenumbers == 8.13.27", diff --git a/tests/test_docx_to_entries.py b/tests/test_docx_to_entries.py index 089c7fec..dce0109b 100644 --- a/tests/test_docx_to_entries.py +++ b/tests/test_docx_to_entries.py @@ -1,5 +1,3 @@ -import os - from khoj.processor.content.docx.docx_to_entries import DocxToEntries diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index 31ccb387..3a25b05d 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -1,4 +1,5 @@ import os +import re from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries from khoj.utils.fs_syncer import get_pdf_files @@ -38,16 +39,23 @@ def test_multi_page_pdf_to_jsonl(): def test_ocr_page_pdf_to_jsonl(): "Convert multiple pages from single PDF file to jsonl." - # Act + # Arrange + expected_str = "playing on a strip of marsh" + expected_str_with_variable_spaces = re.compile(expected_str.replace(" ", r"\s*"), re.IGNORECASE) + # Extract Entries from specified Pdf files with open("tests/data/pdf/ocr_samples.pdf", "rb") as f: pdf_bytes = f.read() - data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes} + + # Act entries = PdfToEntries.extract_pdf_entries(pdf_files=data) + raw_entry = entries[1][0].raw + + # Assert assert len(entries) == 2 assert len(entries[1]) == 1 - assert "playing on a strip of marsh" in entries[1][0].raw + assert re.search(expected_str_with_variable_spaces, raw_entry) is not None def test_get_pdf_files(tmp_path):