diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 84fbb1aa..697579da 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -61,7 +61,7 @@ jobs: env: DEBIAN_FRONTEND: noninteractive run: | - apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 + apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6 - name: ⬇️ Install Postgres env: diff --git a/tests/data/pdf/ocr_samples.pdf b/tests/data/pdf/ocr_samples.pdf new file mode 100644 index 00000000..100f60e0 Binary files /dev/null and b/tests/data/pdf/ocr_samples.pdf differ diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index 81ea18c8..3ab44639 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -50,6 +50,23 @@ def test_multi_page_pdf_to_jsonl(): assert len(jsonl_data) == 6 +def test_ocr_page_pdf_to_jsonl(): + "Convert multiple pages from single PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + with open("tests/data/pdf/ocr_samples.pdf", "rb") as f: + pdf_bytes = f.read() + + data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes} + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) + + # Process Each Entry from All Pdf Files + entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) + + assert len(entries) == 1 + assert "playing on a strip of marsh" in entries[0].raw + + def test_get_pdf_files(tmp_path): "Ensure Pdf files specified via input-filter, input-files extracted" # Arrange