Try adding dependencies for libgl in order to run OCR in github action unit tests

2024-11-23 15:38:55 +01:00 · 2023-11-05 15:09:40 -08:00 · 2023-11-05 15:09:40 -08:00 · 3d6e8d53fe
commit 3d6e8d53fe
parent 5f1e37fff0
3 changed files with 18 additions and 1 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -61,7 +61,7 @@ jobs:
        env:
          DEBIAN_FRONTEND: noninteractive
        run: |
-          apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0
+          apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
      - name: ⬇️ Install Postgres
        env:
--- a/tests/data/pdf/ocr_samples.pdf
+++ b/tests/data/pdf/ocr_samples.pdf
--- a/tests/test_pdf_to_entries.py
+++ b/tests/test_pdf_to_entries.py
@ -50,6 +50,23 @@ def test_multi_page_pdf_to_jsonl():
    assert len(jsonl_data) == 6
 def test_ocr_page_pdf_to_jsonl():
    "Convert multiple pages from single PDF file to jsonl."
    # Act
    # Extract Entries from specified Pdf files
    with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
        pdf_bytes = f.read()
    data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
    entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
    # Process Each Entry from All Pdf Files
    entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
    assert len(entries) == 1
    assert "playing on a strip of marsh" in entries[0].raw
 def test_get_pdf_files(tmp_path):
    "Ensure Pdf files specified via input-filter, input-files extracted"
    # Arrange