Try adding dependencies for libgl in order to run OCR in github action unit tests

This commit is contained in:
sabaimran 2023-11-05 15:09:40 -08:00
parent 5f1e37fff0
commit 3d6e8d53fe
3 changed files with 18 additions and 1 deletions

View file

@ -61,7 +61,7 @@ jobs:
env: env:
DEBIAN_FRONTEND: noninteractive DEBIAN_FRONTEND: noninteractive
run: | run: |
apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 apt update && apt install -y libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
- name: ⬇️ Install Postgres - name: ⬇️ Install Postgres
env: env:

BIN
tests/data/pdf/ocr_samples.pdf vendored Normal file

Binary file not shown.

View file

@ -50,6 +50,23 @@ def test_multi_page_pdf_to_jsonl():
assert len(jsonl_data) == 6 assert len(jsonl_data) == 6
def test_ocr_page_pdf_to_jsonl():
"Convert multiple pages from single PDF file to jsonl."
# Act
# Extract Entries from specified Pdf files
with open("tests/data/pdf/ocr_samples.pdf", "rb") as f:
pdf_bytes = f.read()
data = {"tests/data/pdf/ocr_samples.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files
entries = PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
assert len(entries) == 1
assert "playing on a strip of marsh" in entries[0].raw
def test_get_pdf_files(tmp_path): def test_get_pdf_files(tmp_path):
"Ensure Pdf files specified via input-filter, input-files extracted" "Ensure Pdf files specified via input-filter, input-files extracted"
# Arrange # Arrange