diff --git a/tests/data/pdf/multipage.pdf b/tests/data/pdf/multipage.pdf new file mode 100644 index 00000000..234f39b7 Binary files /dev/null and b/tests/data/pdf/multipage.pdf differ diff --git a/tests/data/pdf/singlepage.pdf b/tests/data/pdf/singlepage.pdf new file mode 100644 index 00000000..8071613a Binary files /dev/null and b/tests/data/pdf/singlepage.pdf differ diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py new file mode 100644 index 00000000..f5918bd1 --- /dev/null +++ b/tests/test_pdf_to_jsonl.py @@ -0,0 +1,74 @@ +# Standard Packages +import json + +# Internal Packages +from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl + + +def test_single_page_pdf_to_jsonl(): + "Convert single page PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"]) + + # Process Each Entry from All Pdf Files + jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( + PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + ) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multi_page_pdf_to_jsonl(): + "Convert multiple pages from single PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"]) + + # Process Each Entry from All Pdf Files + jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( + PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + ) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 6 + + +def test_get_pdf_files(tmp_path): + "Ensure Pdf files specified via input-filter, input-files extracted" + # Arrange + # Include via input-filter globs + group1_file1 = create_file(tmp_path, filename="group1-file1.pdf") + group1_file2 = create_file(tmp_path, filename="group1-file2.pdf") + group2_file1 = create_file(tmp_path, filename="group2-file1.pdf") + group2_file2 = create_file(tmp_path, filename="group2-file2.pdf") + # Include via input-file field + file1 = create_file(tmp_path, filename="document.pdf") + # Not included by any filter + create_file(tmp_path, filename="not-included-document.pdf") + create_file(tmp_path, filename="not-included-text.txt") + + expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1])) + + # Setup input-files, input-filters + input_files = [tmp_path / "document.pdf"] + input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"] + + # Act + extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter) + + # Assert + assert len(extracted_pdf_files) == 5 + assert extracted_pdf_files == expected_files + + +# Helper Functions +def create_file(tmp_path, entry=None, filename="document.pdf"): + pdf_file = tmp_path / filename + pdf_file.touch() + if entry: + pdf_file.write_text(entry) + return pdf_file