From 541cd59a49ce841b696c5c4900c0fd1e96709007 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 17 Oct 2023 02:41:16 -0700 Subject: [PATCH] Let fs_syncer pass PDF files directly as binary before indexing No need to do unneeded base64 encoding/decoding to pass pdf contents for indexing from fs_syncer to pdf_to_jsonl --- src/khoj/processor/pdf/pdf_to_jsonl.py | 2 +- src/khoj/utils/fs_syncer.py | 2 +- tests/test_pdf_to_jsonl.py | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index 77c34617..c24d9940 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl): # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path tmp_file = f"tmp_pdf_file.pdf" with open(f"{tmp_file}", "wb") as f: - bytes = base64.b64decode(pdf_files[pdf_file]) + bytes = pdf_files[pdf_file] f.write(bytes) loader = PyMuPDFLoader(f"{tmp_file}") pdf_entries_per_file = [page.page_content for page in loader.load()] diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index d303d39b..4fab6d81 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -210,7 +210,7 @@ def get_pdf_files(config: TextContentConfig): for file in all_pdf_files: with open(file, "rb") as f: try: - filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8") + filename_to_content_map[file] = f.read() except Exception as e: logger.warning(f"Unable to read file: {file} as PDF. Skipping file.") logger.warning(e, exc_info=True) diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py index bacce37c..b9b26986 100644 --- a/tests/test_pdf_to_jsonl.py +++ b/tests/test_pdf_to_jsonl.py @@ -1,7 +1,6 @@ # Standard Packages import json import os -import base64 # Internal Packages from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl @@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl(): # Extract Entries from specified Pdf files # Read singlepage.pdf into memory as bytes with open("tests/data/pdf/singlepage.pdf", "rb") as f: - pdf_bytes = base64.b64encode(f.read()).decode("utf-8") + pdf_bytes = f.read() data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) @@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl(): # Act # Extract Entries from specified Pdf files with open("tests/data/pdf/multipage.pdf", "rb") as f: - pdf_bytes = base64.b64encode(f.read()).decode("utf-8") + pdf_bytes = f.read() data = {"tests/data/pdf/multipage.pdf": pdf_bytes} entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)