mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Let fs_syncer pass PDF files directly as binary before indexing
No need to do unneeded base64 encoding/decoding to pass pdf contents for indexing from fs_syncer to pdf_to_jsonl
This commit is contained in:
parent
d27dc71dfe
commit
541cd59a49
3 changed files with 4 additions and 5 deletions
|
@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
|
||||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
|
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
|
||||||
tmp_file = f"tmp_pdf_file.pdf"
|
tmp_file = f"tmp_pdf_file.pdf"
|
||||||
with open(f"{tmp_file}", "wb") as f:
|
with open(f"{tmp_file}", "wb") as f:
|
||||||
bytes = base64.b64decode(pdf_files[pdf_file])
|
bytes = pdf_files[pdf_file]
|
||||||
f.write(bytes)
|
f.write(bytes)
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
|
|
|
@ -210,7 +210,7 @@ def get_pdf_files(config: TextContentConfig):
|
||||||
for file in all_pdf_files:
|
for file in all_pdf_files:
|
||||||
with open(file, "rb") as f:
|
with open(file, "rb") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8")
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import base64
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||||
|
@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
# Read singlepage.pdf into memory as bytes
|
# Read singlepage.pdf into memory as bytes
|
||||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||||
|
@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
Loading…
Reference in a new issue