From 017e8c1aef15862bce3fc1c318304c4c947af4c1 Mon Sep 17 00:00:00 2001 From: sabaimran <65192171+sabaimran@users.noreply.github.com> Date: Mon, 3 Jul 2023 15:55:11 -0700 Subject: [PATCH] Skip indexing a PDF that has an indexing error (#274) --- src/khoj/processor/pdf/pdf_to_jsonl.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index 3c90fdc1..b499b697 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -98,10 +98,14 @@ class PdfToJsonl(TextToJsonl): entries = [] entry_to_location_map = [] for pdf_file in pdf_files: - loader = PyPDFLoader(pdf_file) - pdf_entries_per_file = [page.page_content for page in loader.load()] - entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) - entries.extend(pdf_entries_per_file) + try: + loader = PyPDFLoader(pdf_file) + pdf_entries_per_file = [page.page_content for page in loader.load()] + entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) + entries.extend(pdf_entries_per_file) + except Exception as e: + logger.error(f"Error processing file: {pdf_file}. This file will not be indexed.") + logger.error(e) return entries, dict(entry_to_location_map)