mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-24 07:55:07 +01:00
Skip indexing a PDF that has an indexing error (#274)
This commit is contained in:
parent
a6f313589e
commit
017e8c1aef
1 changed files with 8 additions and 4 deletions
|
@ -98,10 +98,14 @@ class PdfToJsonl(TextToJsonl):
|
||||||
entries = []
|
entries = []
|
||||||
entry_to_location_map = []
|
entry_to_location_map = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
|
try:
|
||||||
loader = PyPDFLoader(pdf_file)
|
loader = PyPDFLoader(pdf_file)
|
||||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||||
entries.extend(pdf_entries_per_file)
|
entries.extend(pdf_entries_per_file)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing file: {pdf_file}. This file will not be indexed.")
|
||||||
|
logger.error(e)
|
||||||
|
|
||||||
return entries, dict(entry_to_location_map)
|
return entries, dict(entry_to_location_map)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue