Add try/except around image extraction step

This commit is contained in:
sabaimran 2023-11-04 19:27:18 -07:00
parent fdfab39942
commit 8c3d5a49da

View file

@ -68,13 +68,16 @@ class PdfToEntries(TextToEntries):
with open(f"{tmp_file}", "wb") as f:
bytes = pdf_files[pdf_file]
f.write(bytes)
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True)
try:
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True)
except ModuleNotFoundError:
loader = PyMuPDFLoader(f"{tmp_file}")
pdf_entries_per_file = [page.page_content for page in loader.load()]
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
entries.extend(pdf_entries_per_file)
except Exception as e:
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
logger.warning(e)
logger.warning(e, exc_info=True)
finally:
if os.path.exists(f"{tmp_file}"):
os.remove(f"{tmp_file}")