diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 35aa203f..20b72b8c 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -59,9 +59,14 @@ class PdfToEntries(TextToEntries): entries: List[str] = [] entry_to_location_map: List[Tuple[str, str]] = [] for pdf_file in pdf_files: - pdf_entries_per_file = PdfToEntries.extract_text(pdf_file) - entries.extend(pdf_entries_per_file) - file_to_text_map[pdf_file] = pdf_entries_per_file + try: + pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file]) + entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) + entries.extend(pdf_entries_per_file) + file_to_text_map[pdf_file] = pdf_entries_per_file + except Exception as e: + logger.warning(f"Unable to extract entries from file: {pdf_file}") + logger.warning(e, exc_info=True) return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map)) diff --git a/src/khoj/routers/api_content.py b/src/khoj/routers/api_content.py index d5a6f2ad..a83e0538 100644 --- a/src/khoj/routers/api_content.py +++ b/src/khoj/routers/api_content.py @@ -450,11 +450,11 @@ async def indexer( for file in files: file_data = get_file_content(file) if file_data.file_type in index_files: - index_files[file_data.file_type][file_data.filename] = ( + index_files[file_data.file_type][file_data.name] = ( file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content ) else: - logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") + logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}") indexer_input = IndexerInput( org=index_files["org"],