diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index 00ed3ca4..a2948caa 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -58,28 +58,13 @@ class DocxToEntries(TextToEntries): file_to_text_map = dict() for docx_file in docx_files: try: - timestamp_now = datetime.utcnow().timestamp() - tmp_file = f"tmp_docx_file_{timestamp_now}.docx" - with open(tmp_file, "wb") as f: - bytes_content = docx_files[docx_file] - f.write(bytes_content) - - # Load the content using Docx2txtLoader - loader = Docx2txtLoader(tmp_file) - docx_entries_per_file = loader.load() - - # Convert the loaded entries into the desired format - docx_texts = [page.page_content for page in docx_entries_per_file] - + docx_texts = DocxToEntries.extract_text(docx_files[docx_file]) entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts)) entries.extend(docx_texts) file_to_text_map[docx_file] = docx_texts except Exception as e: - logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.") + logger.warning(f"Unable to extract entries from file: {docx_file}") logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map)) @staticmethod @@ -103,3 +88,30 @@ class DocxToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries") return entries + + @staticmethod + def extract_text(docx_file): + """Extract text from specified DOCX file""" + try: + timestamp_now = datetime.utcnow().timestamp() + tmp_file = f"tmp_docx_file_{timestamp_now}.docx" + docx_entry_by_pages = [] + with open(tmp_file, "wb") as f: + bytes_content = docx_file + f.write(bytes_content) + + # Load the content using Docx2txtLoader + loader = Docx2txtLoader(tmp_file) + docx_entries_per_file = loader.load() + + # Convert the loaded entries into the desired format + docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] + + except Exception as e: + logger.warning(f"Unable to extract text from file: {docx_file}") + logger.warning(e, exc_info=True) + finally: + if os.path.exists(f"{tmp_file}"): + os.remove(f"{tmp_file}") + + return docx_entry_by_pages diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 063d1e74..35aa203f 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -59,32 +59,9 @@ class PdfToEntries(TextToEntries): entries: List[str] = [] entry_to_location_map: List[Tuple[str, str]] = [] for pdf_file in pdf_files: - try: - # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path - timestamp_now = datetime.utcnow().timestamp() - tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf" - with open(f"{tmp_file}", "wb") as f: - bytes = pdf_files[pdf_file] - f.write(bytes) - try: - loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False) - pdf_entries_per_file = [page.page_content for page in loader.load()] - except ImportError: - loader = PyMuPDFLoader(f"{tmp_file}") - pdf_entries_per_file = [ - page.page_content for page in loader.load() - ] # page_content items list for a given pdf. - entry_to_location_map += zip( - pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file) - ) # this is an indexed map of pdf_entries for the pdf. - entries.extend(pdf_entries_per_file) - file_to_text_map[pdf_file] = pdf_entries_per_file - except Exception as e: - logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") - logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") + pdf_entries_per_file = PdfToEntries.extract_text(pdf_file) + entries.extend(pdf_entries_per_file) + file_to_text_map[pdf_file] = pdf_entries_per_file return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map)) @@ -109,3 +86,30 @@ class PdfToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries") return entries + + @staticmethod + def extract_text(pdf_file): + """Extract text from specified PDF files""" + try: + # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path + timestamp_now = datetime.utcnow().timestamp() + tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf" + pdf_entry_by_pages = [] + with open(f"{tmp_file}", "wb") as f: + f.write(pdf_file) + try: + loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False) + pdf_entry_by_pages = [page.page_content for page in loader.load()] + except ImportError: + loader = PyMuPDFLoader(f"{tmp_file}") + pdf_entry_by_pages = [ + page.page_content for page in loader.load() + ] # page_content items list for a given pdf. + except Exception as e: + logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") + logger.warning(e, exc_info=True) + finally: + if os.path.exists(f"{tmp_file}"): + os.remove(f"{tmp_file}") + + return pdf_entry_by_pages