From 4223b355dcc065de20d5d282edf5e6c91891977e Mon Sep 17 00:00:00 2001 From: Debanjum Date: Mon, 11 Nov 2024 03:20:35 -0800 Subject: [PATCH] Use python stdlib methods to write pdf, docx to temp files for loaders Use python standard method tempfile.NamedTemporaryFile to write, delete temporary files safely. --- .../processor/content/docx/docx_to_entries.py | 28 +++++--------- .../processor/content/pdf/pdf_to_entries.py | 37 +++++++------------ 2 files changed, 23 insertions(+), 42 deletions(-) diff --git a/src/khoj/processor/content/docx/docx_to_entries.py b/src/khoj/processor/content/docx/docx_to_entries.py index 55dd8bac..19d9ba13 100644 --- a/src/khoj/processor/content/docx/docx_to_entries.py +++ b/src/khoj/processor/content/docx/docx_to_entries.py @@ -1,7 +1,5 @@ import logging -import os -from datetime import datetime -from random import randint +import tempfile from typing import Dict, List, Tuple from langchain_community.document_loaders import Docx2txtLoader @@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries): def extract_text(docx_file): """Extract text from specified DOCX file""" try: - timestamp_now = datetime.utcnow().timestamp() - random_suffix = randint(0, 1000) - tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx" docx_entry_by_pages = [] - with open(tmp_file, "wb") as f: - bytes_content = docx_file - f.write(bytes_content) + # Create temp file with .docx extension that gets auto-deleted + with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp: + tmp.write(docx_file) + tmp.flush() # Ensure all data is written - # Load the content using Docx2txtLoader - loader = Docx2txtLoader(tmp_file) - docx_entries_per_file = loader.load() - - # Convert the loaded entries into the desired format - docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] + # Load the content using Docx2txtLoader + loader = Docx2txtLoader(tmp.name) + docx_entries_per_file = loader.load() + # Convert the loaded entries into the desired format + docx_entry_by_pages = [page.page_content for page in docx_entries_per_file] except Exception as e: logger.warning(f"Unable to extract text from file: {docx_file}") logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") return docx_entry_by_pages diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 311ac807..39685996 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -1,14 +1,10 @@ -import base64 import logging -import os -from datetime import datetime -from random import randint +import tempfile +from io import BytesIO from typing import Dict, List, Tuple from langchain_community.document_loaders import PyMuPDFLoader -# importing FileObjectAdapter so that we can add new files and debug file object db. -# from khoj.database.adapters import FileObjectAdapters from khoj.database.models import Entry as DbEntry from khoj.database.models import KhojUser from khoj.processor.content.text_to_entries import TextToEntries @@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries): def extract_text(pdf_file): """Extract text from specified PDF files""" try: - # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path - timestamp_now = datetime.utcnow().timestamp() - random_suffix = randint(0, 1000) - tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf" - pdf_entry_by_pages = [] - with open(f"{tmp_file}", "wb") as f: - f.write(pdf_file) - try: - loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False) - pdf_entry_by_pages = [page.page_content for page in loader.load()] - except ImportError: - loader = PyMuPDFLoader(f"{tmp_file}") - pdf_entry_by_pages = [ - page.page_content for page in loader.load() - ] # page_content items list for a given pdf. + # Create temp file with .pdf extension that gets auto-deleted + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf: + tmpf.write(pdf_file) + tmpf.flush() # Ensure all data is written + + # Load the content using PyMuPDFLoader + loader = PyMuPDFLoader(tmpf.name, extract_images=True) + pdf_entries_per_file = loader.load() + + # Convert the loaded entries into the desired format + pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file] except Exception as e: logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") logger.warning(e, exc_info=True) - finally: - if os.path.exists(f"{tmp_file}"): - os.remove(f"{tmp_file}") return pdf_entry_by_pages