Use python stdlib methods to write pdf, docx to temp files for loaders

Use python standard method tempfile.NamedTemporaryFile to write,
delete temporary files safely.
This commit is contained in:
Debanjum 2024-11-11 03:20:35 -08:00
parent fd15fc1e59
commit 4223b355dc
2 changed files with 23 additions and 42 deletions

View file

@ -1,7 +1,5 @@
import logging import logging
import os import tempfile
from datetime import datetime
from random import randint
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import Docx2txtLoader from langchain_community.document_loaders import Docx2txtLoader
@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries):
def extract_text(docx_file): def extract_text(docx_file):
"""Extract text from specified DOCX file""" """Extract text from specified DOCX file"""
try: try:
timestamp_now = datetime.utcnow().timestamp()
random_suffix = randint(0, 1000)
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
docx_entry_by_pages = [] docx_entry_by_pages = []
with open(tmp_file, "wb") as f: # Create temp file with .docx extension that gets auto-deleted
bytes_content = docx_file with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
f.write(bytes_content) tmp.write(docx_file)
tmp.flush() # Ensure all data is written
# Load the content using Docx2txtLoader # Load the content using Docx2txtLoader
loader = Docx2txtLoader(tmp_file) loader = Docx2txtLoader(tmp.name)
docx_entries_per_file = loader.load() docx_entries_per_file = loader.load()
# Convert the loaded entries into the desired format
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
# Convert the loaded entries into the desired format
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
except Exception as e: except Exception as e:
logger.warning(f"Unable to extract text from file: {docx_file}") logger.warning(f"Unable to extract text from file: {docx_file}")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)
finally:
if os.path.exists(f"{tmp_file}"):
os.remove(f"{tmp_file}")
return docx_entry_by_pages return docx_entry_by_pages

View file

@ -1,14 +1,10 @@
import base64
import logging import logging
import os import tempfile
from datetime import datetime from io import BytesIO
from random import randint
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import PyMuPDFLoader
# importing FileObjectAdapter so that we can add new files and debug file object db.
# from khoj.database.adapters import FileObjectAdapters
from khoj.database.models import Entry as DbEntry from khoj.database.models import Entry as DbEntry
from khoj.database.models import KhojUser from khoj.database.models import KhojUser
from khoj.processor.content.text_to_entries import TextToEntries from khoj.processor.content.text_to_entries import TextToEntries
@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries):
def extract_text(pdf_file): def extract_text(pdf_file):
"""Extract text from specified PDF files""" """Extract text from specified PDF files"""
try: try:
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path # Create temp file with .pdf extension that gets auto-deleted
timestamp_now = datetime.utcnow().timestamp() with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
random_suffix = randint(0, 1000) tmpf.write(pdf_file)
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf" tmpf.flush() # Ensure all data is written
pdf_entry_by_pages = []
with open(f"{tmp_file}", "wb") as f: # Load the content using PyMuPDFLoader
f.write(pdf_file) loader = PyMuPDFLoader(tmpf.name, extract_images=True)
try: pdf_entries_per_file = loader.load()
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
pdf_entry_by_pages = [page.page_content for page in loader.load()] # Convert the loaded entries into the desired format
except ImportError: pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file]
loader = PyMuPDFLoader(f"{tmp_file}")
pdf_entry_by_pages = [
page.page_content for page in loader.load()
] # page_content items list for a given pdf.
except Exception as e: except Exception as e:
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.") logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)
finally:
if os.path.exists(f"{tmp_file}"):
os.remove(f"{tmp_file}")
return pdf_entry_by_pages return pdf_entry_by_pages