mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Use python stdlib methods to write pdf, docx to temp files for loaders
Use python standard method tempfile.NamedTemporaryFile to write, delete temporary files safely.
This commit is contained in:
parent
fd15fc1e59
commit
4223b355dc
2 changed files with 23 additions and 42 deletions
|
@ -1,7 +1,5 @@
|
|||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from random import randint
|
||||
import tempfile
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from langchain_community.document_loaders import Docx2txtLoader
|
||||
|
@ -94,26 +92,20 @@ class DocxToEntries(TextToEntries):
|
|||
def extract_text(docx_file):
|
||||
"""Extract text from specified DOCX file"""
|
||||
try:
|
||||
timestamp_now = datetime.utcnow().timestamp()
|
||||
random_suffix = randint(0, 1000)
|
||||
tmp_file = f"tmp_docx_file_{timestamp_now}_{random_suffix}.docx"
|
||||
docx_entry_by_pages = []
|
||||
with open(tmp_file, "wb") as f:
|
||||
bytes_content = docx_file
|
||||
f.write(bytes_content)
|
||||
# Create temp file with .docx extension that gets auto-deleted
|
||||
with tempfile.NamedTemporaryFile(suffix=".docx", delete=True) as tmp:
|
||||
tmp.write(docx_file)
|
||||
tmp.flush() # Ensure all data is written
|
||||
|
||||
# Load the content using Docx2txtLoader
|
||||
loader = Docx2txtLoader(tmp_file)
|
||||
docx_entries_per_file = loader.load()
|
||||
|
||||
# Convert the loaded entries into the desired format
|
||||
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
|
||||
# Load the content using Docx2txtLoader
|
||||
loader = Docx2txtLoader(tmp.name)
|
||||
docx_entries_per_file = loader.load()
|
||||
|
||||
# Convert the loaded entries into the desired format
|
||||
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to extract text from file: {docx_file}")
|
||||
logger.warning(e, exc_info=True)
|
||||
finally:
|
||||
if os.path.exists(f"{tmp_file}"):
|
||||
os.remove(f"{tmp_file}")
|
||||
|
||||
return docx_entry_by_pages
|
||||
|
|
|
@ -1,14 +1,10 @@
|
|||
import base64
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from random import randint
|
||||
import tempfile
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from langchain_community.document_loaders import PyMuPDFLoader
|
||||
|
||||
# importing FileObjectAdapter so that we can add new files and debug file object db.
|
||||
# from khoj.database.adapters import FileObjectAdapters
|
||||
from khoj.database.models import Entry as DbEntry
|
||||
from khoj.database.models import KhojUser
|
||||
from khoj.processor.content.text_to_entries import TextToEntries
|
||||
|
@ -97,26 +93,19 @@ class PdfToEntries(TextToEntries):
|
|||
def extract_text(pdf_file):
|
||||
"""Extract text from specified PDF files"""
|
||||
try:
|
||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
||||
timestamp_now = datetime.utcnow().timestamp()
|
||||
random_suffix = randint(0, 1000)
|
||||
tmp_file = f"tmp_pdf_file_{timestamp_now}_{random_suffix}.pdf"
|
||||
pdf_entry_by_pages = []
|
||||
with open(f"{tmp_file}", "wb") as f:
|
||||
f.write(pdf_file)
|
||||
try:
|
||||
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
|
||||
pdf_entry_by_pages = [page.page_content for page in loader.load()]
|
||||
except ImportError:
|
||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||
pdf_entry_by_pages = [
|
||||
page.page_content for page in loader.load()
|
||||
] # page_content items list for a given pdf.
|
||||
# Create temp file with .pdf extension that gets auto-deleted
|
||||
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmpf:
|
||||
tmpf.write(pdf_file)
|
||||
tmpf.flush() # Ensure all data is written
|
||||
|
||||
# Load the content using PyMuPDFLoader
|
||||
loader = PyMuPDFLoader(tmpf.name, extract_images=True)
|
||||
pdf_entries_per_file = loader.load()
|
||||
|
||||
# Convert the loaded entries into the desired format
|
||||
pdf_entry_by_pages = [page.page_content for page in pdf_entries_per_file]
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
||||
logger.warning(e, exc_info=True)
|
||||
finally:
|
||||
if os.path.exists(f"{tmp_file}"):
|
||||
os.remove(f"{tmp_file}")
|
||||
|
||||
return pdf_entry_by_pages
|
||||
|
|
Loading…
Reference in a new issue