mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-03 12:23:02 +01:00
Add separate methods for docx and pdf files to just convert files to raw text, before further processing
This commit is contained in:
parent
394035136d
commit
ecc81e06a7
2 changed files with 59 additions and 43 deletions
|
@ -58,28 +58,13 @@ class DocxToEntries(TextToEntries):
|
||||||
file_to_text_map = dict()
|
file_to_text_map = dict()
|
||||||
for docx_file in docx_files:
|
for docx_file in docx_files:
|
||||||
try:
|
try:
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
docx_texts = DocxToEntries.extract_text(docx_files[docx_file])
|
||||||
tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
|
|
||||||
with open(tmp_file, "wb") as f:
|
|
||||||
bytes_content = docx_files[docx_file]
|
|
||||||
f.write(bytes_content)
|
|
||||||
|
|
||||||
# Load the content using Docx2txtLoader
|
|
||||||
loader = Docx2txtLoader(tmp_file)
|
|
||||||
docx_entries_per_file = loader.load()
|
|
||||||
|
|
||||||
# Convert the loaded entries into the desired format
|
|
||||||
docx_texts = [page.page_content for page in docx_entries_per_file]
|
|
||||||
|
|
||||||
entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts))
|
entry_to_location_map += zip(docx_texts, [docx_file] * len(docx_texts))
|
||||||
entries.extend(docx_texts)
|
entries.extend(docx_texts)
|
||||||
file_to_text_map[docx_file] = docx_texts
|
file_to_text_map[docx_file] = docx_texts
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to process file: {docx_file}. This file will not be indexed.")
|
logger.warning(f"Unable to extract entries from file: {docx_file}")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
finally:
|
|
||||||
if os.path.exists(f"{tmp_file}"):
|
|
||||||
os.remove(f"{tmp_file}")
|
|
||||||
return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map))
|
return file_to_text_map, DocxToEntries.convert_docx_entries_to_maps(entries, dict(entry_to_location_map))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -103,3 +88,30 @@ class DocxToEntries(TextToEntries):
|
||||||
logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} DOCX entries to dictionaries")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_text(docx_file):
|
||||||
|
"""Extract text from specified DOCX file"""
|
||||||
|
try:
|
||||||
|
timestamp_now = datetime.utcnow().timestamp()
|
||||||
|
tmp_file = f"tmp_docx_file_{timestamp_now}.docx"
|
||||||
|
docx_entry_by_pages = []
|
||||||
|
with open(tmp_file, "wb") as f:
|
||||||
|
bytes_content = docx_file
|
||||||
|
f.write(bytes_content)
|
||||||
|
|
||||||
|
# Load the content using Docx2txtLoader
|
||||||
|
loader = Docx2txtLoader(tmp_file)
|
||||||
|
docx_entries_per_file = loader.load()
|
||||||
|
|
||||||
|
# Convert the loaded entries into the desired format
|
||||||
|
docx_entry_by_pages = [page.page_content for page in docx_entries_per_file]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Unable to extract text from file: {docx_file}")
|
||||||
|
logger.warning(e, exc_info=True)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(f"{tmp_file}"):
|
||||||
|
os.remove(f"{tmp_file}")
|
||||||
|
|
||||||
|
return docx_entry_by_pages
|
||||||
|
|
|
@ -59,32 +59,9 @@ class PdfToEntries(TextToEntries):
|
||||||
entries: List[str] = []
|
entries: List[str] = []
|
||||||
entry_to_location_map: List[Tuple[str, str]] = []
|
entry_to_location_map: List[Tuple[str, str]] = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
try:
|
pdf_entries_per_file = PdfToEntries.extract_text(pdf_file)
|
||||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
entries.extend(pdf_entries_per_file)
|
||||||
timestamp_now = datetime.utcnow().timestamp()
|
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||||
tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf"
|
|
||||||
with open(f"{tmp_file}", "wb") as f:
|
|
||||||
bytes = pdf_files[pdf_file]
|
|
||||||
f.write(bytes)
|
|
||||||
try:
|
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
|
|
||||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
|
||||||
except ImportError:
|
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
|
||||||
pdf_entries_per_file = [
|
|
||||||
page.page_content for page in loader.load()
|
|
||||||
] # page_content items list for a given pdf.
|
|
||||||
entry_to_location_map += zip(
|
|
||||||
pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)
|
|
||||||
) # this is an indexed map of pdf_entries for the pdf.
|
|
||||||
entries.extend(pdf_entries_per_file)
|
|
||||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
|
||||||
logger.warning(e, exc_info=True)
|
|
||||||
finally:
|
|
||||||
if os.path.exists(f"{tmp_file}"):
|
|
||||||
os.remove(f"{tmp_file}")
|
|
||||||
|
|
||||||
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
||||||
|
|
||||||
|
@ -109,3 +86,30 @@ class PdfToEntries(TextToEntries):
|
||||||
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_text(pdf_file):
|
||||||
|
"""Extract text from specified PDF files"""
|
||||||
|
try:
|
||||||
|
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PDF Loader expects a file path
|
||||||
|
timestamp_now = datetime.utcnow().timestamp()
|
||||||
|
tmp_file = f"tmp_pdf_file_{timestamp_now}.pdf"
|
||||||
|
pdf_entry_by_pages = []
|
||||||
|
with open(f"{tmp_file}", "wb") as f:
|
||||||
|
f.write(pdf_file)
|
||||||
|
try:
|
||||||
|
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=False)
|
||||||
|
pdf_entry_by_pages = [page.page_content for page in loader.load()]
|
||||||
|
except ImportError:
|
||||||
|
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||||
|
pdf_entry_by_pages = [
|
||||||
|
page.page_content for page in loader.load()
|
||||||
|
] # page_content items list for a given pdf.
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Unable to process file: {pdf_file}. This file will not be indexed.")
|
||||||
|
logger.warning(e, exc_info=True)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(f"{tmp_file}"):
|
||||||
|
os.remove(f"{tmp_file}")
|
||||||
|
|
||||||
|
return pdf_entry_by_pages
|
||||||
|
|
Loading…
Reference in a new issue