mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Fix parsing for PDFs via content indexing API
This commit is contained in:
parent
623a97a9ee
commit
ee062d1c48
2 changed files with 10 additions and 5 deletions
|
@ -59,9 +59,14 @@ class PdfToEntries(TextToEntries):
|
|||
entries: List[str] = []
|
||||
entry_to_location_map: List[Tuple[str, str]] = []
|
||||
for pdf_file in pdf_files:
|
||||
pdf_entries_per_file = PdfToEntries.extract_text(pdf_file)
|
||||
entries.extend(pdf_entries_per_file)
|
||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||
try:
|
||||
pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file])
|
||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||
entries.extend(pdf_entries_per_file)
|
||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to extract entries from file: {pdf_file}")
|
||||
logger.warning(e, exc_info=True)
|
||||
|
||||
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
||||
|
||||
|
|
|
@ -450,11 +450,11 @@ async def indexer(
|
|||
for file in files:
|
||||
file_data = get_file_content(file)
|
||||
if file_data.file_type in index_files:
|
||||
index_files[file_data.file_type][file_data.filename] = (
|
||||
index_files[file_data.file_type][file_data.name] = (
|
||||
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
|
||||
|
||||
indexer_input = IndexerInput(
|
||||
org=index_files["org"],
|
||||
|
|
Loading…
Reference in a new issue