mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Fix parsing for PDFs via content indexing API
This commit is contained in:
parent
623a97a9ee
commit
ee062d1c48
2 changed files with 10 additions and 5 deletions
|
@ -59,9 +59,14 @@ class PdfToEntries(TextToEntries):
|
||||||
entries: List[str] = []
|
entries: List[str] = []
|
||||||
entry_to_location_map: List[Tuple[str, str]] = []
|
entry_to_location_map: List[Tuple[str, str]] = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
pdf_entries_per_file = PdfToEntries.extract_text(pdf_file)
|
try:
|
||||||
entries.extend(pdf_entries_per_file)
|
pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file])
|
||||||
file_to_text_map[pdf_file] = pdf_entries_per_file
|
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||||
|
entries.extend(pdf_entries_per_file)
|
||||||
|
file_to_text_map[pdf_file] = pdf_entries_per_file
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Unable to extract entries from file: {pdf_file}")
|
||||||
|
logger.warning(e, exc_info=True)
|
||||||
|
|
||||||
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))
|
||||||
|
|
||||||
|
|
|
@ -450,11 +450,11 @@ async def indexer(
|
||||||
for file in files:
|
for file in files:
|
||||||
file_data = get_file_content(file)
|
file_data = get_file_content(file)
|
||||||
if file_data.file_type in index_files:
|
if file_data.file_type in index_files:
|
||||||
index_files[file_data.file_type][file_data.filename] = (
|
index_files[file_data.file_type][file_data.name] = (
|
||||||
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
|
||||||
|
|
||||||
indexer_input = IndexerInput(
|
indexer_input = IndexerInput(
|
||||||
org=index_files["org"],
|
org=index_files["org"],
|
||||||
|
|
Loading…
Reference in a new issue