Fix parsing for PDFs via content indexing API

This commit is contained in:
sabaimran 2024-11-07 18:17:29 -08:00
parent 623a97a9ee
commit ee062d1c48
2 changed files with 10 additions and 5 deletions

View file

@ -59,9 +59,14 @@ class PdfToEntries(TextToEntries):
entries: List[str] = [] entries: List[str] = []
entry_to_location_map: List[Tuple[str, str]] = [] entry_to_location_map: List[Tuple[str, str]] = []
for pdf_file in pdf_files: for pdf_file in pdf_files:
pdf_entries_per_file = PdfToEntries.extract_text(pdf_file) try:
entries.extend(pdf_entries_per_file) pdf_entries_per_file = PdfToEntries.extract_text(pdf_files[pdf_file])
file_to_text_map[pdf_file] = pdf_entries_per_file entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
entries.extend(pdf_entries_per_file)
file_to_text_map[pdf_file] = pdf_entries_per_file
except Exception as e:
logger.warning(f"Unable to extract entries from file: {pdf_file}")
logger.warning(e, exc_info=True)
return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map)) return file_to_text_map, PdfToEntries.convert_pdf_entries_to_maps(entries, dict(entry_to_location_map))

View file

@ -450,11 +450,11 @@ async def indexer(
for file in files: for file in files:
file_data = get_file_content(file) file_data = get_file_content(file)
if file_data.file_type in index_files: if file_data.file_type in index_files:
index_files[file_data.file_type][file_data.filename] = ( index_files[file_data.file_type][file_data.name] = (
file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content file_data.content.decode(file_data.encoding) if file_data.encoding else file_data.content
) )
else: else:
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file_data.name}")
indexer_input = IndexerInput( indexer_input = IndexerInput(
org=index_files["org"], org=index_files["org"],