mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Use OCR to extract image text in PDFs
This commit is contained in:
parent
d1d210605e
commit
b5972e9311
2 changed files with 2 additions and 1 deletions
|
@ -73,6 +73,7 @@ dependencies = [
|
|||
"gunicorn == 21.2.0",
|
||||
"lxml == 4.9.3",
|
||||
"tzdata == 2023.3",
|
||||
"rapidocr-onnxruntime == 1.3.8"
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ class PdfToEntries(TextToEntries):
|
|||
with open(f"{tmp_file}", "wb") as f:
|
||||
bytes = pdf_files[pdf_file]
|
||||
f.write(bytes)
|
||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||
loader = PyMuPDFLoader(f"{tmp_file}", extract_images=True)
|
||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||
entries.extend(pdf_entries_per_file)
|
||||
|
|
Loading…
Reference in a new issue