mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Create PDF to JSONL processor using PyPDF and LangChain
Switch `pydantic' to >= 1.9.1 else `langchain.document_loaders' starts throwing typing error for python 3.8, 3.9
This commit is contained in:
parent
1b3effd8e6
commit
286b500f66
2 changed files with 16 additions and 28 deletions
|
@ -44,7 +44,7 @@ dependencies = [
|
||||||
"tiktoken >= 0.3.0",
|
"tiktoken >= 0.3.0",
|
||||||
"tenacity >= 8.2.2",
|
"tenacity >= 8.2.2",
|
||||||
"pillow == 9.3.0",
|
"pillow == 9.3.0",
|
||||||
"pydantic == 1.9.1",
|
"pydantic >= 1.9.1",
|
||||||
"pyqt6 == 6.3.1",
|
"pyqt6 == 6.3.1",
|
||||||
"pyyaml == 6.0",
|
"pyyaml == 6.0",
|
||||||
"rich >= 13.3.1",
|
"rich >= 13.3.1",
|
||||||
|
@ -54,6 +54,7 @@ dependencies = [
|
||||||
"uvicorn == 0.17.6",
|
"uvicorn == 0.17.6",
|
||||||
"aiohttp == 3.8.4",
|
"aiohttp == 3.8.4",
|
||||||
"langchain >= 0.0.187",
|
"langchain >= 0.0.187",
|
||||||
|
"pypdf >= 3.9.0",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import re
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
# External Packages
|
||||||
|
from langchain.document_loaders import PyPDFLoader
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
|
||||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
@ -90,29 +91,17 @@ class PdfToJsonl(TextToJsonl):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_pdf_entries(pdf_files):
|
def extract_pdf_entries(pdf_files):
|
||||||
"""Extract entries by heading from specified PDF files"""
|
"""Extract entries by page from specified PDF files"""
|
||||||
|
|
||||||
# Regex to extract PDF Entries by Heading
|
|
||||||
pdf_heading_regex = r"^#"
|
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
entry_to_file_map = []
|
entry_to_location_map = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
with open(pdf_file, "r", encoding="utf8") as f:
|
loader = PyPDFLoader(pdf_file)
|
||||||
pdf_content = f.read()
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
pdf_entries_per_file = []
|
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||||
any_headings = re.search(pdf_heading_regex, pdf_content, flags=re.MULTILINE)
|
entries.extend(pdf_entries_per_file)
|
||||||
for entry in re.split(pdf_heading_regex, pdf_content, flags=re.MULTILINE):
|
|
||||||
# Add heading level as the regex split removed it from entries with headings
|
|
||||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
|
||||||
stripped_entry = entry.strip(empty_escape_sequences)
|
|
||||||
if stripped_entry != "":
|
|
||||||
pdf_entries_per_file.append(f"{prefix}{stripped_entry}")
|
|
||||||
|
|
||||||
entry_to_file_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
return entries, dict(entry_to_location_map)
|
||||||
entries.extend(pdf_entries_per_file)
|
|
||||||
|
|
||||||
return entries, dict(entry_to_file_map)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||||
|
@ -120,21 +109,19 @@ class PdfToJsonl(TextToJsonl):
|
||||||
entries = []
|
entries = []
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
|
|
||||||
# Append base filename to compiled entry for context to model
|
# Append base filename to compiled entry for context to model
|
||||||
# Increment heading level for heading entries and make filename as its top level heading
|
heading = f"{entry_filename.stem}\n"
|
||||||
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
compiled_entry = f"{heading}{parsed_entry}"
|
||||||
compiled_entry = f"{prefix}{parsed_entry}"
|
|
||||||
entries.append(
|
entries.append(
|
||||||
Entry(
|
Entry(
|
||||||
compiled=compiled_entry,
|
compiled=compiled_entry,
|
||||||
raw=parsed_entry,
|
raw=parsed_entry,
|
||||||
heading=f"{prefix}{heading}",
|
heading=heading,
|
||||||
file=f"{entry_filename}",
|
file=f"{entry_filename}",
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.debug(f"Converted {len(parsed_entries)} pdf entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue