Wire up PDF to jsonl processor to Khoj server layer (API, config)

- Specify PDF content to index via khoj.yml - Index PDF content on app start, reconfigure - Expose PDF as a search type via API
2025-02-18 22:54:20 +00:00 · 2023-06-01 09:55:48 +05:30 · 2023-06-01 09:55:48 +05:30 · acd14a5e41
commit acd14a5e41
parent d63194c3a9
9 changed files with 40 additions and 3 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -21,6 +21,7 @@ services:
      - ./tests/data/ledger/:/data/ledger/
      - ./tests/data/music/:/data/music/
      - ./tests/data/markdown/:/data/markdown/
+      - ./tests/data/pdf/:/data/pdf/
      # Embeddings and models are populated after the first run
      # You can set these volumes to point to empty directories on host
      - ./tests/data/embeddings/:/data/embeddings/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -21,6 +21,7 @@ keywords = [
    "markdown",
    "beancount",
    "images",
+    "pdf",
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
+from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
 from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
            filters=[DateFilter(), WordFilter(), FileFilter()],
        )

+    # Initialize PDF Search
+    if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
+        logger.info("💸 Setting up search for pdf")
+        # Extract Entries, Generate PDF Embeddings
+        model.pdf_search = text_search.setup(
+            PdfToJsonl,
+            config.content_type.pdf,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[DateFilter(), WordFilter(), FileFilter()],
+        )
+
    # Initialize Image Search
    if (t == state.SearchType.Image or t == None) and config.content_type.image:
        logger.info("🌄 Setting up search for images")
--- a/src/khoj/interface/desktop/file_browser.py
+++ b/src/khoj/interface/desktop/file_browser.py
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
            return "Beancount Files (*.bean *.beancount)"
        elif search_type == SearchType.Markdown:
            return "Markdown Files (*.md *.markdown)"
+        elif search_type == SearchType.Pdf:
+            return "Pdf Files (*.pdf)"
        elif search_type == SearchType.Music:
            return "Org-Music Files (*.org)"
        elif search_type == SearchType.Image:
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@ -109,6 +109,17 @@ def search(
        with timer("Collating results took", logger):
            results = text_search.collate_results(hits, entries, results_count)

+    elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
+        # query pdf files
+        with timer("Query took", logger):
+            hits, entries = text_search.query(
+                user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
+            )
+
+        # collate and return results
+        with timer("Collating results took", logger):
+            results = text_search.collate_results(hits, entries, results_count)
+
    elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
        # query transactions
        with timer("Query took", logger):
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@ -22,6 +22,7 @@ class SearchType(str, Enum):
    Music = "music"
    Markdown = "markdown"
    Image = "image"
+    Pdf = "pdf"


 class ProcessorType(str, Enum):
@ -61,6 +62,7 @@ class SearchModels:
    ledger_search: TextSearchModel = None
    music_search: TextSearchModel = None
    markdown_search: TextSearchModel = None
+    pdf_search: TextSearchModel = None
    image_search: ImageSearchModel = None
    plugin_search: Dict[str, TextSearchModel] = None

--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -28,6 +28,12 @@ default_config = {
            "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
            "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
        },
+        "pdf": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
+        },
        "image": {
            "input-directories": None,
            "input-filter": None,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
    image: Optional[ImageContentConfig]
    music: Optional[TextContentConfig]
    markdown: Optional[TextContentConfig]
+    pdf: Optional[TextContentConfig]
    plugins: Optional[Dict[str, TextContentConfig]]


--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_search_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/search?q=random&t={content_type}")
        # Assert
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_update_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?t={content_type}")
        # Assert
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_regenerate_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?force=true&t={content_type}")
        # Assert