diff --git a/docker-compose.yml b/docker-compose.yml index 5cd1763c..0529b150 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,7 @@ services: - ./tests/data/ledger/:/data/ledger/ - ./tests/data/music/:/data/music/ - ./tests/data/markdown/:/data/markdown/ + - ./tests/data/pdf/:/data/pdf/ # Embeddings and models are populated after the first run # You can set these volumes to point to empty directories on host - ./tests/data/embeddings/:/data/embeddings/ diff --git a/pyproject.toml b/pyproject.toml index 4aae6104..1750268d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ keywords = [ "markdown", "beancount", "images", + "pdf", ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 448c8bde..ae49678b 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl +from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.search_type import image_search, text_search from khoj.utils import constants, state from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel @@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, filters=[DateFilter(), WordFilter(), FileFilter()], ) + # Initialize PDF Search + if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf: + logger.info("💸 Setting up search for pdf") + # Extract Entries, Generate PDF Embeddings + model.pdf_search = text_search.setup( + PdfToJsonl, + config.content_type.pdf, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[DateFilter(), WordFilter(), FileFilter()], + ) + # Initialize Image Search if (t == state.SearchType.Image or t == None) and config.content_type.image: logger.info("🌄 Setting up search for images") diff --git a/src/khoj/interface/desktop/file_browser.py b/src/khoj/interface/desktop/file_browser.py index 4ce9725e..d7071664 100644 --- a/src/khoj/interface/desktop/file_browser.py +++ b/src/khoj/interface/desktop/file_browser.py @@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget): return "Beancount Files (*.bean *.beancount)" elif search_type == SearchType.Markdown: return "Markdown Files (*.md *.markdown)" + elif search_type == SearchType.Pdf: + return "Pdf Files (*.pdf)" elif search_type == SearchType.Music: return "Org-Music Files (*.org)" elif search_type == SearchType.Image: diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 0c7f278f..7a79b95a 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -109,6 +109,17 @@ def search( with timer("Collating results took", logger): results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Pdf or t == None) and state.model.pdf_search: + # query pdf files + with timer("Query took", logger): + hits, entries = text_search.query( + user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe + ) + + # collate and return results + with timer("Collating results took", logger): + results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Ledger or t == None) and state.model.ledger_search: # query transactions with timer("Query took", logger): diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 76baa14d..7b590d13 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -22,6 +22,7 @@ class SearchType(str, Enum): Music = "music" Markdown = "markdown" Image = "image" + Pdf = "pdf" class ProcessorType(str, Enum): @@ -61,6 +62,7 @@ class SearchModels: ledger_search: TextSearchModel = None music_search: TextSearchModel = None markdown_search: TextSearchModel = None + pdf_search: TextSearchModel = None image_search: ImageSearchModel = None plugin_search: Dict[str, TextSearchModel] = None diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index aa10a4d3..87eb07ac 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -28,6 +28,12 @@ default_config = { "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz", "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt", }, + "pdf": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz", + "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt", + }, "image": { "input-directories": None, "input-filter": None, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index bc8ef78a..72d82ce9 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -56,6 +56,7 @@ class ContentConfig(ConfigBase): image: Optional[ImageContentConfig] music: Optional[TextContentConfig] markdown: Optional[TextContentConfig] + pdf: Optional[TextContentConfig] plugins: Optional[Dict[str, TextContentConfig]] diff --git a/tests/test_client.py b/tests/test_client.py index e7087e2c..cee0ee67 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_search_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/search?q=random&t={content_type}") # Assert @@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_update_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/update?t={content_type}") # Assert @@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_regenerate_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/update?force=true&t={content_type}") # Assert