Search PDF files with Khoj. Integrate with LangChain

- **Introduce Khoj to LangChain**: Call GPT with LangChain for Khoj Chat - **Search (and Chat about) PDF files with Khoj** - Create PDF to JSONL Processor: Convert PDF content into standardized JSONL format - Expose PDF search type via Khoj server API - Enable querying PDF files via Obsidian, Emacs and Web interfaces
2025-02-18 21:54:20 +00:00 · 2023-06-02 10:20:26 +05:30 · 2023-06-02 10:20:26 +05:30 · e022910f31
commit e022910f31
parent b484953bb3 e9ed7a19fd
24 changed files with 608 additions and 200 deletions
--- a/README.md
+++ b/README.md
@ -63,7 +63,7 @@
 - **General**
  - **Natural**: Advanced natural language understanding using Transformer based ML Models
  - **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
-  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos
+  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
  - **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
 ## Demos
@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
 - Install Khoj via `pip` and start Khoj backend in non-gui mode
 - Install Khoj plugin via Community Plugins settings pane on Obsidian app
 - Check the new Khoj plugin settings
- Let Khoj backend index the markdown files in the current Vault
+- Let Khoj backend index the markdown, pdf files in the current Vault
 - Open Khoj plugin on Obsidian via Search button on Left Pane
 - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
 - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
 ##### 2. Configure
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes and beancount directories
+- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
 - **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
 ##### 3. Run
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -21,6 +21,7 @@ services:
      - ./tests/data/ledger/:/data/ledger/
      - ./tests/data/music/:/data/music/
      - ./tests/data/markdown/:/data/markdown/
      - ./tests/data/pdf/:/data/pdf/
      # Embeddings and models are populated after the first run
      # You can set these volumes to point to empty directories on host
      - ./tests/data/embeddings/:/data/embeddings/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -21,6 +21,7 @@ keywords = [
    "markdown",
    "beancount",
    "images",
    "pdf",
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
@ -44,7 +45,7 @@ dependencies = [
    "tiktoken >= 0.3.0",
    "tenacity >= 8.2.2",
    "pillow == 9.3.0",
-    "pydantic == 1.9.1",
+    "pydantic >= 1.9.1",
    "pyqt6 == 6.3.1",
    "pyyaml == 6.0",
    "rich >= 13.3.1",
@ -53,6 +54,8 @@ dependencies = [
    "torch == 1.13.1",
    "uvicorn == 0.17.6",
    "aiohttp == 3.8.4",
    "langchain >= 0.0.187",
    "pypdf >= 3.9.0",
 ]
 dynamic = ["version"]
--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@ -4,7 +4,7 @@
 ;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
 ;; Description: A search assistant for your second brain
-;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image
+;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image
 ;; Version: 0.6.2
 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
 ;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
@ -29,8 +29,8 @@
 ;;; Commentary:
 ;; Create a search assistant for your `org-mode', `markdown' notes,
-;; `beancount' transactions and images. This package exposes two
+;; `beancount' transactions, PDFs and images. This package exposes
-;; assistance modes, search and chat:
+;; two assistance modes, search and chat:
 ;;
 ;; Chat provides faster answers, iterative discovery and assisted
 ;; creativity. It requires your OpenAI API key to access GPT models
@ -95,6 +95,7 @@
                 (const "markdown")
                 (const "ledger")
                 (const "image")
                 (const "pdf")
                 (const "music")))
@ -140,6 +141,8 @@ NO-PAGING FILTER))
       "C-x l  | ledger\n")
     (when (member 'image enabled-content-types)
       "C-x i  | image\n")
     (when (member 'pdf enabled-content-types)
       "C-x p  | pdf\n")
     (when (member 'music enabled-content-types)
       "C-x M  | music\n"))))
@ -150,6 +153,7 @@ NO-PAGING FILTER))
 (defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
 (defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
 (defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
 (defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf"))
 (defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
 (defun khoj--make-search-keymap (&optional existing-keymap)
  "Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
@ -164,6 +168,8 @@ NO-PAGING FILTER))
      (define-key kmap (kbd "C-x l") #'khoj--search-ledger))
    (when (member 'image enabled-content-types)
      (define-key kmap (kbd "C-x i") #'khoj--search-images))
    (when (member 'pdf enabled-content-types)
      (define-key kmap (kbd "C-x p") #'khoj--search-pdf))
    (when (member 'music enabled-content-types)
      (define-key kmap (kbd "C-x M") #'khoj--search-music))
    kmap))
@ -544,6 +550,22 @@ CONFIG is json obtained from Khoj config API."
               ;; remove trailing (, ) or SPC from extracted entries string
               (replace-regexp-in-string "[\(\) ]$" "")))
 (defun khoj--extract-entries-as-pdf (json-response query)
  "Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries."
  (thread-last
    json-response
    ;; Extract and render each pdf entry from response
    (mapcar (lambda (json-response-item)
              (thread-last
                ;; Extract pdf entry from each item in json response
                (cdr (assoc 'compiled (assoc 'additional json-response-item)))
                ;; Format pdf entry as a org entry string
                (format "** %s\n\n"))))
    ;; Render entries into org formatted string with query set as as top level heading
    (format "* %s\n%s\n" query)
    ;; remove leading (, ) or SPC from extracted entries string
    (replace-regexp-in-string "^[\(\) ]" "")))
 (defun khoj--extract-entries-as-images (json-response query)
  "Convert JSON-RESPONSE, QUERY from API to html with images."
  (let ((image-results-buffer-html-format-str "<html>\n<body>\n<h1>%s</h1>%s\n\n</body>\n</html>")
@ -592,6 +614,7 @@ CONFIG is json obtained from Khoj config API."
     ((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
     ((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
     ((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
     ((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
     ((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
     (t khoj-default-content-type))))
@ -647,16 +670,19 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
      (insert
       (cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
             ((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
             ((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query))
             ((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
             ((equal content-type "image") (khoj--extract-entries-as-images json-response query))
             (t (khoj--extract-entries json-response query))))
-      (cond ((equal content-type "org") (progn (visual-line-mode)
+      (cond ((or (equal content-type "pdf")
-                                               (org-mode)
+                 (equal content-type "org"))
-                                               (setq-local
+             (progn (visual-line-mode)
-                                                org-startup-folded "showall"
+                    (org-mode)
-                                                org-hide-leading-stars t
+                   (setq-local
-                                                org-startup-with-inline-images t)
+                    org-startup-folded "showall"
-                                               (org-set-startup-visibility)))
+                    org-hide-leading-stars t
                    org-startup-with-inline-images t)
                   (org-set-startup-visibility)))
            ((equal content-type "markdown") (progn (markdown-mode)
                                                    (visual-line-mode)))
            ((equal content-type "ledger") (beancount-mode))
@ -973,7 +999,7 @@ Paragraph only starts at first text after blank line."
  ;; set content type to: last used > based on current buffer > default type
  :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
  ;; dynamically set choices to content types enabled on khoj backend
-  :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "ledger" "music" "image")))
+  :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "pdf" "ledger" "music" "image")))
 (transient-define-suffix khoj--search-command (&optional args)
  (interactive (list (transient-args transient-current-command)))
--- a/src/interface/obsidian/README.md
+++ b/src/interface/obsidian/README.md
@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
 1. Install Khoj via `pip` and start Khoj backend in non-gui mode
 2. Install Khoj plugin via Community Plugins settings pane on Obsidian app
 3. Check the new Khoj plugin settings
-4. Wait for Khoj backend to index markdown files in the current Vault
+4. Wait for Khoj backend to index markdown, PDF files in the current Vault
 5. Open Khoj plugin on Obsidian via Search button on Left Pane
 6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
 7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj
 - [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command
 - [X] Render results as Markdown preview to improve readability
 - [X] Configure Khoj via the plugin setting tab on the settings page
-  - Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault
+  - Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault
  - Set URL of Khoj backend
  - Set Number of Search Results to show in Search Modal
 - [X] Allow reranking of result to improve search quality
--- a/src/interface/obsidian/src/search_modal.ts
+++ b/src/interface/obsidian/src/search_modal.ts
@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
    async getSuggestions(query: string): Promise<SearchResult[]> {
        // Query Khoj backend for search results
        let encodedQuery = encodeURIComponent(query);
-        let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`;
+        let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`;
-        let response = await request(searchUrl);
+
-        let data = JSON.parse(response);
+        // Get search results for markdown and pdf files
-        let results = data
+        let mdResponse = await request(`${searchUrl}&t=markdown`);
        let pdfResponse = await request(`${searchUrl}&t=pdf`);
        // Parse search results
        let mdData = JSON.parse(mdResponse)
            .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
-            .map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; });
+            .map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; });
        let pdfData = JSON.parse(pdfResponse)
            .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
            .map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; })
        // Combine markdown and PDF results and sort them by score
        let results = mdData.concat(pdfData)
            .sort((a: any, b: any) => b.score - a.score)
            .map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; })
        this.query = query;
        return results;
@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
    }
    async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
-        // Get all markdown files in vault
+        // Get all markdown and PDF files in vault
        const mdFiles = this.app.vault.getMarkdownFiles();
        const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf');
        // Find the vault file matching file of chosen search result
-        let file_match = mdFiles
+        let file_match = mdFiles.concat(pdfFiles)
            // Sort by descending length of path
            // This finds longest path match when multiple files have same name
            .sort((a, b) => b.path.length - a.path.length)
@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
        // Open vault file at heading of chosen search result
        if (file_match) {
-            let resultHeading = result.entry.split('\n', 1)[0];
+            let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : '';
            let linkToEntry = `${file_match.path}${resultHeading}`
            this.app.workspace.openLinkText(linkToEntry, '');
            console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`);
--- a/src/interface/obsidian/src/settings.ts
+++ b/src/interface/obsidian/src/settings.ts
@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab {
                    this.plugin.registerInterval(progress_indicator);
                    await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`);
                    await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`);
                    new Notice('✅ Updated Khoj index.');
                    // Reset button once index is updated
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string {
 export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
    let vaultPath = getVaultAbsolutePath(vault);
    let mdInVault = `${vaultPath}/**/*.md`;
    let pdfInVault = `${vaultPath}/**/*.pdf`;
    let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
    // Check if khoj backend is configured, note if cannot connect to backend
@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
    let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
    // Get default config fields from khoj backend
    let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
-    let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
+    let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
    let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
    let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
    let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];
@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                    "markdown": {
                        "input-filter": [mdInVault],
                        "input-files": null,
-                        "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
+                        "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
+                        "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
                    },
                    "pdf": {
                        "input-filter": [pdfInVault],
                        "input-files": null,
                        "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
                        "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
                    }
                }
            }
@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                data["content-type"]["markdown"] = {
                    "input-filter": [mdInVault],
                    "input-files": null,
-                    "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
+                    "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
-                    "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
+                    "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }
            // Else if khoj is not configured to index markdown files in configured obsidian vault
@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
                // Update markdown config in khoj content-type config
                // Set markdown config to only index markdown files in configured obsidian vault
-                let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
+                let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
                data["content-type"]["markdown"] = {
                    "input-filter": [mdInVault],
                    "input-files": null,
-                    "embeddings-file": `${khojIndexDirectory}/${indexName}.pt`,
+                    "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
-                    "compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`,
+                    "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }
            if (khoj_already_configured && !data["content-type"]["pdf"]) {
                // Add pdf config to khoj content-type config
                // Set pdf config to index pdf files in configured obsidian vault
                data["content-type"]["pdf"] = {
                    "input-filter": [pdfInVault],
                    "input-files": null,
                    "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
                    "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }
            // Else if khoj is not configured to index pdf files in configured obsidian vault
            else if (khoj_already_configured &&
                (data["content-type"]["pdf"]["input-filter"].length != 1 ||
                data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
                // Update pdf config in khoj content-type config
                // Set pdf config to only index pdf files in configured obsidian vault
                let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
                data["content-type"]["pdf"] = {
                    "input-filter": [pdfInVault],
                    "input-files": null,
                    "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
                    "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
 from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
            filters=[DateFilter(), WordFilter(), FileFilter()],
        )
    # Initialize PDF Search
    if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
        logger.info("💸 Setting up search for pdf")
        # Extract Entries, Generate PDF Embeddings
        model.pdf_search = text_search.setup(
            PdfToJsonl,
            config.content_type.pdf,
            search_config=config.search_type.asymmetric,
            regenerate=regenerate,
            filters=[DateFilter(), WordFilter(), FileFilter()],
        )
    # Initialize Image Search
    if (t == state.SearchType.Image or t == None) and config.content_type.image:
        logger.info("🌄 Setting up search for images")
--- a/src/khoj/interface/desktop/file_browser.py
+++ b/src/khoj/interface/desktop/file_browser.py
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
            return "Beancount Files (*.bean *.beancount)"
        elif search_type == SearchType.Markdown:
            return "Markdown Files (*.md *.markdown)"
        elif search_type == SearchType.Pdf:
            return "Pdf Files (*.pdf)"
        elif search_type == SearchType.Music:
            return "Org-Music Files (*.org)"
        elif search_type == SearchType.Image:
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -44,6 +44,15 @@
            }).join("\n") + `</div>`;
        }
        function render_pdf(query, data) {
            return `<div id="results-pdf">` + data.map(function (item) {
                let compiled_lines = item.additional.compiled.split("\n");
                let filename = compiled_lines.shift();
                let text_match = compiled_lines.join("\n")
                return `<h2>${filename}</h2>\n<p>${text_match}</p>`
            }).join("\n") + `</div>`;
        }
        function render_json(data, query, type) {
            if (type === "markdown") {
                return render_markdown(query, data);
@ -55,6 +64,8 @@
                return data.map(render_image).join('');
            } else if (type === "ledger") {
                return render_ledger(query, data);
            } else if (type === "pdf") {
                return render_pdf(query, data);
            } else {
                return `<div id="results-plugin">`
                    + data.map((item) => `<p>${item.entry}</p>`).join("\n")
@ -279,6 +290,7 @@
        #json {
            white-space: pre-wrap;
        }
        #results-pdf,
        #results-plugin,
        #results-ledger {
            text-align: left;
--- a/src/khoj/processor/conversation/gpt.py
+++ b/src/khoj/processor/conversation/gpt.py
@ -5,10 +5,10 @@ from datetime import datetime
 # Internal Packages
 from khoj.utils.constants import empty_escape_sequences
 from khoj.processor.conversation import prompts
 from khoj.processor.conversation.utils import (
    chat_completion_with_backoff,
    completion_with_backoff,
    message_to_prompt,
    generate_chatml_messages_with_context,
 )
@ -20,22 +20,14 @@ def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=50
    """
    Answer user query using provided text as reference with OpenAI's GPT
    """
-    # Setup Prompt based on Summary Type
+    # Setup Prompt from arguments
-    prompt = f"""
+    prompt = prompts.answer.format(text=text, user_query=user_query)
 You are a friendly, helpful personal assistant.
 Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
 Notes:
 {text}
 Question: {user_query}
 Answer (in second person):"""
    # Get Response from GPT
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        stop='"""',
@ -43,8 +35,7 @@ Answer (in second person):"""
    )
    # Extract, Clean Message from GPT's Response
-    story = response["choices"][0]["text"]
+    return str(response).replace("\n\n", "")
    return str(story).replace("\n\n", "")
 def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
@ -53,25 +44,15 @@ def summarize(text, summary_type, model, user_query=None, api_key=None, temperat
    """
    # Setup Prompt based on Summary Type
    if summary_type == "chat":
-        prompt = f"""
+        prompt = prompts.summarize_chat.format(text=text)
 You are an AI. Summarize the conversation below from your perspective:
 {text}
 Summarize the conversation from the AI's first-person perspective:"""
    elif summary_type == "notes":
-        prompt = f"""
+        prompt = prompts.summarize_notes.format(text=text, user_query=user_query)
 Summarize the below notes about {user_query}:
 {text}
 Summarize the notes in second person perspective:"""
    # Get Response from GPT
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        frequency_penalty=0.2,
@ -80,8 +61,7 @@ Summarize the notes in second person perspective:"""
    )
    # Extract, Clean Message from GPT's Response
-    story = response["choices"][0]["text"]
+    return str(response).replace("\n\n", "")
    return str(story).replace("\n\n", "")
 def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
@ -102,68 +82,21 @@ def extract_questions(text, model="text-davinci-003", conversation_log={}, api_k
    current_new_year = today.replace(month=1, day=1)
    last_new_year = current_new_year.replace(year=today.year - 1)
-    prompt = f"""
+    prompt = prompts.extract_questions.format(
-You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes.
+        current_date=today.strftime("%A, %Y-%m-%d"),
- The user will provide their questions and answers to you for context.
+        last_new_year=last_new_year.strftime("%Y"),
- Add as much context from the previous questions and answers as required into your search queries.
+        last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
- Break messages into multiple search queries when required to retrieve the relevant information.
+        current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
+        bob_tom_age_difference={current_new_year.year - 1984 - 30},
-
+        bob_age={current_new_year.year - 1984},
-What searches, if any, will you need to perform to answer the users question?
+        chat_history=chat_history,
-Provide search queries as a JSON list of strings
+        text=text,
-Current Date: {today.strftime("%A, %Y-%m-%d")}
+    )
 Q: How was my trip to Cambodia?
 ["How was my trip to Cambodia?"]
 A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
 Q: Who did i visit that temple with?
 ["Who did I visit the Angkor Wat Temple in Cambodia with?"]
 A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
 Q: What national parks did I go to last year?
 ["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""]
 A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}.
 Q: How are you feeling today?
 []
 A: I'm feeling a little bored. Helping you will hopefully make me feel better!
 Q: How many tennis balls fit in the back of a 2002 Honda Civic?
 ["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
 A: 1085 tennis balls will fit in the trunk of a Honda Civic
 Q: Is Bob older than Tom?
 ["When was Bob born?", "What is Tom's age?"]
 A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
 Q: What is their age difference?
 ["What is Bob's age?", "What is Tom's age?"]
 A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old.
 {chat_history}
 Q: {text}
 """
    # Get Response from GPT
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        stop=["A: ", "\n"],
@ -171,17 +104,16 @@ Q: {text}
    )
    # Extract, Clean Message from GPT's Response
    response_text = response["choices"][0]["text"]
    try:
        questions = json.loads(
            # Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
-            response_text.strip(empty_escape_sequences)
+            response.strip(empty_escape_sequences)
            .replace("['", '["')
            .replace("']", '"]')
            .replace("', '", '", "')
        )
    except json.decoder.JSONDecodeError:
-        logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}")
+        logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}")
        questions = [text]
    logger.debug(f"Extracted Questions by GPT: {questions}")
    return questions
@ -191,31 +123,8 @@ def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=1
    """
    Extract search type from user query using OpenAI's GPT
    """
-    # Initialize Variables
+    # Setup Prompt to extract search type
-    understand_primer = """
+    prompt = prompts.search_type + f"{text}\nA:"
 Objective: Extract search type from user query and return information as JSON
 Allowed search types are listed below:
  - search-type=["notes","ledger","image","music"]
 Some examples are given below for reference:
 Q:What fiction book was I reading last week about AI starship?
 A:{ "search-type": "notes" }
 Q:Play some calm classical music?
 A:{ "search-type": "music" }
 Q:How much did I spend at Subway for dinner last time?
 A:{ "search-type": "ledger" }
 Q:What was that popular Sri lankan song that Alex had mentioned?
 A:{ "search-type": "music" }
 Q:Can you recommend a movie to watch from my notes?
 A:{ "search-type": "notes" }
 Q: When did I buy Groceries last?
 A:{ "search-type": "ledger" }
 Q:When did I go surfing last?
 A:{ "search-type": "notes" }"""
    # Setup Prompt with Understand Primer
    prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
    if verbose > 1:
        print(f"Message -> Prompt: {text} -> {prompt}")
@ -223,7 +132,7 @@ A:{ "search-type": "notes" }"""
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        frequency_penalty=0.2,
@ -232,8 +141,7 @@ A:{ "search-type": "notes" }"""
    )
    # Extract, Clean Message from GPT's Response
-    story = str(response["choices"][0]["text"])
+    return json.loads(response.strip(empty_escape_sequences))
    return json.loads(story.strip(empty_escape_sequences))
 def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2):
@ -241,36 +149,23 @@ def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo",
    Converse with user using OpenAI's ChatGPT
    """
    # Initialize Variables
    current_date = datetime.now().strftime("%Y-%m-%d")
    compiled_references = "\n\n".join({f"# {item}" for item in references})
    personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
    conversation_primers = {
        "general": f"""
 Using your general knowledge and our past conversations as context, answer the following question.
 Current Date: {datetime.now().strftime("%Y-%m-%d")}
 Question: {user_query}
 """.strip(),
        "notes": f"""
 Using the notes and our past conversations as context, answer the following question.
 Current Date: {datetime.now().strftime("%Y-%m-%d")}
 Notes:
 {compiled_references}
 Question: {user_query}
 """.strip(),
    }
    # Get Conversation Primer appropriate to Conversation Type
    conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes"
    logger.debug(f"Conversation Type: {conversation_type}")
-    conversation_primer = conversation_primers.get(conversation_type)
+    if conversation_type == "general":
        conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
    else:
        conversation_primer = prompts.notes_conversation.format(
            current_date=current_date, query=user_query, references=compiled_references
        )
    # Setup Prompt with Primer or Conversation History
    messages = generate_chatml_messages_with_context(
        conversation_primer,
-        personality_primer,
+        prompts.personality.format(),
        conversation_log,
        model,
    )
@ -279,11 +174,10 @@ Question: {user_query}
    logger.debug(f"Conversation Context for GPT: {messages}")
    response = chat_completion_with_backoff(
        messages=messages,
-        model=model,
+        model_name=model,
        temperature=temperature,
-        api_key=api_key,
+        openai_api_key=api_key,
    )
    # Extract, Clean Message from GPT's Response
-    story = str(response["choices"][0]["message"]["content"])
+    return response.strip(empty_escape_sequences)
    return story.strip(empty_escape_sequences)
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@ -0,0 +1,165 @@
 # External Packages
 from langchain.prompts import PromptTemplate
 ## Personality
 ## --
 personality = PromptTemplate.from_template("You are Khoj, a friendly, smart and helpful personal assistant.")
 ## General Conversation
 ## --
 general_conversation = PromptTemplate.from_template(
    """
 Using your general knowledge and our past conversations as context, answer the following question.
 Current Date: {current_date}
 Question: {query}
 """.strip()
 )
 ## Notes Conversation
 ## --
 notes_conversation = PromptTemplate.from_template(
    """
 Using the notes and our past conversations as context, answer the following question.
 Current Date: {current_date}
 Notes:
 {references}
 Question: {query}
 """.strip()
 )
 ## Summarize Chat
 ## --
 summarize_chat = PromptTemplate.from_template(
    """
 You are an AI. Summarize the conversation below from your perspective:
 {text}
 Summarize the conversation from the AI's first-person perspective:"""
 )
 ## Summarize Notes
 ## --
 summarize_notes = PromptTemplate.from_template(
    """
 Summarize the below notes about {user_query}:
 {text}
 Summarize the notes in second person perspective:"""
 )
 ## Answer
 ## --
 answer = PromptTemplate.from_template(
    """
 You are a friendly, helpful personal assistant.
 Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
 Notes:
 {text}
 Question: {user_query}
 Answer (in second person):"""
 )
 ## Extract Questions
 ## --
 extract_questions = PromptTemplate.from_template(
    """
 You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes.
 - The user will provide their questions and answers to you for context.
 - Add as much context from the previous questions and answers as required into your search queries.
 - Break messages into multiple search queries when required to retrieve the relevant information.
 - Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
 What searches, if any, will you need to perform to answer the users question?
 Provide search queries as a JSON list of strings
 Current Date: {current_date}
 Q: How was my trip to Cambodia?
 ["How was my trip to Cambodia?"]
 A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
 Q: Who did i visit that temple with?
 ["Who did I visit the Angkor Wat Temple in Cambodia with?"]
 A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
 Q: What national parks did I go to last year?
 ["National park I visited in {last_new_year} dt>=\\"{last_new_year_date}\\" dt<\\"{current_new_year_date}\\""]
 A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year}.
 Q: How are you feeling today?
 []
 A: I'm feeling a little bored. Helping you will hopefully make me feel better!
 Q: How many tennis balls fit in the back of a 2002 Honda Civic?
 ["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
 A: 1085 tennis balls will fit in the trunk of a Honda Civic
 Q: Is Bob older than Tom?
 ["When was Bob born?", "What is Tom's age?"]
 A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
 Q: What is their age difference?
 ["What is Bob's age?", "What is Tom's age?"]
 A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} years old and Tom is 30 years old.
 {chat_history}
 Q: {text}
 """
 )
 ## Extract Search Type
 ## --
 search_type = """
 Objective: Extract search type from user query and return information as JSON
 Allowed search types are listed below:
  - search-type=["notes","ledger","image","music", "pdf"]
 Some examples are given below for reference:
 Q:What fiction book was I reading last week about AI starship?
 A:{ "search-type": "notes" }
 Q: What did the lease say about early termination
 A: { "search-type": "pdf" }
 Q:Play some calm classical music?
 A:{ "search-type": "music" }
 Q:How much did I spend at Subway for dinner last time?
 A:{ "search-type": "ledger" }
 Q:What was that popular Sri lankan song that Alex had mentioned?
 A:{ "search-type": "music" }
 Q:Can you recommend a movie to watch from my notes?
 A:{ "search-type": "notes" }
 Q:When did I buy Groceries last?
 A:{ "search-type": "ledger" }
 Q:When did I go surfing last?
 A:{ "search-type": "notes" }
 Q:"""
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@ -4,6 +4,9 @@ import logging
 from datetime import datetime
 # External Packages
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import OpenAI
 from langchain.schema import ChatMessage
 import openai
 import tiktoken
 from tenacity import (
@ -31,14 +34,17 @@ max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192}
        | retry_if_exception_type(openai.error.RateLimitError)
        | retry_if_exception_type(openai.error.ServiceUnavailableError)
    ),
-    wait=wait_random_exponential(min=1, max=30),
+    wait=wait_random_exponential(min=1, max=10),
-    stop=stop_after_attempt(6),
+    stop=stop_after_attempt(3),
    before_sleep=before_sleep_log(logger, logging.DEBUG),
    reraise=True,
 )
 def completion_with_backoff(**kwargs):
-    openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
+    prompt = kwargs.pop("prompt")
-    return openai.Completion.create(**kwargs, request_timeout=60)
+    if "openai_api_key" not in kwargs:
        kwargs["openai_api_key"] = os.getenv("OPENAI_API_KEY")
    llm = OpenAI(**kwargs, request_timeout=10, max_retries=1)
    return llm(prompt)
@retry(
@ -50,13 +56,19 @@ def completion_with_backoff(**kwargs):
        | retry_if_exception_type(openai.error.ServiceUnavailableError)
    ),
    wait=wait_exponential(multiplier=1, min=4, max=10),
-    stop=stop_after_attempt(6),
+    stop=stop_after_attempt(3),
    before_sleep=before_sleep_log(logger, logging.DEBUG),
    reraise=True,
 )
-def chat_completion_with_backoff(**kwargs):
+def chat_completion_with_backoff(messages, model_name, temperature, openai_api_key=None):
-    openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
+    chat = ChatOpenAI(
-    return openai.ChatCompletion.create(**kwargs, request_timeout=60)
+        model_name=model_name,
        temperature=temperature,
        openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"),
        request_timeout=10,
        max_retries=1,
    )
    return chat(messages).content
 def generate_chatml_messages_with_context(
@ -64,7 +76,11 @@ def generate_chatml_messages_with_context(
 ):
    """Generate messages for ChatGPT with context from previous conversation"""
    # Extract Chat History for Context
-    chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
+    chat_logs = []
    for chat in conversation_log.get("chat", []):
        chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n"
        chat_logs += [chat["message"] + chat_notes]
    rest_backnforths = []
    # Extract in reverse chronological order
    for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
@ -73,17 +89,26 @@ def generate_chatml_messages_with_context(
        rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
    # Format user and system messages to chatml format
-    system_chatml_message = [message_to_chatml(system_message, "system")]
+    system_chatml_message = [ChatMessage(content=system_message, role="system")]
-    user_chatml_message = [message_to_chatml(user_message, "user")]
+    user_chatml_message = [ChatMessage(content=user_message, role="user")]
-    messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:]
+    messages = user_chatml_message + rest_backnforths + system_chatml_message
    # Truncate oldest messages from conversation history until under max supported prompt size by model
    encoder = tiktoken.encoding_for_model(model_name)
-    tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
+    tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
-    while tokens > max_prompt_size[model_name]:
+    while tokens > max_prompt_size[model_name] and len(messages) > 1:
        messages.pop()
-        tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
+        tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
    # Truncate last message if still over max supported prompt size by model
    if tokens > max_prompt_size[model_name]:
        last_message = messages[-1]
        truncated_message = encoder.decode(encoder.encode(last_message.content))
        logger.debug(
            f"Truncate last message to fit within max prompt size of {max_prompt_size[model_name]} supported by {model_name} model:\n {truncated_message}"
        )
        messages = [ChatMessage(content=[truncated_message], role=last_message.role)]
    # Return message in chronological order
    return messages[::-1]
@ -91,12 +116,7 @@ def generate_chatml_messages_with_context(
 def reciprocal_conversation_to_chatml(message_pair):
    """Convert a single back and forth between user and assistant to chatml format"""
-    return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
+    return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])]
 def message_to_chatml(message, role="assistant"):
    """Create chatml message from message and role"""
    return {"role": role, "content": message}
 def message_to_prompt(
--- a/src/khoj/processor/pdf/init.py
+++ b/src/khoj/processor/pdf/init.py
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@ -0,0 +1,131 @@
 # Standard Packages
 import glob
 import logging
 from pathlib import Path
 from typing import List
 # External Packages
 from langchain.document_loaders import PyPDFLoader
 # Internal Packages
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
 from khoj.utils.rawconfig import Entry
 logger = logging.getLogger(__name__)
 class PdfToJsonl(TextToJsonl):
    # Define Functions
    def process(self, previous_entries=None):
        # Extract required fields from config
        pdf_files, pdf_file_filter, output_file = (
            self.config.input_files,
            self.config.input_filter,
            self.config.compressed_jsonl,
        )
        # Input Validation
        if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
            print("At least one of pdf-files or pdf-file-filter is required to be specified")
            exit(1)
        # Get Pdf Files to Process
        pdf_files = PdfToJsonl.get_pdf_files(pdf_files, pdf_file_filter)
        # Extract Entries from specified Pdf files
        with timer("Parse entries from PDF files into dictionaries", logger):
            current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(pdf_files))
        # Split entries by max tokens supported by model
        with timer("Split entries by max token size supported by model", logger):
            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
        # Identify, mark and merge any new entries with previous entries
        with timer("Identify new or updated entries", logger):
            if not previous_entries:
                entries_with_ids = list(enumerate(current_entries))
            else:
                entries_with_ids = self.mark_entries_for_update(
                    current_entries, previous_entries, key="compiled", logger=logger
                )
        with timer("Write PDF entries to JSONL file", logger):
            # Process Each Entry from All Notes Files
            entries = list(map(lambda entry: entry[1], entries_with_ids))
            jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
            # Compress JSONL formatted Data
            if output_file.suffix == ".gz":
                compress_jsonl_data(jsonl_data, output_file)
            elif output_file.suffix == ".jsonl":
                dump_jsonl(jsonl_data, output_file)
        return entries_with_ids
    @staticmethod
    def get_pdf_files(pdf_files=None, pdf_file_filters=None):
        "Get PDF files to process"
        absolute_pdf_files, filtered_pdf_files = set(), set()
        if pdf_files:
            absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
        if pdf_file_filters:
            filtered_pdf_files = {
                filtered_file
                for pdf_file_filter in pdf_file_filters
                for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
            }
        all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
        files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
        if any(files_with_non_pdf_extensions):
            logger.warn(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
        logger.debug(f"Processing files: {all_pdf_files}")
        return all_pdf_files
    @staticmethod
    def extract_pdf_entries(pdf_files):
        """Extract entries by page from specified PDF files"""
        entries = []
        entry_to_location_map = []
        for pdf_file in pdf_files:
            loader = PyPDFLoader(pdf_file)
            pdf_entries_per_file = [page.page_content for page in loader.load()]
            entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
            entries.extend(pdf_entries_per_file)
        return entries, dict(entry_to_location_map)
    @staticmethod
    def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
        "Convert each PDF entries into a dictionary"
        entries = []
        for parsed_entry in parsed_entries:
            entry_filename = Path(entry_to_file_map[parsed_entry])
            # Append base filename to compiled entry for context to model
            heading = f"{entry_filename.stem}\n"
            compiled_entry = f"{heading}{parsed_entry}"
            entries.append(
                Entry(
                    compiled=compiled_entry,
                    raw=parsed_entry,
                    heading=heading,
                    file=f"{entry_filename}",
                )
            )
        logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
        return entries
    @staticmethod
    def convert_pdf_maps_to_jsonl(entries: List[Entry]):
        "Convert each PDF entry to JSON and collate as JSONL"
        return "".join([f"{entry.to_json()}\n" for entry in entries])
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@ -109,6 +109,17 @@ def search(
        with timer("Collating results took", logger):
            results = text_search.collate_results(hits, entries, results_count)
    elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
        # query pdf files
        with timer("Query took", logger):
            hits, entries = text_search.query(
                user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
            )
        # collate and return results
        with timer("Collating results took", logger):
            results = text_search.collate_results(hits, entries, results_count)
    elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
        # query transactions
        with timer("Query took", logger):
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@ -22,6 +22,7 @@ class SearchType(str, Enum):
    Music = "music"
    Markdown = "markdown"
    Image = "image"
    Pdf = "pdf"
 class ProcessorType(str, Enum):
@ -61,6 +62,7 @@ class SearchModels:
    ledger_search: TextSearchModel = None
    music_search: TextSearchModel = None
    markdown_search: TextSearchModel = None
    pdf_search: TextSearchModel = None
    image_search: ImageSearchModel = None
    plugin_search: Dict[str, TextSearchModel] = None
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -28,6 +28,12 @@ default_config = {
            "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
            "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
        },
        "pdf": {
            "input-files": None,
            "input-filter": None,
            "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
            "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
        },
        "image": {
            "input-directories": None,
            "input-filter": None,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
    image: Optional[ImageContentConfig]
    music: Optional[TextContentConfig]
    markdown: Optional[TextContentConfig]
    pdf: Optional[TextContentConfig]
    plugins: Optional[Dict[str, TextContentConfig]]
--- a/tests/data/pdf/multipage.pdf
+++ b/tests/data/pdf/multipage.pdf
--- a/tests/data/pdf/singlepage.pdf
+++ b/tests/data/pdf/singlepage.pdf
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
 # ----------------------------------------------------------------------------------------------------
 def test_search_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/search?q=random&t={content_type}")
        # Assert
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
 # ----------------------------------------------------------------------------------------------------
 def test_update_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?t={content_type}")
        # Assert
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
 # ----------------------------------------------------------------------------------------------------
 def test_regenerate_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?force=true&t={content_type}")
        # Assert
--- a/tests/test_pdf_to_jsonl.py
+++ b/tests/test_pdf_to_jsonl.py
@ -0,0 +1,74 @@
 # Standard Packages
 import json
 # Internal Packages
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
 def test_single_page_pdf_to_jsonl():
    "Convert single page PDF file to jsonl."
    # Act
    # Extract Entries from specified Pdf files
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
    # Process Each Entry from All Pdf Files
    jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
        PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
    )
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
    # Assert
    assert len(jsonl_data) == 1
 def test_multi_page_pdf_to_jsonl():
    "Convert multiple pages from single PDF file to jsonl."
    # Act
    # Extract Entries from specified Pdf files
    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
    # Process Each Entry from All Pdf Files
    jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
        PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
    )
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
    # Assert
    assert len(jsonl_data) == 6
 def test_get_pdf_files(tmp_path):
    "Ensure Pdf files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
    group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
    group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
    group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
    # Include via input-file field
    file1 = create_file(tmp_path, filename="document.pdf")
    # Not included by any filter
    create_file(tmp_path, filename="not-included-document.pdf")
    create_file(tmp_path, filename="not-included-text.txt")
    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
    # Setup input-files, input-filters
    input_files = [tmp_path / "document.pdf"]
    input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
    # Act
    extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
    # Assert
    assert len(extracted_pdf_files) == 5
    assert extracted_pdf_files == expected_files
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="document.pdf"):
    pdf_file = tmp_path / filename
    pdf_file.touch()
    if entry:
        pdf_file.write_text(entry)
    return pdf_file