Search PDF files with Khoj. Integrate with LangChain

- **Introduce Khoj to LangChain**: Call GPT with LangChain for Khoj Chat - **Search (and Chat about) PDF files with Khoj** - Create PDF to JSONL Processor: Convert PDF content into standardized JSONL format - Expose PDF search type via Khoj server API - Enable querying PDF files via Obsidian, Emacs and Web interfaces
2025-02-18 23:14:19 +00:00 · 2023-06-02 10:20:26 +05:30 · 2023-06-02 10:20:26 +05:30 · e022910f31
commit e022910f31
parent b484953bb3 e9ed7a19fd
24 changed files with 608 additions and 200 deletions
--- a/README.md
+++ b/README.md
@ -63,7 +63,7 @@
 - **General**
  - **Natural**: Advanced natural language understanding using Transformer based ML Models
  - **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
-  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos
+  - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
  - **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)

 ## Demos
@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
 - Install Khoj via `pip` and start Khoj backend in non-gui mode
 - Install Khoj plugin via Community Plugins settings pane on Obsidian app
 - Check the new Khoj plugin settings
- Let Khoj backend index the markdown files in the current Vault
+- Let Khoj backend index the markdown, pdf files in the current Vault
 - Open Khoj plugin on Obsidian via Search button on Left Pane
 - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
 - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj

 ##### 2. Configure

- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes and beancount directories
+- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
 - **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)

 ##### 3. Run
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -21,6 +21,7 @@ services:
      - ./tests/data/ledger/:/data/ledger/
      - ./tests/data/music/:/data/music/
      - ./tests/data/markdown/:/data/markdown/
+      - ./tests/data/pdf/:/data/pdf/
      # Embeddings and models are populated after the first run
      # You can set these volumes to point to empty directories on host
      - ./tests/data/embeddings/:/data/embeddings/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -21,6 +21,7 @@ keywords = [
    "markdown",
    "beancount",
    "images",
+    "pdf",
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
@ -44,7 +45,7 @@ dependencies = [
    "tiktoken >= 0.3.0",
    "tenacity >= 8.2.2",
    "pillow == 9.3.0",
-    "pydantic == 1.9.1",
+    "pydantic >= 1.9.1",
    "pyqt6 == 6.3.1",
    "pyyaml == 6.0",
    "rich >= 13.3.1",
@ -53,6 +54,8 @@ dependencies = [
    "torch == 1.13.1",
    "uvicorn == 0.17.6",
    "aiohttp == 3.8.4",
+    "langchain >= 0.0.187",
+    "pypdf >= 3.9.0",
 ]
 dynamic = ["version"]

--- a/src/interface/emacs/khoj.el
+++ b/src/interface/emacs/khoj.el
@ -4,7 +4,7 @@

 ;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
 ;; Description: A search assistant for your second brain
-;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image
+;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image
 ;; Version: 0.6.2
 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
 ;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
@ -29,8 +29,8 @@
 ;;; Commentary:

 ;; Create a search assistant for your `org-mode', `markdown' notes,
-;; `beancount' transactions and images. This package exposes two
-;; assistance modes, search and chat:
+;; `beancount' transactions, PDFs and images. This package exposes
+;; two assistance modes, search and chat:
 ;;
 ;; Chat provides faster answers, iterative discovery and assisted
 ;; creativity. It requires your OpenAI API key to access GPT models
@ -95,6 +95,7 @@
                 (const "markdown")
                 (const "ledger")
                 (const "image")
+                 (const "pdf")
                 (const "music")))


@ -140,6 +141,8 @@ NO-PAGING FILTER))
       "C-x l  | ledger\n")
     (when (member 'image enabled-content-types)
       "C-x i  | image\n")
+     (when (member 'pdf enabled-content-types)
+       "C-x p  | pdf\n")
     (when (member 'music enabled-content-types)
       "C-x M  | music\n"))))

@ -150,6 +153,7 @@ NO-PAGING FILTER))
 (defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
 (defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
 (defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
+(defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf"))
 (defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
 (defun khoj--make-search-keymap (&optional existing-keymap)
  "Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
@ -164,6 +168,8 @@ NO-PAGING FILTER))
      (define-key kmap (kbd "C-x l") #'khoj--search-ledger))
    (when (member 'image enabled-content-types)
      (define-key kmap (kbd "C-x i") #'khoj--search-images))
+    (when (member 'pdf enabled-content-types)
+      (define-key kmap (kbd "C-x p") #'khoj--search-pdf))
    (when (member 'music enabled-content-types)
      (define-key kmap (kbd "C-x M") #'khoj--search-music))
    kmap))
@ -544,6 +550,22 @@ CONFIG is json obtained from Khoj config API."
               ;; remove trailing (, ) or SPC from extracted entries string
               (replace-regexp-in-string "[\(\) ]$" "")))

+(defun khoj--extract-entries-as-pdf (json-response query)
+  "Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries."
+  (thread-last
+    json-response
+    ;; Extract and render each pdf entry from response
+    (mapcar (lambda (json-response-item)
+              (thread-last
+                ;; Extract pdf entry from each item in json response
+                (cdr (assoc 'compiled (assoc 'additional json-response-item)))
+                ;; Format pdf entry as a org entry string
+                (format "** %s\n\n"))))
+    ;; Render entries into org formatted string with query set as as top level heading
+    (format "* %s\n%s\n" query)
+    ;; remove leading (, ) or SPC from extracted entries string
+    (replace-regexp-in-string "^[\(\) ]" "")))
+
 (defun khoj--extract-entries-as-images (json-response query)
  "Convert JSON-RESPONSE, QUERY from API to html with images."
  (let ((image-results-buffer-html-format-str "<html>\n<body>\n<h1>%s</h1>%s\n\n</body>\n</html>")
@ -592,6 +614,7 @@ CONFIG is json obtained from Khoj config API."
     ((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
     ((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
     ((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
+     ((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
     ((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
     (t khoj-default-content-type))))

@ -647,10 +670,13 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
      (insert
       (cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
             ((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
+             ((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query))
             ((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
             ((equal content-type "image") (khoj--extract-entries-as-images json-response query))
             (t (khoj--extract-entries json-response query))))
-      (cond ((equal content-type "org") (progn (visual-line-mode)
+      (cond ((or (equal content-type "pdf")
+                 (equal content-type "org"))
+             (progn (visual-line-mode)
                    (org-mode)
                   (setq-local
                    org-startup-folded "showall"
@ -973,7 +999,7 @@ Paragraph only starts at first text after blank line."
  ;; set content type to: last used > based on current buffer > default type
  :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
  ;; dynamically set choices to content types enabled on khoj backend
-  :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "ledger" "music" "image")))
+  :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "pdf" "ledger" "music" "image")))

 (transient-define-suffix khoj--search-command (&optional args)
  (interactive (list (transient-args transient-current-command)))
--- a/src/interface/obsidian/README.md
+++ b/src/interface/obsidian/README.md
@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
 1. Install Khoj via `pip` and start Khoj backend in non-gui mode
 2. Install Khoj plugin via Community Plugins settings pane on Obsidian app
 3. Check the new Khoj plugin settings
-4. Wait for Khoj backend to index markdown files in the current Vault
+4. Wait for Khoj backend to index markdown, PDF files in the current Vault
 5. Open Khoj plugin on Obsidian via Search button on Left Pane
 6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
 7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj
 - [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command
 - [X] Render results as Markdown preview to improve readability
 - [X] Configure Khoj via the plugin setting tab on the settings page
-  - Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault
+  - Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault
  - Set URL of Khoj backend
  - Set Number of Search Results to show in Search Modal
 - [X] Allow reranking of result to improve search quality
--- a/src/interface/obsidian/src/search_modal.ts
+++ b/src/interface/obsidian/src/search_modal.ts
@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
    async getSuggestions(query: string): Promise<SearchResult[]> {
        // Query Khoj backend for search results
        let encodedQuery = encodeURIComponent(query);
-        let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`;
-        let response = await request(searchUrl);
-        let data = JSON.parse(response);
-        let results = data
+        let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`;
+
+        // Get search results for markdown and pdf files
+        let mdResponse = await request(`${searchUrl}&t=markdown`);
+        let pdfResponse = await request(`${searchUrl}&t=pdf`);
+
+        // Parse search results
+        let mdData = JSON.parse(mdResponse)
            .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
-            .map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; });
+            .map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; });
+        let pdfData = JSON.parse(pdfResponse)
+            .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
+            .map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; })
+
+        // Combine markdown and PDF results and sort them by score
+        let results = mdData.concat(pdfData)
+            .sort((a: any, b: any) => b.score - a.score)
+            .map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; })

        this.query = query;
        return results;
@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
    }

    async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
-        // Get all markdown files in vault
+        // Get all markdown and PDF files in vault
        const mdFiles = this.app.vault.getMarkdownFiles();
+        const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf');

        // Find the vault file matching file of chosen search result
-        let file_match = mdFiles
+        let file_match = mdFiles.concat(pdfFiles)
            // Sort by descending length of path
            // This finds longest path match when multiple files have same name
            .sort((a, b) => b.path.length - a.path.length)
@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {

        // Open vault file at heading of chosen search result
        if (file_match) {
-            let resultHeading = result.entry.split('\n', 1)[0];
+            let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : '';
            let linkToEntry = `${file_match.path}${resultHeading}`
            this.app.workspace.openLinkText(linkToEntry, '');
            console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`);
--- a/src/interface/obsidian/src/settings.ts
+++ b/src/interface/obsidian/src/settings.ts
@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab {
                    this.plugin.registerInterval(progress_indicator);

                    await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`);
+                    await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`);
                    new Notice('✅ Updated Khoj index.');

                    // Reset button once index is updated
--- a/src/interface/obsidian/src/utils.ts
+++ b/src/interface/obsidian/src/utils.ts
@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string {
 export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
    let vaultPath = getVaultAbsolutePath(vault);
    let mdInVault = `${vaultPath}/**/*.md`;
+    let pdfInVault = `${vaultPath}/**/*.pdf`;
    let khojConfigUrl = `${setting.khojUrl}/api/config/data`;

    // Check if khoj backend is configured, note if cannot connect to backend
@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
    let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
    // Get default config fields from khoj backend
    let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
-    let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
+    let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
+    let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
    let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
    let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];

@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                    "markdown": {
                        "input-filter": [mdInVault],
                        "input-files": null,
-                        "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
-                        "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
+                        "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
+                        "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
+                    },
+                    "pdf": {
+                        "input-filter": [pdfInVault],
+                        "input-files": null,
+                        "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
+                        "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
                    }
                }
            }
@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                data["content-type"]["markdown"] = {
                    "input-filter": [mdInVault],
                    "input-files": null,
-                    "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
-                    "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
+                    "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
+                    "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }
            // Else if khoj is not configured to index markdown files in configured obsidian vault
@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
                data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
                // Update markdown config in khoj content-type config
                // Set markdown config to only index markdown files in configured obsidian vault
-                let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
+                let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
                data["content-type"]["markdown"] = {
                    "input-filter": [mdInVault],
                    "input-files": null,
-                    "embeddings-file": `${khojIndexDirectory}/${indexName}.pt`,
-                    "compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`,
+                    "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
+                    "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
+                }
+            }
+
+            if (khoj_already_configured && !data["content-type"]["pdf"]) {
+                // Add pdf config to khoj content-type config
+                // Set pdf config to index pdf files in configured obsidian vault
+                data["content-type"]["pdf"] = {
+                    "input-filter": [pdfInVault],
+                    "input-files": null,
+                    "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
+                    "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
+                }
+            }
+            // Else if khoj is not configured to index pdf files in configured obsidian vault
+            else if (khoj_already_configured &&
+                (data["content-type"]["pdf"]["input-filter"].length != 1 ||
+                data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
+                // Update pdf config in khoj content-type config
+                // Set pdf config to only index pdf files in configured obsidian vault
+                let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
+                data["content-type"]["pdf"] = {
+                    "input-filter": [pdfInVault],
+                    "input-files": null,
+                    "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
+                    "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
                }
            }

--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
 from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
+from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
 from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
            filters=[DateFilter(), WordFilter(), FileFilter()],
        )

+    # Initialize PDF Search
+    if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
+        logger.info("💸 Setting up search for pdf")
+        # Extract Entries, Generate PDF Embeddings
+        model.pdf_search = text_search.setup(
+            PdfToJsonl,
+            config.content_type.pdf,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[DateFilter(), WordFilter(), FileFilter()],
+        )
+
    # Initialize Image Search
    if (t == state.SearchType.Image or t == None) and config.content_type.image:
        logger.info("🌄 Setting up search for images")
--- a/src/khoj/interface/desktop/file_browser.py
+++ b/src/khoj/interface/desktop/file_browser.py
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
            return "Beancount Files (*.bean *.beancount)"
        elif search_type == SearchType.Markdown:
            return "Markdown Files (*.md *.markdown)"
+        elif search_type == SearchType.Pdf:
+            return "Pdf Files (*.pdf)"
        elif search_type == SearchType.Music:
            return "Org-Music Files (*.org)"
        elif search_type == SearchType.Image:
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -44,6 +44,15 @@
            }).join("\n") + `</div>`;
        }

+        function render_pdf(query, data) {
+            return `<div id="results-pdf">` + data.map(function (item) {
+                let compiled_lines = item.additional.compiled.split("\n");
+                let filename = compiled_lines.shift();
+                let text_match = compiled_lines.join("\n")
+                return `<h2>${filename}</h2>\n<p>${text_match}</p>`
+            }).join("\n") + `</div>`;
+        }
+
        function render_json(data, query, type) {
            if (type === "markdown") {
                return render_markdown(query, data);
@ -55,6 +64,8 @@
                return data.map(render_image).join('');
            } else if (type === "ledger") {
                return render_ledger(query, data);
+            } else if (type === "pdf") {
+                return render_pdf(query, data);
            } else {
                return `<div id="results-plugin">`
                    + data.map((item) => `<p>${item.entry}</p>`).join("\n")
@ -279,6 +290,7 @@
        #json {
            white-space: pre-wrap;
        }
+        #results-pdf,
        #results-plugin,
        #results-ledger {
            text-align: left;
--- a/src/khoj/processor/conversation/gpt.py
+++ b/src/khoj/processor/conversation/gpt.py
@ -5,10 +5,10 @@ from datetime import datetime

 # Internal Packages
 from khoj.utils.constants import empty_escape_sequences
+from khoj.processor.conversation import prompts
 from khoj.processor.conversation.utils import (
    chat_completion_with_backoff,
    completion_with_backoff,
-    message_to_prompt,
    generate_chatml_messages_with_context,
 )

@ -20,22 +20,14 @@ def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=50
    """
    Answer user query using provided text as reference with OpenAI's GPT
    """
-    # Setup Prompt based on Summary Type
-    prompt = f"""
-You are a friendly, helpful personal assistant.
-Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
+    # Setup Prompt from arguments
+    prompt = prompts.answer.format(text=text, user_query=user_query)

-Notes:
-{text}
-
-Question: {user_query}
-
-Answer (in second person):"""
    # Get Response from GPT
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        stop='"""',
@ -43,8 +35,7 @@ Answer (in second person):"""
    )

    # Extract, Clean Message from GPT's Response
-    story = response["choices"][0]["text"]
-    return str(story).replace("\n\n", "")
+    return str(response).replace("\n\n", "")


 def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
@ -53,25 +44,15 @@ def summarize(text, summary_type, model, user_query=None, api_key=None, temperat
    """
    # Setup Prompt based on Summary Type
    if summary_type == "chat":
-        prompt = f"""
-You are an AI. Summarize the conversation below from your perspective:
-
-{text}
-
-Summarize the conversation from the AI's first-person perspective:"""
+        prompt = prompts.summarize_chat.format(text=text)
    elif summary_type == "notes":
-        prompt = f"""
-Summarize the below notes about {user_query}:
-
-{text}
-
-Summarize the notes in second person perspective:"""
+        prompt = prompts.summarize_notes.format(text=text, user_query=user_query)

    # Get Response from GPT
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        frequency_penalty=0.2,
@ -80,8 +61,7 @@ Summarize the notes in second person perspective:"""
    )

    # Extract, Clean Message from GPT's Response
-    story = response["choices"][0]["text"]
-    return str(story).replace("\n\n", "")
+    return str(response).replace("\n\n", "")


 def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
@ -102,68 +82,21 @@ def extract_questions(text, model="text-davinci-003", conversation_log={}, api_k
    current_new_year = today.replace(month=1, day=1)
    last_new_year = current_new_year.replace(year=today.year - 1)

-    prompt = f"""
-You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes.
- The user will provide their questions and answers to you for context.
- Add as much context from the previous questions and answers as required into your search queries.
- Break messages into multiple search queries when required to retrieve the relevant information.
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
-
-What searches, if any, will you need to perform to answer the users question?
-Provide search queries as a JSON list of strings
-Current Date: {today.strftime("%A, %Y-%m-%d")}
-
-Q: How was my trip to Cambodia?
-
-["How was my trip to Cambodia?"]
-
-A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
-
-Q: Who did i visit that temple with?
-
-["Who did I visit the Angkor Wat Temple in Cambodia with?"]
-
-A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
-
-Q: What national parks did I go to last year?
-
-["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""]
-
-A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}.
-
-Q: How are you feeling today?
-
-[]
-
-A: I'm feeling a little bored. Helping you will hopefully make me feel better!
-
-Q: How many tennis balls fit in the back of a 2002 Honda Civic?
-
-["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
-
-A: 1085 tennis balls will fit in the trunk of a Honda Civic
-
-Q: Is Bob older than Tom?
-
-["When was Bob born?", "What is Tom's age?"]
-
-A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
-
-Q: What is their age difference?
-
-["What is Bob's age?", "What is Tom's age?"]
-
-A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old.
-
-{chat_history}
-Q: {text}
-
-"""
+    prompt = prompts.extract_questions.format(
+        current_date=today.strftime("%A, %Y-%m-%d"),
+        last_new_year=last_new_year.strftime("%Y"),
+        last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
+        current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
+        bob_tom_age_difference={current_new_year.year - 1984 - 30},
+        bob_age={current_new_year.year - 1984},
+        chat_history=chat_history,
+        text=text,
+    )

    # Get Response from GPT
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        stop=["A: ", "\n"],
@ -171,17 +104,16 @@ Q: {text}
    )

    # Extract, Clean Message from GPT's Response
-    response_text = response["choices"][0]["text"]
    try:
        questions = json.loads(
            # Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
-            response_text.strip(empty_escape_sequences)
+            response.strip(empty_escape_sequences)
            .replace("['", '["')
            .replace("']", '"]')
            .replace("', '", '", "')
        )
    except json.decoder.JSONDecodeError:
-        logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}")
+        logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}")
        questions = [text]
    logger.debug(f"Extracted Questions by GPT: {questions}")
    return questions
@ -191,31 +123,8 @@ def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=1
    """
    Extract search type from user query using OpenAI's GPT
    """
-    # Initialize Variables
-    understand_primer = """
-Objective: Extract search type from user query and return information as JSON
-
-Allowed search types are listed below:
-  - search-type=["notes","ledger","image","music"]
-
-Some examples are given below for reference:
-Q:What fiction book was I reading last week about AI starship?
-A:{ "search-type": "notes" }
-Q:Play some calm classical music?
-A:{ "search-type": "music" }
-Q:How much did I spend at Subway for dinner last time?
-A:{ "search-type": "ledger" }
-Q:What was that popular Sri lankan song that Alex had mentioned?
-A:{ "search-type": "music" }
-Q:Can you recommend a movie to watch from my notes?
-A:{ "search-type": "notes" }
-Q: When did I buy Groceries last?
-A:{ "search-type": "ledger" }
-Q:When did I go surfing last?
-A:{ "search-type": "notes" }"""
-
-    # Setup Prompt with Understand Primer
-    prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
+    # Setup Prompt to extract search type
+    prompt = prompts.search_type + f"{text}\nA:"
    if verbose > 1:
        print(f"Message -> Prompt: {text} -> {prompt}")

@ -223,7 +132,7 @@ A:{ "search-type": "notes" }"""
    logger.debug(f"Prompt for GPT: {prompt}")
    response = completion_with_backoff(
        prompt=prompt,
-        model=model,
+        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
        frequency_penalty=0.2,
@ -232,8 +141,7 @@ A:{ "search-type": "notes" }"""
    )

    # Extract, Clean Message from GPT's Response
-    story = str(response["choices"][0]["text"])
-    return json.loads(story.strip(empty_escape_sequences))
+    return json.loads(response.strip(empty_escape_sequences))


 def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2):
@ -241,36 +149,23 @@ def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo",
    Converse with user using OpenAI's ChatGPT
    """
    # Initialize Variables
+    current_date = datetime.now().strftime("%Y-%m-%d")
    compiled_references = "\n\n".join({f"# {item}" for item in references})

-    personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
-    conversation_primers = {
-        "general": f"""
-Using your general knowledge and our past conversations as context, answer the following question.
-Current Date: {datetime.now().strftime("%Y-%m-%d")}
-
-Question: {user_query}
-""".strip(),
-        "notes": f"""
-Using the notes and our past conversations as context, answer the following question.
-Current Date: {datetime.now().strftime("%Y-%m-%d")}
-
-Notes:
-{compiled_references}
-
-Question: {user_query}
-""".strip(),
-    }
-
    # Get Conversation Primer appropriate to Conversation Type
    conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes"
    logger.debug(f"Conversation Type: {conversation_type}")
-    conversation_primer = conversation_primers.get(conversation_type)
+    if conversation_type == "general":
+        conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
+    else:
+        conversation_primer = prompts.notes_conversation.format(
+            current_date=current_date, query=user_query, references=compiled_references
+        )

    # Setup Prompt with Primer or Conversation History
    messages = generate_chatml_messages_with_context(
        conversation_primer,
-        personality_primer,
+        prompts.personality.format(),
        conversation_log,
        model,
    )
@ -279,11 +174,10 @@ Question: {user_query}
    logger.debug(f"Conversation Context for GPT: {messages}")
    response = chat_completion_with_backoff(
        messages=messages,
-        model=model,
+        model_name=model,
        temperature=temperature,
-        api_key=api_key,
+        openai_api_key=api_key,
    )

    # Extract, Clean Message from GPT's Response
-    story = str(response["choices"][0]["message"]["content"])
-    return story.strip(empty_escape_sequences)
+    return response.strip(empty_escape_sequences)
--- a/src/khoj/processor/conversation/prompts.py
+++ b/src/khoj/processor/conversation/prompts.py
@ -0,0 +1,165 @@
+# External Packages
+from langchain.prompts import PromptTemplate
+
+
+## Personality
+## --
+personality = PromptTemplate.from_template("You are Khoj, a friendly, smart and helpful personal assistant.")
+
+
+## General Conversation
+## --
+general_conversation = PromptTemplate.from_template(
+    """
+Using your general knowledge and our past conversations as context, answer the following question.
+Current Date: {current_date}
+
+Question: {query}
+""".strip()
+)
+
+
+## Notes Conversation
+## --
+notes_conversation = PromptTemplate.from_template(
+    """
+Using the notes and our past conversations as context, answer the following question.
+Current Date: {current_date}
+
+Notes:
+{references}
+
+Question: {query}
+""".strip()
+)
+
+
+## Summarize Chat
+## --
+summarize_chat = PromptTemplate.from_template(
+    """
+You are an AI. Summarize the conversation below from your perspective:
+
+{text}
+
+Summarize the conversation from the AI's first-person perspective:"""
+)
+
+
+## Summarize Notes
+## --
+summarize_notes = PromptTemplate.from_template(
+    """
+Summarize the below notes about {user_query}:
+
+{text}
+
+Summarize the notes in second person perspective:"""
+)
+
+
+## Answer
+## --
+answer = PromptTemplate.from_template(
+    """
+You are a friendly, helpful personal assistant.
+Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
+
+Notes:
+{text}
+
+Question: {user_query}
+
+Answer (in second person):"""
+)
+
+
+## Extract Questions
+## --
+extract_questions = PromptTemplate.from_template(
+    """
+You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes.
+- The user will provide their questions and answers to you for context.
+- Add as much context from the previous questions and answers as required into your search queries.
+- Break messages into multiple search queries when required to retrieve the relevant information.
+- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
+
+What searches, if any, will you need to perform to answer the users question?
+Provide search queries as a JSON list of strings
+Current Date: {current_date}
+
+Q: How was my trip to Cambodia?
+
+["How was my trip to Cambodia?"]
+
+A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
+
+Q: Who did i visit that temple with?
+
+["Who did I visit the Angkor Wat Temple in Cambodia with?"]
+
+A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
+
+Q: What national parks did I go to last year?
+
+["National park I visited in {last_new_year} dt>=\\"{last_new_year_date}\\" dt<\\"{current_new_year_date}\\""]
+
+A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year}.
+
+Q: How are you feeling today?
+
+[]
+
+A: I'm feeling a little bored. Helping you will hopefully make me feel better!
+
+Q: How many tennis balls fit in the back of a 2002 Honda Civic?
+
+["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
+
+A: 1085 tennis balls will fit in the trunk of a Honda Civic
+
+Q: Is Bob older than Tom?
+
+["When was Bob born?", "What is Tom's age?"]
+
+A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
+
+Q: What is their age difference?
+
+["What is Bob's age?", "What is Tom's age?"]
+
+A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} years old and Tom is 30 years old.
+
+{chat_history}
+Q: {text}
+
+"""
+)
+
+
+## Extract Search Type
+## --
+search_type = """
+Objective: Extract search type from user query and return information as JSON
+
+Allowed search types are listed below:
+  - search-type=["notes","ledger","image","music", "pdf"]
+
+Some examples are given below for reference:
+Q:What fiction book was I reading last week about AI starship?
+A:{ "search-type": "notes" }
+Q: What did the lease say about early termination
+A: { "search-type": "pdf" }
+Q:Play some calm classical music?
+A:{ "search-type": "music" }
+Q:How much did I spend at Subway for dinner last time?
+A:{ "search-type": "ledger" }
+Q:What was that popular Sri lankan song that Alex had mentioned?
+A:{ "search-type": "music" }
+Q:Can you recommend a movie to watch from my notes?
+A:{ "search-type": "notes" }
+Q:When did I buy Groceries last?
+A:{ "search-type": "ledger" }
+Q:When did I go surfing last?
+A:{ "search-type": "notes" }
+Q:"""
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@ -4,6 +4,9 @@ import logging
 from datetime import datetime

 # External Packages
+from langchain.chat_models import ChatOpenAI
+from langchain.llms import OpenAI
+from langchain.schema import ChatMessage
 import openai
 import tiktoken
 from tenacity import (
@ -31,14 +34,17 @@ max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192}
        | retry_if_exception_type(openai.error.RateLimitError)
        | retry_if_exception_type(openai.error.ServiceUnavailableError)
    ),
-    wait=wait_random_exponential(min=1, max=30),
-    stop=stop_after_attempt(6),
+    wait=wait_random_exponential(min=1, max=10),
+    stop=stop_after_attempt(3),
    before_sleep=before_sleep_log(logger, logging.DEBUG),
    reraise=True,
 )
 def completion_with_backoff(**kwargs):
-    openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
-    return openai.Completion.create(**kwargs, request_timeout=60)
+    prompt = kwargs.pop("prompt")
+    if "openai_api_key" not in kwargs:
+        kwargs["openai_api_key"] = os.getenv("OPENAI_API_KEY")
+    llm = OpenAI(**kwargs, request_timeout=10, max_retries=1)
+    return llm(prompt)


@retry(
@ -50,13 +56,19 @@ def completion_with_backoff(**kwargs):
        | retry_if_exception_type(openai.error.ServiceUnavailableError)
    ),
    wait=wait_exponential(multiplier=1, min=4, max=10),
-    stop=stop_after_attempt(6),
+    stop=stop_after_attempt(3),
    before_sleep=before_sleep_log(logger, logging.DEBUG),
    reraise=True,
 )
-def chat_completion_with_backoff(**kwargs):
-    openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
-    return openai.ChatCompletion.create(**kwargs, request_timeout=60)
+def chat_completion_with_backoff(messages, model_name, temperature, openai_api_key=None):
+    chat = ChatOpenAI(
+        model_name=model_name,
+        temperature=temperature,
+        openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"),
+        request_timeout=10,
+        max_retries=1,
+    )
+    return chat(messages).content


 def generate_chatml_messages_with_context(
@ -64,7 +76,11 @@ def generate_chatml_messages_with_context(
 ):
    """Generate messages for ChatGPT with context from previous conversation"""
    # Extract Chat History for Context
-    chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
+    chat_logs = []
+    for chat in conversation_log.get("chat", []):
+        chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n"
+        chat_logs += [chat["message"] + chat_notes]
+
    rest_backnforths = []
    # Extract in reverse chronological order
    for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
@ -73,17 +89,26 @@ def generate_chatml_messages_with_context(
        rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]

    # Format user and system messages to chatml format
-    system_chatml_message = [message_to_chatml(system_message, "system")]
-    user_chatml_message = [message_to_chatml(user_message, "user")]
+    system_chatml_message = [ChatMessage(content=system_message, role="system")]
+    user_chatml_message = [ChatMessage(content=user_message, role="user")]

-    messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:]
+    messages = user_chatml_message + rest_backnforths + system_chatml_message

    # Truncate oldest messages from conversation history until under max supported prompt size by model
    encoder = tiktoken.encoding_for_model(model_name)
-    tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
-    while tokens > max_prompt_size[model_name]:
+    tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
+    while tokens > max_prompt_size[model_name] and len(messages) > 1:
        messages.pop()
-        tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
+        tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
+
+    # Truncate last message if still over max supported prompt size by model
+    if tokens > max_prompt_size[model_name]:
+        last_message = messages[-1]
+        truncated_message = encoder.decode(encoder.encode(last_message.content))
+        logger.debug(
+            f"Truncate last message to fit within max prompt size of {max_prompt_size[model_name]} supported by {model_name} model:\n {truncated_message}"
+        )
+        messages = [ChatMessage(content=[truncated_message], role=last_message.role)]

    # Return message in chronological order
    return messages[::-1]
@ -91,12 +116,7 @@ def generate_chatml_messages_with_context(

 def reciprocal_conversation_to_chatml(message_pair):
    """Convert a single back and forth between user and assistant to chatml format"""
-    return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
-
-
-def message_to_chatml(message, role="assistant"):
-    """Create chatml message from message and role"""
-    return {"role": role, "content": message}
+    return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])]


 def message_to_prompt(
--- a/src/khoj/processor/pdf/init.py
+++ b/src/khoj/processor/pdf/init.py
--- a/src/khoj/processor/pdf/pdf_to_jsonl.py
+++ b/src/khoj/processor/pdf/pdf_to_jsonl.py
@ -0,0 +1,131 @@
+# Standard Packages
+import glob
+import logging
+from pathlib import Path
+from typing import List
+
+# External Packages
+from langchain.document_loaders import PyPDFLoader
+
+# Internal Packages
+from khoj.processor.text_to_jsonl import TextToJsonl
+from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
+from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.rawconfig import Entry
+
+
+logger = logging.getLogger(__name__)
+
+
+class PdfToJsonl(TextToJsonl):
+    # Define Functions
+    def process(self, previous_entries=None):
+        # Extract required fields from config
+        pdf_files, pdf_file_filter, output_file = (
+            self.config.input_files,
+            self.config.input_filter,
+            self.config.compressed_jsonl,
+        )
+
+        # Input Validation
+        if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
+            print("At least one of pdf-files or pdf-file-filter is required to be specified")
+            exit(1)
+
+        # Get Pdf Files to Process
+        pdf_files = PdfToJsonl.get_pdf_files(pdf_files, pdf_file_filter)
+
+        # Extract Entries from specified Pdf files
+        with timer("Parse entries from PDF files into dictionaries", logger):
+            current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(pdf_files))
+
+        # Split entries by max tokens supported by model
+        with timer("Split entries by max token size supported by model", logger):
+            current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            if not previous_entries:
+                entries_with_ids = list(enumerate(current_entries))
+            else:
+                entries_with_ids = self.mark_entries_for_update(
+                    current_entries, previous_entries, key="compiled", logger=logger
+                )
+
+        with timer("Write PDF entries to JSONL file", logger):
+            # Process Each Entry from All Notes Files
+            entries = list(map(lambda entry: entry[1], entries_with_ids))
+            jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
+
+            # Compress JSONL formatted Data
+            if output_file.suffix == ".gz":
+                compress_jsonl_data(jsonl_data, output_file)
+            elif output_file.suffix == ".jsonl":
+                dump_jsonl(jsonl_data, output_file)
+
+        return entries_with_ids
+
+    @staticmethod
+    def get_pdf_files(pdf_files=None, pdf_file_filters=None):
+        "Get PDF files to process"
+        absolute_pdf_files, filtered_pdf_files = set(), set()
+        if pdf_files:
+            absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
+        if pdf_file_filters:
+            filtered_pdf_files = {
+                filtered_file
+                for pdf_file_filter in pdf_file_filters
+                for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
+            }
+
+        all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
+
+        files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
+
+        if any(files_with_non_pdf_extensions):
+            logger.warn(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
+
+        logger.debug(f"Processing files: {all_pdf_files}")
+
+        return all_pdf_files
+
+    @staticmethod
+    def extract_pdf_entries(pdf_files):
+        """Extract entries by page from specified PDF files"""
+
+        entries = []
+        entry_to_location_map = []
+        for pdf_file in pdf_files:
+            loader = PyPDFLoader(pdf_file)
+            pdf_entries_per_file = [page.page_content for page in loader.load()]
+            entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
+            entries.extend(pdf_entries_per_file)
+
+        return entries, dict(entry_to_location_map)
+
+    @staticmethod
+    def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
+        "Convert each PDF entries into a dictionary"
+        entries = []
+        for parsed_entry in parsed_entries:
+            entry_filename = Path(entry_to_file_map[parsed_entry])
+            # Append base filename to compiled entry for context to model
+            heading = f"{entry_filename.stem}\n"
+            compiled_entry = f"{heading}{parsed_entry}"
+            entries.append(
+                Entry(
+                    compiled=compiled_entry,
+                    raw=parsed_entry,
+                    heading=heading,
+                    file=f"{entry_filename}",
+                )
+            )
+
+        logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
+
+        return entries
+
+    @staticmethod
+    def convert_pdf_maps_to_jsonl(entries: List[Entry]):
+        "Convert each PDF entry to JSON and collate as JSONL"
+        return "".join([f"{entry.to_json()}\n" for entry in entries])
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@ -109,6 +109,17 @@ def search(
        with timer("Collating results took", logger):
            results = text_search.collate_results(hits, entries, results_count)

+    elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
+        # query pdf files
+        with timer("Query took", logger):
+            hits, entries = text_search.query(
+                user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
+            )
+
+        # collate and return results
+        with timer("Collating results took", logger):
+            results = text_search.collate_results(hits, entries, results_count)
+
    elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
        # query transactions
        with timer("Query took", logger):
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@ -22,6 +22,7 @@ class SearchType(str, Enum):
    Music = "music"
    Markdown = "markdown"
    Image = "image"
+    Pdf = "pdf"


 class ProcessorType(str, Enum):
@ -61,6 +62,7 @@ class SearchModels:
    ledger_search: TextSearchModel = None
    music_search: TextSearchModel = None
    markdown_search: TextSearchModel = None
+    pdf_search: TextSearchModel = None
    image_search: ImageSearchModel = None
    plugin_search: Dict[str, TextSearchModel] = None

--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -28,6 +28,12 @@ default_config = {
            "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
            "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
        },
+        "pdf": {
+            "input-files": None,
+            "input-filter": None,
+            "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
+        },
        "image": {
            "input-directories": None,
            "input-filter": None,
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
    image: Optional[ImageContentConfig]
    music: Optional[TextContentConfig]
    markdown: Optional[TextContentConfig]
+    pdf: Optional[TextContentConfig]
    plugins: Optional[Dict[str, TextContentConfig]]


--- a/tests/data/pdf/multipage.pdf
+++ b/tests/data/pdf/multipage.pdf
--- a/tests/data/pdf/singlepage.pdf
+++ b/tests/data/pdf/singlepage.pdf
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_search_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/search?q=random&t={content_type}")
        # Assert
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_update_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?t={content_type}")
        # Assert
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):

 # ----------------------------------------------------------------------------------------------------
 def test_regenerate_with_valid_content_type(client):
-    for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
+    for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
        # Act
        response = client.get(f"/api/update?force=true&t={content_type}")
        # Assert
--- a/tests/test_pdf_to_jsonl.py
+++ b/tests/test_pdf_to_jsonl.py
@ -0,0 +1,74 @@
+# Standard Packages
+import json
+
+# Internal Packages
+from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
+
+
+def test_single_page_pdf_to_jsonl():
+    "Convert single page PDF file to jsonl."
+    # Act
+    # Extract Entries from specified Pdf files
+    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
+
+    # Process Each Entry from All Pdf Files
+    jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
+        PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
+    )
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 1
+
+
+def test_multi_page_pdf_to_jsonl():
+    "Convert multiple pages from single PDF file to jsonl."
+    # Act
+    # Extract Entries from specified Pdf files
+    entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
+
+    # Process Each Entry from All Pdf Files
+    jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
+        PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
+    )
+    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+    # Assert
+    assert len(jsonl_data) == 6
+
+
+def test_get_pdf_files(tmp_path):
+    "Ensure Pdf files specified via input-filter, input-files extracted"
+    # Arrange
+    # Include via input-filter globs
+    group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
+    group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
+    group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
+    group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
+    # Include via input-file field
+    file1 = create_file(tmp_path, filename="document.pdf")
+    # Not included by any filter
+    create_file(tmp_path, filename="not-included-document.pdf")
+    create_file(tmp_path, filename="not-included-text.txt")
+
+    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
+
+    # Setup input-files, input-filters
+    input_files = [tmp_path / "document.pdf"]
+    input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
+
+    # Act
+    extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
+
+    # Assert
+    assert len(extracted_pdf_files) == 5
+    assert extracted_pdf_files == expected_files
+
+
+# Helper Functions
+def create_file(tmp_path, entry=None, filename="document.pdf"):
+    pdf_file = tmp_path / filename
+    pdf_file.touch()
+    if entry:
+        pdf_file.write_text(entry)
+    return pdf_file