diff --git a/README.md b/README.md index 3d2df029..a21ff201 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ - **General** - **Natural**: Advanced natural language understanding using Transformer based ML Models - **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models - - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos + - **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos - **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/) ## Demos @@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0 - Install Khoj via `pip` and start Khoj backend in non-gui mode - Install Khoj plugin via Community Plugins settings pane on Obsidian app - Check the new Khoj plugin settings -- Let Khoj backend index the markdown files in the current Vault +- Let Khoj backend index the markdown, pdf files in the current Vault - Open Khoj plugin on Obsidian via Search button on Left Pane - Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/) - Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin) @@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj ##### 2. Configure -- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes and beancount directories +- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories - **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml) ##### 3. Run diff --git a/docker-compose.yml b/docker-compose.yml index 5cd1763c..0529b150 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -21,6 +21,7 @@ services: - ./tests/data/ledger/:/data/ledger/ - ./tests/data/music/:/data/music/ - ./tests/data/markdown/:/data/markdown/ + - ./tests/data/pdf/:/data/pdf/ # Embeddings and models are populated after the first run # You can set these volumes to point to empty directories on host - ./tests/data/embeddings/:/data/embeddings/ diff --git a/pyproject.toml b/pyproject.toml index 0406467e..1750268d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ keywords = [ "markdown", "beancount", "images", + "pdf", ] classifiers = [ "Development Status :: 4 - Beta", @@ -44,7 +45,7 @@ dependencies = [ "tiktoken >= 0.3.0", "tenacity >= 8.2.2", "pillow == 9.3.0", - "pydantic == 1.9.1", + "pydantic >= 1.9.1", "pyqt6 == 6.3.1", "pyyaml == 6.0", "rich >= 13.3.1", @@ -53,6 +54,8 @@ dependencies = [ "torch == 1.13.1", "uvicorn == 0.17.6", "aiohttp == 3.8.4", + "langchain >= 0.0.187", + "pypdf >= 3.9.0", ] dynamic = ["version"] diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index e553cd47..b83433d4 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -4,7 +4,7 @@ ;; Author: Debanjum Singh Solanky ;; Description: A search assistant for your second brain -;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image +;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image ;; Version: 0.6.2 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1")) ;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs @@ -29,8 +29,8 @@ ;;; Commentary: ;; Create a search assistant for your `org-mode', `markdown' notes, -;; `beancount' transactions and images. This package exposes two -;; assistance modes, search and chat: +;; `beancount' transactions, PDFs and images. This package exposes +;; two assistance modes, search and chat: ;; ;; Chat provides faster answers, iterative discovery and assisted ;; creativity. It requires your OpenAI API key to access GPT models @@ -95,6 +95,7 @@ (const "markdown") (const "ledger") (const "image") + (const "pdf") (const "music"))) @@ -140,6 +141,8 @@ NO-PAGING FILTER)) "C-x l | ledger\n") (when (member 'image enabled-content-types) "C-x i | image\n") + (when (member 'pdf enabled-content-types) + "C-x p | pdf\n") (when (member 'music enabled-content-types) "C-x M | music\n")))) @@ -150,6 +153,7 @@ NO-PAGING FILTER)) (defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger")) (defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image")) (defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music")) +(defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf")) (defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t)) (defun khoj--make-search-keymap (&optional existing-keymap) "Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed." @@ -164,6 +168,8 @@ NO-PAGING FILTER)) (define-key kmap (kbd "C-x l") #'khoj--search-ledger)) (when (member 'image enabled-content-types) (define-key kmap (kbd "C-x i") #'khoj--search-images)) + (when (member 'pdf enabled-content-types) + (define-key kmap (kbd "C-x p") #'khoj--search-pdf)) (when (member 'music enabled-content-types) (define-key kmap (kbd "C-x M") #'khoj--search-music)) kmap)) @@ -544,6 +550,22 @@ CONFIG is json obtained from Khoj config API." ;; remove trailing (, ) or SPC from extracted entries string (replace-regexp-in-string "[\(\) ]$" ""))) +(defun khoj--extract-entries-as-pdf (json-response query) + "Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries." + (thread-last + json-response + ;; Extract and render each pdf entry from response + (mapcar (lambda (json-response-item) + (thread-last + ;; Extract pdf entry from each item in json response + (cdr (assoc 'compiled (assoc 'additional json-response-item))) + ;; Format pdf entry as a org entry string + (format "** %s\n\n")))) + ;; Render entries into org formatted string with query set as as top level heading + (format "* %s\n%s\n" query) + ;; remove leading (, ) or SPC from extracted entries string + (replace-regexp-in-string "^[\(\) ]" ""))) + (defun khoj--extract-entries-as-images (json-response query) "Convert JSON-RESPONSE, QUERY from API to html with images." (let ((image-results-buffer-html-format-str "\n\n

%s

%s\n\n\n") @@ -592,6 +614,7 @@ CONFIG is json obtained from Khoj config API." ((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music") ((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger") ((and (member 'org enabled-content-types) (equal file-extension "org")) "org") + ((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf") ((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown") (t khoj-default-content-type)))) @@ -647,16 +670,19 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE." (insert (cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query)) ((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query)) + ((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query)) ((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query)) ((equal content-type "image") (khoj--extract-entries-as-images json-response query)) (t (khoj--extract-entries json-response query)))) - (cond ((equal content-type "org") (progn (visual-line-mode) - (org-mode) - (setq-local - org-startup-folded "showall" - org-hide-leading-stars t - org-startup-with-inline-images t) - (org-set-startup-visibility))) + (cond ((or (equal content-type "pdf") + (equal content-type "org")) + (progn (visual-line-mode) + (org-mode) + (setq-local + org-startup-folded "showall" + org-hide-leading-stars t + org-startup-with-inline-images t) + (org-set-startup-visibility))) ((equal content-type "markdown") (progn (markdown-mode) (visual-line-mode))) ((equal content-type "ledger") (beancount-mode)) @@ -973,7 +999,7 @@ Paragraph only starts at first text after blank line." ;; set content type to: last used > based on current buffer > default type :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name)))))) ;; dynamically set choices to content types enabled on khoj backend - :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "ledger" "music" "image"))) + :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "pdf" "ledger" "music" "image"))) (transient-define-suffix khoj--search-command (&optional args) (interactive (list (transient-args transient-current-command))) diff --git a/src/interface/obsidian/README.md b/src/interface/obsidian/README.md index 2bf24228..142b5041 100644 --- a/src/interface/obsidian/README.md +++ b/src/interface/obsidian/README.md @@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0 1. Install Khoj via `pip` and start Khoj backend in non-gui mode 2. Install Khoj plugin via Community Plugins settings pane on Obsidian app 3. Check the new Khoj plugin settings -4. Wait for Khoj backend to index markdown files in the current Vault +4. Wait for Khoj backend to index markdown, PDF files in the current Vault 5. Open Khoj plugin on Obsidian via Search button on Left Pane 6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/) 7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin) @@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj - [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command - [X] Render results as Markdown preview to improve readability - [X] Configure Khoj via the plugin setting tab on the settings page - - Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault + - Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault - Set URL of Khoj backend - Set Number of Search Results to show in Search Modal - [X] Allow reranking of result to improve search quality diff --git a/src/interface/obsidian/src/search_modal.ts b/src/interface/obsidian/src/search_modal.ts index 5f88ff9a..9848334d 100644 --- a/src/interface/obsidian/src/search_modal.ts +++ b/src/interface/obsidian/src/search_modal.ts @@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal { async getSuggestions(query: string): Promise { // Query Khoj backend for search results let encodedQuery = encodeURIComponent(query); - let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`; - let response = await request(searchUrl); - let data = JSON.parse(response); - let results = data + let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`; + + // Get search results for markdown and pdf files + let mdResponse = await request(`${searchUrl}&t=markdown`); + let pdfResponse = await request(`${searchUrl}&t=pdf`); + + // Parse search results + let mdData = JSON.parse(mdResponse) .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path)) - .map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; }); + .map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; }); + let pdfData = JSON.parse(pdfResponse) + .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path)) + .map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; }) + + // Combine markdown and PDF results and sort them by score + let results = mdData.concat(pdfData) + .sort((a: any, b: any) => b.score - a.score) + .map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; }) this.query = query; return results; @@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal { } async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) { - // Get all markdown files in vault + // Get all markdown and PDF files in vault const mdFiles = this.app.vault.getMarkdownFiles(); + const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf'); // Find the vault file matching file of chosen search result - let file_match = mdFiles + let file_match = mdFiles.concat(pdfFiles) // Sort by descending length of path // This finds longest path match when multiple files have same name .sort((a, b) => b.path.length - a.path.length) @@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal { // Open vault file at heading of chosen search result if (file_match) { - let resultHeading = result.entry.split('\n', 1)[0]; + let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : ''; let linkToEntry = `${file_match.path}${resultHeading}` this.app.workspace.openLinkText(linkToEntry, ''); console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`); diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts index 2cdc79a5..b2809cb0 100644 --- a/src/interface/obsidian/src/settings.ts +++ b/src/interface/obsidian/src/settings.ts @@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab { this.plugin.registerInterval(progress_indicator); await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`); + await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`); new Notice('✅ Updated Khoj index.'); // Reset button once index is updated diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 05fd1139..5e176883 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string { export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) { let vaultPath = getVaultAbsolutePath(vault); let mdInVault = `${vaultPath}/**/*.md`; + let pdfInVault = `${vaultPath}/**/*.pdf`; let khojConfigUrl = `${setting.khojUrl}/api/config/data`; // Check if khoj backend is configured, note if cannot connect to backend @@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_'); // Get default config fields from khoj backend let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response)); - let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]); + let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]); + let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]); let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]); let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"]; @@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n "markdown": { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, + }, + "pdf": { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, } } } @@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n data["content-type"]["markdown"] = { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, } } // Else if khoj is not configured to index markdown files in configured obsidian vault @@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) { // Update markdown config in khoj content-type config // Set markdown config to only index markdown files in configured obsidian vault - let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]); + let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]); data["content-type"]["markdown"] = { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`, + } + } + + if (khoj_already_configured && !data["content-type"]["pdf"]) { + // Add pdf config to khoj content-type config + // Set pdf config to index pdf files in configured obsidian vault + data["content-type"]["pdf"] = { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, + } + } + // Else if khoj is not configured to index pdf files in configured obsidian vault + else if (khoj_already_configured && + (data["content-type"]["pdf"]["input-filter"].length != 1 || + data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) { + // Update pdf config in khoj content-type config + // Set pdf config to only index pdf files in configured obsidian vault + let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]); + data["content-type"]["pdf"] = { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`, } } diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 448c8bde..ae49678b 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl +from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.search_type import image_search, text_search from khoj.utils import constants, state from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel @@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, filters=[DateFilter(), WordFilter(), FileFilter()], ) + # Initialize PDF Search + if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf: + logger.info("💸 Setting up search for pdf") + # Extract Entries, Generate PDF Embeddings + model.pdf_search = text_search.setup( + PdfToJsonl, + config.content_type.pdf, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[DateFilter(), WordFilter(), FileFilter()], + ) + # Initialize Image Search if (t == state.SearchType.Image or t == None) and config.content_type.image: logger.info("🌄 Setting up search for images") diff --git a/src/khoj/interface/desktop/file_browser.py b/src/khoj/interface/desktop/file_browser.py index 4ce9725e..d7071664 100644 --- a/src/khoj/interface/desktop/file_browser.py +++ b/src/khoj/interface/desktop/file_browser.py @@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget): return "Beancount Files (*.bean *.beancount)" elif search_type == SearchType.Markdown: return "Markdown Files (*.md *.markdown)" + elif search_type == SearchType.Pdf: + return "Pdf Files (*.pdf)" elif search_type == SearchType.Music: return "Org-Music Files (*.org)" elif search_type == SearchType.Image: diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 42831f5d..bb3725e9 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -44,6 +44,15 @@ }).join("\n") + ``; } + function render_pdf(query, data) { + return `
` + data.map(function (item) { + let compiled_lines = item.additional.compiled.split("\n"); + let filename = compiled_lines.shift(); + let text_match = compiled_lines.join("\n") + return `

${filename}

\n

${text_match}

` + }).join("\n") + `
`; + } + function render_json(data, query, type) { if (type === "markdown") { return render_markdown(query, data); @@ -55,6 +64,8 @@ return data.map(render_image).join(''); } else if (type === "ledger") { return render_ledger(query, data); + } else if (type === "pdf") { + return render_pdf(query, data); } else { return `
` + data.map((item) => `

${item.entry}

`).join("\n") @@ -279,6 +290,7 @@ #json { white-space: pre-wrap; } + #results-pdf, #results-plugin, #results-ledger { text-align: left; diff --git a/src/khoj/processor/conversation/gpt.py b/src/khoj/processor/conversation/gpt.py index c0ba1517..40cccd85 100644 --- a/src/khoj/processor/conversation/gpt.py +++ b/src/khoj/processor/conversation/gpt.py @@ -5,10 +5,10 @@ from datetime import datetime # Internal Packages from khoj.utils.constants import empty_escape_sequences +from khoj.processor.conversation import prompts from khoj.processor.conversation.utils import ( chat_completion_with_backoff, completion_with_backoff, - message_to_prompt, generate_chatml_messages_with_context, ) @@ -20,22 +20,14 @@ def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=50 """ Answer user query using provided text as reference with OpenAI's GPT """ - # Setup Prompt based on Summary Type - prompt = f""" -You are a friendly, helpful personal assistant. -Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know." + # Setup Prompt from arguments + prompt = prompts.answer.format(text=text, user_query=user_query) -Notes: -{text} - -Question: {user_query} - -Answer (in second person):""" # Get Response from GPT logger.debug(f"Prompt for GPT: {prompt}") response = completion_with_backoff( prompt=prompt, - model=model, + model_name=model, temperature=temperature, max_tokens=max_tokens, stop='"""', @@ -43,8 +35,7 @@ Answer (in second person):""" ) # Extract, Clean Message from GPT's Response - story = response["choices"][0]["text"] - return str(story).replace("\n\n", "") + return str(response).replace("\n\n", "") def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200): @@ -53,25 +44,15 @@ def summarize(text, summary_type, model, user_query=None, api_key=None, temperat """ # Setup Prompt based on Summary Type if summary_type == "chat": - prompt = f""" -You are an AI. Summarize the conversation below from your perspective: - -{text} - -Summarize the conversation from the AI's first-person perspective:""" + prompt = prompts.summarize_chat.format(text=text) elif summary_type == "notes": - prompt = f""" -Summarize the below notes about {user_query}: - -{text} - -Summarize the notes in second person perspective:""" + prompt = prompts.summarize_notes.format(text=text, user_query=user_query) # Get Response from GPT logger.debug(f"Prompt for GPT: {prompt}") response = completion_with_backoff( prompt=prompt, - model=model, + model_name=model, temperature=temperature, max_tokens=max_tokens, frequency_penalty=0.2, @@ -80,8 +61,7 @@ Summarize the notes in second person perspective:""" ) # Extract, Clean Message from GPT's Response - story = response["choices"][0]["text"] - return str(story).replace("\n\n", "") + return str(response).replace("\n\n", "") def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100): @@ -102,68 +82,21 @@ def extract_questions(text, model="text-davinci-003", conversation_log={}, api_k current_new_year = today.replace(month=1, day=1) last_new_year = current_new_year.replace(year=today.year - 1) - prompt = f""" -You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes. -- The user will provide their questions and answers to you for context. -- Add as much context from the previous questions and answers as required into your search queries. -- Break messages into multiple search queries when required to retrieve the relevant information. -- Add date filters to your search queries from questions and answers when required to retrieve the relevant information. - -What searches, if any, will you need to perform to answer the users question? -Provide search queries as a JSON list of strings -Current Date: {today.strftime("%A, %Y-%m-%d")} - -Q: How was my trip to Cambodia? - -["How was my trip to Cambodia?"] - -A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful. - -Q: Who did i visit that temple with? - -["Who did I visit the Angkor Wat Temple in Cambodia with?"] - -A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi. - -Q: What national parks did I go to last year? - -["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""] - -A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}. - -Q: How are you feeling today? - -[] - -A: I'm feeling a little bored. Helping you will hopefully make me feel better! - -Q: How many tennis balls fit in the back of a 2002 Honda Civic? - -["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"] - -A: 1085 tennis balls will fit in the trunk of a Honda Civic - -Q: Is Bob older than Tom? - -["When was Bob born?", "What is Tom's age?"] - -A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old. - -Q: What is their age difference? - -["What is Bob's age?", "What is Tom's age?"] - -A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old. - -{chat_history} -Q: {text} - -""" + prompt = prompts.extract_questions.format( + current_date=today.strftime("%A, %Y-%m-%d"), + last_new_year=last_new_year.strftime("%Y"), + last_new_year_date=last_new_year.strftime("%Y-%m-%d"), + current_new_year_date=current_new_year.strftime("%Y-%m-%d"), + bob_tom_age_difference={current_new_year.year - 1984 - 30}, + bob_age={current_new_year.year - 1984}, + chat_history=chat_history, + text=text, + ) # Get Response from GPT response = completion_with_backoff( prompt=prompt, - model=model, + model_name=model, temperature=temperature, max_tokens=max_tokens, stop=["A: ", "\n"], @@ -171,17 +104,16 @@ Q: {text} ) # Extract, Clean Message from GPT's Response - response_text = response["choices"][0]["text"] try: questions = json.loads( # Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings - response_text.strip(empty_escape_sequences) + response.strip(empty_escape_sequences) .replace("['", '["') .replace("']", '"]') .replace("', '", '", "') ) except json.decoder.JSONDecodeError: - logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}") + logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}") questions = [text] logger.debug(f"Extracted Questions by GPT: {questions}") return questions @@ -191,31 +123,8 @@ def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=1 """ Extract search type from user query using OpenAI's GPT """ - # Initialize Variables - understand_primer = """ -Objective: Extract search type from user query and return information as JSON - -Allowed search types are listed below: - - search-type=["notes","ledger","image","music"] - -Some examples are given below for reference: -Q:What fiction book was I reading last week about AI starship? -A:{ "search-type": "notes" } -Q:Play some calm classical music? -A:{ "search-type": "music" } -Q:How much did I spend at Subway for dinner last time? -A:{ "search-type": "ledger" } -Q:What was that popular Sri lankan song that Alex had mentioned? -A:{ "search-type": "music" } -Q:Can you recommend a movie to watch from my notes? -A:{ "search-type": "notes" } -Q: When did I buy Groceries last? -A:{ "search-type": "ledger" } -Q:When did I go surfing last? -A:{ "search-type": "notes" }""" - - # Setup Prompt with Understand Primer - prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:") + # Setup Prompt to extract search type + prompt = prompts.search_type + f"{text}\nA:" if verbose > 1: print(f"Message -> Prompt: {text} -> {prompt}") @@ -223,7 +132,7 @@ A:{ "search-type": "notes" }""" logger.debug(f"Prompt for GPT: {prompt}") response = completion_with_backoff( prompt=prompt, - model=model, + model_name=model, temperature=temperature, max_tokens=max_tokens, frequency_penalty=0.2, @@ -232,8 +141,7 @@ A:{ "search-type": "notes" }""" ) # Extract, Clean Message from GPT's Response - story = str(response["choices"][0]["text"]) - return json.loads(story.strip(empty_escape_sequences)) + return json.loads(response.strip(empty_escape_sequences)) def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2): @@ -241,36 +149,23 @@ def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", Converse with user using OpenAI's ChatGPT """ # Initialize Variables + current_date = datetime.now().strftime("%Y-%m-%d") compiled_references = "\n\n".join({f"# {item}" for item in references}) - personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant." - conversation_primers = { - "general": f""" -Using your general knowledge and our past conversations as context, answer the following question. -Current Date: {datetime.now().strftime("%Y-%m-%d")} - -Question: {user_query} -""".strip(), - "notes": f""" -Using the notes and our past conversations as context, answer the following question. -Current Date: {datetime.now().strftime("%Y-%m-%d")} - -Notes: -{compiled_references} - -Question: {user_query} -""".strip(), - } - # Get Conversation Primer appropriate to Conversation Type conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes" logger.debug(f"Conversation Type: {conversation_type}") - conversation_primer = conversation_primers.get(conversation_type) + if conversation_type == "general": + conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query) + else: + conversation_primer = prompts.notes_conversation.format( + current_date=current_date, query=user_query, references=compiled_references + ) # Setup Prompt with Primer or Conversation History messages = generate_chatml_messages_with_context( conversation_primer, - personality_primer, + prompts.personality.format(), conversation_log, model, ) @@ -279,11 +174,10 @@ Question: {user_query} logger.debug(f"Conversation Context for GPT: {messages}") response = chat_completion_with_backoff( messages=messages, - model=model, + model_name=model, temperature=temperature, - api_key=api_key, + openai_api_key=api_key, ) # Extract, Clean Message from GPT's Response - story = str(response["choices"][0]["message"]["content"]) - return story.strip(empty_escape_sequences) + return response.strip(empty_escape_sequences) diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py new file mode 100644 index 00000000..f40ed171 --- /dev/null +++ b/src/khoj/processor/conversation/prompts.py @@ -0,0 +1,165 @@ +# External Packages +from langchain.prompts import PromptTemplate + + +## Personality +## -- +personality = PromptTemplate.from_template("You are Khoj, a friendly, smart and helpful personal assistant.") + + +## General Conversation +## -- +general_conversation = PromptTemplate.from_template( + """ +Using your general knowledge and our past conversations as context, answer the following question. +Current Date: {current_date} + +Question: {query} +""".strip() +) + + +## Notes Conversation +## -- +notes_conversation = PromptTemplate.from_template( + """ +Using the notes and our past conversations as context, answer the following question. +Current Date: {current_date} + +Notes: +{references} + +Question: {query} +""".strip() +) + + +## Summarize Chat +## -- +summarize_chat = PromptTemplate.from_template( + """ +You are an AI. Summarize the conversation below from your perspective: + +{text} + +Summarize the conversation from the AI's first-person perspective:""" +) + + +## Summarize Notes +## -- +summarize_notes = PromptTemplate.from_template( + """ +Summarize the below notes about {user_query}: + +{text} + +Summarize the notes in second person perspective:""" +) + + +## Answer +## -- +answer = PromptTemplate.from_template( + """ +You are a friendly, helpful personal assistant. +Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know." + +Notes: +{text} + +Question: {user_query} + +Answer (in second person):""" +) + + +## Extract Questions +## -- +extract_questions = PromptTemplate.from_template( + """ +You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes. +- The user will provide their questions and answers to you for context. +- Add as much context from the previous questions and answers as required into your search queries. +- Break messages into multiple search queries when required to retrieve the relevant information. +- Add date filters to your search queries from questions and answers when required to retrieve the relevant information. + +What searches, if any, will you need to perform to answer the users question? +Provide search queries as a JSON list of strings +Current Date: {current_date} + +Q: How was my trip to Cambodia? + +["How was my trip to Cambodia?"] + +A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful. + +Q: Who did i visit that temple with? + +["Who did I visit the Angkor Wat Temple in Cambodia with?"] + +A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi. + +Q: What national parks did I go to last year? + +["National park I visited in {last_new_year} dt>=\\"{last_new_year_date}\\" dt<\\"{current_new_year_date}\\""] + +A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year}. + +Q: How are you feeling today? + +[] + +A: I'm feeling a little bored. Helping you will hopefully make me feel better! + +Q: How many tennis balls fit in the back of a 2002 Honda Civic? + +["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"] + +A: 1085 tennis balls will fit in the trunk of a Honda Civic + +Q: Is Bob older than Tom? + +["When was Bob born?", "What is Tom's age?"] + +A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old. + +Q: What is their age difference? + +["What is Bob's age?", "What is Tom's age?"] + +A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} years old and Tom is 30 years old. + +{chat_history} +Q: {text} + +""" +) + + +## Extract Search Type +## -- +search_type = """ +Objective: Extract search type from user query and return information as JSON + +Allowed search types are listed below: + - search-type=["notes","ledger","image","music", "pdf"] + +Some examples are given below for reference: +Q:What fiction book was I reading last week about AI starship? +A:{ "search-type": "notes" } +Q: What did the lease say about early termination +A: { "search-type": "pdf" } +Q:Play some calm classical music? +A:{ "search-type": "music" } +Q:How much did I spend at Subway for dinner last time? +A:{ "search-type": "ledger" } +Q:What was that popular Sri lankan song that Alex had mentioned? +A:{ "search-type": "music" } +Q:Can you recommend a movie to watch from my notes? +A:{ "search-type": "notes" } +Q:When did I buy Groceries last? +A:{ "search-type": "ledger" } +Q:When did I go surfing last? +A:{ "search-type": "notes" } +Q:""" diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index faf8e1ea..5e3d5ef4 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -4,6 +4,9 @@ import logging from datetime import datetime # External Packages +from langchain.chat_models import ChatOpenAI +from langchain.llms import OpenAI +from langchain.schema import ChatMessage import openai import tiktoken from tenacity import ( @@ -31,14 +34,17 @@ max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192} | retry_if_exception_type(openai.error.RateLimitError) | retry_if_exception_type(openai.error.ServiceUnavailableError) ), - wait=wait_random_exponential(min=1, max=30), - stop=stop_after_attempt(6), + wait=wait_random_exponential(min=1, max=10), + stop=stop_after_attempt(3), before_sleep=before_sleep_log(logger, logging.DEBUG), reraise=True, ) def completion_with_backoff(**kwargs): - openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY") - return openai.Completion.create(**kwargs, request_timeout=60) + prompt = kwargs.pop("prompt") + if "openai_api_key" not in kwargs: + kwargs["openai_api_key"] = os.getenv("OPENAI_API_KEY") + llm = OpenAI(**kwargs, request_timeout=10, max_retries=1) + return llm(prompt) @retry( @@ -50,13 +56,19 @@ def completion_with_backoff(**kwargs): | retry_if_exception_type(openai.error.ServiceUnavailableError) ), wait=wait_exponential(multiplier=1, min=4, max=10), - stop=stop_after_attempt(6), + stop=stop_after_attempt(3), before_sleep=before_sleep_log(logger, logging.DEBUG), reraise=True, ) -def chat_completion_with_backoff(**kwargs): - openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY") - return openai.ChatCompletion.create(**kwargs, request_timeout=60) +def chat_completion_with_backoff(messages, model_name, temperature, openai_api_key=None): + chat = ChatOpenAI( + model_name=model_name, + temperature=temperature, + openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"), + request_timeout=10, + max_retries=1, + ) + return chat(messages).content def generate_chatml_messages_with_context( @@ -64,7 +76,11 @@ def generate_chatml_messages_with_context( ): """Generate messages for ChatGPT with context from previous conversation""" # Extract Chat History for Context - chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])] + chat_logs = [] + for chat in conversation_log.get("chat", []): + chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n" + chat_logs += [chat["message"] + chat_notes] + rest_backnforths = [] # Extract in reverse chronological order for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]): @@ -73,17 +89,26 @@ def generate_chatml_messages_with_context( rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1] # Format user and system messages to chatml format - system_chatml_message = [message_to_chatml(system_message, "system")] - user_chatml_message = [message_to_chatml(user_message, "user")] + system_chatml_message = [ChatMessage(content=system_message, role="system")] + user_chatml_message = [ChatMessage(content=user_message, role="user")] - messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:] + messages = user_chatml_message + rest_backnforths + system_chatml_message # Truncate oldest messages from conversation history until under max supported prompt size by model encoder = tiktoken.encoding_for_model(model_name) - tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()]) - while tokens > max_prompt_size[model_name]: + tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content]) + while tokens > max_prompt_size[model_name] and len(messages) > 1: messages.pop() - tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()]) + tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content]) + + # Truncate last message if still over max supported prompt size by model + if tokens > max_prompt_size[model_name]: + last_message = messages[-1] + truncated_message = encoder.decode(encoder.encode(last_message.content)) + logger.debug( + f"Truncate last message to fit within max prompt size of {max_prompt_size[model_name]} supported by {model_name} model:\n {truncated_message}" + ) + messages = [ChatMessage(content=[truncated_message], role=last_message.role)] # Return message in chronological order return messages[::-1] @@ -91,12 +116,7 @@ def generate_chatml_messages_with_context( def reciprocal_conversation_to_chatml(message_pair): """Convert a single back and forth between user and assistant to chatml format""" - return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])] - - -def message_to_chatml(message, role="assistant"): - """Create chatml message from message and role""" - return {"role": role, "content": message} + return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])] def message_to_prompt( diff --git a/src/khoj/processor/pdf/__init__.py b/src/khoj/processor/pdf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py new file mode 100644 index 00000000..27c03d55 --- /dev/null +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -0,0 +1,131 @@ +# Standard Packages +import glob +import logging +from pathlib import Path +from typing import List + +# External Packages +from langchain.document_loaders import PyPDFLoader + +# Internal Packages +from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer +from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.rawconfig import Entry + + +logger = logging.getLogger(__name__) + + +class PdfToJsonl(TextToJsonl): + # Define Functions + def process(self, previous_entries=None): + # Extract required fields from config + pdf_files, pdf_file_filter, output_file = ( + self.config.input_files, + self.config.input_filter, + self.config.compressed_jsonl, + ) + + # Input Validation + if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter): + print("At least one of pdf-files or pdf-file-filter is required to be specified") + exit(1) + + # Get Pdf Files to Process + pdf_files = PdfToJsonl.get_pdf_files(pdf_files, pdf_file_filter) + + # Extract Entries from specified Pdf files + with timer("Parse entries from PDF files into dictionaries", logger): + current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(pdf_files)) + + # Split entries by max tokens supported by model + with timer("Split entries by max token size supported by model", logger): + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + + # Identify, mark and merge any new entries with previous entries + with timer("Identify new or updated entries", logger): + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = self.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) + + with timer("Write PDF entries to JSONL file", logger): + # Process Each Entry from All Notes Files + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries) + + # Compress JSONL formatted Data + if output_file.suffix == ".gz": + compress_jsonl_data(jsonl_data, output_file) + elif output_file.suffix == ".jsonl": + dump_jsonl(jsonl_data, output_file) + + return entries_with_ids + + @staticmethod + def get_pdf_files(pdf_files=None, pdf_file_filters=None): + "Get PDF files to process" + absolute_pdf_files, filtered_pdf_files = set(), set() + if pdf_files: + absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files} + if pdf_file_filters: + filtered_pdf_files = { + filtered_file + for pdf_file_filter in pdf_file_filters + for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) + } + + all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) + + files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")} + + if any(files_with_non_pdf_extensions): + logger.warn(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}") + + logger.debug(f"Processing files: {all_pdf_files}") + + return all_pdf_files + + @staticmethod + def extract_pdf_entries(pdf_files): + """Extract entries by page from specified PDF files""" + + entries = [] + entry_to_location_map = [] + for pdf_file in pdf_files: + loader = PyPDFLoader(pdf_file) + pdf_entries_per_file = [page.page_content for page in loader.load()] + entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file)) + entries.extend(pdf_entries_per_file) + + return entries, dict(entry_to_location_map) + + @staticmethod + def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]: + "Convert each PDF entries into a dictionary" + entries = [] + for parsed_entry in parsed_entries: + entry_filename = Path(entry_to_file_map[parsed_entry]) + # Append base filename to compiled entry for context to model + heading = f"{entry_filename.stem}\n" + compiled_entry = f"{heading}{parsed_entry}" + entries.append( + Entry( + compiled=compiled_entry, + raw=parsed_entry, + heading=heading, + file=f"{entry_filename}", + ) + ) + + logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries") + + return entries + + @staticmethod + def convert_pdf_maps_to_jsonl(entries: List[Entry]): + "Convert each PDF entry to JSON and collate as JSONL" + return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 0c7f278f..7a79b95a 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -109,6 +109,17 @@ def search( with timer("Collating results took", logger): results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Pdf or t == None) and state.model.pdf_search: + # query pdf files + with timer("Query took", logger): + hits, entries = text_search.query( + user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe + ) + + # collate and return results + with timer("Collating results took", logger): + results = text_search.collate_results(hits, entries, results_count) + elif (t == SearchType.Ledger or t == None) and state.model.ledger_search: # query transactions with timer("Query took", logger): diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 76baa14d..7b590d13 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -22,6 +22,7 @@ class SearchType(str, Enum): Music = "music" Markdown = "markdown" Image = "image" + Pdf = "pdf" class ProcessorType(str, Enum): @@ -61,6 +62,7 @@ class SearchModels: ledger_search: TextSearchModel = None music_search: TextSearchModel = None markdown_search: TextSearchModel = None + pdf_search: TextSearchModel = None image_search: ImageSearchModel = None plugin_search: Dict[str, TextSearchModel] = None diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index aa10a4d3..87eb07ac 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -28,6 +28,12 @@ default_config = { "compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz", "embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt", }, + "pdf": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz", + "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt", + }, "image": { "input-directories": None, "input-filter": None, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index bc8ef78a..72d82ce9 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -56,6 +56,7 @@ class ContentConfig(ConfigBase): image: Optional[ImageContentConfig] music: Optional[TextContentConfig] markdown: Optional[TextContentConfig] + pdf: Optional[TextContentConfig] plugins: Optional[Dict[str, TextContentConfig]] diff --git a/tests/data/pdf/multipage.pdf b/tests/data/pdf/multipage.pdf new file mode 100644 index 00000000..234f39b7 Binary files /dev/null and b/tests/data/pdf/multipage.pdf differ diff --git a/tests/data/pdf/singlepage.pdf b/tests/data/pdf/singlepage.pdf new file mode 100644 index 00000000..8071613a Binary files /dev/null and b/tests/data/pdf/singlepage.pdf differ diff --git a/tests/test_client.py b/tests/test_client.py index e7087e2c..cee0ee67 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_search_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/search?q=random&t={content_type}") # Assert @@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_update_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/update?t={content_type}") # Assert @@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- def test_regenerate_with_valid_content_type(client): - for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]: + for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]: # Act response = client.get(f"/api/update?force=true&t={content_type}") # Assert diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py new file mode 100644 index 00000000..f5918bd1 --- /dev/null +++ b/tests/test_pdf_to_jsonl.py @@ -0,0 +1,74 @@ +# Standard Packages +import json + +# Internal Packages +from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl + + +def test_single_page_pdf_to_jsonl(): + "Convert single page PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"]) + + # Process Each Entry from All Pdf Files + jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( + PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + ) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multi_page_pdf_to_jsonl(): + "Convert multiple pages from single PDF file to jsonl." + # Act + # Extract Entries from specified Pdf files + entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"]) + + # Process Each Entry from All Pdf Files + jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( + PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + ) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 6 + + +def test_get_pdf_files(tmp_path): + "Ensure Pdf files specified via input-filter, input-files extracted" + # Arrange + # Include via input-filter globs + group1_file1 = create_file(tmp_path, filename="group1-file1.pdf") + group1_file2 = create_file(tmp_path, filename="group1-file2.pdf") + group2_file1 = create_file(tmp_path, filename="group2-file1.pdf") + group2_file2 = create_file(tmp_path, filename="group2-file2.pdf") + # Include via input-file field + file1 = create_file(tmp_path, filename="document.pdf") + # Not included by any filter + create_file(tmp_path, filename="not-included-document.pdf") + create_file(tmp_path, filename="not-included-text.txt") + + expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1])) + + # Setup input-files, input-filters + input_files = [tmp_path / "document.pdf"] + input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"] + + # Act + extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter) + + # Assert + assert len(extracted_pdf_files) == 5 + assert extracted_pdf_files == expected_files + + +# Helper Functions +def create_file(tmp_path, entry=None, filename="document.pdf"): + pdf_file = tmp_path / filename + pdf_file.touch() + if entry: + pdf_file.write_text(entry) + return pdf_file