From bbe3bf97333696efcefb2750808ba262405f3d30 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 1 Jun 2023 20:31:28 +0530 Subject: [PATCH] Render PDF search results in Khoj Obsidian interface - Make plugin update khoj server config to index PDF files in vault too - Make Obsidian plugin update index for PDF files in vault too - Show PDF results in Khoj Search modal as well - Ensure combined results are sorted by score across both types - Jump to PDF file when select it PDF search result from modal --- src/interface/obsidian/README.md | 4 +- src/interface/obsidian/src/search_modal.ts | 29 +++++++++---- src/interface/obsidian/src/settings.ts | 1 + src/interface/obsidian/src/utils.ts | 49 ++++++++++++++++++---- 4 files changed, 65 insertions(+), 18 deletions(-) diff --git a/src/interface/obsidian/README.md b/src/interface/obsidian/README.md index 2bf24228..142b5041 100644 --- a/src/interface/obsidian/README.md +++ b/src/interface/obsidian/README.md @@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0 1. Install Khoj via `pip` and start Khoj backend in non-gui mode 2. Install Khoj plugin via Community Plugins settings pane on Obsidian app 3. Check the new Khoj plugin settings -4. Wait for Khoj backend to index markdown files in the current Vault +4. Wait for Khoj backend to index markdown, PDF files in the current Vault 5. Open Khoj plugin on Obsidian via Search button on Left Pane 6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/) 7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin) @@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj - [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command - [X] Render results as Markdown preview to improve readability - [X] Configure Khoj via the plugin setting tab on the settings page - - Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault + - Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault - Set URL of Khoj backend - Set Number of Search Results to show in Search Modal - [X] Allow reranking of result to improve search quality diff --git a/src/interface/obsidian/src/search_modal.ts b/src/interface/obsidian/src/search_modal.ts index 5f88ff9a..9848334d 100644 --- a/src/interface/obsidian/src/search_modal.ts +++ b/src/interface/obsidian/src/search_modal.ts @@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal { async getSuggestions(query: string): Promise { // Query Khoj backend for search results let encodedQuery = encodeURIComponent(query); - let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`; - let response = await request(searchUrl); - let data = JSON.parse(response); - let results = data + let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`; + + // Get search results for markdown and pdf files + let mdResponse = await request(`${searchUrl}&t=markdown`); + let pdfResponse = await request(`${searchUrl}&t=pdf`); + + // Parse search results + let mdData = JSON.parse(mdResponse) .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path)) - .map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; }); + .map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; }); + let pdfData = JSON.parse(pdfResponse) + .filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path)) + .map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; }) + + // Combine markdown and PDF results and sort them by score + let results = mdData.concat(pdfData) + .sort((a: any, b: any) => b.score - a.score) + .map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; }) this.query = query; return results; @@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal { } async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) { - // Get all markdown files in vault + // Get all markdown and PDF files in vault const mdFiles = this.app.vault.getMarkdownFiles(); + const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf'); // Find the vault file matching file of chosen search result - let file_match = mdFiles + let file_match = mdFiles.concat(pdfFiles) // Sort by descending length of path // This finds longest path match when multiple files have same name .sort((a, b) => b.path.length - a.path.length) @@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal { // Open vault file at heading of chosen search result if (file_match) { - let resultHeading = result.entry.split('\n', 1)[0]; + let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : ''; let linkToEntry = `${file_match.path}${resultHeading}` this.app.workspace.openLinkText(linkToEntry, ''); console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`); diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts index 2cdc79a5..b2809cb0 100644 --- a/src/interface/obsidian/src/settings.ts +++ b/src/interface/obsidian/src/settings.ts @@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab { this.plugin.registerInterval(progress_indicator); await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`); + await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`); new Notice('✅ Updated Khoj index.'); // Reset button once index is updated diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 05fd1139..5e176883 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string { export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) { let vaultPath = getVaultAbsolutePath(vault); let mdInVault = `${vaultPath}/**/*.md`; + let pdfInVault = `${vaultPath}/**/*.pdf`; let khojConfigUrl = `${setting.khojUrl}/api/config/data`; // Check if khoj backend is configured, note if cannot connect to backend @@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_'); // Get default config fields from khoj backend let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response)); - let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]); + let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]); + let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]); let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]); let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"]; @@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n "markdown": { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, + }, + "pdf": { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, } } } @@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n data["content-type"]["markdown"] = { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, } } // Else if khoj is not configured to index markdown files in configured obsidian vault @@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) { // Update markdown config in khoj content-type config // Set markdown config to only index markdown files in configured obsidian vault - let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]); + let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]); data["content-type"]["markdown"] = { "input-filter": [mdInVault], "input-files": null, - "embeddings-file": `${khojIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`, + "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`, + } + } + + if (khoj_already_configured && !data["content-type"]["pdf"]) { + // Add pdf config to khoj content-type config + // Set pdf config to index pdf files in configured obsidian vault + data["content-type"]["pdf"] = { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, + } + } + // Else if khoj is not configured to index pdf files in configured obsidian vault + else if (khoj_already_configured && + (data["content-type"]["pdf"]["input-filter"].length != 1 || + data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) { + // Update pdf config in khoj content-type config + // Set pdf config to only index pdf files in configured obsidian vault + let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]); + data["content-type"]["pdf"] = { + "input-filter": [pdfInVault], + "input-files": null, + "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`, + "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`, } }