mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Search PDF files with Khoj. Integrate with LangChain
- **Introduce Khoj to LangChain**: Call GPT with LangChain for Khoj Chat - **Search (and Chat about) PDF files with Khoj** - Create PDF to JSONL Processor: Convert PDF content into standardized JSONL format - Expose PDF search type via Khoj server API - Enable querying PDF files via Obsidian, Emacs and Web interfaces
This commit is contained in:
commit
e022910f31
24 changed files with 608 additions and 200 deletions
|
@ -63,7 +63,7 @@
|
|||
- **General**
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos
|
||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
|
||||
- **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
||||
|
||||
## Demos
|
||||
|
@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
|
|||
- Install Khoj via `pip` and start Khoj backend in non-gui mode
|
||||
- Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
||||
- Check the new Khoj plugin settings
|
||||
- Let Khoj backend index the markdown files in the current Vault
|
||||
- Let Khoj backend index the markdown, pdf files in the current Vault
|
||||
- Open Khoj plugin on Obsidian via Search button on Left Pane
|
||||
- Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
||||
- Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
||||
|
@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
|
|||
|
||||
##### 2. Configure
|
||||
|
||||
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes and beancount directories
|
||||
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
|
||||
- **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
|
||||
|
||||
##### 3. Run
|
||||
|
|
|
@ -21,6 +21,7 @@ services:
|
|||
- ./tests/data/ledger/:/data/ledger/
|
||||
- ./tests/data/music/:/data/music/
|
||||
- ./tests/data/markdown/:/data/markdown/
|
||||
- ./tests/data/pdf/:/data/pdf/
|
||||
# Embeddings and models are populated after the first run
|
||||
# You can set these volumes to point to empty directories on host
|
||||
- ./tests/data/embeddings/:/data/embeddings/
|
||||
|
|
|
@ -21,6 +21,7 @@ keywords = [
|
|||
"markdown",
|
||||
"beancount",
|
||||
"images",
|
||||
"pdf",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
|
@ -44,7 +45,7 @@ dependencies = [
|
|||
"tiktoken >= 0.3.0",
|
||||
"tenacity >= 8.2.2",
|
||||
"pillow == 9.3.0",
|
||||
"pydantic == 1.9.1",
|
||||
"pydantic >= 1.9.1",
|
||||
"pyqt6 == 6.3.1",
|
||||
"pyyaml == 6.0",
|
||||
"rich >= 13.3.1",
|
||||
|
@ -53,6 +54,8 @@ dependencies = [
|
|||
"torch == 1.13.1",
|
||||
"uvicorn == 0.17.6",
|
||||
"aiohttp == 3.8.4",
|
||||
"langchain >= 0.0.187",
|
||||
"pypdf >= 3.9.0",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Description: A search assistant for your second brain
|
||||
;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image
|
||||
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image
|
||||
;; Version: 0.6.2
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||
;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
|
||||
|
@ -29,8 +29,8 @@
|
|||
;;; Commentary:
|
||||
|
||||
;; Create a search assistant for your `org-mode', `markdown' notes,
|
||||
;; `beancount' transactions and images. This package exposes two
|
||||
;; assistance modes, search and chat:
|
||||
;; `beancount' transactions, PDFs and images. This package exposes
|
||||
;; two assistance modes, search and chat:
|
||||
;;
|
||||
;; Chat provides faster answers, iterative discovery and assisted
|
||||
;; creativity. It requires your OpenAI API key to access GPT models
|
||||
|
@ -95,6 +95,7 @@
|
|||
(const "markdown")
|
||||
(const "ledger")
|
||||
(const "image")
|
||||
(const "pdf")
|
||||
(const "music")))
|
||||
|
||||
|
||||
|
@ -140,6 +141,8 @@ NO-PAGING FILTER))
|
|||
"C-x l | ledger\n")
|
||||
(when (member 'image enabled-content-types)
|
||||
"C-x i | image\n")
|
||||
(when (member 'pdf enabled-content-types)
|
||||
"C-x p | pdf\n")
|
||||
(when (member 'music enabled-content-types)
|
||||
"C-x M | music\n"))))
|
||||
|
||||
|
@ -150,6 +153,7 @@ NO-PAGING FILTER))
|
|||
(defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
|
||||
(defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
|
||||
(defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
|
||||
(defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf"))
|
||||
(defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
|
||||
(defun khoj--make-search-keymap (&optional existing-keymap)
|
||||
"Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
|
||||
|
@ -164,6 +168,8 @@ NO-PAGING FILTER))
|
|||
(define-key kmap (kbd "C-x l") #'khoj--search-ledger))
|
||||
(when (member 'image enabled-content-types)
|
||||
(define-key kmap (kbd "C-x i") #'khoj--search-images))
|
||||
(when (member 'pdf enabled-content-types)
|
||||
(define-key kmap (kbd "C-x p") #'khoj--search-pdf))
|
||||
(when (member 'music enabled-content-types)
|
||||
(define-key kmap (kbd "C-x M") #'khoj--search-music))
|
||||
kmap))
|
||||
|
@ -544,6 +550,22 @@ CONFIG is json obtained from Khoj config API."
|
|||
;; remove trailing (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "[\(\) ]$" "")))
|
||||
|
||||
(defun khoj--extract-entries-as-pdf (json-response query)
|
||||
"Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries."
|
||||
(thread-last
|
||||
json-response
|
||||
;; Extract and render each pdf entry from response
|
||||
(mapcar (lambda (json-response-item)
|
||||
(thread-last
|
||||
;; Extract pdf entry from each item in json response
|
||||
(cdr (assoc 'compiled (assoc 'additional json-response-item)))
|
||||
;; Format pdf entry as a org entry string
|
||||
(format "** %s\n\n"))))
|
||||
;; Render entries into org formatted string with query set as as top level heading
|
||||
(format "* %s\n%s\n" query)
|
||||
;; remove leading (, ) or SPC from extracted entries string
|
||||
(replace-regexp-in-string "^[\(\) ]" "")))
|
||||
|
||||
(defun khoj--extract-entries-as-images (json-response query)
|
||||
"Convert JSON-RESPONSE, QUERY from API to html with images."
|
||||
(let ((image-results-buffer-html-format-str "<html>\n<body>\n<h1>%s</h1>%s\n\n</body>\n</html>")
|
||||
|
@ -592,6 +614,7 @@ CONFIG is json obtained from Khoj config API."
|
|||
((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
|
||||
((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
|
||||
((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
|
||||
((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
|
||||
((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
|
||||
(t khoj-default-content-type))))
|
||||
|
||||
|
@ -647,16 +670,19 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
|
|||
(insert
|
||||
(cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
|
||||
((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
|
||||
((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query))
|
||||
((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
|
||||
((equal content-type "image") (khoj--extract-entries-as-images json-response query))
|
||||
(t (khoj--extract-entries json-response query))))
|
||||
(cond ((equal content-type "org") (progn (visual-line-mode)
|
||||
(org-mode)
|
||||
(setq-local
|
||||
org-startup-folded "showall"
|
||||
org-hide-leading-stars t
|
||||
org-startup-with-inline-images t)
|
||||
(org-set-startup-visibility)))
|
||||
(cond ((or (equal content-type "pdf")
|
||||
(equal content-type "org"))
|
||||
(progn (visual-line-mode)
|
||||
(org-mode)
|
||||
(setq-local
|
||||
org-startup-folded "showall"
|
||||
org-hide-leading-stars t
|
||||
org-startup-with-inline-images t)
|
||||
(org-set-startup-visibility)))
|
||||
((equal content-type "markdown") (progn (markdown-mode)
|
||||
(visual-line-mode)))
|
||||
((equal content-type "ledger") (beancount-mode))
|
||||
|
@ -973,7 +999,7 @@ Paragraph only starts at first text after blank line."
|
|||
;; set content type to: last used > based on current buffer > default type
|
||||
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
||||
;; dynamically set choices to content types enabled on khoj backend
|
||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "ledger" "music" "image")))
|
||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "pdf" "ledger" "music" "image")))
|
||||
|
||||
(transient-define-suffix khoj--search-command (&optional args)
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
|
|
|
@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
|
|||
1. Install Khoj via `pip` and start Khoj backend in non-gui mode
|
||||
2. Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
||||
3. Check the new Khoj plugin settings
|
||||
4. Wait for Khoj backend to index markdown files in the current Vault
|
||||
4. Wait for Khoj backend to index markdown, PDF files in the current Vault
|
||||
5. Open Khoj plugin on Obsidian via Search button on Left Pane
|
||||
6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
||||
7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
||||
|
@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj
|
|||
- [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command
|
||||
- [X] Render results as Markdown preview to improve readability
|
||||
- [X] Configure Khoj via the plugin setting tab on the settings page
|
||||
- Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault
|
||||
- Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault
|
||||
- Set URL of Khoj backend
|
||||
- Set Number of Search Results to show in Search Modal
|
||||
- [X] Allow reranking of result to improve search quality
|
||||
|
|
|
@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
|||
async getSuggestions(query: string): Promise<SearchResult[]> {
|
||||
// Query Khoj backend for search results
|
||||
let encodedQuery = encodeURIComponent(query);
|
||||
let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`;
|
||||
let response = await request(searchUrl);
|
||||
let data = JSON.parse(response);
|
||||
let results = data
|
||||
let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`;
|
||||
|
||||
// Get search results for markdown and pdf files
|
||||
let mdResponse = await request(`${searchUrl}&t=markdown`);
|
||||
let pdfResponse = await request(`${searchUrl}&t=pdf`);
|
||||
|
||||
// Parse search results
|
||||
let mdData = JSON.parse(mdResponse)
|
||||
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
||||
.map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; });
|
||||
.map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; });
|
||||
let pdfData = JSON.parse(pdfResponse)
|
||||
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
||||
.map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; })
|
||||
|
||||
// Combine markdown and PDF results and sort them by score
|
||||
let results = mdData.concat(pdfData)
|
||||
.sort((a: any, b: any) => b.score - a.score)
|
||||
.map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; })
|
||||
|
||||
this.query = query;
|
||||
return results;
|
||||
|
@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
|||
}
|
||||
|
||||
async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
|
||||
// Get all markdown files in vault
|
||||
// Get all markdown and PDF files in vault
|
||||
const mdFiles = this.app.vault.getMarkdownFiles();
|
||||
const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf');
|
||||
|
||||
// Find the vault file matching file of chosen search result
|
||||
let file_match = mdFiles
|
||||
let file_match = mdFiles.concat(pdfFiles)
|
||||
// Sort by descending length of path
|
||||
// This finds longest path match when multiple files have same name
|
||||
.sort((a, b) => b.path.length - a.path.length)
|
||||
|
@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
|||
|
||||
// Open vault file at heading of chosen search result
|
||||
if (file_match) {
|
||||
let resultHeading = result.entry.split('\n', 1)[0];
|
||||
let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : '';
|
||||
let linkToEntry = `${file_match.path}${resultHeading}`
|
||||
this.app.workspace.openLinkText(linkToEntry, '');
|
||||
console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`);
|
||||
|
|
|
@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab {
|
|||
this.plugin.registerInterval(progress_indicator);
|
||||
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`);
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`);
|
||||
new Notice('✅ Updated Khoj index.');
|
||||
|
||||
// Reset button once index is updated
|
||||
|
|
|
@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string {
|
|||
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
||||
let vaultPath = getVaultAbsolutePath(vault);
|
||||
let mdInVault = `${vaultPath}/**/*.md`;
|
||||
let pdfInVault = `${vaultPath}/**/*.pdf`;
|
||||
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
||||
|
||||
// Check if khoj backend is configured, note if cannot connect to backend
|
||||
|
@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
||||
// Get default config fields from khoj backend
|
||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||
let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
|
||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];
|
||||
|
||||
|
@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
"markdown": {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
},
|
||||
"pdf": {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
||||
|
@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
||||
// Update markdown config in khoj content-type config
|
||||
// Set markdown config to only index markdown files in configured obsidian vault
|
||||
let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
if (khoj_already_configured && !data["content-type"]["pdf"]) {
|
||||
// Add pdf config to khoj content-type config
|
||||
// Set pdf config to index pdf files in configured obsidian vault
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index pdf files in configured obsidian vault
|
||||
else if (khoj_already_configured &&
|
||||
(data["content-type"]["pdf"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
|
||||
// Update pdf config in khoj content-type config
|
||||
// Set pdf config to only index pdf files in configured obsidian vault
|
||||
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
|||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
|
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
|||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize PDF Search
|
||||
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
|
||||
logger.info("💸 Setting up search for pdf")
|
||||
# Extract Entries, Generate PDF Embeddings
|
||||
model.pdf_search = text_search.setup(
|
||||
PdfToJsonl,
|
||||
config.content_type.pdf,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
|
||||
# Initialize Image Search
|
||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||
logger.info("🌄 Setting up search for images")
|
||||
|
|
|
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
|
|||
return "Beancount Files (*.bean *.beancount)"
|
||||
elif search_type == SearchType.Markdown:
|
||||
return "Markdown Files (*.md *.markdown)"
|
||||
elif search_type == SearchType.Pdf:
|
||||
return "Pdf Files (*.pdf)"
|
||||
elif search_type == SearchType.Music:
|
||||
return "Org-Music Files (*.org)"
|
||||
elif search_type == SearchType.Image:
|
||||
|
|
|
@ -44,6 +44,15 @@
|
|||
}).join("\n") + `</div>`;
|
||||
}
|
||||
|
||||
function render_pdf(query, data) {
|
||||
return `<div id="results-pdf">` + data.map(function (item) {
|
||||
let compiled_lines = item.additional.compiled.split("\n");
|
||||
let filename = compiled_lines.shift();
|
||||
let text_match = compiled_lines.join("\n")
|
||||
return `<h2>${filename}</h2>\n<p>${text_match}</p>`
|
||||
}).join("\n") + `</div>`;
|
||||
}
|
||||
|
||||
function render_json(data, query, type) {
|
||||
if (type === "markdown") {
|
||||
return render_markdown(query, data);
|
||||
|
@ -55,6 +64,8 @@
|
|||
return data.map(render_image).join('');
|
||||
} else if (type === "ledger") {
|
||||
return render_ledger(query, data);
|
||||
} else if (type === "pdf") {
|
||||
return render_pdf(query, data);
|
||||
} else {
|
||||
return `<div id="results-plugin">`
|
||||
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
|
||||
|
@ -279,6 +290,7 @@
|
|||
#json {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
#results-pdf,
|
||||
#results-plugin,
|
||||
#results-ledger {
|
||||
text-align: left;
|
||||
|
|
|
@ -5,10 +5,10 @@ from datetime import datetime
|
|||
|
||||
# Internal Packages
|
||||
from khoj.utils.constants import empty_escape_sequences
|
||||
from khoj.processor.conversation import prompts
|
||||
from khoj.processor.conversation.utils import (
|
||||
chat_completion_with_backoff,
|
||||
completion_with_backoff,
|
||||
message_to_prompt,
|
||||
generate_chatml_messages_with_context,
|
||||
)
|
||||
|
||||
|
@ -20,22 +20,14 @@ def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=50
|
|||
"""
|
||||
Answer user query using provided text as reference with OpenAI's GPT
|
||||
"""
|
||||
# Setup Prompt based on Summary Type
|
||||
prompt = f"""
|
||||
You are a friendly, helpful personal assistant.
|
||||
Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
|
||||
# Setup Prompt from arguments
|
||||
prompt = prompts.answer.format(text=text, user_query=user_query)
|
||||
|
||||
Notes:
|
||||
{text}
|
||||
|
||||
Question: {user_query}
|
||||
|
||||
Answer (in second person):"""
|
||||
# Get Response from GPT
|
||||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
model_name=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stop='"""',
|
||||
|
@ -43,8 +35,7 @@ Answer (in second person):"""
|
|||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = response["choices"][0]["text"]
|
||||
return str(story).replace("\n\n", "")
|
||||
return str(response).replace("\n\n", "")
|
||||
|
||||
|
||||
def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
|
||||
|
@ -53,25 +44,15 @@ def summarize(text, summary_type, model, user_query=None, api_key=None, temperat
|
|||
"""
|
||||
# Setup Prompt based on Summary Type
|
||||
if summary_type == "chat":
|
||||
prompt = f"""
|
||||
You are an AI. Summarize the conversation below from your perspective:
|
||||
|
||||
{text}
|
||||
|
||||
Summarize the conversation from the AI's first-person perspective:"""
|
||||
prompt = prompts.summarize_chat.format(text=text)
|
||||
elif summary_type == "notes":
|
||||
prompt = f"""
|
||||
Summarize the below notes about {user_query}:
|
||||
|
||||
{text}
|
||||
|
||||
Summarize the notes in second person perspective:"""
|
||||
prompt = prompts.summarize_notes.format(text=text, user_query=user_query)
|
||||
|
||||
# Get Response from GPT
|
||||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
model_name=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
frequency_penalty=0.2,
|
||||
|
@ -80,8 +61,7 @@ Summarize the notes in second person perspective:"""
|
|||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = response["choices"][0]["text"]
|
||||
return str(story).replace("\n\n", "")
|
||||
return str(response).replace("\n\n", "")
|
||||
|
||||
|
||||
def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
|
||||
|
@ -102,68 +82,21 @@ def extract_questions(text, model="text-davinci-003", conversation_log={}, api_k
|
|||
current_new_year = today.replace(month=1, day=1)
|
||||
last_new_year = current_new_year.replace(year=today.year - 1)
|
||||
|
||||
prompt = f"""
|
||||
You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes.
|
||||
- The user will provide their questions and answers to you for context.
|
||||
- Add as much context from the previous questions and answers as required into your search queries.
|
||||
- Break messages into multiple search queries when required to retrieve the relevant information.
|
||||
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
|
||||
|
||||
What searches, if any, will you need to perform to answer the users question?
|
||||
Provide search queries as a JSON list of strings
|
||||
Current Date: {today.strftime("%A, %Y-%m-%d")}
|
||||
|
||||
Q: How was my trip to Cambodia?
|
||||
|
||||
["How was my trip to Cambodia?"]
|
||||
|
||||
A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
|
||||
|
||||
Q: Who did i visit that temple with?
|
||||
|
||||
["Who did I visit the Angkor Wat Temple in Cambodia with?"]
|
||||
|
||||
A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
|
||||
|
||||
Q: What national parks did I go to last year?
|
||||
|
||||
["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""]
|
||||
|
||||
A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}.
|
||||
|
||||
Q: How are you feeling today?
|
||||
|
||||
[]
|
||||
|
||||
A: I'm feeling a little bored. Helping you will hopefully make me feel better!
|
||||
|
||||
Q: How many tennis balls fit in the back of a 2002 Honda Civic?
|
||||
|
||||
["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
|
||||
|
||||
A: 1085 tennis balls will fit in the trunk of a Honda Civic
|
||||
|
||||
Q: Is Bob older than Tom?
|
||||
|
||||
["When was Bob born?", "What is Tom's age?"]
|
||||
|
||||
A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
|
||||
|
||||
Q: What is their age difference?
|
||||
|
||||
["What is Bob's age?", "What is Tom's age?"]
|
||||
|
||||
A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old.
|
||||
|
||||
{chat_history}
|
||||
Q: {text}
|
||||
|
||||
"""
|
||||
prompt = prompts.extract_questions.format(
|
||||
current_date=today.strftime("%A, %Y-%m-%d"),
|
||||
last_new_year=last_new_year.strftime("%Y"),
|
||||
last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
|
||||
current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
|
||||
bob_tom_age_difference={current_new_year.year - 1984 - 30},
|
||||
bob_age={current_new_year.year - 1984},
|
||||
chat_history=chat_history,
|
||||
text=text,
|
||||
)
|
||||
|
||||
# Get Response from GPT
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
model_name=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stop=["A: ", "\n"],
|
||||
|
@ -171,17 +104,16 @@ Q: {text}
|
|||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
response_text = response["choices"][0]["text"]
|
||||
try:
|
||||
questions = json.loads(
|
||||
# Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
|
||||
response_text.strip(empty_escape_sequences)
|
||||
response.strip(empty_escape_sequences)
|
||||
.replace("['", '["')
|
||||
.replace("']", '"]')
|
||||
.replace("', '", '", "')
|
||||
)
|
||||
except json.decoder.JSONDecodeError:
|
||||
logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}")
|
||||
logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}")
|
||||
questions = [text]
|
||||
logger.debug(f"Extracted Questions by GPT: {questions}")
|
||||
return questions
|
||||
|
@ -191,31 +123,8 @@ def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=1
|
|||
"""
|
||||
Extract search type from user query using OpenAI's GPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
understand_primer = """
|
||||
Objective: Extract search type from user query and return information as JSON
|
||||
|
||||
Allowed search types are listed below:
|
||||
- search-type=["notes","ledger","image","music"]
|
||||
|
||||
Some examples are given below for reference:
|
||||
Q:What fiction book was I reading last week about AI starship?
|
||||
A:{ "search-type": "notes" }
|
||||
Q:Play some calm classical music?
|
||||
A:{ "search-type": "music" }
|
||||
Q:How much did I spend at Subway for dinner last time?
|
||||
A:{ "search-type": "ledger" }
|
||||
Q:What was that popular Sri lankan song that Alex had mentioned?
|
||||
A:{ "search-type": "music" }
|
||||
Q:Can you recommend a movie to watch from my notes?
|
||||
A:{ "search-type": "notes" }
|
||||
Q: When did I buy Groceries last?
|
||||
A:{ "search-type": "ledger" }
|
||||
Q:When did I go surfing last?
|
||||
A:{ "search-type": "notes" }"""
|
||||
|
||||
# Setup Prompt with Understand Primer
|
||||
prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
|
||||
# Setup Prompt to extract search type
|
||||
prompt = prompts.search_type + f"{text}\nA:"
|
||||
if verbose > 1:
|
||||
print(f"Message -> Prompt: {text} -> {prompt}")
|
||||
|
||||
|
@ -223,7 +132,7 @@ A:{ "search-type": "notes" }"""
|
|||
logger.debug(f"Prompt for GPT: {prompt}")
|
||||
response = completion_with_backoff(
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
model_name=model,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
frequency_penalty=0.2,
|
||||
|
@ -232,8 +141,7 @@ A:{ "search-type": "notes" }"""
|
|||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["text"])
|
||||
return json.loads(story.strip(empty_escape_sequences))
|
||||
return json.loads(response.strip(empty_escape_sequences))
|
||||
|
||||
|
||||
def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2):
|
||||
|
@ -241,36 +149,23 @@ def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo",
|
|||
Converse with user using OpenAI's ChatGPT
|
||||
"""
|
||||
# Initialize Variables
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
compiled_references = "\n\n".join({f"# {item}" for item in references})
|
||||
|
||||
personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
|
||||
conversation_primers = {
|
||||
"general": f"""
|
||||
Using your general knowledge and our past conversations as context, answer the following question.
|
||||
Current Date: {datetime.now().strftime("%Y-%m-%d")}
|
||||
|
||||
Question: {user_query}
|
||||
""".strip(),
|
||||
"notes": f"""
|
||||
Using the notes and our past conversations as context, answer the following question.
|
||||
Current Date: {datetime.now().strftime("%Y-%m-%d")}
|
||||
|
||||
Notes:
|
||||
{compiled_references}
|
||||
|
||||
Question: {user_query}
|
||||
""".strip(),
|
||||
}
|
||||
|
||||
# Get Conversation Primer appropriate to Conversation Type
|
||||
conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes"
|
||||
logger.debug(f"Conversation Type: {conversation_type}")
|
||||
conversation_primer = conversation_primers.get(conversation_type)
|
||||
if conversation_type == "general":
|
||||
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
|
||||
else:
|
||||
conversation_primer = prompts.notes_conversation.format(
|
||||
current_date=current_date, query=user_query, references=compiled_references
|
||||
)
|
||||
|
||||
# Setup Prompt with Primer or Conversation History
|
||||
messages = generate_chatml_messages_with_context(
|
||||
conversation_primer,
|
||||
personality_primer,
|
||||
prompts.personality.format(),
|
||||
conversation_log,
|
||||
model,
|
||||
)
|
||||
|
@ -279,11 +174,10 @@ Question: {user_query}
|
|||
logger.debug(f"Conversation Context for GPT: {messages}")
|
||||
response = chat_completion_with_backoff(
|
||||
messages=messages,
|
||||
model=model,
|
||||
model_name=model,
|
||||
temperature=temperature,
|
||||
api_key=api_key,
|
||||
openai_api_key=api_key,
|
||||
)
|
||||
|
||||
# Extract, Clean Message from GPT's Response
|
||||
story = str(response["choices"][0]["message"]["content"])
|
||||
return story.strip(empty_escape_sequences)
|
||||
return response.strip(empty_escape_sequences)
|
||||
|
|
165
src/khoj/processor/conversation/prompts.py
Normal file
165
src/khoj/processor/conversation/prompts.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
# External Packages
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
|
||||
## Personality
|
||||
## --
|
||||
personality = PromptTemplate.from_template("You are Khoj, a friendly, smart and helpful personal assistant.")
|
||||
|
||||
|
||||
## General Conversation
|
||||
## --
|
||||
general_conversation = PromptTemplate.from_template(
|
||||
"""
|
||||
Using your general knowledge and our past conversations as context, answer the following question.
|
||||
Current Date: {current_date}
|
||||
|
||||
Question: {query}
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
||||
## Notes Conversation
|
||||
## --
|
||||
notes_conversation = PromptTemplate.from_template(
|
||||
"""
|
||||
Using the notes and our past conversations as context, answer the following question.
|
||||
Current Date: {current_date}
|
||||
|
||||
Notes:
|
||||
{references}
|
||||
|
||||
Question: {query}
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
||||
## Summarize Chat
|
||||
## --
|
||||
summarize_chat = PromptTemplate.from_template(
|
||||
"""
|
||||
You are an AI. Summarize the conversation below from your perspective:
|
||||
|
||||
{text}
|
||||
|
||||
Summarize the conversation from the AI's first-person perspective:"""
|
||||
)
|
||||
|
||||
|
||||
## Summarize Notes
|
||||
## --
|
||||
summarize_notes = PromptTemplate.from_template(
|
||||
"""
|
||||
Summarize the below notes about {user_query}:
|
||||
|
||||
{text}
|
||||
|
||||
Summarize the notes in second person perspective:"""
|
||||
)
|
||||
|
||||
|
||||
## Answer
|
||||
## --
|
||||
answer = PromptTemplate.from_template(
|
||||
"""
|
||||
You are a friendly, helpful personal assistant.
|
||||
Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
|
||||
|
||||
Notes:
|
||||
{text}
|
||||
|
||||
Question: {user_query}
|
||||
|
||||
Answer (in second person):"""
|
||||
)
|
||||
|
||||
|
||||
## Extract Questions
|
||||
## --
|
||||
extract_questions = PromptTemplate.from_template(
|
||||
"""
|
||||
You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes.
|
||||
- The user will provide their questions and answers to you for context.
|
||||
- Add as much context from the previous questions and answers as required into your search queries.
|
||||
- Break messages into multiple search queries when required to retrieve the relevant information.
|
||||
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
|
||||
|
||||
What searches, if any, will you need to perform to answer the users question?
|
||||
Provide search queries as a JSON list of strings
|
||||
Current Date: {current_date}
|
||||
|
||||
Q: How was my trip to Cambodia?
|
||||
|
||||
["How was my trip to Cambodia?"]
|
||||
|
||||
A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
|
||||
|
||||
Q: Who did i visit that temple with?
|
||||
|
||||
["Who did I visit the Angkor Wat Temple in Cambodia with?"]
|
||||
|
||||
A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
|
||||
|
||||
Q: What national parks did I go to last year?
|
||||
|
||||
["National park I visited in {last_new_year} dt>=\\"{last_new_year_date}\\" dt<\\"{current_new_year_date}\\""]
|
||||
|
||||
A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year}.
|
||||
|
||||
Q: How are you feeling today?
|
||||
|
||||
[]
|
||||
|
||||
A: I'm feeling a little bored. Helping you will hopefully make me feel better!
|
||||
|
||||
Q: How many tennis balls fit in the back of a 2002 Honda Civic?
|
||||
|
||||
["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
|
||||
|
||||
A: 1085 tennis balls will fit in the trunk of a Honda Civic
|
||||
|
||||
Q: Is Bob older than Tom?
|
||||
|
||||
["When was Bob born?", "What is Tom's age?"]
|
||||
|
||||
A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
|
||||
|
||||
Q: What is their age difference?
|
||||
|
||||
["What is Bob's age?", "What is Tom's age?"]
|
||||
|
||||
A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} years old and Tom is 30 years old.
|
||||
|
||||
{chat_history}
|
||||
Q: {text}
|
||||
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
## Extract Search Type
|
||||
## --
|
||||
search_type = """
|
||||
Objective: Extract search type from user query and return information as JSON
|
||||
|
||||
Allowed search types are listed below:
|
||||
- search-type=["notes","ledger","image","music", "pdf"]
|
||||
|
||||
Some examples are given below for reference:
|
||||
Q:What fiction book was I reading last week about AI starship?
|
||||
A:{ "search-type": "notes" }
|
||||
Q: What did the lease say about early termination
|
||||
A: { "search-type": "pdf" }
|
||||
Q:Play some calm classical music?
|
||||
A:{ "search-type": "music" }
|
||||
Q:How much did I spend at Subway for dinner last time?
|
||||
A:{ "search-type": "ledger" }
|
||||
Q:What was that popular Sri lankan song that Alex had mentioned?
|
||||
A:{ "search-type": "music" }
|
||||
Q:Can you recommend a movie to watch from my notes?
|
||||
A:{ "search-type": "notes" }
|
||||
Q:When did I buy Groceries last?
|
||||
A:{ "search-type": "ledger" }
|
||||
Q:When did I go surfing last?
|
||||
A:{ "search-type": "notes" }
|
||||
Q:"""
|
|
@ -4,6 +4,9 @@ import logging
|
|||
from datetime import datetime
|
||||
|
||||
# External Packages
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.schema import ChatMessage
|
||||
import openai
|
||||
import tiktoken
|
||||
from tenacity import (
|
||||
|
@ -31,14 +34,17 @@ max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192}
|
|||
| retry_if_exception_type(openai.error.RateLimitError)
|
||||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||
),
|
||||
wait=wait_random_exponential(min=1, max=30),
|
||||
stop=stop_after_attempt(6),
|
||||
wait=wait_random_exponential(min=1, max=10),
|
||||
stop=stop_after_attempt(3),
|
||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||
reraise=True,
|
||||
)
|
||||
def completion_with_backoff(**kwargs):
|
||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
||||
return openai.Completion.create(**kwargs, request_timeout=60)
|
||||
prompt = kwargs.pop("prompt")
|
||||
if "openai_api_key" not in kwargs:
|
||||
kwargs["openai_api_key"] = os.getenv("OPENAI_API_KEY")
|
||||
llm = OpenAI(**kwargs, request_timeout=10, max_retries=1)
|
||||
return llm(prompt)
|
||||
|
||||
|
||||
@retry(
|
||||
|
@ -50,13 +56,19 @@ def completion_with_backoff(**kwargs):
|
|||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||
),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||
stop=stop_after_attempt(6),
|
||||
stop=stop_after_attempt(3),
|
||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||
reraise=True,
|
||||
)
|
||||
def chat_completion_with_backoff(**kwargs):
|
||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
||||
return openai.ChatCompletion.create(**kwargs, request_timeout=60)
|
||||
def chat_completion_with_backoff(messages, model_name, temperature, openai_api_key=None):
|
||||
chat = ChatOpenAI(
|
||||
model_name=model_name,
|
||||
temperature=temperature,
|
||||
openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"),
|
||||
request_timeout=10,
|
||||
max_retries=1,
|
||||
)
|
||||
return chat(messages).content
|
||||
|
||||
|
||||
def generate_chatml_messages_with_context(
|
||||
|
@ -64,7 +76,11 @@ def generate_chatml_messages_with_context(
|
|||
):
|
||||
"""Generate messages for ChatGPT with context from previous conversation"""
|
||||
# Extract Chat History for Context
|
||||
chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
|
||||
chat_logs = []
|
||||
for chat in conversation_log.get("chat", []):
|
||||
chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n"
|
||||
chat_logs += [chat["message"] + chat_notes]
|
||||
|
||||
rest_backnforths = []
|
||||
# Extract in reverse chronological order
|
||||
for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
|
||||
|
@ -73,17 +89,26 @@ def generate_chatml_messages_with_context(
|
|||
rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
|
||||
|
||||
# Format user and system messages to chatml format
|
||||
system_chatml_message = [message_to_chatml(system_message, "system")]
|
||||
user_chatml_message = [message_to_chatml(user_message, "user")]
|
||||
system_chatml_message = [ChatMessage(content=system_message, role="system")]
|
||||
user_chatml_message = [ChatMessage(content=user_message, role="user")]
|
||||
|
||||
messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:]
|
||||
messages = user_chatml_message + rest_backnforths + system_chatml_message
|
||||
|
||||
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
||||
encoder = tiktoken.encoding_for_model(model_name)
|
||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
||||
while tokens > max_prompt_size[model_name]:
|
||||
tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
|
||||
while tokens > max_prompt_size[model_name] and len(messages) > 1:
|
||||
messages.pop()
|
||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
||||
tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
|
||||
|
||||
# Truncate last message if still over max supported prompt size by model
|
||||
if tokens > max_prompt_size[model_name]:
|
||||
last_message = messages[-1]
|
||||
truncated_message = encoder.decode(encoder.encode(last_message.content))
|
||||
logger.debug(
|
||||
f"Truncate last message to fit within max prompt size of {max_prompt_size[model_name]} supported by {model_name} model:\n {truncated_message}"
|
||||
)
|
||||
messages = [ChatMessage(content=[truncated_message], role=last_message.role)]
|
||||
|
||||
# Return message in chronological order
|
||||
return messages[::-1]
|
||||
|
@ -91,12 +116,7 @@ def generate_chatml_messages_with_context(
|
|||
|
||||
def reciprocal_conversation_to_chatml(message_pair):
|
||||
"""Convert a single back and forth between user and assistant to chatml format"""
|
||||
return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
|
||||
|
||||
|
||||
def message_to_chatml(message, role="assistant"):
|
||||
"""Create chatml message from message and role"""
|
||||
return {"role": role, "content": message}
|
||||
return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])]
|
||||
|
||||
|
||||
def message_to_prompt(
|
||||
|
|
0
src/khoj/processor/pdf/__init__.py
Normal file
0
src/khoj/processor/pdf/__init__.py
Normal file
131
src/khoj/processor/pdf/pdf_to_jsonl.py
Normal file
131
src/khoj/processor/pdf/pdf_to_jsonl.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
# Standard Packages
|
||||
import glob
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
# External Packages
|
||||
from langchain.document_loaders import PyPDFLoader
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils.rawconfig import Entry
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfToJsonl(TextToJsonl):
|
||||
# Define Functions
|
||||
def process(self, previous_entries=None):
|
||||
# Extract required fields from config
|
||||
pdf_files, pdf_file_filter, output_file = (
|
||||
self.config.input_files,
|
||||
self.config.input_filter,
|
||||
self.config.compressed_jsonl,
|
||||
)
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
|
||||
print("At least one of pdf-files or pdf-file-filter is required to be specified")
|
||||
exit(1)
|
||||
|
||||
# Get Pdf Files to Process
|
||||
pdf_files = PdfToJsonl.get_pdf_files(pdf_files, pdf_file_filter)
|
||||
|
||||
# Extract Entries from specified Pdf files
|
||||
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(pdf_files))
|
||||
|
||||
# Split entries by max tokens supported by model
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write PDF entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if output_file.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, output_file)
|
||||
elif output_file.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, output_file)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
@staticmethod
|
||||
def get_pdf_files(pdf_files=None, pdf_file_filters=None):
|
||||
"Get PDF files to process"
|
||||
absolute_pdf_files, filtered_pdf_files = set(), set()
|
||||
if pdf_files:
|
||||
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
||||
if pdf_file_filters:
|
||||
filtered_pdf_files = {
|
||||
filtered_file
|
||||
for pdf_file_filter in pdf_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
||||
}
|
||||
|
||||
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
||||
|
||||
files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
|
||||
|
||||
if any(files_with_non_pdf_extensions):
|
||||
logger.warn(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
|
||||
|
||||
logger.debug(f"Processing files: {all_pdf_files}")
|
||||
|
||||
return all_pdf_files
|
||||
|
||||
@staticmethod
|
||||
def extract_pdf_entries(pdf_files):
|
||||
"""Extract entries by page from specified PDF files"""
|
||||
|
||||
entries = []
|
||||
entry_to_location_map = []
|
||||
for pdf_file in pdf_files:
|
||||
loader = PyPDFLoader(pdf_file)
|
||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||
entries.extend(pdf_entries_per_file)
|
||||
|
||||
return entries, dict(entry_to_location_map)
|
||||
|
||||
@staticmethod
|
||||
def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||
"Convert each PDF entries into a dictionary"
|
||||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||
# Append base filename to compiled entry for context to model
|
||||
heading = f"{entry_filename.stem}\n"
|
||||
compiled_entry = f"{heading}{parsed_entry}"
|
||||
entries.append(
|
||||
Entry(
|
||||
compiled=compiled_entry,
|
||||
raw=parsed_entry,
|
||||
heading=heading,
|
||||
file=f"{entry_filename}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
||||
|
||||
return entries
|
||||
|
||||
@staticmethod
|
||||
def convert_pdf_maps_to_jsonl(entries: List[Entry]):
|
||||
"Convert each PDF entry to JSON and collate as JSONL"
|
||||
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
@ -109,6 +109,17 @@ def search(
|
|||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
|
||||
# query pdf files
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||
# query transactions
|
||||
with timer("Query took", logger):
|
||||
|
|
|
@ -22,6 +22,7 @@ class SearchType(str, Enum):
|
|||
Music = "music"
|
||||
Markdown = "markdown"
|
||||
Image = "image"
|
||||
Pdf = "pdf"
|
||||
|
||||
|
||||
class ProcessorType(str, Enum):
|
||||
|
@ -61,6 +62,7 @@ class SearchModels:
|
|||
ledger_search: TextSearchModel = None
|
||||
music_search: TextSearchModel = None
|
||||
markdown_search: TextSearchModel = None
|
||||
pdf_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
plugin_search: Dict[str, TextSearchModel] = None
|
||||
|
||||
|
|
|
@ -28,6 +28,12 @@ default_config = {
|
|||
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
||||
},
|
||||
"pdf": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||
},
|
||||
"image": {
|
||||
"input-directories": None,
|
||||
"input-filter": None,
|
||||
|
|
|
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
|||
image: Optional[ImageContentConfig]
|
||||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
pdf: Optional[TextContentConfig]
|
||||
plugins: Optional[Dict[str, TextContentConfig]]
|
||||
|
||||
|
||||
|
|
BIN
tests/data/pdf/multipage.pdf
Normal file
BIN
tests/data/pdf/multipage.pdf
Normal file
Binary file not shown.
BIN
tests/data/pdf/singlepage.pdf
Normal file
BIN
tests/data/pdf/singlepage.pdf
Normal file
Binary file not shown.
|
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_search_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||
# Assert
|
||||
|
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_update_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?t={content_type}")
|
||||
# Assert
|
||||
|
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
|
|||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
||||
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||
# Act
|
||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||
# Assert
|
||||
|
|
74
tests/test_pdf_to_jsonl.py
Normal file
74
tests/test_pdf_to_jsonl.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
# Standard Packages
|
||||
import json
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
|
||||
|
||||
def test_single_page_pdf_to_jsonl():
|
||||
"Convert single page PDF file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 1
|
||||
|
||||
|
||||
def test_multi_page_pdf_to_jsonl():
|
||||
"Convert multiple pages from single PDF file to jsonl."
|
||||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
|
||||
|
||||
# Process Each Entry from All Pdf Files
|
||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 6
|
||||
|
||||
|
||||
def test_get_pdf_files(tmp_path):
|
||||
"Ensure Pdf files specified via input-filter, input-files extracted"
|
||||
# Arrange
|
||||
# Include via input-filter globs
|
||||
group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
|
||||
group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
|
||||
group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
|
||||
group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
|
||||
# Include via input-file field
|
||||
file1 = create_file(tmp_path, filename="document.pdf")
|
||||
# Not included by any filter
|
||||
create_file(tmp_path, filename="not-included-document.pdf")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / "document.pdf"]
|
||||
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
|
||||
|
||||
# Act
|
||||
extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
|
||||
|
||||
# Assert
|
||||
assert len(extracted_pdf_files) == 5
|
||||
assert extracted_pdf_files == expected_files
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry=None, filename="document.pdf"):
|
||||
pdf_file = tmp_path / filename
|
||||
pdf_file.touch()
|
||||
if entry:
|
||||
pdf_file.write_text(entry)
|
||||
return pdf_file
|
Loading…
Reference in a new issue