mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Search PDF files with Khoj. Integrate with LangChain
- **Introduce Khoj to LangChain**: Call GPT with LangChain for Khoj Chat - **Search (and Chat about) PDF files with Khoj** - Create PDF to JSONL Processor: Convert PDF content into standardized JSONL format - Expose PDF search type via Khoj server API - Enable querying PDF files via Obsidian, Emacs and Web interfaces
This commit is contained in:
commit
e022910f31
24 changed files with 608 additions and 200 deletions
|
@ -63,7 +63,7 @@
|
||||||
- **General**
|
- **General**
|
||||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||||
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
||||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions and Photos
|
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
|
||||||
- **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
- **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
||||||
|
|
||||||
## Demos
|
## Demos
|
||||||
|
@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
|
||||||
- Install Khoj via `pip` and start Khoj backend in non-gui mode
|
- Install Khoj via `pip` and start Khoj backend in non-gui mode
|
||||||
- Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
- Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
||||||
- Check the new Khoj plugin settings
|
- Check the new Khoj plugin settings
|
||||||
- Let Khoj backend index the markdown files in the current Vault
|
- Let Khoj backend index the markdown, pdf files in the current Vault
|
||||||
- Open Khoj plugin on Obsidian via Search button on Left Pane
|
- Open Khoj plugin on Obsidian via Search button on Left Pane
|
||||||
- Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
- Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
||||||
- Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
- Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
||||||
|
@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
|
||||||
|
|
||||||
##### 2. Configure
|
##### 2. Configure
|
||||||
|
|
||||||
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes and beancount directories
|
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
|
||||||
- **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
|
- **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
|
||||||
|
|
||||||
##### 3. Run
|
##### 3. Run
|
||||||
|
|
|
@ -21,6 +21,7 @@ services:
|
||||||
- ./tests/data/ledger/:/data/ledger/
|
- ./tests/data/ledger/:/data/ledger/
|
||||||
- ./tests/data/music/:/data/music/
|
- ./tests/data/music/:/data/music/
|
||||||
- ./tests/data/markdown/:/data/markdown/
|
- ./tests/data/markdown/:/data/markdown/
|
||||||
|
- ./tests/data/pdf/:/data/pdf/
|
||||||
# Embeddings and models are populated after the first run
|
# Embeddings and models are populated after the first run
|
||||||
# You can set these volumes to point to empty directories on host
|
# You can set these volumes to point to empty directories on host
|
||||||
- ./tests/data/embeddings/:/data/embeddings/
|
- ./tests/data/embeddings/:/data/embeddings/
|
||||||
|
|
|
@ -21,6 +21,7 @@ keywords = [
|
||||||
"markdown",
|
"markdown",
|
||||||
"beancount",
|
"beancount",
|
||||||
"images",
|
"images",
|
||||||
|
"pdf",
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
|
@ -44,7 +45,7 @@ dependencies = [
|
||||||
"tiktoken >= 0.3.0",
|
"tiktoken >= 0.3.0",
|
||||||
"tenacity >= 8.2.2",
|
"tenacity >= 8.2.2",
|
||||||
"pillow == 9.3.0",
|
"pillow == 9.3.0",
|
||||||
"pydantic == 1.9.1",
|
"pydantic >= 1.9.1",
|
||||||
"pyqt6 == 6.3.1",
|
"pyqt6 == 6.3.1",
|
||||||
"pyyaml == 6.0",
|
"pyyaml == 6.0",
|
||||||
"rich >= 13.3.1",
|
"rich >= 13.3.1",
|
||||||
|
@ -53,6 +54,8 @@ dependencies = [
|
||||||
"torch == 1.13.1",
|
"torch == 1.13.1",
|
||||||
"uvicorn == 0.17.6",
|
"uvicorn == 0.17.6",
|
||||||
"aiohttp == 3.8.4",
|
"aiohttp == 3.8.4",
|
||||||
|
"langchain >= 0.0.187",
|
||||||
|
"pypdf >= 3.9.0",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||||
;; Description: A search assistant for your second brain
|
;; Description: A search assistant for your second brain
|
||||||
;; Keywords: search, chat, org-mode, outlines, markdown, beancount, image
|
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, beancount, image
|
||||||
;; Version: 0.6.2
|
;; Version: 0.6.2
|
||||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||||
;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
|
;; URL: https://github.com/debanjum/khoj/tree/master/src/interface/emacs
|
||||||
|
@ -29,8 +29,8 @@
|
||||||
;;; Commentary:
|
;;; Commentary:
|
||||||
|
|
||||||
;; Create a search assistant for your `org-mode', `markdown' notes,
|
;; Create a search assistant for your `org-mode', `markdown' notes,
|
||||||
;; `beancount' transactions and images. This package exposes two
|
;; `beancount' transactions, PDFs and images. This package exposes
|
||||||
;; assistance modes, search and chat:
|
;; two assistance modes, search and chat:
|
||||||
;;
|
;;
|
||||||
;; Chat provides faster answers, iterative discovery and assisted
|
;; Chat provides faster answers, iterative discovery and assisted
|
||||||
;; creativity. It requires your OpenAI API key to access GPT models
|
;; creativity. It requires your OpenAI API key to access GPT models
|
||||||
|
@ -95,6 +95,7 @@
|
||||||
(const "markdown")
|
(const "markdown")
|
||||||
(const "ledger")
|
(const "ledger")
|
||||||
(const "image")
|
(const "image")
|
||||||
|
(const "pdf")
|
||||||
(const "music")))
|
(const "music")))
|
||||||
|
|
||||||
|
|
||||||
|
@ -140,6 +141,8 @@ NO-PAGING FILTER))
|
||||||
"C-x l | ledger\n")
|
"C-x l | ledger\n")
|
||||||
(when (member 'image enabled-content-types)
|
(when (member 'image enabled-content-types)
|
||||||
"C-x i | image\n")
|
"C-x i | image\n")
|
||||||
|
(when (member 'pdf enabled-content-types)
|
||||||
|
"C-x p | pdf\n")
|
||||||
(when (member 'music enabled-content-types)
|
(when (member 'music enabled-content-types)
|
||||||
"C-x M | music\n"))))
|
"C-x M | music\n"))))
|
||||||
|
|
||||||
|
@ -150,6 +153,7 @@ NO-PAGING FILTER))
|
||||||
(defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
|
(defun khoj--search-ledger () "Set content-type to `ledger'." (interactive) (setq khoj--content-type "ledger"))
|
||||||
(defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
|
(defun khoj--search-images () "Set content-type to image." (interactive) (setq khoj--content-type "image"))
|
||||||
(defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
|
(defun khoj--search-music () "Set content-type to music." (interactive) (setq khoj--content-type "music"))
|
||||||
|
(defun khoj--search-pdf () "Set content-type to pdf." (interactive) (setq khoj--content-type "pdf"))
|
||||||
(defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
|
(defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
|
||||||
(defun khoj--make-search-keymap (&optional existing-keymap)
|
(defun khoj--make-search-keymap (&optional existing-keymap)
|
||||||
"Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
|
"Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
|
||||||
|
@ -164,6 +168,8 @@ NO-PAGING FILTER))
|
||||||
(define-key kmap (kbd "C-x l") #'khoj--search-ledger))
|
(define-key kmap (kbd "C-x l") #'khoj--search-ledger))
|
||||||
(when (member 'image enabled-content-types)
|
(when (member 'image enabled-content-types)
|
||||||
(define-key kmap (kbd "C-x i") #'khoj--search-images))
|
(define-key kmap (kbd "C-x i") #'khoj--search-images))
|
||||||
|
(when (member 'pdf enabled-content-types)
|
||||||
|
(define-key kmap (kbd "C-x p") #'khoj--search-pdf))
|
||||||
(when (member 'music enabled-content-types)
|
(when (member 'music enabled-content-types)
|
||||||
(define-key kmap (kbd "C-x M") #'khoj--search-music))
|
(define-key kmap (kbd "C-x M") #'khoj--search-music))
|
||||||
kmap))
|
kmap))
|
||||||
|
@ -544,6 +550,22 @@ CONFIG is json obtained from Khoj config API."
|
||||||
;; remove trailing (, ) or SPC from extracted entries string
|
;; remove trailing (, ) or SPC from extracted entries string
|
||||||
(replace-regexp-in-string "[\(\) ]$" "")))
|
(replace-regexp-in-string "[\(\) ]$" "")))
|
||||||
|
|
||||||
|
(defun khoj--extract-entries-as-pdf (json-response query)
|
||||||
|
"Convert QUERY, JSON-RESPONSE from API with PDF results to `org-mode' entries."
|
||||||
|
(thread-last
|
||||||
|
json-response
|
||||||
|
;; Extract and render each pdf entry from response
|
||||||
|
(mapcar (lambda (json-response-item)
|
||||||
|
(thread-last
|
||||||
|
;; Extract pdf entry from each item in json response
|
||||||
|
(cdr (assoc 'compiled (assoc 'additional json-response-item)))
|
||||||
|
;; Format pdf entry as a org entry string
|
||||||
|
(format "** %s\n\n"))))
|
||||||
|
;; Render entries into org formatted string with query set as as top level heading
|
||||||
|
(format "* %s\n%s\n" query)
|
||||||
|
;; remove leading (, ) or SPC from extracted entries string
|
||||||
|
(replace-regexp-in-string "^[\(\) ]" "")))
|
||||||
|
|
||||||
(defun khoj--extract-entries-as-images (json-response query)
|
(defun khoj--extract-entries-as-images (json-response query)
|
||||||
"Convert JSON-RESPONSE, QUERY from API to html with images."
|
"Convert JSON-RESPONSE, QUERY from API to html with images."
|
||||||
(let ((image-results-buffer-html-format-str "<html>\n<body>\n<h1>%s</h1>%s\n\n</body>\n</html>")
|
(let ((image-results-buffer-html-format-str "<html>\n<body>\n<h1>%s</h1>%s\n\n</body>\n</html>")
|
||||||
|
@ -592,6 +614,7 @@ CONFIG is json obtained from Khoj config API."
|
||||||
((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
|
((and (member 'music enabled-content-types) (equal buffer-name "Music.org")) "music")
|
||||||
((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
|
((and (member 'ledger enabled-content-types) (or (equal file-extension "bean") (equal file-extension "beancount"))) "ledger")
|
||||||
((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
|
((and (member 'org enabled-content-types) (equal file-extension "org")) "org")
|
||||||
|
((and (member 'org enabled-content-types) (equal file-extension "pdf")) "pdf")
|
||||||
((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
|
((and (member 'markdown enabled-content-types) (or (equal file-extension "markdown") (equal file-extension "md"))) "markdown")
|
||||||
(t khoj-default-content-type))))
|
(t khoj-default-content-type))))
|
||||||
|
|
||||||
|
@ -647,16 +670,19 @@ Render results in BUFFER-NAME using QUERY, CONTENT-TYPE."
|
||||||
(insert
|
(insert
|
||||||
(cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
|
(cond ((or (equal content-type "org") (equal content-type "music")) (khoj--extract-entries-as-org json-response query))
|
||||||
((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
|
((equal content-type "markdown") (khoj--extract-entries-as-markdown json-response query))
|
||||||
|
((equal content-type "pdf") (khoj--extract-entries-as-pdf json-response query))
|
||||||
((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
|
((equal content-type "ledger") (khoj--extract-entries-as-ledger json-response query))
|
||||||
((equal content-type "image") (khoj--extract-entries-as-images json-response query))
|
((equal content-type "image") (khoj--extract-entries-as-images json-response query))
|
||||||
(t (khoj--extract-entries json-response query))))
|
(t (khoj--extract-entries json-response query))))
|
||||||
(cond ((equal content-type "org") (progn (visual-line-mode)
|
(cond ((or (equal content-type "pdf")
|
||||||
(org-mode)
|
(equal content-type "org"))
|
||||||
(setq-local
|
(progn (visual-line-mode)
|
||||||
org-startup-folded "showall"
|
(org-mode)
|
||||||
org-hide-leading-stars t
|
(setq-local
|
||||||
org-startup-with-inline-images t)
|
org-startup-folded "showall"
|
||||||
(org-set-startup-visibility)))
|
org-hide-leading-stars t
|
||||||
|
org-startup-with-inline-images t)
|
||||||
|
(org-set-startup-visibility)))
|
||||||
((equal content-type "markdown") (progn (markdown-mode)
|
((equal content-type "markdown") (progn (markdown-mode)
|
||||||
(visual-line-mode)))
|
(visual-line-mode)))
|
||||||
((equal content-type "ledger") (beancount-mode))
|
((equal content-type "ledger") (beancount-mode))
|
||||||
|
@ -973,7 +999,7 @@ Paragraph only starts at first text after blank line."
|
||||||
;; set content type to: last used > based on current buffer > default type
|
;; set content type to: last used > based on current buffer > default type
|
||||||
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
||||||
;; dynamically set choices to content types enabled on khoj backend
|
;; dynamically set choices to content types enabled on khoj backend
|
||||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "ledger" "music" "image")))
|
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("org" "markdown" "pdf" "ledger" "music" "image")))
|
||||||
|
|
||||||
(transient-define-suffix khoj--search-command (&optional args)
|
(transient-define-suffix khoj--search-command (&optional args)
|
||||||
(interactive (list (transient-args transient-current-command)))
|
(interactive (list (transient-args transient-current-command)))
|
||||||
|
|
|
@ -42,7 +42,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
|
||||||
1. Install Khoj via `pip` and start Khoj backend in non-gui mode
|
1. Install Khoj via `pip` and start Khoj backend in non-gui mode
|
||||||
2. Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
2. Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
||||||
3. Check the new Khoj plugin settings
|
3. Check the new Khoj plugin settings
|
||||||
4. Wait for Khoj backend to index markdown files in the current Vault
|
4. Wait for Khoj backend to index markdown, PDF files in the current Vault
|
||||||
5. Open Khoj plugin on Obsidian via Search button on Left Pane
|
5. Open Khoj plugin on Obsidian via Search button on Left Pane
|
||||||
6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
6. Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
||||||
7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
7. Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
||||||
|
@ -151,7 +151,7 @@ The plugin implements the following functionality to search your notes with Khoj
|
||||||
- [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command
|
- [X] Open the Khoj search modal via left ribbon icon or the *Khoj: Search* command
|
||||||
- [X] Render results as Markdown preview to improve readability
|
- [X] Render results as Markdown preview to improve readability
|
||||||
- [X] Configure Khoj via the plugin setting tab on the settings page
|
- [X] Configure Khoj via the plugin setting tab on the settings page
|
||||||
- Set Obsidian Vault to Index with Khoj. Defaults to all markdown files in current Vault
|
- Set Obsidian Vault to Index with Khoj. Defaults to all markdown, PDF files in current Vault
|
||||||
- Set URL of Khoj backend
|
- Set URL of Khoj backend
|
||||||
- Set Number of Search Results to show in Search Modal
|
- Set Number of Search Results to show in Search Modal
|
||||||
- [X] Allow reranking of result to improve search quality
|
- [X] Allow reranking of result to improve search quality
|
||||||
|
|
|
@ -89,12 +89,24 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
||||||
async getSuggestions(query: string): Promise<SearchResult[]> {
|
async getSuggestions(query: string): Promise<SearchResult[]> {
|
||||||
// Query Khoj backend for search results
|
// Query Khoj backend for search results
|
||||||
let encodedQuery = encodeURIComponent(query);
|
let encodedQuery = encodeURIComponent(query);
|
||||||
let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}&t=markdown`;
|
let searchUrl = `${this.setting.khojUrl}/api/search?q=${encodedQuery}&n=${this.setting.resultsCount}&r=${this.rerank}`;
|
||||||
let response = await request(searchUrl);
|
|
||||||
let data = JSON.parse(response);
|
// Get search results for markdown and pdf files
|
||||||
let results = data
|
let mdResponse = await request(`${searchUrl}&t=markdown`);
|
||||||
|
let pdfResponse = await request(`${searchUrl}&t=pdf`);
|
||||||
|
|
||||||
|
// Parse search results
|
||||||
|
let mdData = JSON.parse(mdResponse)
|
||||||
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
||||||
.map((result: any) => { return { entry: result.entry, file: result.additional.file } as SearchResult; });
|
.map((result: any) => { return { entry: result.entry, score: result.score, file: result.additional.file }; });
|
||||||
|
let pdfData = JSON.parse(pdfResponse)
|
||||||
|
.filter((result: any) => !this.find_similar_notes || !result.additional.file.endsWith(this.app.workspace.getActiveFile()?.path))
|
||||||
|
.map((result: any) => { return { entry: `## ${result.additional.compiled}`, score: result.score, file: result.additional.file } as SearchResult; })
|
||||||
|
|
||||||
|
// Combine markdown and PDF results and sort them by score
|
||||||
|
let results = mdData.concat(pdfData)
|
||||||
|
.sort((a: any, b: any) => b.score - a.score)
|
||||||
|
.map((result: any) => { return { entry: result.entry, file: result.file } as SearchResult; })
|
||||||
|
|
||||||
this.query = query;
|
this.query = query;
|
||||||
return results;
|
return results;
|
||||||
|
@ -124,11 +136,12 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
||||||
}
|
}
|
||||||
|
|
||||||
async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
|
async onChooseSuggestion(result: SearchResult, _: MouseEvent | KeyboardEvent) {
|
||||||
// Get all markdown files in vault
|
// Get all markdown and PDF files in vault
|
||||||
const mdFiles = this.app.vault.getMarkdownFiles();
|
const mdFiles = this.app.vault.getMarkdownFiles();
|
||||||
|
const pdfFiles = this.app.vault.getFiles().filter(file => file.extension === 'pdf');
|
||||||
|
|
||||||
// Find the vault file matching file of chosen search result
|
// Find the vault file matching file of chosen search result
|
||||||
let file_match = mdFiles
|
let file_match = mdFiles.concat(pdfFiles)
|
||||||
// Sort by descending length of path
|
// Sort by descending length of path
|
||||||
// This finds longest path match when multiple files have same name
|
// This finds longest path match when multiple files have same name
|
||||||
.sort((a, b) => b.path.length - a.path.length)
|
.sort((a, b) => b.path.length - a.path.length)
|
||||||
|
@ -138,7 +151,7 @@ export class KhojSearchModal extends SuggestModal<SearchResult> {
|
||||||
|
|
||||||
// Open vault file at heading of chosen search result
|
// Open vault file at heading of chosen search result
|
||||||
if (file_match) {
|
if (file_match) {
|
||||||
let resultHeading = result.entry.split('\n', 1)[0];
|
let resultHeading = file_match.extension !== 'pdf' ? result.entry.split('\n', 1)[0] : '';
|
||||||
let linkToEntry = `${file_match.path}${resultHeading}`
|
let linkToEntry = `${file_match.path}${resultHeading}`
|
||||||
this.app.workspace.openLinkText(linkToEntry, '');
|
this.app.workspace.openLinkText(linkToEntry, '');
|
||||||
console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`);
|
console.log(`Link: ${linkToEntry}, File: ${file_match.path}, Heading: ${resultHeading}`);
|
||||||
|
|
|
@ -108,6 +108,7 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||||
this.plugin.registerInterval(progress_indicator);
|
this.plugin.registerInterval(progress_indicator);
|
||||||
|
|
||||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`);
|
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true`);
|
||||||
|
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true`);
|
||||||
new Notice('✅ Updated Khoj index.');
|
new Notice('✅ Updated Khoj index.');
|
||||||
|
|
||||||
// Reset button once index is updated
|
// Reset button once index is updated
|
||||||
|
|
|
@ -12,6 +12,7 @@ export function getVaultAbsolutePath(vault: Vault): string {
|
||||||
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
||||||
let vaultPath = getVaultAbsolutePath(vault);
|
let vaultPath = getVaultAbsolutePath(vault);
|
||||||
let mdInVault = `${vaultPath}/**/*.md`;
|
let mdInVault = `${vaultPath}/**/*.md`;
|
||||||
|
let pdfInVault = `${vaultPath}/**/*.pdf`;
|
||||||
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
||||||
|
|
||||||
// Check if khoj backend is configured, note if cannot connect to backend
|
// Check if khoj backend is configured, note if cannot connect to backend
|
||||||
|
@ -32,7 +33,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
||||||
// Get default config fields from khoj backend
|
// Get default config fields from khoj backend
|
||||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||||
let khojDefaultIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||||
|
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
|
||||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];
|
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["model"];
|
||||||
|
|
||||||
|
@ -47,8 +49,14 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
"markdown": {
|
"markdown": {
|
||||||
"input-filter": [mdInVault],
|
"input-filter": [mdInVault],
|
||||||
"input-files": null,
|
"input-files": null,
|
||||||
"embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
|
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
|
},
|
||||||
|
"pdf": {
|
||||||
|
"input-filter": [pdfInVault],
|
||||||
|
"input-files": null,
|
||||||
|
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||||
|
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -59,8 +67,8 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
data["content-type"]["markdown"] = {
|
data["content-type"]["markdown"] = {
|
||||||
"input-filter": [mdInVault],
|
"input-filter": [mdInVault],
|
||||||
"input-files": null,
|
"input-files": null,
|
||||||
"embeddings-file": `${khojDefaultIndexDirectory}/${indexName}.pt`,
|
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||||
"compressed-jsonl": `${khojDefaultIndexDirectory}/${indexName}.jsonl.gz`,
|
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
||||||
|
@ -68,12 +76,37 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
||||||
// Update markdown config in khoj content-type config
|
// Update markdown config in khoj content-type config
|
||||||
// Set markdown config to only index markdown files in configured obsidian vault
|
// Set markdown config to only index markdown files in configured obsidian vault
|
||||||
let khojIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||||
data["content-type"]["markdown"] = {
|
data["content-type"]["markdown"] = {
|
||||||
"input-filter": [mdInVault],
|
"input-filter": [mdInVault],
|
||||||
"input-files": null,
|
"input-files": null,
|
||||||
"embeddings-file": `${khojIndexDirectory}/${indexName}.pt`,
|
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
|
||||||
"compressed-jsonl": `${khojIndexDirectory}/${indexName}.jsonl.gz`,
|
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (khoj_already_configured && !data["content-type"]["pdf"]) {
|
||||||
|
// Add pdf config to khoj content-type config
|
||||||
|
// Set pdf config to index pdf files in configured obsidian vault
|
||||||
|
data["content-type"]["pdf"] = {
|
||||||
|
"input-filter": [pdfInVault],
|
||||||
|
"input-files": null,
|
||||||
|
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||||
|
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Else if khoj is not configured to index pdf files in configured obsidian vault
|
||||||
|
else if (khoj_already_configured &&
|
||||||
|
(data["content-type"]["pdf"]["input-filter"].length != 1 ||
|
||||||
|
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
|
||||||
|
// Update pdf config in khoj content-type config
|
||||||
|
// Set pdf config to only index pdf files in configured obsidian vault
|
||||||
|
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
|
||||||
|
data["content-type"]["pdf"] = {
|
||||||
|
"input-filter": [pdfInVault],
|
||||||
|
"input-files": null,
|
||||||
|
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
|
||||||
|
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ from khoj.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||||
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||||
from khoj.search_type import image_search, text_search
|
from khoj.search_type import image_search, text_search
|
||||||
from khoj.utils import constants, state
|
from khoj.utils import constants, state
|
||||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||||
|
@ -132,6 +133,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Initialize PDF Search
|
||||||
|
if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf:
|
||||||
|
logger.info("💸 Setting up search for pdf")
|
||||||
|
# Extract Entries, Generate PDF Embeddings
|
||||||
|
model.pdf_search = text_search.setup(
|
||||||
|
PdfToJsonl,
|
||||||
|
config.content_type.pdf,
|
||||||
|
search_config=config.search_type.asymmetric,
|
||||||
|
regenerate=regenerate,
|
||||||
|
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||||
|
)
|
||||||
|
|
||||||
# Initialize Image Search
|
# Initialize Image Search
|
||||||
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
if (t == state.SearchType.Image or t == None) and config.content_type.image:
|
||||||
logger.info("🌄 Setting up search for images")
|
logger.info("🌄 Setting up search for images")
|
||||||
|
|
|
@ -42,6 +42,8 @@ class FileBrowser(QtWidgets.QWidget):
|
||||||
return "Beancount Files (*.bean *.beancount)"
|
return "Beancount Files (*.bean *.beancount)"
|
||||||
elif search_type == SearchType.Markdown:
|
elif search_type == SearchType.Markdown:
|
||||||
return "Markdown Files (*.md *.markdown)"
|
return "Markdown Files (*.md *.markdown)"
|
||||||
|
elif search_type == SearchType.Pdf:
|
||||||
|
return "Pdf Files (*.pdf)"
|
||||||
elif search_type == SearchType.Music:
|
elif search_type == SearchType.Music:
|
||||||
return "Org-Music Files (*.org)"
|
return "Org-Music Files (*.org)"
|
||||||
elif search_type == SearchType.Image:
|
elif search_type == SearchType.Image:
|
||||||
|
|
|
@ -44,6 +44,15 @@
|
||||||
}).join("\n") + `</div>`;
|
}).join("\n") + `</div>`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function render_pdf(query, data) {
|
||||||
|
return `<div id="results-pdf">` + data.map(function (item) {
|
||||||
|
let compiled_lines = item.additional.compiled.split("\n");
|
||||||
|
let filename = compiled_lines.shift();
|
||||||
|
let text_match = compiled_lines.join("\n")
|
||||||
|
return `<h2>${filename}</h2>\n<p>${text_match}</p>`
|
||||||
|
}).join("\n") + `</div>`;
|
||||||
|
}
|
||||||
|
|
||||||
function render_json(data, query, type) {
|
function render_json(data, query, type) {
|
||||||
if (type === "markdown") {
|
if (type === "markdown") {
|
||||||
return render_markdown(query, data);
|
return render_markdown(query, data);
|
||||||
|
@ -55,6 +64,8 @@
|
||||||
return data.map(render_image).join('');
|
return data.map(render_image).join('');
|
||||||
} else if (type === "ledger") {
|
} else if (type === "ledger") {
|
||||||
return render_ledger(query, data);
|
return render_ledger(query, data);
|
||||||
|
} else if (type === "pdf") {
|
||||||
|
return render_pdf(query, data);
|
||||||
} else {
|
} else {
|
||||||
return `<div id="results-plugin">`
|
return `<div id="results-plugin">`
|
||||||
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
|
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
|
||||||
|
@ -279,6 +290,7 @@
|
||||||
#json {
|
#json {
|
||||||
white-space: pre-wrap;
|
white-space: pre-wrap;
|
||||||
}
|
}
|
||||||
|
#results-pdf,
|
||||||
#results-plugin,
|
#results-plugin,
|
||||||
#results-ledger {
|
#results-ledger {
|
||||||
text-align: left;
|
text-align: left;
|
||||||
|
|
|
@ -5,10 +5,10 @@ from datetime import datetime
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
from khoj.utils.constants import empty_escape_sequences
|
||||||
|
from khoj.processor.conversation import prompts
|
||||||
from khoj.processor.conversation.utils import (
|
from khoj.processor.conversation.utils import (
|
||||||
chat_completion_with_backoff,
|
chat_completion_with_backoff,
|
||||||
completion_with_backoff,
|
completion_with_backoff,
|
||||||
message_to_prompt,
|
|
||||||
generate_chatml_messages_with_context,
|
generate_chatml_messages_with_context,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -20,22 +20,14 @@ def answer(text, user_query, model, api_key=None, temperature=0.5, max_tokens=50
|
||||||
"""
|
"""
|
||||||
Answer user query using provided text as reference with OpenAI's GPT
|
Answer user query using provided text as reference with OpenAI's GPT
|
||||||
"""
|
"""
|
||||||
# Setup Prompt based on Summary Type
|
# Setup Prompt from arguments
|
||||||
prompt = f"""
|
prompt = prompts.answer.format(text=text, user_query=user_query)
|
||||||
You are a friendly, helpful personal assistant.
|
|
||||||
Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
{text}
|
|
||||||
|
|
||||||
Question: {user_query}
|
|
||||||
|
|
||||||
Answer (in second person):"""
|
|
||||||
# Get Response from GPT
|
# Get Response from GPT
|
||||||
logger.debug(f"Prompt for GPT: {prompt}")
|
logger.debug(f"Prompt for GPT: {prompt}")
|
||||||
response = completion_with_backoff(
|
response = completion_with_backoff(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
model=model,
|
model_name=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
stop='"""',
|
stop='"""',
|
||||||
|
@ -43,8 +35,7 @@ Answer (in second person):"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract, Clean Message from GPT's Response
|
# Extract, Clean Message from GPT's Response
|
||||||
story = response["choices"][0]["text"]
|
return str(response).replace("\n\n", "")
|
||||||
return str(story).replace("\n\n", "")
|
|
||||||
|
|
||||||
|
|
||||||
def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
|
def summarize(text, summary_type, model, user_query=None, api_key=None, temperature=0.5, max_tokens=200):
|
||||||
|
@ -53,25 +44,15 @@ def summarize(text, summary_type, model, user_query=None, api_key=None, temperat
|
||||||
"""
|
"""
|
||||||
# Setup Prompt based on Summary Type
|
# Setup Prompt based on Summary Type
|
||||||
if summary_type == "chat":
|
if summary_type == "chat":
|
||||||
prompt = f"""
|
prompt = prompts.summarize_chat.format(text=text)
|
||||||
You are an AI. Summarize the conversation below from your perspective:
|
|
||||||
|
|
||||||
{text}
|
|
||||||
|
|
||||||
Summarize the conversation from the AI's first-person perspective:"""
|
|
||||||
elif summary_type == "notes":
|
elif summary_type == "notes":
|
||||||
prompt = f"""
|
prompt = prompts.summarize_notes.format(text=text, user_query=user_query)
|
||||||
Summarize the below notes about {user_query}:
|
|
||||||
|
|
||||||
{text}
|
|
||||||
|
|
||||||
Summarize the notes in second person perspective:"""
|
|
||||||
|
|
||||||
# Get Response from GPT
|
# Get Response from GPT
|
||||||
logger.debug(f"Prompt for GPT: {prompt}")
|
logger.debug(f"Prompt for GPT: {prompt}")
|
||||||
response = completion_with_backoff(
|
response = completion_with_backoff(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
model=model,
|
model_name=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
frequency_penalty=0.2,
|
frequency_penalty=0.2,
|
||||||
|
@ -80,8 +61,7 @@ Summarize the notes in second person perspective:"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract, Clean Message from GPT's Response
|
# Extract, Clean Message from GPT's Response
|
||||||
story = response["choices"][0]["text"]
|
return str(response).replace("\n\n", "")
|
||||||
return str(story).replace("\n\n", "")
|
|
||||||
|
|
||||||
|
|
||||||
def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
|
def extract_questions(text, model="text-davinci-003", conversation_log={}, api_key=None, temperature=0, max_tokens=100):
|
||||||
|
@ -102,68 +82,21 @@ def extract_questions(text, model="text-davinci-003", conversation_log={}, api_k
|
||||||
current_new_year = today.replace(month=1, day=1)
|
current_new_year = today.replace(month=1, day=1)
|
||||||
last_new_year = current_new_year.replace(year=today.year - 1)
|
last_new_year = current_new_year.replace(year=today.year - 1)
|
||||||
|
|
||||||
prompt = f"""
|
prompt = prompts.extract_questions.format(
|
||||||
You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the users notes.
|
current_date=today.strftime("%A, %Y-%m-%d"),
|
||||||
- The user will provide their questions and answers to you for context.
|
last_new_year=last_new_year.strftime("%Y"),
|
||||||
- Add as much context from the previous questions and answers as required into your search queries.
|
last_new_year_date=last_new_year.strftime("%Y-%m-%d"),
|
||||||
- Break messages into multiple search queries when required to retrieve the relevant information.
|
current_new_year_date=current_new_year.strftime("%Y-%m-%d"),
|
||||||
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
|
bob_tom_age_difference={current_new_year.year - 1984 - 30},
|
||||||
|
bob_age={current_new_year.year - 1984},
|
||||||
What searches, if any, will you need to perform to answer the users question?
|
chat_history=chat_history,
|
||||||
Provide search queries as a JSON list of strings
|
text=text,
|
||||||
Current Date: {today.strftime("%A, %Y-%m-%d")}
|
)
|
||||||
|
|
||||||
Q: How was my trip to Cambodia?
|
|
||||||
|
|
||||||
["How was my trip to Cambodia?"]
|
|
||||||
|
|
||||||
A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
|
|
||||||
|
|
||||||
Q: Who did i visit that temple with?
|
|
||||||
|
|
||||||
["Who did I visit the Angkor Wat Temple in Cambodia with?"]
|
|
||||||
|
|
||||||
A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
|
|
||||||
|
|
||||||
Q: What national parks did I go to last year?
|
|
||||||
|
|
||||||
["National park I visited in {last_new_year.strftime("%Y")} dt>=\\"{last_new_year.strftime("%Y-%m-%d")}\\" dt<\\"{current_new_year.strftime("%Y-%m-%d")}\\""]
|
|
||||||
|
|
||||||
A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year.strftime("%Y")}.
|
|
||||||
|
|
||||||
Q: How are you feeling today?
|
|
||||||
|
|
||||||
[]
|
|
||||||
|
|
||||||
A: I'm feeling a little bored. Helping you will hopefully make me feel better!
|
|
||||||
|
|
||||||
Q: How many tennis balls fit in the back of a 2002 Honda Civic?
|
|
||||||
|
|
||||||
["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
|
|
||||||
|
|
||||||
A: 1085 tennis balls will fit in the trunk of a Honda Civic
|
|
||||||
|
|
||||||
Q: Is Bob older than Tom?
|
|
||||||
|
|
||||||
["When was Bob born?", "What is Tom's age?"]
|
|
||||||
|
|
||||||
A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
|
|
||||||
|
|
||||||
Q: What is their age difference?
|
|
||||||
|
|
||||||
["What is Bob's age?", "What is Tom's age?"]
|
|
||||||
|
|
||||||
A: Bob is {current_new_year.year - 1984 - 30} years older than Tom. As Bob is {current_new_year.year - 1984} years old and Tom is 30 years old.
|
|
||||||
|
|
||||||
{chat_history}
|
|
||||||
Q: {text}
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Get Response from GPT
|
# Get Response from GPT
|
||||||
response = completion_with_backoff(
|
response = completion_with_backoff(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
model=model,
|
model_name=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
stop=["A: ", "\n"],
|
stop=["A: ", "\n"],
|
||||||
|
@ -171,17 +104,16 @@ Q: {text}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract, Clean Message from GPT's Response
|
# Extract, Clean Message from GPT's Response
|
||||||
response_text = response["choices"][0]["text"]
|
|
||||||
try:
|
try:
|
||||||
questions = json.loads(
|
questions = json.loads(
|
||||||
# Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
|
# Clean response to increase likelihood of valid JSON. E.g replace ' with " to enclose strings
|
||||||
response_text.strip(empty_escape_sequences)
|
response.strip(empty_escape_sequences)
|
||||||
.replace("['", '["')
|
.replace("['", '["')
|
||||||
.replace("']", '"]')
|
.replace("']", '"]')
|
||||||
.replace("', '", '", "')
|
.replace("', '", '", "')
|
||||||
)
|
)
|
||||||
except json.decoder.JSONDecodeError:
|
except json.decoder.JSONDecodeError:
|
||||||
logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response_text}")
|
logger.warn(f"GPT returned invalid JSON. Falling back to using user message as search query.\n{response}")
|
||||||
questions = [text]
|
questions = [text]
|
||||||
logger.debug(f"Extracted Questions by GPT: {questions}")
|
logger.debug(f"Extracted Questions by GPT: {questions}")
|
||||||
return questions
|
return questions
|
||||||
|
@ -191,31 +123,8 @@ def extract_search_type(text, model, api_key=None, temperature=0.5, max_tokens=1
|
||||||
"""
|
"""
|
||||||
Extract search type from user query using OpenAI's GPT
|
Extract search type from user query using OpenAI's GPT
|
||||||
"""
|
"""
|
||||||
# Initialize Variables
|
# Setup Prompt to extract search type
|
||||||
understand_primer = """
|
prompt = prompts.search_type + f"{text}\nA:"
|
||||||
Objective: Extract search type from user query and return information as JSON
|
|
||||||
|
|
||||||
Allowed search types are listed below:
|
|
||||||
- search-type=["notes","ledger","image","music"]
|
|
||||||
|
|
||||||
Some examples are given below for reference:
|
|
||||||
Q:What fiction book was I reading last week about AI starship?
|
|
||||||
A:{ "search-type": "notes" }
|
|
||||||
Q:Play some calm classical music?
|
|
||||||
A:{ "search-type": "music" }
|
|
||||||
Q:How much did I spend at Subway for dinner last time?
|
|
||||||
A:{ "search-type": "ledger" }
|
|
||||||
Q:What was that popular Sri lankan song that Alex had mentioned?
|
|
||||||
A:{ "search-type": "music" }
|
|
||||||
Q:Can you recommend a movie to watch from my notes?
|
|
||||||
A:{ "search-type": "notes" }
|
|
||||||
Q: When did I buy Groceries last?
|
|
||||||
A:{ "search-type": "ledger" }
|
|
||||||
Q:When did I go surfing last?
|
|
||||||
A:{ "search-type": "notes" }"""
|
|
||||||
|
|
||||||
# Setup Prompt with Understand Primer
|
|
||||||
prompt = message_to_prompt(text, understand_primer, start_sequence="\nA:", restart_sequence="\nQ:")
|
|
||||||
if verbose > 1:
|
if verbose > 1:
|
||||||
print(f"Message -> Prompt: {text} -> {prompt}")
|
print(f"Message -> Prompt: {text} -> {prompt}")
|
||||||
|
|
||||||
|
@ -223,7 +132,7 @@ A:{ "search-type": "notes" }"""
|
||||||
logger.debug(f"Prompt for GPT: {prompt}")
|
logger.debug(f"Prompt for GPT: {prompt}")
|
||||||
response = completion_with_backoff(
|
response = completion_with_backoff(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
model=model,
|
model_name=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
frequency_penalty=0.2,
|
frequency_penalty=0.2,
|
||||||
|
@ -232,8 +141,7 @@ A:{ "search-type": "notes" }"""
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract, Clean Message from GPT's Response
|
# Extract, Clean Message from GPT's Response
|
||||||
story = str(response["choices"][0]["text"])
|
return json.loads(response.strip(empty_escape_sequences))
|
||||||
return json.loads(story.strip(empty_escape_sequences))
|
|
||||||
|
|
||||||
|
|
||||||
def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2):
|
def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo", api_key=None, temperature=0.2):
|
||||||
|
@ -241,36 +149,23 @@ def converse(references, user_query, conversation_log={}, model="gpt-3.5-turbo",
|
||||||
Converse with user using OpenAI's ChatGPT
|
Converse with user using OpenAI's ChatGPT
|
||||||
"""
|
"""
|
||||||
# Initialize Variables
|
# Initialize Variables
|
||||||
|
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||||
compiled_references = "\n\n".join({f"# {item}" for item in references})
|
compiled_references = "\n\n".join({f"# {item}" for item in references})
|
||||||
|
|
||||||
personality_primer = "You are Khoj, a friendly, smart and helpful personal assistant."
|
|
||||||
conversation_primers = {
|
|
||||||
"general": f"""
|
|
||||||
Using your general knowledge and our past conversations as context, answer the following question.
|
|
||||||
Current Date: {datetime.now().strftime("%Y-%m-%d")}
|
|
||||||
|
|
||||||
Question: {user_query}
|
|
||||||
""".strip(),
|
|
||||||
"notes": f"""
|
|
||||||
Using the notes and our past conversations as context, answer the following question.
|
|
||||||
Current Date: {datetime.now().strftime("%Y-%m-%d")}
|
|
||||||
|
|
||||||
Notes:
|
|
||||||
{compiled_references}
|
|
||||||
|
|
||||||
Question: {user_query}
|
|
||||||
""".strip(),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Get Conversation Primer appropriate to Conversation Type
|
# Get Conversation Primer appropriate to Conversation Type
|
||||||
conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes"
|
conversation_type = "general" if user_query.startswith("@general") or compiled_references.strip() == "" else "notes"
|
||||||
logger.debug(f"Conversation Type: {conversation_type}")
|
logger.debug(f"Conversation Type: {conversation_type}")
|
||||||
conversation_primer = conversation_primers.get(conversation_type)
|
if conversation_type == "general":
|
||||||
|
conversation_primer = prompts.general_conversation.format(current_date=current_date, query=user_query)
|
||||||
|
else:
|
||||||
|
conversation_primer = prompts.notes_conversation.format(
|
||||||
|
current_date=current_date, query=user_query, references=compiled_references
|
||||||
|
)
|
||||||
|
|
||||||
# Setup Prompt with Primer or Conversation History
|
# Setup Prompt with Primer or Conversation History
|
||||||
messages = generate_chatml_messages_with_context(
|
messages = generate_chatml_messages_with_context(
|
||||||
conversation_primer,
|
conversation_primer,
|
||||||
personality_primer,
|
prompts.personality.format(),
|
||||||
conversation_log,
|
conversation_log,
|
||||||
model,
|
model,
|
||||||
)
|
)
|
||||||
|
@ -279,11 +174,10 @@ Question: {user_query}
|
||||||
logger.debug(f"Conversation Context for GPT: {messages}")
|
logger.debug(f"Conversation Context for GPT: {messages}")
|
||||||
response = chat_completion_with_backoff(
|
response = chat_completion_with_backoff(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
model=model,
|
model_name=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
api_key=api_key,
|
openai_api_key=api_key,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract, Clean Message from GPT's Response
|
# Extract, Clean Message from GPT's Response
|
||||||
story = str(response["choices"][0]["message"]["content"])
|
return response.strip(empty_escape_sequences)
|
||||||
return story.strip(empty_escape_sequences)
|
|
||||||
|
|
165
src/khoj/processor/conversation/prompts.py
Normal file
165
src/khoj/processor/conversation/prompts.py
Normal file
|
@ -0,0 +1,165 @@
|
||||||
|
# External Packages
|
||||||
|
from langchain.prompts import PromptTemplate
|
||||||
|
|
||||||
|
|
||||||
|
## Personality
|
||||||
|
## --
|
||||||
|
personality = PromptTemplate.from_template("You are Khoj, a friendly, smart and helpful personal assistant.")
|
||||||
|
|
||||||
|
|
||||||
|
## General Conversation
|
||||||
|
## --
|
||||||
|
general_conversation = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
Using your general knowledge and our past conversations as context, answer the following question.
|
||||||
|
Current Date: {current_date}
|
||||||
|
|
||||||
|
Question: {query}
|
||||||
|
""".strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Notes Conversation
|
||||||
|
## --
|
||||||
|
notes_conversation = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
Using the notes and our past conversations as context, answer the following question.
|
||||||
|
Current Date: {current_date}
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
{references}
|
||||||
|
|
||||||
|
Question: {query}
|
||||||
|
""".strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Summarize Chat
|
||||||
|
## --
|
||||||
|
summarize_chat = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
You are an AI. Summarize the conversation below from your perspective:
|
||||||
|
|
||||||
|
{text}
|
||||||
|
|
||||||
|
Summarize the conversation from the AI's first-person perspective:"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Summarize Notes
|
||||||
|
## --
|
||||||
|
summarize_notes = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
Summarize the below notes about {user_query}:
|
||||||
|
|
||||||
|
{text}
|
||||||
|
|
||||||
|
Summarize the notes in second person perspective:"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Answer
|
||||||
|
## --
|
||||||
|
answer = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
You are a friendly, helpful personal assistant.
|
||||||
|
Using the users notes below, answer their following question. If the answer is not contained within the notes, say "I don't know."
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
{text}
|
||||||
|
|
||||||
|
Question: {user_query}
|
||||||
|
|
||||||
|
Answer (in second person):"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Extract Questions
|
||||||
|
## --
|
||||||
|
extract_questions = PromptTemplate.from_template(
|
||||||
|
"""
|
||||||
|
You are Khoj, an extremely smart and helpful search assistant with the ability to retrieve information from the user's notes.
|
||||||
|
- The user will provide their questions and answers to you for context.
|
||||||
|
- Add as much context from the previous questions and answers as required into your search queries.
|
||||||
|
- Break messages into multiple search queries when required to retrieve the relevant information.
|
||||||
|
- Add date filters to your search queries from questions and answers when required to retrieve the relevant information.
|
||||||
|
|
||||||
|
What searches, if any, will you need to perform to answer the users question?
|
||||||
|
Provide search queries as a JSON list of strings
|
||||||
|
Current Date: {current_date}
|
||||||
|
|
||||||
|
Q: How was my trip to Cambodia?
|
||||||
|
|
||||||
|
["How was my trip to Cambodia?"]
|
||||||
|
|
||||||
|
A: The trip was amazing. I went to the Angkor Wat temple and it was beautiful.
|
||||||
|
|
||||||
|
Q: Who did i visit that temple with?
|
||||||
|
|
||||||
|
["Who did I visit the Angkor Wat Temple in Cambodia with?"]
|
||||||
|
|
||||||
|
A: You visited the Angkor Wat Temple in Cambodia with Pablo, Namita and Xi.
|
||||||
|
|
||||||
|
Q: What national parks did I go to last year?
|
||||||
|
|
||||||
|
["National park I visited in {last_new_year} dt>=\\"{last_new_year_date}\\" dt<\\"{current_new_year_date}\\""]
|
||||||
|
|
||||||
|
A: You visited the Grand Canyon and Yellowstone National Park in {last_new_year}.
|
||||||
|
|
||||||
|
Q: How are you feeling today?
|
||||||
|
|
||||||
|
[]
|
||||||
|
|
||||||
|
A: I'm feeling a little bored. Helping you will hopefully make me feel better!
|
||||||
|
|
||||||
|
Q: How many tennis balls fit in the back of a 2002 Honda Civic?
|
||||||
|
|
||||||
|
["What is the size of a tennis ball?", "What is the trunk size of a 2002 Honda Civic?"]
|
||||||
|
|
||||||
|
A: 1085 tennis balls will fit in the trunk of a Honda Civic
|
||||||
|
|
||||||
|
Q: Is Bob older than Tom?
|
||||||
|
|
||||||
|
["When was Bob born?", "What is Tom's age?"]
|
||||||
|
|
||||||
|
A: Yes, Bob is older than Tom. As Bob was born on 1984-01-01 and Tom is 30 years old.
|
||||||
|
|
||||||
|
Q: What is their age difference?
|
||||||
|
|
||||||
|
["What is Bob's age?", "What is Tom's age?"]
|
||||||
|
|
||||||
|
A: Bob is {bob_tom_age_difference} years older than Tom. As Bob is {bob_age} years old and Tom is 30 years old.
|
||||||
|
|
||||||
|
{chat_history}
|
||||||
|
Q: {text}
|
||||||
|
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
## Extract Search Type
|
||||||
|
## --
|
||||||
|
search_type = """
|
||||||
|
Objective: Extract search type from user query and return information as JSON
|
||||||
|
|
||||||
|
Allowed search types are listed below:
|
||||||
|
- search-type=["notes","ledger","image","music", "pdf"]
|
||||||
|
|
||||||
|
Some examples are given below for reference:
|
||||||
|
Q:What fiction book was I reading last week about AI starship?
|
||||||
|
A:{ "search-type": "notes" }
|
||||||
|
Q: What did the lease say about early termination
|
||||||
|
A: { "search-type": "pdf" }
|
||||||
|
Q:Play some calm classical music?
|
||||||
|
A:{ "search-type": "music" }
|
||||||
|
Q:How much did I spend at Subway for dinner last time?
|
||||||
|
A:{ "search-type": "ledger" }
|
||||||
|
Q:What was that popular Sri lankan song that Alex had mentioned?
|
||||||
|
A:{ "search-type": "music" }
|
||||||
|
Q:Can you recommend a movie to watch from my notes?
|
||||||
|
A:{ "search-type": "notes" }
|
||||||
|
Q:When did I buy Groceries last?
|
||||||
|
A:{ "search-type": "ledger" }
|
||||||
|
Q:When did I go surfing last?
|
||||||
|
A:{ "search-type": "notes" }
|
||||||
|
Q:"""
|
|
@ -4,6 +4,9 @@ import logging
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from langchain.llms import OpenAI
|
||||||
|
from langchain.schema import ChatMessage
|
||||||
import openai
|
import openai
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from tenacity import (
|
from tenacity import (
|
||||||
|
@ -31,14 +34,17 @@ max_prompt_size = {"gpt-3.5-turbo": 4096, "gpt-4": 8192}
|
||||||
| retry_if_exception_type(openai.error.RateLimitError)
|
| retry_if_exception_type(openai.error.RateLimitError)
|
||||||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||||
),
|
),
|
||||||
wait=wait_random_exponential(min=1, max=30),
|
wait=wait_random_exponential(min=1, max=10),
|
||||||
stop=stop_after_attempt(6),
|
stop=stop_after_attempt(3),
|
||||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||||
reraise=True,
|
reraise=True,
|
||||||
)
|
)
|
||||||
def completion_with_backoff(**kwargs):
|
def completion_with_backoff(**kwargs):
|
||||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
prompt = kwargs.pop("prompt")
|
||||||
return openai.Completion.create(**kwargs, request_timeout=60)
|
if "openai_api_key" not in kwargs:
|
||||||
|
kwargs["openai_api_key"] = os.getenv("OPENAI_API_KEY")
|
||||||
|
llm = OpenAI(**kwargs, request_timeout=10, max_retries=1)
|
||||||
|
return llm(prompt)
|
||||||
|
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
|
@ -50,13 +56,19 @@ def completion_with_backoff(**kwargs):
|
||||||
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
| retry_if_exception_type(openai.error.ServiceUnavailableError)
|
||||||
),
|
),
|
||||||
wait=wait_exponential(multiplier=1, min=4, max=10),
|
wait=wait_exponential(multiplier=1, min=4, max=10),
|
||||||
stop=stop_after_attempt(6),
|
stop=stop_after_attempt(3),
|
||||||
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
before_sleep=before_sleep_log(logger, logging.DEBUG),
|
||||||
reraise=True,
|
reraise=True,
|
||||||
)
|
)
|
||||||
def chat_completion_with_backoff(**kwargs):
|
def chat_completion_with_backoff(messages, model_name, temperature, openai_api_key=None):
|
||||||
openai.api_key = kwargs["api_key"] if kwargs.get("api_key") else os.getenv("OPENAI_API_KEY")
|
chat = ChatOpenAI(
|
||||||
return openai.ChatCompletion.create(**kwargs, request_timeout=60)
|
model_name=model_name,
|
||||||
|
temperature=temperature,
|
||||||
|
openai_api_key=openai_api_key or os.getenv("OPENAI_API_KEY"),
|
||||||
|
request_timeout=10,
|
||||||
|
max_retries=1,
|
||||||
|
)
|
||||||
|
return chat(messages).content
|
||||||
|
|
||||||
|
|
||||||
def generate_chatml_messages_with_context(
|
def generate_chatml_messages_with_context(
|
||||||
|
@ -64,7 +76,11 @@ def generate_chatml_messages_with_context(
|
||||||
):
|
):
|
||||||
"""Generate messages for ChatGPT with context from previous conversation"""
|
"""Generate messages for ChatGPT with context from previous conversation"""
|
||||||
# Extract Chat History for Context
|
# Extract Chat History for Context
|
||||||
chat_logs = [f'{chat["message"]}\n\nNotes:\n{chat.get("context","")}' for chat in conversation_log.get("chat", [])]
|
chat_logs = []
|
||||||
|
for chat in conversation_log.get("chat", []):
|
||||||
|
chat_notes = f'\n\n Notes:\n{chat.get("context")}' if chat.get("context") else "\n"
|
||||||
|
chat_logs += [chat["message"] + chat_notes]
|
||||||
|
|
||||||
rest_backnforths = []
|
rest_backnforths = []
|
||||||
# Extract in reverse chronological order
|
# Extract in reverse chronological order
|
||||||
for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
|
for user_msg, assistant_msg in zip(chat_logs[-2::-2], chat_logs[::-2]):
|
||||||
|
@ -73,17 +89,26 @@ def generate_chatml_messages_with_context(
|
||||||
rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
|
rest_backnforths += reciprocal_conversation_to_chatml([user_msg, assistant_msg])[::-1]
|
||||||
|
|
||||||
# Format user and system messages to chatml format
|
# Format user and system messages to chatml format
|
||||||
system_chatml_message = [message_to_chatml(system_message, "system")]
|
system_chatml_message = [ChatMessage(content=system_message, role="system")]
|
||||||
user_chatml_message = [message_to_chatml(user_message, "user")]
|
user_chatml_message = [ChatMessage(content=user_message, role="user")]
|
||||||
|
|
||||||
messages = user_chatml_message + rest_backnforths[:2] + system_chatml_message + rest_backnforths[2:]
|
messages = user_chatml_message + rest_backnforths + system_chatml_message
|
||||||
|
|
||||||
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
||||||
encoder = tiktoken.encoding_for_model(model_name)
|
encoder = tiktoken.encoding_for_model(model_name)
|
||||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
|
||||||
while tokens > max_prompt_size[model_name]:
|
while tokens > max_prompt_size[model_name] and len(messages) > 1:
|
||||||
messages.pop()
|
messages.pop()
|
||||||
tokens = sum([len(encoder.encode(value)) for message in messages for value in message.values()])
|
tokens = sum([len(encoder.encode(content)) for message in messages for content in message.content])
|
||||||
|
|
||||||
|
# Truncate last message if still over max supported prompt size by model
|
||||||
|
if tokens > max_prompt_size[model_name]:
|
||||||
|
last_message = messages[-1]
|
||||||
|
truncated_message = encoder.decode(encoder.encode(last_message.content))
|
||||||
|
logger.debug(
|
||||||
|
f"Truncate last message to fit within max prompt size of {max_prompt_size[model_name]} supported by {model_name} model:\n {truncated_message}"
|
||||||
|
)
|
||||||
|
messages = [ChatMessage(content=[truncated_message], role=last_message.role)]
|
||||||
|
|
||||||
# Return message in chronological order
|
# Return message in chronological order
|
||||||
return messages[::-1]
|
return messages[::-1]
|
||||||
|
@ -91,12 +116,7 @@ def generate_chatml_messages_with_context(
|
||||||
|
|
||||||
def reciprocal_conversation_to_chatml(message_pair):
|
def reciprocal_conversation_to_chatml(message_pair):
|
||||||
"""Convert a single back and forth between user and assistant to chatml format"""
|
"""Convert a single back and forth between user and assistant to chatml format"""
|
||||||
return [message_to_chatml(message, role) for message, role in zip(message_pair, ["user", "assistant"])]
|
return [ChatMessage(content=message, role=role) for message, role in zip(message_pair, ["user", "assistant"])]
|
||||||
|
|
||||||
|
|
||||||
def message_to_chatml(message, role="assistant"):
|
|
||||||
"""Create chatml message from message and role"""
|
|
||||||
return {"role": role, "content": message}
|
|
||||||
|
|
||||||
|
|
||||||
def message_to_prompt(
|
def message_to_prompt(
|
||||||
|
|
0
src/khoj/processor/pdf/__init__.py
Normal file
0
src/khoj/processor/pdf/__init__.py
Normal file
131
src/khoj/processor/pdf/pdf_to_jsonl.py
Normal file
131
src/khoj/processor/pdf/pdf_to_jsonl.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
# Standard Packages
|
||||||
|
import glob
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
# External Packages
|
||||||
|
from langchain.document_loaders import PyPDFLoader
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||||
|
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
|
||||||
|
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PdfToJsonl(TextToJsonl):
|
||||||
|
# Define Functions
|
||||||
|
def process(self, previous_entries=None):
|
||||||
|
# Extract required fields from config
|
||||||
|
pdf_files, pdf_file_filter, output_file = (
|
||||||
|
self.config.input_files,
|
||||||
|
self.config.input_filter,
|
||||||
|
self.config.compressed_jsonl,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Input Validation
|
||||||
|
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
|
||||||
|
print("At least one of pdf-files or pdf-file-filter is required to be specified")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Get Pdf Files to Process
|
||||||
|
pdf_files = PdfToJsonl.get_pdf_files(pdf_files, pdf_file_filter)
|
||||||
|
|
||||||
|
# Extract Entries from specified Pdf files
|
||||||
|
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||||
|
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(pdf_files))
|
||||||
|
|
||||||
|
# Split entries by max tokens supported by model
|
||||||
|
with timer("Split entries by max token size supported by model", logger):
|
||||||
|
current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
|
# Identify, mark and merge any new entries with previous entries
|
||||||
|
with timer("Identify new or updated entries", logger):
|
||||||
|
if not previous_entries:
|
||||||
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
else:
|
||||||
|
entries_with_ids = self.mark_entries_for_update(
|
||||||
|
current_entries, previous_entries, key="compiled", logger=logger
|
||||||
|
)
|
||||||
|
|
||||||
|
with timer("Write PDF entries to JSONL file", logger):
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
|
jsonl_data = PdfToJsonl.convert_pdf_maps_to_jsonl(entries)
|
||||||
|
|
||||||
|
# Compress JSONL formatted Data
|
||||||
|
if output_file.suffix == ".gz":
|
||||||
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
|
elif output_file.suffix == ".jsonl":
|
||||||
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
|
||||||
|
return entries_with_ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_pdf_files(pdf_files=None, pdf_file_filters=None):
|
||||||
|
"Get PDF files to process"
|
||||||
|
absolute_pdf_files, filtered_pdf_files = set(), set()
|
||||||
|
if pdf_files:
|
||||||
|
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
||||||
|
if pdf_file_filters:
|
||||||
|
filtered_pdf_files = {
|
||||||
|
filtered_file
|
||||||
|
for pdf_file_filter in pdf_file_filters
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
||||||
|
}
|
||||||
|
|
||||||
|
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
||||||
|
|
||||||
|
files_with_non_pdf_extensions = {pdf_file for pdf_file in all_pdf_files if not pdf_file.endswith(".pdf")}
|
||||||
|
|
||||||
|
if any(files_with_non_pdf_extensions):
|
||||||
|
logger.warn(f"[Warning] There maybe non pdf-mode files in the input set: {files_with_non_pdf_extensions}")
|
||||||
|
|
||||||
|
logger.debug(f"Processing files: {all_pdf_files}")
|
||||||
|
|
||||||
|
return all_pdf_files
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_pdf_entries(pdf_files):
|
||||||
|
"""Extract entries by page from specified PDF files"""
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
entry_to_location_map = []
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
loader = PyPDFLoader(pdf_file)
|
||||||
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
|
entry_to_location_map += zip(pdf_entries_per_file, [pdf_file] * len(pdf_entries_per_file))
|
||||||
|
entries.extend(pdf_entries_per_file)
|
||||||
|
|
||||||
|
return entries, dict(entry_to_location_map)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_pdf_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||||
|
"Convert each PDF entries into a dictionary"
|
||||||
|
entries = []
|
||||||
|
for parsed_entry in parsed_entries:
|
||||||
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
|
# Append base filename to compiled entry for context to model
|
||||||
|
heading = f"{entry_filename.stem}\n"
|
||||||
|
compiled_entry = f"{heading}{parsed_entry}"
|
||||||
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled_entry,
|
||||||
|
raw=parsed_entry,
|
||||||
|
heading=heading,
|
||||||
|
file=f"{entry_filename}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries")
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_pdf_maps_to_jsonl(entries: List[Entry]):
|
||||||
|
"Convert each PDF entry to JSON and collate as JSONL"
|
||||||
|
return "".join([f"{entry.to_json()}\n" for entry in entries])
|
|
@ -109,6 +109,17 @@ def search(
|
||||||
with timer("Collating results took", logger):
|
with timer("Collating results took", logger):
|
||||||
results = text_search.collate_results(hits, entries, results_count)
|
results = text_search.collate_results(hits, entries, results_count)
|
||||||
|
|
||||||
|
elif (t == SearchType.Pdf or t == None) and state.model.pdf_search:
|
||||||
|
# query pdf files
|
||||||
|
with timer("Query took", logger):
|
||||||
|
hits, entries = text_search.query(
|
||||||
|
user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||||
|
)
|
||||||
|
|
||||||
|
# collate and return results
|
||||||
|
with timer("Collating results took", logger):
|
||||||
|
results = text_search.collate_results(hits, entries, results_count)
|
||||||
|
|
||||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||||
# query transactions
|
# query transactions
|
||||||
with timer("Query took", logger):
|
with timer("Query took", logger):
|
||||||
|
|
|
@ -22,6 +22,7 @@ class SearchType(str, Enum):
|
||||||
Music = "music"
|
Music = "music"
|
||||||
Markdown = "markdown"
|
Markdown = "markdown"
|
||||||
Image = "image"
|
Image = "image"
|
||||||
|
Pdf = "pdf"
|
||||||
|
|
||||||
|
|
||||||
class ProcessorType(str, Enum):
|
class ProcessorType(str, Enum):
|
||||||
|
@ -61,6 +62,7 @@ class SearchModels:
|
||||||
ledger_search: TextSearchModel = None
|
ledger_search: TextSearchModel = None
|
||||||
music_search: TextSearchModel = None
|
music_search: TextSearchModel = None
|
||||||
markdown_search: TextSearchModel = None
|
markdown_search: TextSearchModel = None
|
||||||
|
pdf_search: TextSearchModel = None
|
||||||
image_search: ImageSearchModel = None
|
image_search: ImageSearchModel = None
|
||||||
plugin_search: Dict[str, TextSearchModel] = None
|
plugin_search: Dict[str, TextSearchModel] = None
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,12 @@ default_config = {
|
||||||
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
"compressed-jsonl": "~/.khoj/content/ledger/ledger.jsonl.gz",
|
||||||
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
"embeddings-file": "~/.khoj/content/ledger/ledger_embeddings.pt",
|
||||||
},
|
},
|
||||||
|
"pdf": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||||
|
},
|
||||||
"image": {
|
"image": {
|
||||||
"input-directories": None,
|
"input-directories": None,
|
||||||
"input-filter": None,
|
"input-filter": None,
|
||||||
|
|
|
@ -56,6 +56,7 @@ class ContentConfig(ConfigBase):
|
||||||
image: Optional[ImageContentConfig]
|
image: Optional[ImageContentConfig]
|
||||||
music: Optional[TextContentConfig]
|
music: Optional[TextContentConfig]
|
||||||
markdown: Optional[TextContentConfig]
|
markdown: Optional[TextContentConfig]
|
||||||
|
pdf: Optional[TextContentConfig]
|
||||||
plugins: Optional[Dict[str, TextContentConfig]]
|
plugins: Optional[Dict[str, TextContentConfig]]
|
||||||
|
|
||||||
|
|
||||||
|
|
BIN
tests/data/pdf/multipage.pdf
Normal file
BIN
tests/data/pdf/multipage.pdf
Normal file
Binary file not shown.
BIN
tests/data/pdf/singlepage.pdf
Normal file
BIN
tests/data/pdf/singlepage.pdf
Normal file
Binary file not shown.
|
@ -34,7 +34,7 @@ def test_search_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_search_with_valid_content_type(client):
|
def test_search_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/search?q=random&t={content_type}")
|
response = client.get(f"/api/search?q=random&t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -52,7 +52,7 @@ def test_update_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_update_with_valid_content_type(client):
|
def test_update_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/update?t={content_type}")
|
response = client.get(f"/api/update?t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -70,7 +70,7 @@ def test_regenerate_with_invalid_content_type(client):
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_regenerate_with_valid_content_type(client):
|
def test_regenerate_with_valid_content_type(client):
|
||||||
for content_type in ["org", "markdown", "ledger", "image", "music", "plugin1"]:
|
for content_type in ["org", "markdown", "ledger", "image", "music", "pdf", "plugin1"]:
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/update?force=true&t={content_type}")
|
response = client.get(f"/api/update?force=true&t={content_type}")
|
||||||
# Assert
|
# Assert
|
||||||
|
|
74
tests/test_pdf_to_jsonl.py
Normal file
74
tests/test_pdf_to_jsonl.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
# Standard Packages
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_page_pdf_to_jsonl():
|
||||||
|
"Convert single page PDF file to jsonl."
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Pdf files
|
||||||
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/singlepage.pdf"])
|
||||||
|
|
||||||
|
# Process Each Entry from All Pdf Files
|
||||||
|
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||||
|
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
|
)
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_multi_page_pdf_to_jsonl():
|
||||||
|
"Convert multiple pages from single PDF file to jsonl."
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Pdf files
|
||||||
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=["tests/data/pdf/multipage.pdf"])
|
||||||
|
|
||||||
|
# Process Each Entry from All Pdf Files
|
||||||
|
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
||||||
|
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
|
)
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 6
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_pdf_files(tmp_path):
|
||||||
|
"Ensure Pdf files specified via input-filter, input-files extracted"
|
||||||
|
# Arrange
|
||||||
|
# Include via input-filter globs
|
||||||
|
group1_file1 = create_file(tmp_path, filename="group1-file1.pdf")
|
||||||
|
group1_file2 = create_file(tmp_path, filename="group1-file2.pdf")
|
||||||
|
group2_file1 = create_file(tmp_path, filename="group2-file1.pdf")
|
||||||
|
group2_file2 = create_file(tmp_path, filename="group2-file2.pdf")
|
||||||
|
# Include via input-file field
|
||||||
|
file1 = create_file(tmp_path, filename="document.pdf")
|
||||||
|
# Not included by any filter
|
||||||
|
create_file(tmp_path, filename="not-included-document.pdf")
|
||||||
|
create_file(tmp_path, filename="not-included-text.txt")
|
||||||
|
|
||||||
|
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||||
|
|
||||||
|
# Setup input-files, input-filters
|
||||||
|
input_files = [tmp_path / "document.pdf"]
|
||||||
|
input_filter = [tmp_path / "group1*.pdf", tmp_path / "group2*.pdf"]
|
||||||
|
|
||||||
|
# Act
|
||||||
|
extracted_pdf_files = PdfToJsonl.get_pdf_files(input_files, input_filter)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(extracted_pdf_files) == 5
|
||||||
|
assert extracted_pdf_files == expected_files
|
||||||
|
|
||||||
|
|
||||||
|
# Helper Functions
|
||||||
|
def create_file(tmp_path, entry=None, filename="document.pdf"):
|
||||||
|
pdf_file = tmp_path / filename
|
||||||
|
pdf_file.touch()
|
||||||
|
if entry:
|
||||||
|
pdf_file.write_text(entry)
|
||||||
|
return pdf_file
|
Loading…
Reference in a new issue