diff --git a/docs/chat.md b/docs/chat.md index eb3a2f0f..b900d052 100644 --- a/docs/chat.md +++ b/docs/chat.md @@ -7,18 +7,21 @@ ### Setup #### Offline Chat -Offline chat works without internet but it is slower, lower quality and more compute intensive. +Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive. -!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time +> **System Requirements**: +> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available +> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required +> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times -- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card +- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card ![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4') #### Online Chat Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive. -!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing +!> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing 1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys) 2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key. diff --git a/docs/emacs.md b/docs/emacs.md index 36b9f9db..6492ecc4 100644 --- a/docs/emacs.md +++ b/docs/emacs.md @@ -46,7 +46,7 @@ Indexes your org-agenda files, by default. (use-package khoj :ensure t :pin melpa-stable - :bind ("C-c s" . 'khoj) + :bind ("C-c s" . 'khoj)) ``` - Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj diff --git a/manifest.json b/manifest.json index 0ecc4fbb..0d5c71b8 100644 --- a/manifest.json +++ b/manifest.json @@ -1,7 +1,7 @@ { "id": "khoj", "name": "Khoj", - "version": "0.12.3", + "version": "0.13.0", "minAppVersion": "0.15.0", "description": "An Open-Source AI Personal Assistant for your Digital Brain", "author": "Khoj Inc.", diff --git a/pyproject.toml b/pyproject.toml index 12be01cc..34f15d4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "khoj-assistant" -description = "An AI personal assistant for your Digital Brain" +description = "An AI copilot for your Second Brain" readme = "README.md" license = "GPL-3.0-or-later" requires-python = ">=3.8" @@ -40,8 +40,9 @@ dependencies = [ "dateparser >= 1.1.1", "defusedxml == 0.7.1", "fastapi == 0.77.1", + "python-multipart >= 0.0.5", "jinja2 == 3.1.2", - "openai >= 0.27.0", + "openai >= 0.27.0, < 1.0.0", "tiktoken >= 0.3.2", "tenacity >= 8.2.2", "pillow == 9.3.0", @@ -83,6 +84,7 @@ test = [ "freezegun >= 1.2.0", "factory-boy >= 3.2.1", "trio >= 0.22.0", + "pytest-xdist", ] dev = [ "khoj-assistant[test]", diff --git a/scripts/bump_version.sh b/scripts/bump_version.sh index 07d2117f..561953dd 100755 --- a/scripts/bump_version.sh +++ b/scripts/bump_version.sh @@ -9,6 +9,10 @@ do # Get current project version current_version=$OPTARG + # Bump Desktop app to current version + cd $project_root/src/interface/desktop + sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json + # Bump Obsidian plugin to current version cd $project_root/src/interface/obsidian sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json diff --git a/src/app/main.py b/src/app/main.py index 0049f157..16f7cced 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -14,10 +14,11 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th # External Packages import uvicorn -import django -import schedule - from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import schedule +import django + from fastapi.staticfiles import StaticFiles from rich.logging import RichHandler from django.core.asgi import get_asgi_application @@ -41,6 +42,15 @@ app = FastAPI() # Get Django Application django_app = get_asgi_application() +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + # Set Locale locale.setlocale(locale.LC_ALL, "") diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index 4f8891cf..fd75e3a7 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -8,7 +8,6 @@ const {dialog} = require('electron'); const cron = require('cron').CronJob; const axios = require('axios'); -const { Readable } = require('stream'); const KHOJ_URL = 'http://127.0.0.1:42110' @@ -65,7 +64,7 @@ const schema = { var state = {} -const store = new Store({schema}); +const store = new Store({ schema }); console.log(store); @@ -86,57 +85,65 @@ function handleSetTitle (event, title) { }); } +function filenameToMimeType (filename) { + const extension = filename.split('.').pop(); + switch (extension) { + case 'pdf': + return 'application/pdf'; + case 'png': + return 'image/png'; + case 'jpg': + case 'jpeg': + return 'image/jpeg'; + case 'md': + case 'markdown': + return 'text/markdown'; + case 'org': + return 'text/org'; + default: + return 'text/plain'; + } +} + function pushDataToKhoj (regenerate = false) { let filesToPush = []; - const files = store.get('files'); - const folders = store.get('folders'); - state = { - completed: true + const files = store.get('files') || []; + const folders = store.get('folders') || []; + state = { completed: true } + + // Collect paths of all configured files to index + for (const file of files) { + filesToPush.push(file.path); } - if (files) { - for (file of files) { - filesToPush.push(file.path); - } - } - if (folders) { - for (folder of folders) { - const files = fs.readdirSync(folder.path, { withFileTypes: true }); - for (file of files) { - if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { - filesToPush.push(path.join(folder.path, file.name)); - } + // Collect paths of all indexable files in configured folders + for (const folder of folders) { + const files = fs.readdirSync(folder.path, { withFileTypes: true }); + for (const file of files) { + if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { + filesToPush.push(path.join(folder.path, file.name)); } } } - let data = { - files: [] - } - const lastSync = store.get('lastSync') || []; - - for (file of filesToPush) { + const formData = new FormData(); + for (const file of filesToPush) { const stats = fs.statSync(file); if (!regenerate) { + // Only push files that have been modified since last sync if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) { continue; } } + // Collect all updated or newly created files since last sync to index on Khoj server try { - let rawData; - // If the file is a PDF or IMG file, read it as a binary file - if (binaryFileTypes.includes(file.split('.').pop())) { - rawData = fs.readFileSync(file).toString('base64'); - } else { - rawData = fs.readFileSync(file, 'utf8'); - } - - data.files.push({ - path: file, - content: rawData - }); + let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8"; + let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : ""); + let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding); + let fileObj = new Blob([fileContent], { type: mimeType }); + formData.append('files', fileObj, file); state[file] = { success: true, } @@ -149,46 +156,46 @@ function pushDataToKhoj (regenerate = false) { } } + // Mark deleted files for removal from index on Khoj server for (const syncedFile of lastSync) { if (!filesToPush.includes(syncedFile.path)) { - data.files.push({ - path: syncedFile.path, - content: "" - }); + fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) }); + formData.append('files', fileObj, syncedFile.path); } } - const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' }; - - const stream = new Readable({ - read() { - this.push(JSON.stringify(data)); - this.push(null); - } - }); - - const hostURL = store.get('hostURL') || KHOJ_URL; - - axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers }) - .then(response => { - console.log(response.data); - const win = BrowserWindow.getAllWindows()[0]; - win.webContents.send('update-state', state); - let lastSync = []; - for (const file of filesToPush) { - lastSync.push({ - path: file, - datetime: new Date().toISOString() - }); - } - store.set('lastSync', lastSync); - }) - .catch(error => { - console.error(error); - state['completed'] = false - const win = BrowserWindow.getAllWindows()[0]; - win.webContents.send('update-state', state); - }); + // Send collected files to Khoj server for indexing + if (!!formData?.entries()?.next().value) { + const hostURL = store.get('hostURL') || KHOJ_URL; + const headers = { + 'x-api-key': 'secret' + }; + axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers }) + .then(response => { + console.log(response.data); + let lastSync = []; + for (const file of filesToPush) { + lastSync.push({ + path: file, + datetime: new Date().toISOString() + }); + } + store.set('lastSync', lastSync); + }) + .catch(error => { + console.error(error); + state['completed'] = false + }) + .finally(() => { + // Syncing complete + const win = BrowserWindow.getAllWindows()[0]; + if (win) win.webContents.send('update-state', state); + }); + } else { + // Syncing complete + const win = BrowserWindow.getAllWindows()[0]; + if (win) win.webContents.send('update-state', state); + } } pushDataToKhoj(); diff --git a/src/interface/desktop/package.json b/src/interface/desktop/package.json index fb2d9983..d74e831a 100644 --- a/src/interface/desktop/package.json +++ b/src/interface/desktop/package.json @@ -1,13 +1,13 @@ { "name": "Khoj", - "homepage": ".", - "productName": "Khoj", - "version": "1.0.2", - "description": "Scaffolding for the desktop entrypoint to Khoj", - "main": "main.js", + "version": "0.13.0", + "description": "An AI copilot for your Second Brain", + "author": "Saba Imran, Debanjum Singh Solanky ", + "license": "GPL-3.0-or-later", + "homepage": "https://khoj.dev", "repository": "\"https://github.com/khoj-ai/khoj\"", - "author": "Khoj ", - "license": "MIT", + "productName": "Khoj", + "main": "main.js", "private": false, "devDependencies": { "electron": "25.8.1" diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 2f8360f2..b9343c41 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -1,11 +1,12 @@ -;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*- +;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*- -;; Copyright (C) 2021-2022 Debanjum Singh Solanky +;; Copyright (C) 2021-2023 Khoj Inc. -;; Author: Debanjum Singh Solanky -;; Description: An AI personal assistant for your digital brain +;; Author: Debanjum Singh Solanky +;; Saba Imran +;; Description: An AI copilot for your Second Brain ;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image -;; Version: 0.12.3 +;; Version: 0.13.0 ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1")) ;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs @@ -28,8 +29,8 @@ ;;; Commentary: -;; Create an AI personal assistant for your `org-mode', `markdown' notes, -;; PDFs and images. The assistant exposes 2 modes, search and chat: +;; Create an AI copilot to your `org-mode', `markdown' notes, +;; PDFs and images. The copilot exposes 2 modes, search and chat: ;; ;; Chat provides faster answers, iterative discovery and assisted ;; creativity. It requires your OpenAI API key to access GPT models @@ -87,6 +88,21 @@ :group 'khoj :type 'integer) +(defcustom khoj-search-on-idle-time 0.3 + "Idle time (in seconds) to wait before triggering search." + :group 'khoj + :type 'number) + +(defcustom khoj-server-api-key "secret" + "API Key to Khoj server." + :group 'khoj + :type 'string) + +(defcustom khoj-index-interval 3600 + "Interval (in seconds) to wait before updating content index." + :group 'khoj + :type 'number) + (defcustom khoj-default-content-type "org" "The default content type to perform search on." :group 'khoj @@ -115,6 +131,15 @@ (defvar khoj--content-type "org" "The type of content to perform search on.") +(defvar khoj--search-on-idle-timer nil + "Idle timer to trigger incremental search.") + +(defvar khoj--index-timer nil + "Timer to trigger content indexing.") + +(defvar khoj--indexed-files '() + "Files that were indexed in previous content indexing run.") + (declare-function org-element-property "org-mode" (PROPERTY ELEMENT)) (declare-function org-element-type "org-mode" (ELEMENT)) (declare-function markdown-mode "markdown-mode" ()) @@ -236,6 +261,11 @@ for example), set this to the full interpreter path." :type 'boolean :group 'khoj) +(defcustom khoj-offline-chat-model nil + "Specify chat model to use for offline chat with khoj." + :type 'string + :group 'khoj) + (defcustom khoj-auto-setup t "Automate install, configure and start of khoj server. Auto invokes setup steps on calling main entrypoint." @@ -365,9 +395,9 @@ CONFIG is json obtained from Khoj config API." (string-join "/")))) (defun khoj--server-configure () - "Configure the the Khoj server for search and chat." + "Configure the Khoj server for search and chat." (interactive) - (let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null)) + (let* ((url-request-method "GET") (current-config (with-temp-buffer (url-insert-file-contents (format "%s/api/config/data" khoj-server-url)) @@ -376,56 +406,12 @@ CONFIG is json obtained from Khoj config API." (with-temp-buffer (url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url)) (ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false)))) - (default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file))) (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile))) (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config)))))) - (default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config)))) - (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config))))) + (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))) + (offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))) (config (or current-config default-config))) - ;; Configure content types - (cond - ;; If khoj backend is not configured yet - ((not current-config) - (message "khoj.el: Server not configured yet.") - (setq config (delq (assoc 'content-type config) config)) - (cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files) - (input-filter . ,org-directory-regexes) - (compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir)) - (embeddings-file . ,(format "%s/org.pt" default-index-dir)) - (index-heading-entries . ,json-false))))) - config)) - - ;; Else if khoj config has no org content config - ((not (alist-get 'org (alist-get 'content-type config))) - (message "khoj.el: Org-mode content on server not configured yet.") - (let ((new-content-type (alist-get 'content-type config))) - (setq new-content-type (delq (assoc 'org new-content-type) new-content-type)) - (cl-pushnew `(org . ((input-files . ,khoj-org-files) - (input-filter . ,org-directory-regexes) - (compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir)) - (embeddings-file . ,(format "%s/org.pt" default-index-dir)) - (index-heading-entries . ,json-false))) - new-content-type) - (setq config (delq (assoc 'content-type config) config)) - (cl-pushnew `(content-type . ,new-content-type) config))) - - ;; Else if khoj is not configured to index specified org files - ((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files) - (equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes))) - (message "khoj.el: Org-mode content on server is stale.") - (let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file))) - (new-content-type (alist-get 'content-type config))) - (setq new-content-type (delq (assoc 'org new-content-type) new-content-type)) - (cl-pushnew `(org . ((input-files . ,khoj-org-files) - (input-filter . ,org-directory-regexes) - (compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory)) - (embeddings-file . ,(format "%s/org.pt" index-directory)) - (index-heading-entries . ,json-false))) - new-content-type) - (setq config (delq (assoc 'content-type config) config)) - (cl-pushnew `(content-type . ,new-content-type) config)))) - ;; Configure processors (cond ((not khoj-openai-api-key) @@ -441,10 +427,11 @@ CONFIG is json obtained from Khoj config API." ;; If khoj backend isn't configured yet ((not current-config) - (message "khoj.el: Chat not configured yet.") + (message "khoj.el: Khoj not configured yet.") (setq config (delq (assoc 'processor config) config)) (cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) - (enable-offline-chat . ,enable-offline-chat) + (offline-chat . ((enable-offline-chat . ,enable-offline-chat) + (chat-model . ,offline-chat-model))) (openai . ((chat-model . ,chat-model) (api-key . ,khoj-openai-api-key))))))) config)) @@ -455,7 +442,8 @@ CONFIG is json obtained from Khoj config API." (let ((new-processor-type (alist-get 'processor config))) (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) - (enable-offline-chat . ,enable-offline-chat) + (offline-chat . ((enable-offline-chat . ,enable-offline-chat) + (chat-model . ,offline-chat-model))) (openai . ((chat-model . ,chat-model) (api-key . ,khoj-openai-api-key))))) new-processor-type) @@ -465,13 +453,15 @@ CONFIG is json obtained from Khoj config API." ;; Else if chat configuration in khoj backend has gone stale ((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key) (equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model) - (equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat))) + (equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat) + (equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model))) (message "khoj.el: Chat configuration has gone stale.") (let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile))) (new-processor-type (alist-get 'processor config))) (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory)) - (enable-offline-chat . ,enable-offline-chat) + (offline-chat . ((enable-offline-chat . ,enable-offline-chat) + (chat-model . ,offline-chat-model))) (openai . ((chat-model . ,khoj-chat-model) (api-key . ,khoj-openai-api-key))))) new-processor-type) @@ -509,9 +499,75 @@ CONFIG is json obtained from Khoj config API." (khoj--server-configure)))) -;; ----------------------------------------------- -;; Extract and Render Entries of each Content Type -;; ----------------------------------------------- +;; ------------------- +;; Khoj Index Content +;; ------------------- + +(defun khoj--server-index-files (&optional force content-type file-paths) + "Send files at `FILE-PATHS' to the Khoj server to index for search and chat. +`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed." + (interactive) + (let ((boundary (format "-------------------------%d" (random (expt 10 10)))) + (files-to-index (or file-paths + (append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files))) + (type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type))) + (inhibit-message t) + (message-log-max nil)) + (let ((url-request-method "POST") + (url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary)) + (url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary)) + ("x-api-key" . ,khoj-server-api-key)))) + (with-current-buffer + (url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false")) + ;; render response from indexing API endpoint on server + (lambda (status) + (if (not status) + (message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " "")) + (with-current-buffer (current-buffer) + (goto-char "\n\n") + (message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s" + (if force "force " "") + content-type + status + (string-trim (buffer-substring-no-properties (point) (point-max))))))) + nil t t))) + (setq khoj--indexed-files files-to-index))) + +(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary) + "Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body. +Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request." + (with-temp-buffer + (set-buffer-multibyte nil) + (insert "\n") + (dolist (file-to-index files-to-index) + (insert (format "--%s\r\n" boundary)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) + (insert "Content-Type: text/org\r\n\r\n") + (insert (with-temp-buffer + (insert-file-contents-literally file-to-index) + (buffer-string))) + (insert "\r\n")) + (dolist (file-to-index previously-indexed-files) + (when (not (member file-to-index files-to-index)) + (insert (format "--%s\r\n" boundary)) + (insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index)) + (insert "Content-Type: text/org\r\n\r\n") + (insert "") + (insert "\r\n"))) + (insert (format "--%s--\r\n" boundary)) + (buffer-string))) + +;; Cancel any running indexing timer, first +(when khoj--index-timer + (cancel-timer khoj--index-timer)) +;; Send files to index on server every `khoj-index-interval' seconds +(setq khoj--index-timer + (run-with-timer 60 khoj-index-interval 'khoj--server-index-files)) + + +;; ------------------------------------------- +;; Render Response from Khoj server for Emacs +;; ------------------------------------------- (defun khoj--extract-entries-as-markdown (json-response query) "Convert JSON-RESPONSE, QUERY from API to markdown entries." @@ -920,6 +976,9 @@ RECEIVE-DATE is the message receive date." (message "khoj.el: Teardown Incremental Search") ;; unset khoj minibuffer window (setq khoj--minibuffer-window nil) + (when (and khoj--search-on-idle-timer + (timerp khoj--search-on-idle-timer)) + (cancel-timer khoj--search-on-idle-timer)) ;; delete open connections to khoj server (khoj--delete-open-network-connections-to-server) ;; remove hooks for khoj incremental query and self @@ -942,8 +1001,10 @@ RECEIVE-DATE is the message receive date." ;; set current (mini-)buffer entered as khoj minibuffer ;; used to query khoj API only when user in khoj minibuffer (setq khoj--minibuffer-window (current-buffer)) - (add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action - (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit + ; do khoj incremental search after idle time + (setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search)) + ; teardown khoj incremental search on minibuffer exit + (add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) (read-string khoj--query-prompt)))) @@ -1014,17 +1075,20 @@ Paragraph only starts at first text after blank line." ;; Khoj Menu ;; --------- -(transient-define-argument khoj--content-type-switch () - :class 'transient-switches - :argument-format "--content-type=%s" - :argument-regexp ".+" - ;; set content type to: last used > based on current buffer > default type - :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name)))))) - ;; dynamically set choices to content types enabled on khoj backend - :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image"))) +(defun khoj--setup-and-show-menu () + "Create Transient menu for khoj and show it." + ;; Create the Khoj Transient menu + (transient-define-argument khoj--content-type-switch () + :class 'transient-switches + :argument-format "--content-type=%s" + :argument-regexp ".+" + ;; set content type to: last used > based on current buffer > default type + :init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name)))))) + ;; dynamically set choices to content types enabled on khoj backend + :choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image"))) -(transient-define-suffix khoj--search-command (&optional args) - (interactive (list (transient-args transient-current-command))) + (transient-define-suffix khoj--search-command (&optional args) + (interactive (list (transient-args transient-current-command))) (progn ;; set content type to: specified > last used > based on current buffer > default type (setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name)))) @@ -1033,9 +1097,9 @@ Paragraph only starts at first text after blank line." ;; trigger incremental search (call-interactively #'khoj-incremental))) -(transient-define-suffix khoj--find-similar-command (&optional args) - "Find items similar to current item at point." - (interactive (list (transient-args transient-current-command))) + (transient-define-suffix khoj--find-similar-command (&optional args) + "Find items similar to current item at point." + (interactive (list (transient-args transient-current-command))) (progn ;; set content type to: specified > last used > based on current buffer > default type (setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name)))) @@ -1043,37 +1107,38 @@ Paragraph only starts at first text after blank line." (setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count)) (khoj--find-similar khoj--content-type))) -(transient-define-suffix khoj--update-command (&optional args) - "Call khoj API to update index of specified content type." - (interactive (list (transient-args transient-current-command))) - (let* ((force-update (if (member "--force-update" args) "true" "false")) - ;; set content type to: specified > last used > based on current buffer > default type - (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name)))) - (type-query (if (equal content-type "all") "" (format "t=%s" content-type))) - (update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update)) - (url-request-method "GET")) - (progn - (setq khoj--content-type content-type) - (url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " ""))))))) + (transient-define-suffix khoj--update-command (&optional args) + "Call khoj API to update index of specified content type." + (interactive (list (transient-args transient-current-command))) + (let* ((force-update (if (member "--force-update" args) "true" "false")) + ;; set content type to: specified > last used > based on current buffer > default type + (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name)))) + (url-request-method "GET")) + (progn + (setq khoj--content-type content-type) + (khoj--server-index-files force-update content-type)))) -(transient-define-suffix khoj--chat-command (&optional _) - "Command to Chat with Khoj." - (interactive (list (transient-args transient-current-command))) - (khoj--chat)) + (transient-define-suffix khoj--chat-command (&optional _) + "Command to Chat with Khoj." + (interactive (list (transient-args transient-current-command))) + (khoj--chat)) -(transient-define-prefix khoj--menu () - "Create Khoj Menu to Configure and Execute Commands." - [["Configure Search" - ("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count)))) - ("t" "Content Type" khoj--content-type-switch)] - ["Configure Update" - ("-f" "Force Update" "--force-update")]] - [["Act" - ("c" "Chat" khoj--chat-command) - ("s" "Search" khoj--search-command) - ("f" "Find Similar" khoj--find-similar-command) - ("u" "Update" khoj--update-command) - ("q" "Quit" transient-quit-one)]]) + (transient-define-prefix khoj--menu () + "Create Khoj Menu to Configure and Execute Commands." + [["Configure Search" + ("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count)))) + ("t" "Content Type" khoj--content-type-switch)] + ["Configure Update" + ("-f" "Force Update" "--force-update")]] + [["Act" + ("c" "Chat" khoj--chat-command) + ("s" "Search" khoj--search-command) + ("f" "Find Similar" khoj--find-similar-command) + ("u" "Update" khoj--update-command) + ("q" "Quit" transient-quit-one)]]) + + ;; Show the Khoj Transient menu + (khoj--menu)) ;; ---------- @@ -1086,7 +1151,7 @@ Paragraph only starts at first text after blank line." (interactive) (when khoj-auto-setup (khoj-setup t)) - (khoj--menu)) + (khoj--setup-and-show-menu)) (provide 'khoj) diff --git a/src/interface/emacs/tests/khoj-tests.el b/src/interface/emacs/tests/khoj-tests.el index 8242d30b..c0d9f4a6 100644 --- a/src/interface/emacs/tests/khoj-tests.el +++ b/src/interface/emacs/tests/khoj-tests.el @@ -206,6 +206,64 @@ Rule everything\n") "Rule everything")) )) + +;; ------------------------------------- +;; Test Helpers to Index Content +;; ------------------------------------- + +(ert-deftest khoj-tests--render-files-to-add-request-body () + "Test files are formatted into a multi-part http request body" + (let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n")) + (act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n"))) + (unwind-protect + (progn + (should + (equal + (khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj") + (format + "\n--khoj\r\n\ +Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ +Content-Type: text/org\r\n\r\n\ +# Become God\n\ +## Upgrade\n\n\ +Penance to Immortality\n\n\r +--khoj\r\n\ +Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ +Content-Type: text/org\r\n\r\n\ +## Act\n\n\ +Rule everything\n\n\r\n\ +--khoj--\r\n" upgrade-file act-file)))) + (delete-file upgrade-file) + (delete-file act-file)))) + +(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body () + "Test files are formatted into a multi-part http request body" + (let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n")) + (act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n"))) + (unwind-protect + (progn + (should + (equal + (khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj") + (format + "\n--khoj\r\n\ +Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ +Content-Type: text/org\r\n\r\n\ +# Become God\n\ +## Upgrade\n\n\ +Penance to Immortality\n\n\r +--khoj\r\n\ +Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ +Content-Type: text/org\r\n\r\n\ +## Act\n\n\ +Rule everything\n\n\r +--khoj\r\n\ +Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\ +Content-Type: text/org\r\n\r\n\ +\r +--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org")))) + (delete-file upgrade-file) + (delete-file act-file)))) (provide 'khoj-tests) diff --git a/src/interface/obsidian/manifest.json b/src/interface/obsidian/manifest.json index 0ecc4fbb..0d5c71b8 100644 --- a/src/interface/obsidian/manifest.json +++ b/src/interface/obsidian/manifest.json @@ -1,7 +1,7 @@ { "id": "khoj", "name": "Khoj", - "version": "0.12.3", + "version": "0.13.0", "minAppVersion": "0.15.0", "description": "An Open-Source AI Personal Assistant for your Digital Brain", "author": "Khoj Inc.", diff --git a/src/interface/obsidian/package.json b/src/interface/obsidian/package.json index eb18132f..beb049fa 100644 --- a/src/interface/obsidian/package.json +++ b/src/interface/obsidian/package.json @@ -1,7 +1,9 @@ { "name": "Khoj", - "version": "0.12.3", - "description": "An AI Personal Assistant for your Digital Brain", + "version": "0.13.0", + "description": "An AI copilot for your Second Brain", + "author": "Debanjum Singh Solanky, Saba Imran ", + "license": "GPL-3.0-or-later", "main": "src/main.js", "scripts": { "dev": "node esbuild.config.mjs", @@ -14,8 +16,6 @@ "AI", "assistant" ], - "author": "Debanjum Singh Solanky", - "license": "GPL-3.0-or-later", "devDependencies": { "@types/node": "^16.11.6", "@typescript-eslint/eslint-plugin": "5.29.0", diff --git a/src/interface/obsidian/src/main.ts b/src/interface/obsidian/src/main.ts index 935945dd..1fbed55f 100644 --- a/src/interface/obsidian/src/main.ts +++ b/src/interface/obsidian/src/main.ts @@ -1,12 +1,13 @@ -import { Notice, Plugin } from 'obsidian'; +import { Notice, Plugin, TFile } from 'obsidian'; import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings' import { KhojSearchModal } from 'src/search_modal' import { KhojChatModal } from 'src/chat_modal' -import { configureKhojBackend } from './utils'; +import { configureKhojBackend, updateContentIndex } from './utils'; export default class Khoj extends Plugin { settings: KhojSetting; + indexingTimer: NodeJS.Timeout; async onload() { await this.loadSettings(); @@ -54,6 +55,15 @@ export default class Khoj extends Plugin { // Add a settings tab so the user can configure khoj this.addSettingTab(new KhojSettingTab(this.app, this)); + + // Add scheduled job to update index every 60 minutes + this.indexingTimer = setInterval(async () => { + if (this.settings.autoConfigure) { + this.settings.lastSyncedFiles = await updateContentIndex( + this.app.vault, this.settings, this.settings.lastSyncedFiles + ); + } + }, 60 * 60 * 1000); } async loadSettings() { @@ -72,4 +82,12 @@ export default class Khoj extends Plugin { } this.saveData(this.settings); } + + async onunload() { + // Remove scheduled job to update index at regular cadence + if (this.indexingTimer) + clearInterval(this.indexingTimer); + + this.unload(); + } } diff --git a/src/interface/obsidian/src/settings.ts b/src/interface/obsidian/src/settings.ts index c013f10c..c3f40905 100644 --- a/src/interface/obsidian/src/settings.ts +++ b/src/interface/obsidian/src/settings.ts @@ -1,5 +1,6 @@ -import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian'; +import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian'; import Khoj from 'src/main'; +import { updateContentIndex } from './utils'; export interface KhojSetting { enableOfflineChat: boolean; @@ -8,6 +9,7 @@ export interface KhojSetting { khojUrl: string; connectedToBackend: boolean; autoConfigure: boolean; + lastSyncedFiles: TFile[]; } export const DEFAULT_SETTINGS: KhojSetting = { @@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = { connectedToBackend: false, autoConfigure: true, openaiApiKey: '', + lastSyncedFiles: [] } export class KhojSettingTab extends PluginSettingTab { @@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab { }, 300); this.plugin.registerInterval(progress_indicator); - await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`); - await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`); + this.plugin.settings.lastSyncedFiles = await updateContentIndex( + this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true + ); new Notice('βœ… Updated Khoj index.'); // Reset button once index is updated diff --git a/src/interface/obsidian/src/utils.ts b/src/interface/obsidian/src/utils.ts index 920da583..eb3d4d12 100644 --- a/src/interface/obsidian/src/utils.ts +++ b/src/interface/obsidian/src/utils.ts @@ -1,4 +1,4 @@ -import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian'; +import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian'; import { KhojSetting } from 'src/settings' export function getVaultAbsolutePath(vault: Vault): string { @@ -14,18 +14,85 @@ type OpenAIType = null | { "api-key": string; }; +type OfflineChatType = null | { + "chat-model": string; + "enable-offline-chat": boolean; +}; + interface ProcessorData { conversation: { "conversation-logfile": string; openai: OpenAIType; - "enable-offline-chat": boolean; + "offline-chat": OfflineChatType; + "tokenizer": null | string; + "max-prompt-size": null | number; }; } +function fileExtensionToMimeType (extension: string): string { + switch (extension) { + case 'pdf': + return 'application/pdf'; + case 'png': + return 'image/png'; + case 'jpg': + case 'jpeg': + return 'image/jpeg'; + case 'md': + case 'markdown': + return 'text/markdown'; + case 'org': + return 'text/org'; + default: + return 'text/plain'; + } +} + +export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise { + // Get all markdown, pdf files in the vault + console.log(`Khoj: Updating Khoj content index...`) + const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf'); + const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg'] + let countOfFilesToIndex = 0; + let countOfFilesToDelete = 0; + + // Add all files to index as multipart form data + const formData = new FormData(); + for (const file of files) { + countOfFilesToIndex++; + const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8"; + const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : ""); + const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file); + formData.append('files', new Blob([fileContent], { type: mimeType }), file.path); + } + + // Add any previously synced files to be deleted to multipart form data + for (const lastSyncedFile of lastSyncedFiles) { + if (!files.includes(lastSyncedFile)) { + countOfFilesToDelete++; + formData.append('files', new Blob([]), lastSyncedFile.path); + } + } + + // Call Khoj backend to update index with all markdown, pdf files + const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, { + method: 'POST', + headers: { + 'x-api-key': 'secret', + }, + body: formData, + }); + + if (!response.ok) { + new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`); + } else { + console.log(`βœ… Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`); + } + + return files; +} + export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) { - let vaultPath = getVaultAbsolutePath(vault); - let mdInVault = `${vaultPath}/**/*.md`; - let pdfInVault = `${vaultPath}/**/*.pdf`; let khojConfigUrl = `${setting.khojUrl}/api/config/data`; // Check if khoj backend is configured, note if cannot connect to backend @@ -43,124 +110,33 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n if (!setting.connectedToBackend) return; // Set index name from the path of the current vault - let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_'); // Get default config fields from khoj backend let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response)); - let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]); - let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]); let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]); - let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"]; + let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"]; + let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"]; // Get current config if khoj backend configured, else get default config from khoj backend await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`) .then(response => JSON.parse(response)) .then(data => { - khoj_already_configured = data["content-type"] != null; - // If khoj backend not configured yet - if (!khoj_already_configured) { - // Create khoj content-type config with only markdown configured - data["content-type"] = { - "markdown": { - "input-filter": [mdInVault], - "input-files": null, - "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, - } - } - - const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf'); - - if (hasPdfFiles) { - data["content-type"]["pdf"] = { - "input-filter": [pdfInVault], - "input-files": null, - "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, - } - } - } - // Else if khoj config has no markdown content config - else if (!data["content-type"]["markdown"]) { - // Add markdown config to khoj content-type config - // Set markdown config to index markdown files in configured obsidian vault - data["content-type"]["markdown"] = { - "input-filter": [mdInVault], - "input-files": null, - "embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`, - } - } - // Else if khoj is not configured to index markdown files in configured obsidian vault - else if ( - data["content-type"]["markdown"]["input-files"] != null || - data["content-type"]["markdown"]["input-filter"] == null || - data["content-type"]["markdown"]["input-filter"].length != 1 || - data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) { - // Update markdown config in khoj content-type config - // Set markdown config to only index markdown files in configured obsidian vault - let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]); - data["content-type"]["markdown"] = { - "input-filter": [mdInVault], - "input-files": null, - "embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`, - } - } - - if (khoj_already_configured && !data["content-type"]["pdf"]) { - const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf'); - - if (hasPdfFiles) { - data["content-type"]["pdf"] = { - "input-filter": [pdfInVault], - "input-files": null, - "embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`, - } - } else { - data["content-type"]["pdf"] = null; - } - } - // Else if khoj is not configured to index pdf files in configured obsidian vault - else if (khoj_already_configured && - ( - data["content-type"]["pdf"]["input-files"] != null || - data["content-type"]["pdf"]["input-filter"] == null || - data["content-type"]["pdf"]["input-filter"].length != 1 || - data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) { - - let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf'); - - if (hasPdfFiles) { - // Update pdf config in khoj content-type config - // Set pdf config to only index pdf files in configured obsidian vault - let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]); - data["content-type"]["pdf"] = { - "input-filter": [pdfInVault], - "input-files": null, - "embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`, - "compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`, - } - } else { - data["content-type"]["pdf"] = null; - } - } - let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`; - let processorData: ProcessorData = { "conversation": { "conversation-logfile": conversationLogFile, "openai": null, - "enable-offline-chat": setting.enableOfflineChat, + "offline-chat": { + "chat-model": khojDefaultOfflineChatModelName, + "enable-offline-chat": setting.enableOfflineChat, + }, + "tokenizer": null, + "max-prompt-size": null, } } // If the Open AI API Key was configured in the plugin settings if (!!setting.openaiApiKey) { - - let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName; - + let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName; processorData = { "conversation": { "conversation-logfile": conversationLogFile, @@ -168,7 +144,12 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n "chat-model": openAIChatModel, "api-key": setting.openaiApiKey, }, - "enable-offline-chat": setting.enableOfflineChat, + "offline-chat": { + "chat-model": khojDefaultOfflineChatModelName, + "enable-offline-chat": setting.enableOfflineChat, + }, + "tokenizer": null, + "max-prompt-size": null, }, } } @@ -197,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) { method: 'POST', contentType: 'application/json', }; - // Save khojConfig on khoj backend at khojConfigUrl - await request(requestContent) - // Refresh khoj search index after updating config - .then(_ => request(`${khojUrl}/api/update?t=markdown`)) - .then(_ => request(`${khojUrl}/api/update?t=pdf`)); + request(requestContent); } function getIndexDirectoryFromBackendConfig(filepath: string) { diff --git a/src/interface/obsidian/versions.json b/src/interface/obsidian/versions.json index cf60cf10..9cc1eb5c 100644 --- a/src/interface/obsidian/versions.json +++ b/src/interface/obsidian/versions.json @@ -24,5 +24,6 @@ "0.12.0": "0.15.0", "0.12.1": "0.15.0", "0.12.2": "0.15.0", - "0.12.3": "0.15.0" + "0.12.3": "0.15.0", + "0.13.0": "0.15.0" } diff --git a/src/khoj/configure.py b/src/khoj/configure.py index e0a06601..5f60c663 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -28,7 +28,7 @@ from khoj.utils.config import ( ) from khoj.utils.helpers import resolve_absolute_path, merge_dicts from khoj.utils.fs_syncer import collect_files -from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig +from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig from khoj.routers.indexer import configure_content, load_content, configure_search @@ -136,7 +136,7 @@ def configure_routes(app): app.include_router(api, prefix="/api") app.include_router(api_beta, prefix="/api/beta") - app.include_router(indexer, prefix="/v1/indexer") + app.include_router(indexer, prefix="/api/v1/index") app.include_router(web_client) app.include_router(auth_router, prefix="/auth") @@ -156,7 +156,7 @@ if not state.demo: state.content_index = configure_content( state.content_index, state.config.content_type, all_files, state.search_models ) - logger.info("πŸ“¬ Content index updated via Scheduler") + logger.info("πŸ“ͺ Content index updated via Scheduler") except Exception as e: logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True) @@ -207,9 +207,7 @@ def configure_conversation_processor( conversation_config=ConversationProcessorConfig( conversation_logfile=conversation_logfile, openai=(conversation_config.openai if (conversation_config is not None) else None), - enable_offline_chat=( - conversation_config.enable_offline_chat if (conversation_config is not None) else False - ), + offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(), ) ) else: diff --git a/src/khoj/interface/web/config.html b/src/khoj/interface/web/config.html index 3b295a88..d41ca26b 100644 --- a/src/khoj/interface/web/config.html +++ b/src/khoj/interface/web/config.html @@ -236,7 +236,7 @@
-

Setup chat using OpenAI

+

Setup online chat using OpenAI

-

Setup offline chat (Llama V2)

+

Setup offline chat

-
+
-
+
@@ -346,7 +346,7 @@ featuresHintText.classList.add("show"); } - fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, { + fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, { method: 'POST', headers: { 'Content-Type': 'application/json', diff --git a/src/khoj/interface/web/content_type_input.html b/src/khoj/interface/web/content_type_input.html index 3ef512f8..1f0dfa76 100644 --- a/src/khoj/interface/web/content_type_input.html +++ b/src/khoj/interface/web/content_type_input.html @@ -34,7 +34,7 @@ {% else %} {% for input_filter in current_config['input_filter'] %} - + {% endfor %} {% endif %} @@ -106,17 +106,18 @@ submit.addEventListener("click", function(event) { event.preventDefault(); - let globFormat = "**/*." + let globFormat = "**/*" let suffixes = []; if ('{{content_type}}' == "markdown") - suffixes = ["md", "markdown"] + suffixes = [".md", ".markdown"] else if ('{{content_type}}' == "org") - suffixes = ["org"] + suffixes = [".org"] else if ('{{content_type}}' === "pdf") - suffixes = ["pdf"] + suffixes = [".pdf"] else if ('{{content_type}}' === "plaintext") - suffixes = ['*'] + suffixes = ['.*'] + let globs = suffixes.map(x => `${globFormat}${x}`) var inputFileNodes = document.getElementsByName("input-files"); var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value); @@ -124,10 +125,19 @@ var inputFilter = []; var nodes = getValidInputNodes(inputFilterNodes); + + // A regex that checks for globs in the path. If they exist, + // we are going to just not add our own globing. If they don't, + // then we will assume globbing should be done. + const glob_regex = /([*?\[\]])/; if (nodes.length > 0) { for (var i = 0; i < nodes.length; i++) { - for (var j = 0; j < suffixes.length; j++) { - inputFilter.push(nodes[i].value + globFormat + suffixes[j]); + for (var j = 0; j < globs.length; j++) { + if (glob_regex.test(nodes[i].value)) { + inputFilter.push(nodes[i].value); + } else { + inputFilter.push(nodes[i].value + globs[j]); + } } } } diff --git a/src/khoj/migrations/migrate_offline_chat_schema.py b/src/khoj/migrations/migrate_offline_chat_schema.py new file mode 100644 index 00000000..873783a3 --- /dev/null +++ b/src/khoj/migrations/migrate_offline_chat_schema.py @@ -0,0 +1,83 @@ +""" +Current format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + enable-offline-chat: false + conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json + openai: + ... +search-type: + ... + +New format of khoj.yml +--- +app: + ... +content-type: + ... +processor: + conversation: + offline-chat: + enable-offline-chat: false + chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin + tokenizer: null + max_prompt_size: null + conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json + openai: + ... +search-type: + ... +""" +import logging +from packaging import version + +from khoj.utils.yaml import load_config_from_file, save_config_to_file + + +logger = logging.getLogger(__name__) + + +def migrate_offline_chat_schema(args): + schema_version = "0.12.3" + raw_config = load_config_from_file(args.config_file) + previous_version = raw_config.get("version") + + if "processor" not in raw_config: + return args + if raw_config["processor"] is None: + return args + if "conversation" not in raw_config["processor"]: + return args + + if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"): + logger.info( + f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration" + ) + raw_config["version"] = schema_version + + # Create max-prompt-size field in conversation processor schema + raw_config["processor"]["conversation"]["max-prompt-size"] = None + raw_config["processor"]["conversation"]["tokenizer"] = None + + # Create offline chat schema based on existing enable_offline_chat field in khoj config schema + offline_chat_model = ( + raw_config["processor"]["conversation"] + .get("offline-chat", {}) + .get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin") + ) + raw_config["processor"]["conversation"]["offline-chat"] = { + "enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False), + "chat-model": offline_chat_model, + } + + # Delete old enable-offline-chat field from conversation processor schema + if "enable-offline-chat" in raw_config["processor"]["conversation"]: + del raw_config["processor"]["conversation"]["enable-offline-chat"] + + save_config_to_file(raw_config, args.config_file) + return args diff --git a/src/khoj/processor/conversation/gpt4all/chat_model.py b/src/khoj/processor/conversation/gpt4all/chat_model.py index 9bc9ea52..7e92d002 100644 --- a/src/khoj/processor/conversation/gpt4all/chat_model.py +++ b/src/khoj/processor/conversation/gpt4all/chat_model.py @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) def extract_questions_offline( text: str, - model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin", + model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", loaded_model: Union[Any, None] = None, conversation_log={}, use_history: bool = True, @@ -113,7 +113,7 @@ def filter_questions(questions: List[str]): ] filtered_questions = [] for q in questions: - if not any([word in q.lower() for word in hint_words]): + if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q): filtered_questions.append(q) return filtered_questions @@ -123,10 +123,12 @@ def converse_offline( references, user_query, conversation_log={}, - model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin", + model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin", loaded_model: Union[Any, None] = None, completion_func=None, conversation_command=ConversationCommand.Default, + max_prompt_size=None, + tokenizer_name=None, ) -> Union[ThreadedGenerator, Iterator[str]]: """ Converse with user using Llama @@ -158,6 +160,8 @@ def converse_offline( prompts.system_prompt_message_llamav2, conversation_log, model_name=model, + max_prompt_size=max_prompt_size, + tokenizer_name=tokenizer_name, ) g = ThreadedGenerator(references, completion_func=completion_func) diff --git a/src/khoj/processor/conversation/gpt4all/model_metadata.py b/src/khoj/processor/conversation/gpt4all/model_metadata.py deleted file mode 100644 index 065e3720..00000000 --- a/src/khoj/processor/conversation/gpt4all/model_metadata.py +++ /dev/null @@ -1,3 +0,0 @@ -model_name_to_url = { - "llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin" -} diff --git a/src/khoj/processor/conversation/gpt4all/utils.py b/src/khoj/processor/conversation/gpt4all/utils.py index 4042fbe2..d5201780 100644 --- a/src/khoj/processor/conversation/gpt4all/utils.py +++ b/src/khoj/processor/conversation/gpt4all/utils.py @@ -1,24 +1,8 @@ -import os import logging -import requests -import hashlib -from tqdm import tqdm - -from khoj.processor.conversation.gpt4all import model_metadata logger = logging.getLogger(__name__) -expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"} - - -def get_md5_checksum(filename: str): - hash_md5 = hashlib.md5() - with open(filename, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - def download_model(model_name: str): try: @@ -27,57 +11,12 @@ def download_model(model_name: str): logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.") raise e - url = model_metadata.model_name_to_url.get(model_name) - model_path = os.path.expanduser(f"~/.cache/gpt4all/") - if not url: - logger.debug(f"Model {model_name} not found in model metadata. Skipping download.") - return GPT4All(model_name=model_name, model_path=model_path) - - filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}") - if os.path.exists(filename): - # Check if the user is connected to the internet - try: - requests.get("https://www.google.com/", timeout=5) - except: - logger.debug("User is offline. Disabling allowed download flag") - return GPT4All(model_name=model_name, model_path=model_path, allow_download=False) - return GPT4All(model_name=model_name, model_path=model_path) - - # Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file - tmp_filename = filename + ".tmp" - + # Use GPU for Chat Model, if available try: - os.makedirs(os.path.dirname(tmp_filename), exist_ok=True) - logger.debug(f"Downloading model {model_name} from {url} to {filename}...") - with requests.get(url, stream=True) as r: - r.raise_for_status() - total_size = int(r.headers.get("content-length", 0)) - with open(tmp_filename, "wb") as f, tqdm( - unit="B", # unit string to be displayed. - unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc. - unit_divisor=1024, # is used when unit_scale is true - total=total_size, # the total iteration. - desc=model_name, # prefix to be displayed on progress bar. - ) as progress_bar: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - progress_bar.update(len(chunk)) + model = GPT4All(model_name=model_name, device="gpu") + logger.debug("Loaded chat model to GPU.") + except ValueError: + model = GPT4All(model_name=model_name) + logger.debug("Loaded chat model to CPU.") - # Verify the checksum - if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename): - logger.error( - f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available." - ) - os.remove(tmp_filename) - raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.") - - # Move the tmp file to the actual file - os.rename(tmp_filename, filename) - logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}") - return GPT4All(model_name) - except Exception as e: - logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True) - # Remove the tmp file if it exists - if os.path.exists(tmp_filename): - os.remove(tmp_filename) - return None + return model diff --git a/src/khoj/processor/conversation/openai/gpt.py b/src/khoj/processor/conversation/openai/gpt.py index 96510586..73b4f176 100644 --- a/src/khoj/processor/conversation/openai/gpt.py +++ b/src/khoj/processor/conversation/openai/gpt.py @@ -116,6 +116,8 @@ def converse( temperature: float = 0.2, completion_func=None, conversation_command=ConversationCommand.Default, + max_prompt_size=None, + tokenizer_name=None, ): """ Converse with user using OpenAI's ChatGPT @@ -141,6 +143,8 @@ def converse( prompts.personality.format(), conversation_log, model, + max_prompt_size, + tokenizer_name, ) truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages}) logger.debug(f"Conversation Context for GPT: {truncated_messages}") diff --git a/src/khoj/processor/conversation/prompts.py b/src/khoj/processor/conversation/prompts.py index 4de3c623..d487609d 100644 --- a/src/khoj/processor/conversation/prompts.py +++ b/src/khoj/processor/conversation/prompts.py @@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template( """.strip() ) -system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant. +system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant. Using your general knowledge and our past conversations as context, answer the following question. If you do not know the answer, say 'I don't know.'""" @@ -51,13 +51,13 @@ extract_questions_system_prompt_llamav2 = PromptTemplate.from_template( general_conversation_llamav2 = PromptTemplate.from_template( """ -[INST]{query}[/INST] +[INST] {query} [/INST] """.strip() ) chat_history_llamav2_from_user = PromptTemplate.from_template( """ -[INST]{message}[/INST] +[INST] {message} [/INST] """.strip() ) @@ -69,7 +69,7 @@ chat_history_llamav2_from_assistant = PromptTemplate.from_template( conversation_llamav2 = PromptTemplate.from_template( """ -[INST]{query}[/INST] +[INST] {query} [/INST] """.strip() ) @@ -91,7 +91,7 @@ Question: {query} notes_conversation_llamav2 = PromptTemplate.from_template( """ -Notes: +User's Notes: {references} Question: {query} """.strip() @@ -134,19 +134,25 @@ Answer (in second person):""" extract_questions_llamav2_sample = PromptTemplate.from_template( """ -[INST]<>Current Date: {current_date}<>[/INST] -[INST]How was my trip to Cambodia?[/INST][] -[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia? -[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have? -[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic? -[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}' -[INST]How are you feeling today?[/INST] -[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age? -[INST]<> +[INST] <>Current Date: {current_date}<> [/INST] +[INST] How was my trip to Cambodia? [/INST] +How was my trip to Cambodia? +[INST] Who did I visit the temple with on that trip? [/INST] +Who did I visit the temple with in Cambodia? +[INST] How should I take care of my plants? [/INST] +What kind of plants do I have? What issues do my plants have? +[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST] +What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic? +[INST] What did I do for Christmas last year? [/INST] +What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}' +[INST] How are you feeling today? [/INST] +[INST] Is Alice older than Bob? [/INST] +When was Alice born? What is Bob's age? +[INST] <> Use these notes from the user's previous conversations to provide a response: {chat_history} -<>[/INST] -[INST]{query}[/INST] +<> [/INST] +[INST] {query} [/INST] """ ) diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py index 4a92c367..83d51f2d 100644 --- a/src/khoj/processor/conversation/utils.py +++ b/src/khoj/processor/conversation/utils.py @@ -3,24 +3,27 @@ import logging from time import perf_counter import json from datetime import datetime +import queue import tiktoken # External packages from langchain.schema import ChatMessage -from transformers import LlamaTokenizerFast +from transformers import AutoTokenizer # Internal Packages -import queue from khoj.utils.helpers import merge_dicts + logger = logging.getLogger(__name__) -max_prompt_size = { +model_to_prompt_size = { "gpt-3.5-turbo": 4096, "gpt-4": 8192, - "llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548, + "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548, "gpt-3.5-turbo-16k": 15000, } -tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"} +model_to_tokenizer = { + "llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer", +} class ThreadedGenerator: @@ -82,9 +85,26 @@ def message_to_log( def generate_chatml_messages_with_context( - user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2 + user_message, + system_message, + conversation_log={}, + model_name="gpt-3.5-turbo", + max_prompt_size=None, + tokenizer_name=None, ): """Generate messages for ChatGPT with context from previous conversation""" + # Set max prompt size from user config, pre-configured for model or to default prompt size + try: + max_prompt_size = max_prompt_size or model_to_prompt_size[model_name] + except: + max_prompt_size = 2000 + logger.warning( + f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window." + ) + + # Scale lookback turns proportional to max prompt size supported by model + lookback_turns = max_prompt_size // 750 + # Extract Chat History for Context chat_logs = [] for chat in conversation_log.get("chat", []): @@ -105,19 +125,28 @@ def generate_chatml_messages_with_context( messages = user_chatml_message + rest_backnforths + system_chatml_message # Truncate oldest messages from conversation history until under max supported prompt size by model - messages = truncate_messages(messages, max_prompt_size[model_name], model_name) + messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name) # Return message in chronological order return messages[::-1] -def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]: +def truncate_messages( + messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None +) -> list[ChatMessage]: """Truncate messages to fit within max prompt size supported by model""" - if "llama" in model_name: - encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name]) - else: - encoder = tiktoken.encoding_for_model(model_name) + try: + if model_name.startswith("gpt-"): + encoder = tiktoken.encoding_for_model(model_name) + else: + encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name]) + except: + default_tokenizer = "hf-internal-testing/llama-tokenizer" + encoder = AutoTokenizer.from_pretrained(default_tokenizer) + logger.warning( + f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing." + ) system_message = messages.pop() system_message_tokens = len(encoder.encode(system_message.content)) diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_jsonl.py index 77c34617..c24d9940 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_jsonl.py @@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl): # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path tmp_file = f"tmp_pdf_file.pdf" with open(f"{tmp_file}", "wb") as f: - bytes = base64.b64decode(pdf_files[pdf_file]) + bytes = pdf_files[pdf_file] f.write(bytes) loader = PyMuPDFLoader(f"{tmp_file}") pdf_entries_per_file = [page.page_content for page in loader.load()] diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 2ff6bab0..345429e8 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -30,6 +30,7 @@ from khoj.utils.rawconfig import ( GithubContentConfig, NotionContentConfig, ConversationProcessorConfig, + OfflineChatProcessorConfig, ) from khoj.utils.helpers import resolve_absolute_path from khoj.utils.state import SearchType @@ -185,6 +186,10 @@ if not state.demo: state.content_index.markdown = None elif content_type == "org": state.content_index.org = None + elif content_type == "plaintext": + state.content_index.plaintext = None + else: + logger.warning(f"Request to delete unknown content type: {content_type} via API") try: save_config_to_file_updated_state() @@ -284,10 +289,11 @@ if not state.demo: except Exception as e: return {"status": "error", "message": str(e)} - @api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200) + @api.post("/config/data/processor/conversation/offline_chat", status_code=200) async def set_processor_enable_offline_chat_config_data( request: Request, enable_offline_chat: bool, + offline_chat_model: Optional[str] = None, client: Optional[str] = None, ): _initialize_config() @@ -301,7 +307,12 @@ if not state.demo: state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore assert state.config.processor.conversation is not None - state.config.processor.conversation.enable_offline_chat = enable_offline_chat + if state.config.processor.conversation.offline_chat is None: + state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig() + + state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat + if offline_chat_model is not None: + state.config.processor.conversation.offline_chat.chat_model = offline_chat_model state.processor_config = configure_processor(state.config.processor, state.processor_config) update_telemetry_state( @@ -322,7 +333,7 @@ if not state.demo: # Create Routes @api.get("/config/data/default") def get_default_config_data(): - return constants.default_config + return constants.empty_config @api.get("/config/types", response_model=List[str]) @@ -387,7 +398,7 @@ async def search( # Encode query with filter terms removed defiltered_query = user_query for filter in [DateFilter(), WordFilter(), FileFilter()]: - defiltered_query = filter.defilter(user_query) + defiltered_query = filter.defilter(defiltered_query) encoded_asymmetric_query = None if t == SearchType.All or t != SearchType.Image: @@ -622,7 +633,7 @@ def update( if state.processor_config: components.append("Conversation processor") components_msg = ", ".join(components) - logger.info(f"πŸ“¬ {components_msg} updated via API") + logger.info(f"πŸ“ͺ {components_msg} updated via API") update_telemetry_state( request=request, @@ -702,12 +713,18 @@ async def chat( ) -> Response: perform_chat_checks() conversation_command = get_conversation_command(query=q, any_references=True) + + q = q.replace(f"/{conversation_command.value}", "").strip() + compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( request, q, (n or 5), conversation_command ) - conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references)) + + if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references): + conversation_command = ConversationCommand.General + if conversation_command == ConversationCommand.Help: - model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai" + model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai" formatted_help = help_message.format(model=model_type, version=state.khoj_version) return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200) @@ -768,23 +785,21 @@ async def extract_references_and_questions( logger.warning( "No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes." ) - return compiled_references, inferred_queries + return compiled_references, inferred_queries, q if conversation_type == ConversationCommand.General: return compiled_references, inferred_queries, q # Extract filter terms from user message defiltered_query = q - filter_terms = [] for filter in [DateFilter(), WordFilter(), FileFilter()]: - filter_terms += filter.get_filter_terms(q) - defiltered_query = filter.defilter(q) - filters_in_query = " ".join(filter_terms) + defiltered_query = filter.defilter(defiltered_query) + filters_in_query = q.replace(defiltered_query, "").strip() # Infer search queries from user message with timer("Extracting search queries took", logger): # If we've reached here, either the user has enabled offline chat or the openai model is enabled. - if state.processor_config.conversation.enable_offline_chat: + if state.processor_config.conversation.offline_chat.enable_offline_chat: loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model inferred_queries = extract_questions_offline( defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False @@ -800,7 +815,7 @@ async def extract_references_and_questions( with timer("Searching knowledge base took", logger): result_list = [] for query in inferred_queries: - n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n + n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n result_list.extend( await search( f"{query} {filters_in_query}", diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py index 267af330..6b42f29c 100644 --- a/src/khoj/routers/helpers.py +++ b/src/khoj/routers/helpers.py @@ -113,7 +113,7 @@ def generate_chat_response( meta_log=meta_log, ) - if state.processor_config.conversation.enable_offline_chat: + if state.processor_config.conversation.offline_chat.enable_offline_chat: loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model chat_response = converse_offline( references=compiled_references, @@ -122,6 +122,9 @@ def generate_chat_response( conversation_log=meta_log, completion_func=partial_completion, conversation_command=conversation_command, + model=state.processor_config.conversation.offline_chat.chat_model, + max_prompt_size=state.processor_config.conversation.max_prompt_size, + tokenizer_name=state.processor_config.conversation.tokenizer, ) elif state.processor_config.conversation.openai_model: @@ -135,6 +138,8 @@ def generate_chat_response( api_key=api_key, completion_func=partial_completion, conversation_command=conversation_command, + max_prompt_size=state.processor_config.conversation.max_prompt_size, + tokenizer_name=state.processor_config.conversation.tokenizer, ) except Exception as e: diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index f5b2b418..a9656050 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -1,11 +1,11 @@ # Standard Packages import logging -import sys from typing import Optional, Union, Dict # External Packages -from fastapi import APIRouter, HTTPException, Header, Request, Body, Response +from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile from pydantic import BaseModel +from khoj.routers.helpers import update_telemetry_state # Internal Packages from khoj.utils import state, constants @@ -56,42 +56,30 @@ class IndexerInput(BaseModel): plaintext: Optional[dict[str, str]] = None -@indexer.post("/batch") -async def index_batch( +@indexer.post("/update") +async def update( request: Request, + files: list[UploadFile], x_api_key: str = Header(None), - regenerate: bool = False, - search_type: Optional[Union[state.SearchType, str]] = None, + force: bool = False, + t: Optional[Union[state.SearchType, str]] = None, + client: Optional[str] = None, + user_agent: Optional[str] = Header(None), + referer: Optional[str] = Header(None), + host: Optional[str] = Header(None), ): if x_api_key != "secret": raise HTTPException(status_code=401, detail="Invalid API Key") state.config_lock.acquire() try: - logger.info(f"Received batch indexing request") - index_batch_request_acc = b"" - async for chunk in request.stream(): - index_batch_request_acc += chunk - data_bytes = sys.getsizeof(index_batch_request_acc) - unit = "KB" - data_size = data_bytes / 1024 - if data_size > 1000: - unit = "MB" - data_size = data_size / 1024 - if data_size > 1000: - unit = "GB" - data_size = data_size / 1024 - data_size_metric = f"{data_size:.2f} {unit}" - logger.info(f"Received {data_size_metric} of data") - index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc) - logger.info(f"Received {len(index_batch_request.files)} files") - + logger.info(f"πŸ“¬ Updating content index via API call by {client} client") org_files: Dict[str, str] = {} markdown_files: Dict[str, str] = {} pdf_files: Dict[str, str] = {} plaintext_files: Dict[str, str] = {} - for file in index_batch_request.files: - file_type = get_file_type(file.path) + for file in files: + file_type, encoding = get_file_type(file.content_type) dict_to_update = None if file_type == "org": dict_to_update = org_files @@ -103,9 +91,11 @@ async def index_batch( dict_to_update = plaintext_files if dict_to_update is not None: - dict_to_update[file.path] = file.content + dict_to_update[file.filename] = ( + file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() + ) else: - logger.info(f"Skipping unsupported streamed file: {file.path}") + logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}") indexer_input = IndexerInput( org=org_files, @@ -115,7 +105,7 @@ async def index_batch( ) if state.config == None: - logger.info("First run, initializing state.") + logger.info("πŸ“¬ Initializing content index on first run.") default_full_config = FullConfig( content_type=None, search_type=SearchConfig.parse_obj(constants.default_config["search-type"]), @@ -142,15 +132,30 @@ async def index_batch( state.config.content_type, indexer_input.dict(), state.search_models, - regenerate=regenerate, - t=search_type, + regenerate=force, + t=t, full_corpus=False, ) except Exception as e: - logger.error(f"Failed to process batch indexing request: {e}", exc_info=True) + logger.error( + f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}", + exc_info=True, + ) finally: state.config_lock.release() + + update_telemetry_state( + request=request, + telemetry_type="api", + api="index/update", + client=client, + user_agent=user_agent, + referer=referer, + host=host, + ) + + logger.info(f"πŸ“ͺ Content index updated via API call by {client} client") return Response(content="OK", status_code=200) diff --git a/src/khoj/utils/cli.py b/src/khoj/utils/cli.py index 78a9ccf9..1d6106cb 100644 --- a/src/khoj/utils/cli.py +++ b/src/khoj/utils/cli.py @@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file from khoj.migrations.migrate_version import migrate_config_to_version from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema from khoj.migrations.migrate_offline_model import migrate_offline_model +from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema def cli(args=None): @@ -55,7 +56,12 @@ def cli(args=None): def run_migrations(args): - migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model] + migrations = [ + migrate_config_to_version, + migrate_processor_conversation_schema, + migrate_offline_model, + migrate_offline_chat_schema, + ] for migration in migrations: args = migration(args) return args diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index a6532346..5b3b9f6e 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model # External Packages import torch +from khoj.utils.rawconfig import OfflineChatProcessorConfig + logger = logging.getLogger(__name__) # Internal Packages @@ -84,7 +86,6 @@ class SearchModels: @dataclass class GPT4AllProcessorConfig: - chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin" loaded_model: Union[Any, None] = None @@ -95,18 +96,20 @@ class ConversationProcessorConfigModel: ): self.openai_model = conversation_config.openai self.gpt4all_model = GPT4AllProcessorConfig() - self.enable_offline_chat = conversation_config.enable_offline_chat + self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig() + self.max_prompt_size = conversation_config.max_prompt_size + self.tokenizer = conversation_config.tokenizer self.conversation_logfile = Path(conversation_config.conversation_logfile) self.chat_session: List[str] = [] self.meta_log: dict = {} - if self.enable_offline_chat: + if self.offline_chat.enable_offline_chat: try: - self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model) - except ValueError as e: + self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model) + except Exception as e: + self.offline_chat.enable_offline_chat = False self.gpt4all_model.loaded_model = None logger.error(f"Error while loading offline chat model: {e}", exc_info=True) - self.enable_offline_chat = False else: self.gpt4all_model.loaded_model = None diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index c5a67714..9ed97798 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -6,6 +6,64 @@ empty_escape_sequences = "\n|\r|\t| " app_env_filepath = "~/.khoj/env" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" +empty_config = { + "content-type": { + "org": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz", + "embeddings-file": "~/.khoj/content/org/org_embeddings.pt", + "index-heading-entries": False, + }, + "markdown": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz", + "embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt", + }, + "pdf": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz", + "embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt", + }, + "plaintext": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz", + "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt", + }, + }, + "search-type": { + "symmetric": { + "encoder": "sentence-transformers/all-MiniLM-L6-v2", + "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "model_directory": "~/.khoj/search/symmetric/", + }, + "asymmetric": { + "encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", + "cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "model_directory": "~/.khoj/search/asymmetric/", + }, + "image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"}, + }, + "processor": { + "conversation": { + "openai": { + "api-key": None, + "chat-model": "gpt-3.5-turbo", + }, + "offline-chat": { + "enable-offline-chat": False, + "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin", + }, + "tokenizer": None, + "max-prompt-size": None, + "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json", + } + }, +} + # default app config to use default_config = { "content-type": { @@ -72,7 +130,12 @@ default_config = { "api-key": None, "chat-model": "gpt-3.5-turbo", }, - "enable-offline-chat": False, + "offline-chat": { + "enable-offline-chat": False, + "chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin", + }, + "tokenizer": None, + "max-prompt-size": None, "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json", } }, diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py index 6a777bd7..44fc70ad 100644 --- a/src/khoj/utils/fs_syncer.py +++ b/src/khoj/utils/fs_syncer.py @@ -1,6 +1,6 @@ import logging import glob -import base64 +import os from typing import Optional from bs4 import BeautifulSoup @@ -39,13 +39,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: return soup.get_text(strip=True, separator="\n") # Extract required fields from config - input_files, input_filter = ( + input_files, input_filters = ( config.input_files, config.input_filter, ) # Input Validation - if is_none_or_empty(input_files) and is_none_or_empty(input_filter): + if is_none_or_empty(input_files) and is_none_or_empty(input_filters): logger.debug("At least one of input-files or input-file-filter is required to be specified") return {} @@ -53,11 +53,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: absolute_plaintext_files, filtered_plaintext_files = set(), set() if input_files: absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files} - if input_filter: + if input_filters: filtered_plaintext_files = { filtered_file - for jsonl_file_filter in input_filter - for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True) + for plaintext_file_filter in input_filters + for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) @@ -73,12 +74,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: filename_to_content_map = {} for file in all_target_files: - with open(file, "r") as f: + with open(file, "r", encoding="utf8") as f: try: plaintext_content = f.read() if file.endswith(("html", "htm", "xml")): plaintext_content = extract_html_content(plaintext_content) - filename_to_content_map[file] = f.read() + filename_to_content_map[file] = plaintext_content except Exception as e: logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.") logger.warning(e, exc_info=True) @@ -88,13 +89,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]: def get_org_files(config: TextContentConfig): # Extract required fields from config - org_files, org_file_filter = ( + org_files, org_file_filters = ( config.input_files, config.input_filter, ) # Input Validation - if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): + if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters): logger.debug("At least one of org-files or org-file-filter is required to be specified") return {} @@ -102,11 +103,12 @@ def get_org_files(config: TextContentConfig): absolute_org_files, filtered_org_files = set(), set() if org_files: absolute_org_files = {get_absolute_path(org_file) for org_file in org_files} - if org_file_filter: + if org_file_filters: filtered_org_files = { filtered_file - for org_file_filter in org_file_filter + for org_file_filter in org_file_filters for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_org_files = sorted(absolute_org_files | filtered_org_files) @@ -119,7 +121,7 @@ def get_org_files(config: TextContentConfig): filename_to_content_map = {} for file in all_org_files: - with open(file, "r") as f: + with open(file, "r", encoding="utf8") as f: try: filename_to_content_map[file] = f.read() except Exception as e: @@ -131,26 +133,27 @@ def get_org_files(config: TextContentConfig): def get_markdown_files(config: TextContentConfig): # Extract required fields from config - markdown_files, markdown_file_filter = ( + markdown_files, markdown_file_filters = ( config.input_files, config.input_filter, ) # Input Validation - if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): + if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters): logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified") return {} - "Get Markdown files to process" + # Get markdown files to process absolute_markdown_files, filtered_markdown_files = set(), set() if markdown_files: absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} - if markdown_file_filter: + if markdown_file_filters: filtered_markdown_files = { filtered_file - for markdown_file_filter in markdown_file_filter + for markdown_file_filter in markdown_file_filters for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) @@ -168,7 +171,7 @@ def get_markdown_files(config: TextContentConfig): filename_to_content_map = {} for file in all_markdown_files: - with open(file, "r") as f: + with open(file, "r", encoding="utf8") as f: try: filename_to_content_map[file] = f.read() except Exception as e: @@ -180,13 +183,13 @@ def get_markdown_files(config: TextContentConfig): def get_pdf_files(config: TextContentConfig): # Extract required fields from config - pdf_files, pdf_file_filter = ( + pdf_files, pdf_file_filters = ( config.input_files, config.input_filter, ) # Input Validation - if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter): + if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters): logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified") return {} @@ -194,11 +197,12 @@ def get_pdf_files(config: TextContentConfig): absolute_pdf_files, filtered_pdf_files = set(), set() if pdf_files: absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files} - if pdf_file_filter: + if pdf_file_filters: filtered_pdf_files = { filtered_file - for pdf_file_filter in pdf_file_filter + for pdf_file_filter in pdf_file_filters for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) + if os.path.isfile(filtered_file) } all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) @@ -214,7 +218,7 @@ def get_pdf_files(config: TextContentConfig): for file in all_pdf_files: with open(file, "rb") as f: try: - filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8") + filename_to_content_map[file] = f.read() except Exception as e: logger.warning(f"Unable to read file: {file} as PDF. Skipping file.") logger.warning(e, exc_info=True) diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index f8977043..9209ff67 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -66,20 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict): return merged_dict -def get_file_type(filepath: str) -> str: - "Get file type from file path" - file_type = Path(filepath).suffix[1:] +def get_file_type(file_type: str) -> tuple[str, str]: + "Get file type from file mime type" - if file_type in ["md", "markdown"]: - return "markdown" - elif file_type in ["org", "orgmode"]: - return "org" - elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]: - return "plaintext" - elif file_type in ["pdf"]: - return "pdf" - - return file_type + encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None + file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type + if file_type in ["text/markdown"]: + return "markdown", encoding + elif file_type in ["text/org"]: + return "org", encoding + elif file_type in ["application/pdf"]: + return "pdf", encoding + elif file_type in ["image/jpeg"]: + return "jpeg", encoding + elif file_type in ["image/png"]: + return "png", encoding + elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]: + return "plaintext", encoding + else: + return "other", encoding def load_model( diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index 0a916db4..f7c42266 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -91,10 +91,17 @@ class OpenAIProcessorConfig(ConfigBase): chat_model: Optional[str] = "gpt-3.5-turbo" +class OfflineChatProcessorConfig(ConfigBase): + enable_offline_chat: Optional[bool] = False + chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin" + + class ConversationProcessorConfig(ConfigBase): conversation_logfile: Path openai: Optional[OpenAIProcessorConfig] - enable_offline_chat: Optional[bool] = False + offline_chat: Optional[OfflineChatProcessorConfig] + max_prompt_size: Optional[int] + tokenizer: Optional[str] class ProcessorConfig(ConfigBase): diff --git a/tests/conftest.py b/tests/conftest.py index 7c1878a2..4f7dfb10 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path from khoj.utils.rawconfig import ( ContentConfig, ConversationProcessorConfig, + OfflineChatProcessorConfig, OpenAIProcessorConfig, ProcessorConfig, TextContentConfig, @@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory): # Setup conversation processor processor_config = ProcessorConfig() + offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True) processor_config.conversation = ConversationProcessorConfig( - enable_offline_chat=True, + offline_chat=offline_chat, conversation_logfile=processor_dir.joinpath("conversation_logs.json"), ) diff --git a/tests/test_client.py b/tests/test_client.py index 784c765c..a5f14882 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -6,6 +6,7 @@ from urllib.parse import quote # External Packages from fastapi.testclient import TestClient +import pytest # Internal Packages from app.main import app @@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client): # ---------------------------------------------------------------------------------------------------- -def test_index_batch(client): +def test_index_update(client): # Arrange - request_body = get_sample_files_data() + files = get_sample_files_data() headers = {"x-api-key": "secret"} # Act - response = client.post("/v1/indexer/batch", json=request_body, headers=headers) + response = client.post("/api/v1/index/update", files=files, headers=headers) # Assert assert response.status_code == 200 @@ -76,12 +77,11 @@ def test_index_batch(client): def test_regenerate_with_valid_content_type(client): for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]: # Arrange - request_body = get_sample_files_data() - + files = get_sample_files_data() headers = {"x-api-key": "secret"} # Act - response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers) + response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers) # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" @@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client): response = client.get(f"/api/update?force=true&t=github") # Arrange - request_body = get_sample_files_data() - + files = get_sample_files_data() headers = {"x-api-key": "secret"} # Act - response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers) + response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers) # Assert assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github" # ---------------------------------------------------------------------------------------------------- +@pytest.mark.skip(reason="Flaky test on parallel test runs") def test_get_configured_types_via_api(client): # Act response = client.get(f"/api/config/types") @@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter( def get_sample_files_data(): return { - "org": { - "path/to/filename.org": "* practicing piano", - "path/to/filename1.org": "** top 3 reasons why I moved to SF", - "path/to/filename2.org": "* how to build a search engine", - }, - "pdf": { - "path/to/filename.pdf": "Moore's law does not apply to consumer hardware", - "path/to/filename1.pdf": "The sun is a ball of helium", - "path/to/filename2.pdf": "Effect of sunshine on baseline human happiness", - }, - "plaintext": { - "path/to/filename.txt": "data,column,value", - "path/to/filename1.txt": "my first web page", - "path/to/filename2.txt": "2021-02-02 Journal Entry", - }, - "markdown": { - "path/to/filename.md": "# Notes from client call", - "path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate", - "path/to/filename2.md": "**Understanding science through the lens of art**", - }, + "files": ("path/to/filename.org", "* practicing piano", "text/org"), + "files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"), + "files": ("path/to/filename2.org", "* how to build a search engine", "text/org"), + "files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"), + "files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"), + "files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"), + "files": ("path/to/filename.txt", "data,column,value", "text/plain"), + "files": ("path/to/filename1.txt", "my first web page", "text/plain"), + "files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"), + "files": ("path/to/filename.md", "# Notes from client call", "text/markdown"), + "files": ( + "path/to/filename1.md", + "## Studying anthropological records from the Fatimid caliphate", + "text/markdown", + ), + "files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"), } diff --git a/tests/test_gpt4all_chat_actors.py b/tests/test_gpt4all_chat_actors.py index d7904ff8..76ed26e7 100644 --- a/tests/test_gpt4all_chat_actors.py +++ b/tests/test_gpt4all_chat_actors.py @@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model from khoj.processor.conversation.utils import message_to_log -MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin" +MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin" @pytest.fixture(scope="session") @@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model): @pytest.mark.chatquality def test_extract_multiple_implicit_questions_from_message(loaded_model): # Act - response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model) + response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model) # Assert - expected_responses = ["height", "taller", "shorter", "heights"] + expected_responses = ["height", "taller", "shorter", "heights", "who"] assert len(response) <= 3 for question in response: assert any([expected_response in question.lower() for expected_response in expected_responses]), ( - "Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question + "Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question ) @@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model): def test_generate_search_query_using_question_from_chat_history(loaded_model): # Arrange message_list = [ - ("What is the name of Mr. Vader's daughter?", "Princess Leia", []), + ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []), ] # Act @@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model): use_history=True, ) - expected_responses = [ - "Vader", - "sons", + all_expected_in_response = [ + "Anderson", + ] + + any_expected_in_response = [ "son", - "Darth", + "sons", "children", ] # Assert assert len(response) >= 1 - assert any([expected_response in response[0] for expected_response in expected_responses]), ( + assert all([expected_response in response[0] for expected_response in all_expected_in_response]), ( + "Expected chat actor to ask for clarification in response, but got: " + response[0] + ) + assert any([expected_response in response[0] for expected_response in any_expected_in_response]), ( "Expected chat actor to ask for clarification in response, but got: " + response[0] ) @@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model): def test_generate_search_query_using_answer_from_chat_history(loaded_model): # Arrange message_list = [ - ("What is the name of Mr. Vader's daughter?", "Princess Leia", []), + ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []), ] # Act response = extract_questions_offline( - "Is she a Jedi?", + "Is she a Doctor?", conversation_log=populate_chat_history(message_list), loaded_model=loaded_model, use_history=True, ) expected_responses = [ - "Leia", - "Vader", + "Barbara", + "Robert", "daughter", ] diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py index bacce37c..b9b26986 100644 --- a/tests/test_pdf_to_jsonl.py +++ b/tests/test_pdf_to_jsonl.py @@ -1,7 +1,6 @@ # Standard Packages import json import os -import base64 # Internal Packages from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl @@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl(): # Extract Entries from specified Pdf files # Read singlepage.pdf into memory as bytes with open("tests/data/pdf/singlepage.pdf", "rb") as f: - pdf_bytes = base64.b64encode(f.read()).decode("utf-8") + pdf_bytes = f.read() data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) @@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl(): # Act # Extract Entries from specified Pdf files with open("tests/data/pdf/multipage.pdf", "rb") as f: - pdf_bytes = base64.b64encode(f.read()).decode("utf-8") + pdf_bytes = f.read() data = {"tests/data/pdf/multipage.pdf": pdf_bytes} entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index b1a9aa4d..179718fa 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -1,26 +1,25 @@ # System Packages import logging +import locale from pathlib import Path import os # External Packages import pytest -from khoj.utils.config import SearchModels # Internal Packages from khoj.utils.state import content_index, search_models from khoj.search_type import text_search -from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.github.github_to_jsonl import GithubToJsonl +from khoj.utils.config import SearchModels from khoj.utils.fs_syncer import get_org_files +from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig # Test # ---------------------------------------------------------------------------------------------------- -def test_text_search_setup_with_missing_file_raises_error( - org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig -): +def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig): # Arrange # Ensure file mentioned in org.input-files is missing single_new_file = Path(org_config_with_only_new_file.input_files[0]) @@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error( # Act # Generate notes embeddings during asymmetric setup with pytest.raises(FileNotFoundError): - data = get_org_files(org_config_with_only_new_file) + get_org_files(org_config_with_only_new_file) + + +# ---------------------------------------------------------------------------------------------------- +def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path): + # Arrange + orgfile = tmp_path / "directory.org" / "file.org" + orgfile.parent.mkdir() + with open(orgfile, "w") as f: + f.write("* Heading\n- List item\n") + org_content_config = TextContentConfig( + input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt" + ) + + # Act + # should not raise IsADirectoryError and return orgfile + assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"} # ---------------------------------------------------------------------------------------------------- @@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error( def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels): # Arrange data = get_org_files(content_config.org) + # Act # Regenerate notes embeddings during asymmetric setup notes_model = text_search.setup( diff --git a/versions.json b/versions.json index cf60cf10..9cc1eb5c 100644 --- a/versions.json +++ b/versions.json @@ -24,5 +24,6 @@ "0.12.0": "0.15.0", "0.12.1": "0.15.0", "0.12.2": "0.15.0", - "0.12.3": "0.15.0" + "0.12.3": "0.15.0", + "0.13.0": "0.15.0" }