Resolve merge conflicts

This commit is contained in:
sabaimran 2023-10-19 14:39:05 -07:00
commit 963cd165eb
42 changed files with 941 additions and 590 deletions

View file

@ -7,18 +7,21 @@
### Setup
#### Offline Chat
Offline chat works without internet but it is slower, lower quality and more compute intensive.
Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time
> **System Requirements**:
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card
- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card
![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
#### Online Chat
Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing
!> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing
1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.

View file

@ -46,7 +46,7 @@ Indexes your org-agenda files, by default.
(use-package khoj
:ensure t
:pin melpa-stable
:bind ("C-c s" . 'khoj)
:bind ("C-c s" . 'khoj))
```
- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj

View file

@ -1,7 +1,7 @@
{
"id": "khoj",
"name": "Khoj",
"version": "0.12.3",
"version": "0.13.0",
"minAppVersion": "0.15.0",
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
"author": "Khoj Inc.",

View file

@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "khoj-assistant"
description = "An AI personal assistant for your Digital Brain"
description = "An AI copilot for your Second Brain"
readme = "README.md"
license = "GPL-3.0-or-later"
requires-python = ">=3.8"
@ -40,8 +40,9 @@ dependencies = [
"dateparser >= 1.1.1",
"defusedxml == 0.7.1",
"fastapi == 0.77.1",
"python-multipart >= 0.0.5",
"jinja2 == 3.1.2",
"openai >= 0.27.0",
"openai >= 0.27.0, < 1.0.0",
"tiktoken >= 0.3.2",
"tenacity >= 8.2.2",
"pillow == 9.3.0",
@ -83,6 +84,7 @@ test = [
"freezegun >= 1.2.0",
"factory-boy >= 3.2.1",
"trio >= 0.22.0",
"pytest-xdist",
]
dev = [
"khoj-assistant[test]",

View file

@ -9,6 +9,10 @@ do
# Get current project version
current_version=$OPTARG
# Bump Desktop app to current version
cd $project_root/src/interface/desktop
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
# Bump Obsidian plugin to current version
cd $project_root/src/interface/obsidian
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json

View file

@ -14,10 +14,11 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
# External Packages
import uvicorn
import django
import schedule
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import schedule
import django
from fastapi.staticfiles import StaticFiles
from rich.logging import RichHandler
from django.core.asgi import get_asgi_application
@ -41,6 +42,15 @@ app = FastAPI()
# Get Django Application
django_app = get_asgi_application()
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Set Locale
locale.setlocale(locale.LC_ALL, "")

View file

@ -8,7 +8,6 @@ const {dialog} = require('electron');
const cron = require('cron').CronJob;
const axios = require('axios');
const { Readable } = require('stream');
const KHOJ_URL = 'http://127.0.0.1:42110'
@ -65,7 +64,7 @@ const schema = {
var state = {}
const store = new Store({schema});
const store = new Store({ schema });
console.log(store);
@ -86,57 +85,65 @@ function handleSetTitle (event, title) {
});
}
function filenameToMimeType (filename) {
const extension = filename.split('.').pop();
switch (extension) {
case 'pdf':
return 'application/pdf';
case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
}
function pushDataToKhoj (regenerate = false) {
let filesToPush = [];
const files = store.get('files');
const folders = store.get('folders');
state = {
completed: true
const files = store.get('files') || [];
const folders = store.get('folders') || [];
state = { completed: true }
// Collect paths of all configured files to index
for (const file of files) {
filesToPush.push(file.path);
}
if (files) {
for (file of files) {
filesToPush.push(file.path);
}
}
if (folders) {
for (folder of folders) {
const files = fs.readdirSync(folder.path, { withFileTypes: true });
for (file of files) {
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
filesToPush.push(path.join(folder.path, file.name));
}
// Collect paths of all indexable files in configured folders
for (const folder of folders) {
const files = fs.readdirSync(folder.path, { withFileTypes: true });
for (const file of files) {
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
filesToPush.push(path.join(folder.path, file.name));
}
}
}
let data = {
files: []
}
const lastSync = store.get('lastSync') || [];
for (file of filesToPush) {
const formData = new FormData();
for (const file of filesToPush) {
const stats = fs.statSync(file);
if (!regenerate) {
// Only push files that have been modified since last sync
if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
continue;
}
}
// Collect all updated or newly created files since last sync to index on Khoj server
try {
let rawData;
// If the file is a PDF or IMG file, read it as a binary file
if (binaryFileTypes.includes(file.split('.').pop())) {
rawData = fs.readFileSync(file).toString('base64');
} else {
rawData = fs.readFileSync(file, 'utf8');
}
data.files.push({
path: file,
content: rawData
});
let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
let fileObj = new Blob([fileContent], { type: mimeType });
formData.append('files', fileObj, file);
state[file] = {
success: true,
}
@ -149,46 +156,46 @@ function pushDataToKhoj (regenerate = false) {
}
}
// Mark deleted files for removal from index on Khoj server
for (const syncedFile of lastSync) {
if (!filesToPush.includes(syncedFile.path)) {
data.files.push({
path: syncedFile.path,
content: ""
});
fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
formData.append('files', fileObj, syncedFile.path);
}
}
const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' };
const stream = new Readable({
read() {
this.push(JSON.stringify(data));
this.push(null);
}
});
const hostURL = store.get('hostURL') || KHOJ_URL;
axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
.then(response => {
console.log(response.data);
const win = BrowserWindow.getAllWindows()[0];
win.webContents.send('update-state', state);
let lastSync = [];
for (const file of filesToPush) {
lastSync.push({
path: file,
datetime: new Date().toISOString()
});
}
store.set('lastSync', lastSync);
})
.catch(error => {
console.error(error);
state['completed'] = false
const win = BrowserWindow.getAllWindows()[0];
win.webContents.send('update-state', state);
});
// Send collected files to Khoj server for indexing
if (!!formData?.entries()?.next().value) {
const hostURL = store.get('hostURL') || KHOJ_URL;
const headers = {
'x-api-key': 'secret'
};
axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
.then(response => {
console.log(response.data);
let lastSync = [];
for (const file of filesToPush) {
lastSync.push({
path: file,
datetime: new Date().toISOString()
});
}
store.set('lastSync', lastSync);
})
.catch(error => {
console.error(error);
state['completed'] = false
})
.finally(() => {
// Syncing complete
const win = BrowserWindow.getAllWindows()[0];
if (win) win.webContents.send('update-state', state);
});
} else {
// Syncing complete
const win = BrowserWindow.getAllWindows()[0];
if (win) win.webContents.send('update-state', state);
}
}
pushDataToKhoj();

View file

@ -1,13 +1,13 @@
{
"name": "Khoj",
"homepage": ".",
"productName": "Khoj",
"version": "1.0.2",
"description": "Scaffolding for the desktop entrypoint to Khoj",
"main": "main.js",
"version": "0.13.0",
"description": "An AI copilot for your Second Brain",
"author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
"license": "GPL-3.0-or-later",
"homepage": "https://khoj.dev",
"repository": "\"https://github.com/khoj-ai/khoj\"",
"author": "Khoj <team@khoj.dev>",
"license": "MIT",
"productName": "Khoj",
"main": "main.js",
"private": false,
"devDependencies": {
"electron": "25.8.1"

View file

@ -1,11 +1,12 @@
;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*-
;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*-
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
;; Copyright (C) 2021-2023 Khoj Inc.
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
;; Description: An AI personal assistant for your digital brain
;; Author: Debanjum Singh Solanky <debanjum@khoj.dev>
;; Saba Imran <saba@khoj.dev>
;; Description: An AI copilot for your Second Brain
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
;; Version: 0.12.3
;; Version: 0.13.0
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
@ -28,8 +29,8 @@
;;; Commentary:
;; Create an AI personal assistant for your `org-mode', `markdown' notes,
;; PDFs and images. The assistant exposes 2 modes, search and chat:
;; Create an AI copilot to your `org-mode', `markdown' notes,
;; PDFs and images. The copilot exposes 2 modes, search and chat:
;;
;; Chat provides faster answers, iterative discovery and assisted
;; creativity. It requires your OpenAI API key to access GPT models
@ -87,6 +88,21 @@
:group 'khoj
:type 'integer)
(defcustom khoj-search-on-idle-time 0.3
"Idle time (in seconds) to wait before triggering search."
:group 'khoj
:type 'number)
(defcustom khoj-server-api-key "secret"
"API Key to Khoj server."
:group 'khoj
:type 'string)
(defcustom khoj-index-interval 3600
"Interval (in seconds) to wait before updating content index."
:group 'khoj
:type 'number)
(defcustom khoj-default-content-type "org"
"The default content type to perform search on."
:group 'khoj
@ -115,6 +131,15 @@
(defvar khoj--content-type "org"
"The type of content to perform search on.")
(defvar khoj--search-on-idle-timer nil
"Idle timer to trigger incremental search.")
(defvar khoj--index-timer nil
"Timer to trigger content indexing.")
(defvar khoj--indexed-files '()
"Files that were indexed in previous content indexing run.")
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
(declare-function org-element-type "org-mode" (ELEMENT))
(declare-function markdown-mode "markdown-mode" ())
@ -236,6 +261,11 @@ for example), set this to the full interpreter path."
:type 'boolean
:group 'khoj)
(defcustom khoj-offline-chat-model nil
"Specify chat model to use for offline chat with khoj."
:type 'string
:group 'khoj)
(defcustom khoj-auto-setup t
"Automate install, configure and start of khoj server.
Auto invokes setup steps on calling main entrypoint."
@ -365,9 +395,9 @@ CONFIG is json obtained from Khoj config API."
(string-join "/"))))
(defun khoj--server-configure ()
"Configure the the Khoj server for search and chat."
"Configure the Khoj server for search and chat."
(interactive)
(let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
(let* ((url-request-method "GET")
(current-config
(with-temp-buffer
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
@ -376,56 +406,12 @@ CONFIG is json obtained from Khoj config API."
(with-temp-buffer
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
(default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))))
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
(offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
(config (or current-config default-config)))
;; Configure content types
(cond
;; If khoj backend is not configured yet
((not current-config)
(message "khoj.el: Server not configured yet.")
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
(index-heading-entries . ,json-false)))))
config))
;; Else if khoj config has no org content config
((not (alist-get 'org (alist-get 'content-type config)))
(message "khoj.el: Org-mode content on server not configured yet.")
(let ((new-content-type (alist-get 'content-type config)))
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
(index-heading-entries . ,json-false)))
new-content-type)
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ,new-content-type) config)))
;; Else if khoj is not configured to index specified org files
((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
(equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
(message "khoj.el: Org-mode content on server is stale.")
(let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
(new-content-type (alist-get 'content-type config)))
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
(embeddings-file . ,(format "%s/org.pt" index-directory))
(index-heading-entries . ,json-false)))
new-content-type)
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ,new-content-type) config))))
;; Configure processors
(cond
((not khoj-openai-api-key)
@ -441,10 +427,11 @@ CONFIG is json obtained from Khoj config API."
;; If khoj backend isn't configured yet
((not current-config)
(message "khoj.el: Chat not configured yet.")
(message "khoj.el: Khoj not configured yet.")
(setq config (delq (assoc 'processor config) config))
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
(enable-offline-chat . ,enable-offline-chat)
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,chat-model)
(api-key . ,khoj-openai-api-key)))))))
config))
@ -455,7 +442,8 @@ CONFIG is json obtained from Khoj config API."
(let ((new-processor-type (alist-get 'processor config)))
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
(enable-offline-chat . ,enable-offline-chat)
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,chat-model)
(api-key . ,khoj-openai-api-key)))))
new-processor-type)
@ -465,13 +453,15 @@ CONFIG is json obtained from Khoj config API."
;; Else if chat configuration in khoj backend has gone stale
((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
(equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
(equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat)))
(equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat)
(equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model)))
(message "khoj.el: Chat configuration has gone stale.")
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
(new-processor-type (alist-get 'processor config)))
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
(enable-offline-chat . ,enable-offline-chat)
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,khoj-chat-model)
(api-key . ,khoj-openai-api-key)))))
new-processor-type)
@ -509,9 +499,75 @@ CONFIG is json obtained from Khoj config API."
(khoj--server-configure))))
;; -----------------------------------------------
;; Extract and Render Entries of each Content Type
;; -----------------------------------------------
;; -------------------
;; Khoj Index Content
;; -------------------
(defun khoj--server-index-files (&optional force content-type file-paths)
"Send files at `FILE-PATHS' to the Khoj server to index for search and chat.
`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed."
(interactive)
(let ((boundary (format "-------------------------%d" (random (expt 10 10))))
(files-to-index (or file-paths
(append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
(type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
(inhibit-message t)
(message-log-max nil))
(let ((url-request-method "POST")
(url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
(url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
("x-api-key" . ,khoj-server-api-key))))
(with-current-buffer
(url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false"))
;; render response from indexing API endpoint on server
(lambda (status)
(if (not status)
(message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " ""))
(with-current-buffer (current-buffer)
(goto-char "\n\n")
(message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s"
(if force "force " "")
content-type
status
(string-trim (buffer-substring-no-properties (point) (point-max)))))))
nil t t)))
(setq khoj--indexed-files files-to-index)))
(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
"Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
(with-temp-buffer
(set-buffer-multibyte nil)
(insert "\n")
(dolist (file-to-index files-to-index)
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert "Content-Type: text/org\r\n\r\n")
(insert (with-temp-buffer
(insert-file-contents-literally file-to-index)
(buffer-string)))
(insert "\r\n"))
(dolist (file-to-index previously-indexed-files)
(when (not (member file-to-index files-to-index))
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert "Content-Type: text/org\r\n\r\n")
(insert "")
(insert "\r\n")))
(insert (format "--%s--\r\n" boundary))
(buffer-string)))
;; Cancel any running indexing timer, first
(when khoj--index-timer
(cancel-timer khoj--index-timer))
;; Send files to index on server every `khoj-index-interval' seconds
(setq khoj--index-timer
(run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
;; -------------------------------------------
;; Render Response from Khoj server for Emacs
;; -------------------------------------------
(defun khoj--extract-entries-as-markdown (json-response query)
"Convert JSON-RESPONSE, QUERY from API to markdown entries."
@ -920,6 +976,9 @@ RECEIVE-DATE is the message receive date."
(message "khoj.el: Teardown Incremental Search")
;; unset khoj minibuffer window
(setq khoj--minibuffer-window nil)
(when (and khoj--search-on-idle-timer
(timerp khoj--search-on-idle-timer))
(cancel-timer khoj--search-on-idle-timer))
;; delete open connections to khoj server
(khoj--delete-open-network-connections-to-server)
;; remove hooks for khoj incremental query and self
@ -942,8 +1001,10 @@ RECEIVE-DATE is the message receive date."
;; set current (mini-)buffer entered as khoj minibuffer
;; used to query khoj API only when user in khoj minibuffer
(setq khoj--minibuffer-window (current-buffer))
(add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit
; do khoj incremental search after idle time
(setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search))
; teardown khoj incremental search on minibuffer exit
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
(read-string khoj--query-prompt))))
@ -1014,17 +1075,20 @@ Paragraph only starts at first text after blank line."
;; Khoj Menu
;; ---------
(transient-define-argument khoj--content-type-switch ()
:class 'transient-switches
:argument-format "--content-type=%s"
:argument-regexp ".+"
;; set content type to: last used > based on current buffer > default type
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
;; dynamically set choices to content types enabled on khoj backend
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
(defun khoj--setup-and-show-menu ()
"Create Transient menu for khoj and show it."
;; Create the Khoj Transient menu
(transient-define-argument khoj--content-type-switch ()
:class 'transient-switches
:argument-format "--content-type=%s"
:argument-regexp ".+"
;; set content type to: last used > based on current buffer > default type
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
;; dynamically set choices to content types enabled on khoj backend
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
(transient-define-suffix khoj--search-command (&optional args)
(interactive (list (transient-args transient-current-command)))
(transient-define-suffix khoj--search-command (&optional args)
(interactive (list (transient-args transient-current-command)))
(progn
;; set content type to: specified > last used > based on current buffer > default type
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
@ -1033,9 +1097,9 @@ Paragraph only starts at first text after blank line."
;; trigger incremental search
(call-interactively #'khoj-incremental)))
(transient-define-suffix khoj--find-similar-command (&optional args)
"Find items similar to current item at point."
(interactive (list (transient-args transient-current-command)))
(transient-define-suffix khoj--find-similar-command (&optional args)
"Find items similar to current item at point."
(interactive (list (transient-args transient-current-command)))
(progn
;; set content type to: specified > last used > based on current buffer > default type
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
@ -1043,37 +1107,38 @@ Paragraph only starts at first text after blank line."
(setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count))
(khoj--find-similar khoj--content-type)))
(transient-define-suffix khoj--update-command (&optional args)
"Call khoj API to update index of specified content type."
(interactive (list (transient-args transient-current-command)))
(let* ((force-update (if (member "--force-update" args) "true" "false"))
;; set content type to: specified > last used > based on current buffer > default type
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
(type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
(update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
(url-request-method "GET"))
(progn
(setq khoj--content-type content-type)
(url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
(transient-define-suffix khoj--update-command (&optional args)
"Call khoj API to update index of specified content type."
(interactive (list (transient-args transient-current-command)))
(let* ((force-update (if (member "--force-update" args) "true" "false"))
;; set content type to: specified > last used > based on current buffer > default type
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
(url-request-method "GET"))
(progn
(setq khoj--content-type content-type)
(khoj--server-index-files force-update content-type))))
(transient-define-suffix khoj--chat-command (&optional _)
"Command to Chat with Khoj."
(interactive (list (transient-args transient-current-command)))
(khoj--chat))
(transient-define-suffix khoj--chat-command (&optional _)
"Command to Chat with Khoj."
(interactive (list (transient-args transient-current-command)))
(khoj--chat))
(transient-define-prefix khoj--menu ()
"Create Khoj Menu to Configure and Execute Commands."
[["Configure Search"
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
("t" "Content Type" khoj--content-type-switch)]
["Configure Update"
("-f" "Force Update" "--force-update")]]
[["Act"
("c" "Chat" khoj--chat-command)
("s" "Search" khoj--search-command)
("f" "Find Similar" khoj--find-similar-command)
("u" "Update" khoj--update-command)
("q" "Quit" transient-quit-one)]])
(transient-define-prefix khoj--menu ()
"Create Khoj Menu to Configure and Execute Commands."
[["Configure Search"
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
("t" "Content Type" khoj--content-type-switch)]
["Configure Update"
("-f" "Force Update" "--force-update")]]
[["Act"
("c" "Chat" khoj--chat-command)
("s" "Search" khoj--search-command)
("f" "Find Similar" khoj--find-similar-command)
("u" "Update" khoj--update-command)
("q" "Quit" transient-quit-one)]])
;; Show the Khoj Transient menu
(khoj--menu))
;; ----------
@ -1086,7 +1151,7 @@ Paragraph only starts at first text after blank line."
(interactive)
(when khoj-auto-setup
(khoj-setup t))
(khoj--menu))
(khoj--setup-and-show-menu))
(provide 'khoj)

View file

@ -206,6 +206,64 @@ Rule everything\n")
"Rule everything"))
))
;; -------------------------------------
;; Test Helpers to Index Content
;; -------------------------------------
(ert-deftest khoj-tests--render-files-to-add-request-body ()
"Test files are formatted into a multi-part http request body"
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
(unwind-protect
(progn
(should
(equal
(khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj")
(format
"\n--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
# Become God\n\
## Upgrade\n\n\
Penance to Immortality\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
## Act\n\n\
Rule everything\n\n\r\n\
--khoj--\r\n" upgrade-file act-file))))
(delete-file upgrade-file)
(delete-file act-file))))
(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body ()
"Test files are formatted into a multi-part http request body"
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
(unwind-protect
(progn
(should
(equal
(khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj")
(format
"\n--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
# Become God\n\
## Upgrade\n\n\
Penance to Immortality\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
## Act\n\n\
Rule everything\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
\r
--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org"))))
(delete-file upgrade-file)
(delete-file act-file))))
(provide 'khoj-tests)

View file

@ -1,7 +1,7 @@
{
"id": "khoj",
"name": "Khoj",
"version": "0.12.3",
"version": "0.13.0",
"minAppVersion": "0.15.0",
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
"author": "Khoj Inc.",

View file

@ -1,7 +1,9 @@
{
"name": "Khoj",
"version": "0.12.3",
"description": "An AI Personal Assistant for your Digital Brain",
"version": "0.13.0",
"description": "An AI copilot for your Second Brain",
"author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
"license": "GPL-3.0-or-later",
"main": "src/main.js",
"scripts": {
"dev": "node esbuild.config.mjs",
@ -14,8 +16,6 @@
"AI",
"assistant"
],
"author": "Debanjum Singh Solanky",
"license": "GPL-3.0-or-later",
"devDependencies": {
"@types/node": "^16.11.6",
"@typescript-eslint/eslint-plugin": "5.29.0",

View file

@ -1,12 +1,13 @@
import { Notice, Plugin } from 'obsidian';
import { Notice, Plugin, TFile } from 'obsidian';
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
import { KhojSearchModal } from 'src/search_modal'
import { KhojChatModal } from 'src/chat_modal'
import { configureKhojBackend } from './utils';
import { configureKhojBackend, updateContentIndex } from './utils';
export default class Khoj extends Plugin {
settings: KhojSetting;
indexingTimer: NodeJS.Timeout;
async onload() {
await this.loadSettings();
@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
// Add a settings tab so the user can configure khoj
this.addSettingTab(new KhojSettingTab(this.app, this));
// Add scheduled job to update index every 60 minutes
this.indexingTimer = setInterval(async () => {
if (this.settings.autoConfigure) {
this.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.settings, this.settings.lastSyncedFiles
);
}
}, 60 * 60 * 1000);
}
async loadSettings() {
@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
}
this.saveData(this.settings);
}
async onunload() {
// Remove scheduled job to update index at regular cadence
if (this.indexingTimer)
clearInterval(this.indexingTimer);
this.unload();
}
}

View file

@ -1,5 +1,6 @@
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian';
import Khoj from 'src/main';
import { updateContentIndex } from './utils';
export interface KhojSetting {
enableOfflineChat: boolean;
@ -8,6 +9,7 @@ export interface KhojSetting {
khojUrl: string;
connectedToBackend: boolean;
autoConfigure: boolean;
lastSyncedFiles: TFile[];
}
export const DEFAULT_SETTINGS: KhojSetting = {
@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
connectedToBackend: false,
autoConfigure: true,
openaiApiKey: '',
lastSyncedFiles: []
}
export class KhojSettingTab extends PluginSettingTab {
@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
}, 300);
this.plugin.registerInterval(progress_indicator);
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
);
new Notice('✅ Updated Khoj index.');
// Reset button once index is updated

View file

@ -1,4 +1,4 @@
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
import { KhojSetting } from 'src/settings'
export function getVaultAbsolutePath(vault: Vault): string {
@ -14,18 +14,85 @@ type OpenAIType = null | {
"api-key": string;
};
type OfflineChatType = null | {
"chat-model": string;
"enable-offline-chat": boolean;
};
interface ProcessorData {
conversation: {
"conversation-logfile": string;
openai: OpenAIType;
"enable-offline-chat": boolean;
"offline-chat": OfflineChatType;
"tokenizer": null | string;
"max-prompt-size": null | number;
};
}
function fileExtensionToMimeType (extension: string): string {
switch (extension) {
case 'pdf':
return 'application/pdf';
case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
}
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
// Get all markdown, pdf files in the vault
console.log(`Khoj: Updating Khoj content index...`)
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
let countOfFilesToIndex = 0;
let countOfFilesToDelete = 0;
// Add all files to index as multipart form data
const formData = new FormData();
for (const file of files) {
countOfFilesToIndex++;
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
}
// Add any previously synced files to be deleted to multipart form data
for (const lastSyncedFile of lastSyncedFiles) {
if (!files.includes(lastSyncedFile)) {
countOfFilesToDelete++;
formData.append('files', new Blob([]), lastSyncedFile.path);
}
}
// Call Khoj backend to update index with all markdown, pdf files
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
method: 'POST',
headers: {
'x-api-key': 'secret',
},
body: formData,
});
if (!response.ok) {
new Notice(`Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
} else {
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
}
return files;
}
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
let vaultPath = getVaultAbsolutePath(vault);
let mdInVault = `${vaultPath}/**/*.md`;
let pdfInVault = `${vaultPath}/**/*.pdf`;
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
// Check if khoj backend is configured, note if cannot connect to backend
@ -43,124 +110,33 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
if (!setting.connectedToBackend) return;
// Set index name from the path of the current vault
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
// Get default config fields from khoj backend
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"];
// Get current config if khoj backend configured, else get default config from khoj backend
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
.then(response => JSON.parse(response))
.then(data => {
khoj_already_configured = data["content-type"] != null;
// If khoj backend not configured yet
if (!khoj_already_configured) {
// Create khoj content-type config with only markdown configured
data["content-type"] = {
"markdown": {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
}
}
// Else if khoj config has no markdown content config
else if (!data["content-type"]["markdown"]) {
// Add markdown config to khoj content-type config
// Set markdown config to index markdown files in configured obsidian vault
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
// Else if khoj is not configured to index markdown files in configured obsidian vault
else if (
data["content-type"]["markdown"]["input-files"] != null ||
data["content-type"]["markdown"]["input-filter"] == null ||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
// Update markdown config in khoj content-type config
// Set markdown config to only index markdown files in configured obsidian vault
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
if (khoj_already_configured && !data["content-type"]["pdf"]) {
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
// Else if khoj is not configured to index pdf files in configured obsidian vault
else if (khoj_already_configured &&
(
data["content-type"]["pdf"]["input-files"] != null ||
data["content-type"]["pdf"]["input-filter"] == null ||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
// Update pdf config in khoj content-type config
// Set pdf config to only index pdf files in configured obsidian vault
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
let processorData: ProcessorData = {
"conversation": {
"conversation-logfile": conversationLogFile,
"openai": null,
"enable-offline-chat": setting.enableOfflineChat,
"offline-chat": {
"chat-model": khojDefaultOfflineChatModelName,
"enable-offline-chat": setting.enableOfflineChat,
},
"tokenizer": null,
"max-prompt-size": null,
}
}
// If the Open AI API Key was configured in the plugin settings
if (!!setting.openaiApiKey) {
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName;
processorData = {
"conversation": {
"conversation-logfile": conversationLogFile,
@ -168,7 +144,12 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
"chat-model": openAIChatModel,
"api-key": setting.openaiApiKey,
},
"enable-offline-chat": setting.enableOfflineChat,
"offline-chat": {
"chat-model": khojDefaultOfflineChatModelName,
"enable-offline-chat": setting.enableOfflineChat,
},
"tokenizer": null,
"max-prompt-size": null,
},
}
}
@ -197,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
method: 'POST',
contentType: 'application/json',
};
// Save khojConfig on khoj backend at khojConfigUrl
await request(requestContent)
// Refresh khoj search index after updating config
.then(_ => request(`${khojUrl}/api/update?t=markdown`))
.then(_ => request(`${khojUrl}/api/update?t=pdf`));
request(requestContent);
}
function getIndexDirectoryFromBackendConfig(filepath: string) {

View file

@ -24,5 +24,6 @@
"0.12.0": "0.15.0",
"0.12.1": "0.15.0",
"0.12.2": "0.15.0",
"0.12.3": "0.15.0"
"0.12.3": "0.15.0",
"0.13.0": "0.15.0"
}

View file

@ -28,7 +28,7 @@ from khoj.utils.config import (
)
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
from khoj.utils.fs_syncer import collect_files
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig
from khoj.routers.indexer import configure_content, load_content, configure_search
@ -136,7 +136,7 @@ def configure_routes(app):
app.include_router(api, prefix="/api")
app.include_router(api_beta, prefix="/api/beta")
app.include_router(indexer, prefix="/v1/indexer")
app.include_router(indexer, prefix="/api/v1/index")
app.include_router(web_client)
app.include_router(auth_router, prefix="/auth")
@ -156,7 +156,7 @@ if not state.demo:
state.content_index = configure_content(
state.content_index, state.config.content_type, all_files, state.search_models
)
logger.info("📬 Content index updated via Scheduler")
logger.info("📪 Content index updated via Scheduler")
except Exception as e:
logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
@ -207,9 +207,7 @@ def configure_conversation_processor(
conversation_config=ConversationProcessorConfig(
conversation_logfile=conversation_logfile,
openai=(conversation_config.openai if (conversation_config is not None) else None),
enable_offline_chat=(
conversation_config.enable_offline_chat if (conversation_config is not None) else False
),
offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(),
)
)
else:

View file

@ -236,7 +236,7 @@
</h3>
</div>
<div class="card-description-row">
<p class="card-description">Setup chat using OpenAI</p>
<p class="card-description">Setup online chat using OpenAI</p>
</div>
<div class="card-action-row">
<a class="card-button" href="/config/processor/conversation/openai">
@ -261,21 +261,21 @@
<img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
<h3 class="card-title">
Offline Chat
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and not current_model_state.conversation_gpt4all %}
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %}
<img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
{% endif %}
</h3>
</div>
<div class="card-description-row">
<p class="card-description">Setup offline chat (Llama V2)</p>
<p class="card-description">Setup offline chat</p>
</div>
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
<button class="card-button" onclick="toggleEnableLocalLLLM(false)">
Disable
</button>
</div>
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
<button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
Enable
</button>
@ -346,7 +346,7 @@
featuresHintText.classList.add("show");
}
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, {
method: 'POST',
headers: {
'Content-Type': 'application/json',

View file

@ -34,7 +34,7 @@
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
{% else %}
{% for input_filter in current_config['input_filter'] %}
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter.split('/*')[0] }}">
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter }}">
{% endfor %}
{% endif %}
</td>
@ -106,17 +106,18 @@
submit.addEventListener("click", function(event) {
event.preventDefault();
let globFormat = "**/*."
let globFormat = "**/*"
let suffixes = [];
if ('{{content_type}}' == "markdown")
suffixes = ["md", "markdown"]
suffixes = [".md", ".markdown"]
else if ('{{content_type}}' == "org")
suffixes = ["org"]
suffixes = [".org"]
else if ('{{content_type}}' === "pdf")
suffixes = ["pdf"]
suffixes = [".pdf"]
else if ('{{content_type}}' === "plaintext")
suffixes = ['*']
suffixes = ['.*']
let globs = suffixes.map(x => `${globFormat}${x}`)
var inputFileNodes = document.getElementsByName("input-files");
var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
@ -124,10 +125,19 @@
var inputFilter = [];
var nodes = getValidInputNodes(inputFilterNodes);
// A regex that checks for globs in the path. If they exist,
// we are going to just not add our own globing. If they don't,
// then we will assume globbing should be done.
const glob_regex = /([*?\[\]])/;
if (nodes.length > 0) {
for (var i = 0; i < nodes.length; i++) {
for (var j = 0; j < suffixes.length; j++) {
inputFilter.push(nodes[i].value + globFormat + suffixes[j]);
for (var j = 0; j < globs.length; j++) {
if (glob_regex.test(nodes[i].value)) {
inputFilter.push(nodes[i].value);
} else {
inputFilter.push(nodes[i].value + globs[j]);
}
}
}
}

View file

@ -0,0 +1,83 @@
"""
Current format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
enable-offline-chat: false
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
openai:
...
search-type:
...
New format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
tokenizer: null
max_prompt_size: null
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
openai:
...
search-type:
...
"""
import logging
from packaging import version
from khoj.utils.yaml import load_config_from_file, save_config_to_file
logger = logging.getLogger(__name__)
def migrate_offline_chat_schema(args):
schema_version = "0.12.3"
raw_config = load_config_from_file(args.config_file)
previous_version = raw_config.get("version")
if "processor" not in raw_config:
return args
if raw_config["processor"] is None:
return args
if "conversation" not in raw_config["processor"]:
return args
if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"):
logger.info(
f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration"
)
raw_config["version"] = schema_version
# Create max-prompt-size field in conversation processor schema
raw_config["processor"]["conversation"]["max-prompt-size"] = None
raw_config["processor"]["conversation"]["tokenizer"] = None
# Create offline chat schema based on existing enable_offline_chat field in khoj config schema
offline_chat_model = (
raw_config["processor"]["conversation"]
.get("offline-chat", {})
.get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin")
)
raw_config["processor"]["conversation"]["offline-chat"] = {
"enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False),
"chat-model": offline_chat_model,
}
# Delete old enable-offline-chat field from conversation processor schema
if "enable-offline-chat" in raw_config["processor"]["conversation"]:
del raw_config["processor"]["conversation"]["enable-offline-chat"]
save_config_to_file(raw_config, args.config_file)
return args

View file

@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline(
text: str,
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
loaded_model: Union[Any, None] = None,
conversation_log={},
use_history: bool = True,
@ -113,7 +113,7 @@ def filter_questions(questions: List[str]):
]
filtered_questions = []
for q in questions:
if not any([word in q.lower() for word in hint_words]):
if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q):
filtered_questions.append(q)
return filtered_questions
@ -123,10 +123,12 @@ def converse_offline(
references,
user_query,
conversation_log={},
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
loaded_model: Union[Any, None] = None,
completion_func=None,
conversation_command=ConversationCommand.Default,
max_prompt_size=None,
tokenizer_name=None,
) -> Union[ThreadedGenerator, Iterator[str]]:
"""
Converse with user using Llama
@ -158,6 +160,8 @@ def converse_offline(
prompts.system_prompt_message_llamav2,
conversation_log,
model_name=model,
max_prompt_size=max_prompt_size,
tokenizer_name=tokenizer_name,
)
g = ThreadedGenerator(references, completion_func=completion_func)

View file

@ -1,3 +0,0 @@
model_name_to_url = {
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin"
}

View file

@ -1,24 +1,8 @@
import os
import logging
import requests
import hashlib
from tqdm import tqdm
from khoj.processor.conversation.gpt4all import model_metadata
logger = logging.getLogger(__name__)
expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"}
def get_md5_checksum(filename: str):
hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def download_model(model_name: str):
try:
@ -27,57 +11,12 @@ def download_model(model_name: str):
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
raise e
url = model_metadata.model_name_to_url.get(model_name)
model_path = os.path.expanduser(f"~/.cache/gpt4all/")
if not url:
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
return GPT4All(model_name=model_name, model_path=model_path)
filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
if os.path.exists(filename):
# Check if the user is connected to the internet
try:
requests.get("https://www.google.com/", timeout=5)
except:
logger.debug("User is offline. Disabling allowed download flag")
return GPT4All(model_name=model_name, model_path=model_path, allow_download=False)
return GPT4All(model_name=model_name, model_path=model_path)
# Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file
tmp_filename = filename + ".tmp"
# Use GPU for Chat Model, if available
try:
os.makedirs(os.path.dirname(tmp_filename), exist_ok=True)
logger.debug(f"Downloading model {model_name} from {url} to {filename}...")
with requests.get(url, stream=True) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with open(tmp_filename, "wb") as f, tqdm(
unit="B", # unit string to be displayed.
unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
unit_divisor=1024, # is used when unit_scale is true
total=total_size, # the total iteration.
desc=model_name, # prefix to be displayed on progress bar.
) as progress_bar:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
progress_bar.update(len(chunk))
model = GPT4All(model_name=model_name, device="gpu")
logger.debug("Loaded chat model to GPU.")
except ValueError:
model = GPT4All(model_name=model_name)
logger.debug("Loaded chat model to CPU.")
# Verify the checksum
if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename):
logger.error(
f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available."
)
os.remove(tmp_filename)
raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.")
# Move the tmp file to the actual file
os.rename(tmp_filename, filename)
logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}")
return GPT4All(model_name)
except Exception as e:
logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True)
# Remove the tmp file if it exists
if os.path.exists(tmp_filename):
os.remove(tmp_filename)
return None
return model

View file

@ -116,6 +116,8 @@ def converse(
temperature: float = 0.2,
completion_func=None,
conversation_command=ConversationCommand.Default,
max_prompt_size=None,
tokenizer_name=None,
):
"""
Converse with user using OpenAI's ChatGPT
@ -141,6 +143,8 @@ def converse(
prompts.personality.format(),
conversation_log,
model,
max_prompt_size,
tokenizer_name,
)
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
logger.debug(f"Conversation Context for GPT: {truncated_messages}")

View file

@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template(
""".strip()
)
system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant.
system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant.
Using your general knowledge and our past conversations as context, answer the following question.
If you do not know the answer, say 'I don't know.'"""
@ -51,13 +51,13 @@ extract_questions_system_prompt_llamav2 = PromptTemplate.from_template(
general_conversation_llamav2 = PromptTemplate.from_template(
"""
<s>[INST]{query}[/INST]
<s>[INST] {query} [/INST]
""".strip()
)
chat_history_llamav2_from_user = PromptTemplate.from_template(
"""
<s>[INST]{message}[/INST]
<s>[INST] {message} [/INST]
""".strip()
)
@ -69,7 +69,7 @@ chat_history_llamav2_from_assistant = PromptTemplate.from_template(
conversation_llamav2 = PromptTemplate.from_template(
"""
<s>[INST]{query}[/INST]
<s>[INST] {query} [/INST]
""".strip()
)
@ -91,7 +91,7 @@ Question: {query}
notes_conversation_llamav2 = PromptTemplate.from_template(
"""
Notes:
User's Notes:
{references}
Question: {query}
""".strip()
@ -134,19 +134,25 @@ Answer (in second person):"""
extract_questions_llamav2_sample = PromptTemplate.from_template(
"""
<s>[INST]<<SYS>>Current Date: {current_date}<</SYS>>[/INST]</s>
<s>[INST]How was my trip to Cambodia?[/INST][]</s>
<s>[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia?</s>
<s>[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have?</s>
<s>[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
<s>[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
<s>[INST]How are you feeling today?[/INST]</s>
<s>[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age?</s>
<s>[INST]<<SYS>>
<s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s>
<s>[INST] How was my trip to Cambodia? [/INST]
How was my trip to Cambodia?</s>
<s>[INST] Who did I visit the temple with on that trip? [/INST]
Who did I visit the temple with in Cambodia?</s>
<s>[INST] How should I take care of my plants? [/INST]
What kind of plants do I have? What issues do my plants have?</s>
<s>[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST]
What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
<s>[INST] What did I do for Christmas last year? [/INST]
What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
<s>[INST] How are you feeling today? [/INST]</s>
<s>[INST] Is Alice older than Bob? [/INST]
When was Alice born? What is Bob's age?</s>
<s>[INST] <<SYS>>
Use these notes from the user's previous conversations to provide a response:
{chat_history}
<</SYS>>[/INST]</s>
<s>[INST]{query}[/INST]
<</SYS>> [/INST]</s>
<s>[INST] {query} [/INST]
"""
)

View file

@ -3,24 +3,27 @@ import logging
from time import perf_counter
import json
from datetime import datetime
import queue
import tiktoken
# External packages
from langchain.schema import ChatMessage
from transformers import LlamaTokenizerFast
from transformers import AutoTokenizer
# Internal Packages
import queue
from khoj.utils.helpers import merge_dicts
logger = logging.getLogger(__name__)
max_prompt_size = {
model_to_prompt_size = {
"gpt-3.5-turbo": 4096,
"gpt-4": 8192,
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548,
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
"gpt-3.5-turbo-16k": 15000,
}
tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"}
model_to_tokenizer = {
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
}
class ThreadedGenerator:
@ -82,9 +85,26 @@ def message_to_log(
def generate_chatml_messages_with_context(
user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2
user_message,
system_message,
conversation_log={},
model_name="gpt-3.5-turbo",
max_prompt_size=None,
tokenizer_name=None,
):
"""Generate messages for ChatGPT with context from previous conversation"""
# Set max prompt size from user config, pre-configured for model or to default prompt size
try:
max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
except:
max_prompt_size = 2000
logger.warning(
f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
)
# Scale lookback turns proportional to max prompt size supported by model
lookback_turns = max_prompt_size // 750
# Extract Chat History for Context
chat_logs = []
for chat in conversation_log.get("chat", []):
@ -105,19 +125,28 @@ def generate_chatml_messages_with_context(
messages = user_chatml_message + rest_backnforths + system_chatml_message
# Truncate oldest messages from conversation history until under max supported prompt size by model
messages = truncate_messages(messages, max_prompt_size[model_name], model_name)
messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
# Return message in chronological order
return messages[::-1]
def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]:
def truncate_messages(
messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None
) -> list[ChatMessage]:
"""Truncate messages to fit within max prompt size supported by model"""
if "llama" in model_name:
encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name])
else:
encoder = tiktoken.encoding_for_model(model_name)
try:
if model_name.startswith("gpt-"):
encoder = tiktoken.encoding_for_model(model_name)
else:
encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name])
except:
default_tokenizer = "hf-internal-testing/llama-tokenizer"
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
logger.warning(
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
)
system_message = messages.pop()
system_message_tokens = len(encoder.encode(system_message.content))

View file

@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
tmp_file = f"tmp_pdf_file.pdf"
with open(f"{tmp_file}", "wb") as f:
bytes = base64.b64decode(pdf_files[pdf_file])
bytes = pdf_files[pdf_file]
f.write(bytes)
loader = PyMuPDFLoader(f"{tmp_file}")
pdf_entries_per_file = [page.page_content for page in loader.load()]

View file

@ -30,6 +30,7 @@ from khoj.utils.rawconfig import (
GithubContentConfig,
NotionContentConfig,
ConversationProcessorConfig,
OfflineChatProcessorConfig,
)
from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.state import SearchType
@ -185,6 +186,10 @@ if not state.demo:
state.content_index.markdown = None
elif content_type == "org":
state.content_index.org = None
elif content_type == "plaintext":
state.content_index.plaintext = None
else:
logger.warning(f"Request to delete unknown content type: {content_type} via API")
try:
save_config_to_file_updated_state()
@ -284,10 +289,11 @@ if not state.demo:
except Exception as e:
return {"status": "error", "message": str(e)}
@api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200)
@api.post("/config/data/processor/conversation/offline_chat", status_code=200)
async def set_processor_enable_offline_chat_config_data(
request: Request,
enable_offline_chat: bool,
offline_chat_model: Optional[str] = None,
client: Optional[str] = None,
):
_initialize_config()
@ -301,7 +307,12 @@ if not state.demo:
state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore
assert state.config.processor.conversation is not None
state.config.processor.conversation.enable_offline_chat = enable_offline_chat
if state.config.processor.conversation.offline_chat is None:
state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig()
state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
if offline_chat_model is not None:
state.config.processor.conversation.offline_chat.chat_model = offline_chat_model
state.processor_config = configure_processor(state.config.processor, state.processor_config)
update_telemetry_state(
@ -322,7 +333,7 @@ if not state.demo:
# Create Routes
@api.get("/config/data/default")
def get_default_config_data():
return constants.default_config
return constants.empty_config
@api.get("/config/types", response_model=List[str])
@ -387,7 +398,7 @@ async def search(
# Encode query with filter terms removed
defiltered_query = user_query
for filter in [DateFilter(), WordFilter(), FileFilter()]:
defiltered_query = filter.defilter(user_query)
defiltered_query = filter.defilter(defiltered_query)
encoded_asymmetric_query = None
if t == SearchType.All or t != SearchType.Image:
@ -622,7 +633,7 @@ def update(
if state.processor_config:
components.append("Conversation processor")
components_msg = ", ".join(components)
logger.info(f"📬 {components_msg} updated via API")
logger.info(f"📪 {components_msg} updated via API")
update_telemetry_state(
request=request,
@ -702,12 +713,18 @@ async def chat(
) -> Response:
perform_chat_checks()
conversation_command = get_conversation_command(query=q, any_references=True)
q = q.replace(f"/{conversation_command.value}", "").strip()
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
request, q, (n or 5), conversation_command
)
conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references):
conversation_command = ConversationCommand.General
if conversation_command == ConversationCommand.Help:
model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai"
model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai"
formatted_help = help_message.format(model=model_type, version=state.khoj_version)
return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
@ -768,23 +785,21 @@ async def extract_references_and_questions(
logger.warning(
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
)
return compiled_references, inferred_queries
return compiled_references, inferred_queries, q
if conversation_type == ConversationCommand.General:
return compiled_references, inferred_queries, q
# Extract filter terms from user message
defiltered_query = q
filter_terms = []
for filter in [DateFilter(), WordFilter(), FileFilter()]:
filter_terms += filter.get_filter_terms(q)
defiltered_query = filter.defilter(q)
filters_in_query = " ".join(filter_terms)
defiltered_query = filter.defilter(defiltered_query)
filters_in_query = q.replace(defiltered_query, "").strip()
# Infer search queries from user message
with timer("Extracting search queries took", logger):
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
if state.processor_config.conversation.enable_offline_chat:
if state.processor_config.conversation.offline_chat.enable_offline_chat:
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
inferred_queries = extract_questions_offline(
defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
@ -800,7 +815,7 @@ async def extract_references_and_questions(
with timer("Searching knowledge base took", logger):
result_list = []
for query in inferred_queries:
n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n
n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n
result_list.extend(
await search(
f"{query} {filters_in_query}",

View file

@ -113,7 +113,7 @@ def generate_chat_response(
meta_log=meta_log,
)
if state.processor_config.conversation.enable_offline_chat:
if state.processor_config.conversation.offline_chat.enable_offline_chat:
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
chat_response = converse_offline(
references=compiled_references,
@ -122,6 +122,9 @@ def generate_chat_response(
conversation_log=meta_log,
completion_func=partial_completion,
conversation_command=conversation_command,
model=state.processor_config.conversation.offline_chat.chat_model,
max_prompt_size=state.processor_config.conversation.max_prompt_size,
tokenizer_name=state.processor_config.conversation.tokenizer,
)
elif state.processor_config.conversation.openai_model:
@ -135,6 +138,8 @@ def generate_chat_response(
api_key=api_key,
completion_func=partial_completion,
conversation_command=conversation_command,
max_prompt_size=state.processor_config.conversation.max_prompt_size,
tokenizer_name=state.processor_config.conversation.tokenizer,
)
except Exception as e:

View file

@ -1,11 +1,11 @@
# Standard Packages
import logging
import sys
from typing import Optional, Union, Dict
# External Packages
from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile
from pydantic import BaseModel
from khoj.routers.helpers import update_telemetry_state
# Internal Packages
from khoj.utils import state, constants
@ -56,42 +56,30 @@ class IndexerInput(BaseModel):
plaintext: Optional[dict[str, str]] = None
@indexer.post("/batch")
async def index_batch(
@indexer.post("/update")
async def update(
request: Request,
files: list[UploadFile],
x_api_key: str = Header(None),
regenerate: bool = False,
search_type: Optional[Union[state.SearchType, str]] = None,
force: bool = False,
t: Optional[Union[state.SearchType, str]] = None,
client: Optional[str] = None,
user_agent: Optional[str] = Header(None),
referer: Optional[str] = Header(None),
host: Optional[str] = Header(None),
):
if x_api_key != "secret":
raise HTTPException(status_code=401, detail="Invalid API Key")
state.config_lock.acquire()
try:
logger.info(f"Received batch indexing request")
index_batch_request_acc = b""
async for chunk in request.stream():
index_batch_request_acc += chunk
data_bytes = sys.getsizeof(index_batch_request_acc)
unit = "KB"
data_size = data_bytes / 1024
if data_size > 1000:
unit = "MB"
data_size = data_size / 1024
if data_size > 1000:
unit = "GB"
data_size = data_size / 1024
data_size_metric = f"{data_size:.2f} {unit}"
logger.info(f"Received {data_size_metric} of data")
index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
logger.info(f"Received {len(index_batch_request.files)} files")
logger.info(f"📬 Updating content index via API call by {client} client")
org_files: Dict[str, str] = {}
markdown_files: Dict[str, str] = {}
pdf_files: Dict[str, str] = {}
plaintext_files: Dict[str, str] = {}
for file in index_batch_request.files:
file_type = get_file_type(file.path)
for file in files:
file_type, encoding = get_file_type(file.content_type)
dict_to_update = None
if file_type == "org":
dict_to_update = org_files
@ -103,9 +91,11 @@ async def index_batch(
dict_to_update = plaintext_files
if dict_to_update is not None:
dict_to_update[file.path] = file.content
dict_to_update[file.filename] = (
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
)
else:
logger.info(f"Skipping unsupported streamed file: {file.path}")
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
indexer_input = IndexerInput(
org=org_files,
@ -115,7 +105,7 @@ async def index_batch(
)
if state.config == None:
logger.info("First run, initializing state.")
logger.info("📬 Initializing content index on first run.")
default_full_config = FullConfig(
content_type=None,
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
@ -142,15 +132,30 @@ async def index_batch(
state.config.content_type,
indexer_input.dict(),
state.search_models,
regenerate=regenerate,
t=search_type,
regenerate=force,
t=t,
full_corpus=False,
)
except Exception as e:
logger.error(f"Failed to process batch indexing request: {e}", exc_info=True)
logger.error(
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
exc_info=True,
)
finally:
state.config_lock.release()
update_telemetry_state(
request=request,
telemetry_type="api",
api="index/update",
client=client,
user_agent=user_agent,
referer=referer,
host=host,
)
logger.info(f"📪 Content index updated via API call by {client} client")
return Response(content="OK", status_code=200)

View file

@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file
from khoj.migrations.migrate_version import migrate_config_to_version
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
from khoj.migrations.migrate_offline_model import migrate_offline_model
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
def cli(args=None):
@ -55,7 +56,12 @@ def cli(args=None):
def run_migrations(args):
migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model]
migrations = [
migrate_config_to_version,
migrate_processor_conversation_schema,
migrate_offline_model,
migrate_offline_chat_schema,
]
for migration in migrations:
args = migration(args)
return args

View file

@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model
# External Packages
import torch
from khoj.utils.rawconfig import OfflineChatProcessorConfig
logger = logging.getLogger(__name__)
# Internal Packages
@ -84,7 +86,6 @@ class SearchModels:
@dataclass
class GPT4AllProcessorConfig:
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
loaded_model: Union[Any, None] = None
@ -95,18 +96,20 @@ class ConversationProcessorConfigModel:
):
self.openai_model = conversation_config.openai
self.gpt4all_model = GPT4AllProcessorConfig()
self.enable_offline_chat = conversation_config.enable_offline_chat
self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig()
self.max_prompt_size = conversation_config.max_prompt_size
self.tokenizer = conversation_config.tokenizer
self.conversation_logfile = Path(conversation_config.conversation_logfile)
self.chat_session: List[str] = []
self.meta_log: dict = {}
if self.enable_offline_chat:
if self.offline_chat.enable_offline_chat:
try:
self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model)
except ValueError as e:
self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
except Exception as e:
self.offline_chat.enable_offline_chat = False
self.gpt4all_model.loaded_model = None
logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
self.enable_offline_chat = False
else:
self.gpt4all_model.loaded_model = None

View file

@ -6,6 +6,64 @@ empty_escape_sequences = "\n|\r|\t| "
app_env_filepath = "~/.khoj/env"
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
empty_config = {
"content-type": {
"org": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz",
"embeddings-file": "~/.khoj/content/org/org_embeddings.pt",
"index-heading-entries": False,
},
"markdown": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
"embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
},
"pdf": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
},
"plaintext": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
"embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
},
},
"search-type": {
"symmetric": {
"encoder": "sentence-transformers/all-MiniLM-L6-v2",
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"model_directory": "~/.khoj/search/symmetric/",
},
"asymmetric": {
"encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"model_directory": "~/.khoj/search/asymmetric/",
},
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
},
"processor": {
"conversation": {
"openai": {
"api-key": None,
"chat-model": "gpt-3.5-turbo",
},
"offline-chat": {
"enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
},
"tokenizer": None,
"max-prompt-size": None,
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
}
},
}
# default app config to use
default_config = {
"content-type": {
@ -72,7 +130,12 @@ default_config = {
"api-key": None,
"chat-model": "gpt-3.5-turbo",
},
"enable-offline-chat": False,
"offline-chat": {
"enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
},
"tokenizer": None,
"max-prompt-size": None,
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
}
},

View file

@ -1,6 +1,6 @@
import logging
import glob
import base64
import os
from typing import Optional
from bs4 import BeautifulSoup
@ -39,13 +39,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
return soup.get_text(strip=True, separator="\n")
# Extract required fields from config
input_files, input_filter = (
input_files, input_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(input_files) and is_none_or_empty(input_filter):
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
logger.debug("At least one of input-files or input-file-filter is required to be specified")
return {}
@ -53,11 +53,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
absolute_plaintext_files, filtered_plaintext_files = set(), set()
if input_files:
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
if input_filter:
if input_filters:
filtered_plaintext_files = {
filtered_file
for jsonl_file_filter in input_filter
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
for plaintext_file_filter in input_filters
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
@ -73,12 +74,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
filename_to_content_map = {}
for file in all_target_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
plaintext_content = f.read()
if file.endswith(("html", "htm", "xml")):
plaintext_content = extract_html_content(plaintext_content)
filename_to_content_map[file] = f.read()
filename_to_content_map[file] = plaintext_content
except Exception as e:
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
logger.warning(e, exc_info=True)
@ -88,13 +89,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
def get_org_files(config: TextContentConfig):
# Extract required fields from config
org_files, org_file_filter = (
org_files, org_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
logger.debug("At least one of org-files or org-file-filter is required to be specified")
return {}
@ -102,11 +103,12 @@ def get_org_files(config: TextContentConfig):
absolute_org_files, filtered_org_files = set(), set()
if org_files:
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
if org_file_filter:
if org_file_filters:
filtered_org_files = {
filtered_file
for org_file_filter in org_file_filter
for org_file_filter in org_file_filters
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_org_files = sorted(absolute_org_files | filtered_org_files)
@ -119,7 +121,7 @@ def get_org_files(config: TextContentConfig):
filename_to_content_map = {}
for file in all_org_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
@ -131,26 +133,27 @@ def get_org_files(config: TextContentConfig):
def get_markdown_files(config: TextContentConfig):
# Extract required fields from config
markdown_files, markdown_file_filter = (
markdown_files, markdown_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
return {}
"Get Markdown files to process"
# Get markdown files to process
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
if markdown_file_filter:
if markdown_file_filters:
filtered_markdown_files = {
filtered_file
for markdown_file_filter in markdown_file_filter
for markdown_file_filter in markdown_file_filters
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
@ -168,7 +171,7 @@ def get_markdown_files(config: TextContentConfig):
filename_to_content_map = {}
for file in all_markdown_files:
with open(file, "r") as f:
with open(file, "r", encoding="utf8") as f:
try:
filename_to_content_map[file] = f.read()
except Exception as e:
@ -180,13 +183,13 @@ def get_markdown_files(config: TextContentConfig):
def get_pdf_files(config: TextContentConfig):
# Extract required fields from config
pdf_files, pdf_file_filter = (
pdf_files, pdf_file_filters = (
config.input_files,
config.input_filter,
)
# Input Validation
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
return {}
@ -194,11 +197,12 @@ def get_pdf_files(config: TextContentConfig):
absolute_pdf_files, filtered_pdf_files = set(), set()
if pdf_files:
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
if pdf_file_filter:
if pdf_file_filters:
filtered_pdf_files = {
filtered_file
for pdf_file_filter in pdf_file_filter
for pdf_file_filter in pdf_file_filters
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
if os.path.isfile(filtered_file)
}
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
@ -214,7 +218,7 @@ def get_pdf_files(config: TextContentConfig):
for file in all_pdf_files:
with open(file, "rb") as f:
try:
filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8")
filename_to_content_map[file] = f.read()
except Exception as e:
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
logger.warning(e, exc_info=True)

View file

@ -66,20 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
return merged_dict
def get_file_type(filepath: str) -> str:
"Get file type from file path"
file_type = Path(filepath).suffix[1:]
def get_file_type(file_type: str) -> tuple[str, str]:
"Get file type from file mime type"
if file_type in ["md", "markdown"]:
return "markdown"
elif file_type in ["org", "orgmode"]:
return "org"
elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]:
return "plaintext"
elif file_type in ["pdf"]:
return "pdf"
return file_type
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
if file_type in ["text/markdown"]:
return "markdown", encoding
elif file_type in ["text/org"]:
return "org", encoding
elif file_type in ["application/pdf"]:
return "pdf", encoding
elif file_type in ["image/jpeg"]:
return "jpeg", encoding
elif file_type in ["image/png"]:
return "png", encoding
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
return "plaintext", encoding
else:
return "other", encoding
def load_model(

View file

@ -91,10 +91,17 @@ class OpenAIProcessorConfig(ConfigBase):
chat_model: Optional[str] = "gpt-3.5-turbo"
class OfflineChatProcessorConfig(ConfigBase):
enable_offline_chat: Optional[bool] = False
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
class ConversationProcessorConfig(ConfigBase):
conversation_logfile: Path
openai: Optional[OpenAIProcessorConfig]
enable_offline_chat: Optional[bool] = False
offline_chat: Optional[OfflineChatProcessorConfig]
max_prompt_size: Optional[int]
tokenizer: Optional[str]
class ProcessorConfig(ConfigBase):

View file

@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import (
ContentConfig,
ConversationProcessorConfig,
OfflineChatProcessorConfig,
OpenAIProcessorConfig,
ProcessorConfig,
TextContentConfig,
@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):
# Setup conversation processor
processor_config = ProcessorConfig()
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
processor_config.conversation = ConversationProcessorConfig(
enable_offline_chat=True,
offline_chat=offline_chat,
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
)

View file

@ -6,6 +6,7 @@ from urllib.parse import quote
# External Packages
from fastapi.testclient import TestClient
import pytest
# Internal Packages
from app.main import app
@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
# ----------------------------------------------------------------------------------------------------
def test_index_batch(client):
def test_index_update(client):
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
response = client.post("/api/v1/index/update", files=files, headers=headers)
# Assert
assert response.status_code == 200
@ -76,12 +77,11 @@ def test_index_batch(client):
def test_regenerate_with_valid_content_type(client):
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
response = client.get(f"/api/update?force=true&t=github")
# Arrange
request_body = get_sample_files_data()
files = get_sample_files_data()
headers = {"x-api-key": "secret"}
# Act
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
# Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="Flaky test on parallel test runs")
def test_get_configured_types_via_api(client):
# Act
response = client.get(f"/api/config/types")
@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
def get_sample_files_data():
return {
"org": {
"path/to/filename.org": "* practicing piano",
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
"path/to/filename2.org": "* how to build a search engine",
},
"pdf": {
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
"path/to/filename1.pdf": "The sun is a ball of helium",
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
},
"plaintext": {
"path/to/filename.txt": "data,column,value",
"path/to/filename1.txt": "<html>my first web page</html>",
"path/to/filename2.txt": "2021-02-02 Journal Entry",
},
"markdown": {
"path/to/filename.md": "# Notes from client call",
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
"path/to/filename2.md": "**Understanding science through the lens of art**",
},
"files": ("path/to/filename.org", "* practicing piano", "text/org"),
"files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
"files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
"files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
"files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
"files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
"files": ("path/to/filename.txt", "data,column,value", "text/plain"),
"files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
"files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
"files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
"files": (
"path/to/filename1.md",
"## Studying anthropological records from the Fatimid caliphate",
"text/markdown",
),
"files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
}

View file

@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
@pytest.fixture(scope="session")
@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
@pytest.mark.chatquality
def test_extract_multiple_implicit_questions_from_message(loaded_model):
# Act
response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
# Assert
expected_responses = ["height", "taller", "shorter", "heights"]
expected_responses = ["height", "taller", "shorter", "heights", "who"]
assert len(response) <= 3
for question in response:
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
"Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
"Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
)
@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
def test_generate_search_query_using_question_from_chat_history(loaded_model):
# Arrange
message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
]
# Act
@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
use_history=True,
)
expected_responses = [
"Vader",
"sons",
all_expected_in_response = [
"Anderson",
]
any_expected_in_response = [
"son",
"Darth",
"sons",
"children",
]
# Assert
assert len(response) >= 1
assert any([expected_response in response[0] for expected_response in expected_responses]), (
assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0]
)
assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0]
)
@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
# Arrange
message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
]
# Act
response = extract_questions_offline(
"Is she a Jedi?",
"Is she a Doctor?",
conversation_log=populate_chat_history(message_list),
loaded_model=loaded_model,
use_history=True,
)
expected_responses = [
"Leia",
"Vader",
"Barbara",
"Robert",
"daughter",
]

View file

@ -1,7 +1,6 @@
# Standard Packages
import json
import os
import base64
# Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
# Extract Entries from specified Pdf files
# Read singlepage.pdf into memory as bytes
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
# Act
# Extract Entries from specified Pdf files
with open("tests/data/pdf/multipage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)

View file

@ -1,26 +1,25 @@
# System Packages
import logging
import locale
from pathlib import Path
import os
# External Packages
import pytest
from khoj.utils.config import SearchModels
# Internal Packages
from khoj.utils.state import content_index, search_models
from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.utils.config import SearchModels
from khoj.utils.fs_syncer import get_org_files
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
# Test
# ----------------------------------------------------------------------------------------------------
def test_text_search_setup_with_missing_file_raises_error(
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
# Arrange
# Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0])
@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError):
data = get_org_files(org_config_with_only_new_file)
get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
# Arrange
orgfile = tmp_path / "directory.org" / "file.org"
orgfile.parent.mkdir()
with open(orgfile, "w") as f:
f.write("* Heading\n- List item\n")
org_content_config = TextContentConfig(
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
)
# Act
# should not raise IsADirectoryError and return orgfile
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
# ----------------------------------------------------------------------------------------------------
@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Arrange
data = get_org_files(content_config.org)
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup(

View file

@ -24,5 +24,6 @@
"0.12.0": "0.15.0",
"0.12.1": "0.15.0",
"0.12.2": "0.15.0",
"0.12.3": "0.15.0"
"0.12.3": "0.15.0",
"0.13.0": "0.15.0"
}