Resolve merge conflicts

This commit is contained in:
sabaimran 2023-10-19 14:39:05 -07:00
commit 963cd165eb
42 changed files with 941 additions and 590 deletions

View file

@ -7,18 +7,21 @@
### Setup ### Setup
#### Offline Chat #### Offline Chat
Offline chat works without internet but it is slower, lower quality and more compute intensive. Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time > **System Requirements**:
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card - Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card
![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4') ![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
#### Online Chat #### Online Chat
Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive. Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing !> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing
1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys) 1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key. 2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.

View file

@ -46,7 +46,7 @@ Indexes your org-agenda files, by default.
(use-package khoj (use-package khoj
:ensure t :ensure t
:pin melpa-stable :pin melpa-stable
:bind ("C-c s" . 'khoj) :bind ("C-c s" . 'khoj))
``` ```
- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj - Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj

View file

@ -1,7 +1,7 @@
{ {
"id": "khoj", "id": "khoj",
"name": "Khoj", "name": "Khoj",
"version": "0.12.3", "version": "0.13.0",
"minAppVersion": "0.15.0", "minAppVersion": "0.15.0",
"description": "An Open-Source AI Personal Assistant for your Digital Brain", "description": "An Open-Source AI Personal Assistant for your Digital Brain",
"author": "Khoj Inc.", "author": "Khoj Inc.",

View file

@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project] [project]
name = "khoj-assistant" name = "khoj-assistant"
description = "An AI personal assistant for your Digital Brain" description = "An AI copilot for your Second Brain"
readme = "README.md" readme = "README.md"
license = "GPL-3.0-or-later" license = "GPL-3.0-or-later"
requires-python = ">=3.8" requires-python = ">=3.8"
@ -40,8 +40,9 @@ dependencies = [
"dateparser >= 1.1.1", "dateparser >= 1.1.1",
"defusedxml == 0.7.1", "defusedxml == 0.7.1",
"fastapi == 0.77.1", "fastapi == 0.77.1",
"python-multipart >= 0.0.5",
"jinja2 == 3.1.2", "jinja2 == 3.1.2",
"openai >= 0.27.0", "openai >= 0.27.0, < 1.0.0",
"tiktoken >= 0.3.2", "tiktoken >= 0.3.2",
"tenacity >= 8.2.2", "tenacity >= 8.2.2",
"pillow == 9.3.0", "pillow == 9.3.0",
@ -83,6 +84,7 @@ test = [
"freezegun >= 1.2.0", "freezegun >= 1.2.0",
"factory-boy >= 3.2.1", "factory-boy >= 3.2.1",
"trio >= 0.22.0", "trio >= 0.22.0",
"pytest-xdist",
] ]
dev = [ dev = [
"khoj-assistant[test]", "khoj-assistant[test]",

View file

@ -9,6 +9,10 @@ do
# Get current project version # Get current project version
current_version=$OPTARG current_version=$OPTARG
# Bump Desktop app to current version
cd $project_root/src/interface/desktop
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
# Bump Obsidian plugin to current version # Bump Obsidian plugin to current version
cd $project_root/src/interface/obsidian cd $project_root/src/interface/obsidian
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json

View file

@ -14,10 +14,11 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
# External Packages # External Packages
import uvicorn import uvicorn
import django
import schedule
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import schedule
import django
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from rich.logging import RichHandler from rich.logging import RichHandler
from django.core.asgi import get_asgi_application from django.core.asgi import get_asgi_application
@ -41,6 +42,15 @@ app = FastAPI()
# Get Django Application # Get Django Application
django_app = get_asgi_application() django_app = get_asgi_application()
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Set Locale # Set Locale
locale.setlocale(locale.LC_ALL, "") locale.setlocale(locale.LC_ALL, "")

View file

@ -8,7 +8,6 @@ const {dialog} = require('electron');
const cron = require('cron').CronJob; const cron = require('cron').CronJob;
const axios = require('axios'); const axios = require('axios');
const { Readable } = require('stream');
const KHOJ_URL = 'http://127.0.0.1:42110' const KHOJ_URL = 'http://127.0.0.1:42110'
@ -86,57 +85,65 @@ function handleSetTitle (event, title) {
}); });
} }
function pushDataToKhoj (regenerate = false) { function filenameToMimeType (filename) {
let filesToPush = []; const extension = filename.split('.').pop();
const files = store.get('files'); switch (extension) {
const folders = store.get('folders'); case 'pdf':
state = { return 'application/pdf';
completed: true case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
} }
if (files) { function pushDataToKhoj (regenerate = false) {
for (file of files) { let filesToPush = [];
const files = store.get('files') || [];
const folders = store.get('folders') || [];
state = { completed: true }
// Collect paths of all configured files to index
for (const file of files) {
filesToPush.push(file.path); filesToPush.push(file.path);
} }
}
if (folders) { // Collect paths of all indexable files in configured folders
for (folder of folders) { for (const folder of folders) {
const files = fs.readdirSync(folder.path, { withFileTypes: true }); const files = fs.readdirSync(folder.path, { withFileTypes: true });
for (file of files) { for (const file of files) {
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) { if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
filesToPush.push(path.join(folder.path, file.name)); filesToPush.push(path.join(folder.path, file.name));
} }
} }
} }
}
let data = {
files: []
}
const lastSync = store.get('lastSync') || []; const lastSync = store.get('lastSync') || [];
const formData = new FormData();
for (file of filesToPush) { for (const file of filesToPush) {
const stats = fs.statSync(file); const stats = fs.statSync(file);
if (!regenerate) { if (!regenerate) {
// Only push files that have been modified since last sync
if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) { if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
continue; continue;
} }
} }
// Collect all updated or newly created files since last sync to index on Khoj server
try { try {
let rawData; let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
// If the file is a PDF or IMG file, read it as a binary file let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
if (binaryFileTypes.includes(file.split('.').pop())) { let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
rawData = fs.readFileSync(file).toString('base64'); let fileObj = new Blob([fileContent], { type: mimeType });
} else { formData.append('files', fileObj, file);
rawData = fs.readFileSync(file, 'utf8');
}
data.files.push({
path: file,
content: rawData
});
state[file] = { state[file] = {
success: true, success: true,
} }
@ -149,31 +156,23 @@ function pushDataToKhoj (regenerate = false) {
} }
} }
// Mark deleted files for removal from index on Khoj server
for (const syncedFile of lastSync) { for (const syncedFile of lastSync) {
if (!filesToPush.includes(syncedFile.path)) { if (!filesToPush.includes(syncedFile.path)) {
data.files.push({ fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
path: syncedFile.path, formData.append('files', fileObj, syncedFile.path);
content: ""
});
} }
} }
const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' }; // Send collected files to Khoj server for indexing
if (!!formData?.entries()?.next().value) {
const stream = new Readable({
read() {
this.push(JSON.stringify(data));
this.push(null);
}
});
const hostURL = store.get('hostURL') || KHOJ_URL; const hostURL = store.get('hostURL') || KHOJ_URL;
const headers = {
axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers }) 'x-api-key': 'secret'
};
axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
.then(response => { .then(response => {
console.log(response.data); console.log(response.data);
const win = BrowserWindow.getAllWindows()[0];
win.webContents.send('update-state', state);
let lastSync = []; let lastSync = [];
for (const file of filesToPush) { for (const file of filesToPush) {
lastSync.push({ lastSync.push({
@ -186,9 +185,17 @@ function pushDataToKhoj (regenerate = false) {
.catch(error => { .catch(error => {
console.error(error); console.error(error);
state['completed'] = false state['completed'] = false
})
.finally(() => {
// Syncing complete
const win = BrowserWindow.getAllWindows()[0]; const win = BrowserWindow.getAllWindows()[0];
win.webContents.send('update-state', state); if (win) win.webContents.send('update-state', state);
}); });
} else {
// Syncing complete
const win = BrowserWindow.getAllWindows()[0];
if (win) win.webContents.send('update-state', state);
}
} }
pushDataToKhoj(); pushDataToKhoj();

View file

@ -1,13 +1,13 @@
{ {
"name": "Khoj", "name": "Khoj",
"homepage": ".", "version": "0.13.0",
"productName": "Khoj", "description": "An AI copilot for your Second Brain",
"version": "1.0.2", "author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
"description": "Scaffolding for the desktop entrypoint to Khoj", "license": "GPL-3.0-or-later",
"main": "main.js", "homepage": "https://khoj.dev",
"repository": "\"https://github.com/khoj-ai/khoj\"", "repository": "\"https://github.com/khoj-ai/khoj\"",
"author": "Khoj <team@khoj.dev>", "productName": "Khoj",
"license": "MIT", "main": "main.js",
"private": false, "private": false,
"devDependencies": { "devDependencies": {
"electron": "25.8.1" "electron": "25.8.1"

View file

@ -1,11 +1,12 @@
;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*- ;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*-
;; Copyright (C) 2021-2022 Debanjum Singh Solanky ;; Copyright (C) 2021-2023 Khoj Inc.
;; Author: Debanjum Singh Solanky <debanjum@gmail.com> ;; Author: Debanjum Singh Solanky <debanjum@khoj.dev>
;; Description: An AI personal assistant for your digital brain ;; Saba Imran <saba@khoj.dev>
;; Description: An AI copilot for your Second Brain
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image ;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
;; Version: 0.12.3 ;; Version: 0.13.0
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1")) ;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs ;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
@ -28,8 +29,8 @@
;;; Commentary: ;;; Commentary:
;; Create an AI personal assistant for your `org-mode', `markdown' notes, ;; Create an AI copilot to your `org-mode', `markdown' notes,
;; PDFs and images. The assistant exposes 2 modes, search and chat: ;; PDFs and images. The copilot exposes 2 modes, search and chat:
;; ;;
;; Chat provides faster answers, iterative discovery and assisted ;; Chat provides faster answers, iterative discovery and assisted
;; creativity. It requires your OpenAI API key to access GPT models ;; creativity. It requires your OpenAI API key to access GPT models
@ -87,6 +88,21 @@
:group 'khoj :group 'khoj
:type 'integer) :type 'integer)
(defcustom khoj-search-on-idle-time 0.3
"Idle time (in seconds) to wait before triggering search."
:group 'khoj
:type 'number)
(defcustom khoj-server-api-key "secret"
"API Key to Khoj server."
:group 'khoj
:type 'string)
(defcustom khoj-index-interval 3600
"Interval (in seconds) to wait before updating content index."
:group 'khoj
:type 'number)
(defcustom khoj-default-content-type "org" (defcustom khoj-default-content-type "org"
"The default content type to perform search on." "The default content type to perform search on."
:group 'khoj :group 'khoj
@ -115,6 +131,15 @@
(defvar khoj--content-type "org" (defvar khoj--content-type "org"
"The type of content to perform search on.") "The type of content to perform search on.")
(defvar khoj--search-on-idle-timer nil
"Idle timer to trigger incremental search.")
(defvar khoj--index-timer nil
"Timer to trigger content indexing.")
(defvar khoj--indexed-files '()
"Files that were indexed in previous content indexing run.")
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT)) (declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
(declare-function org-element-type "org-mode" (ELEMENT)) (declare-function org-element-type "org-mode" (ELEMENT))
(declare-function markdown-mode "markdown-mode" ()) (declare-function markdown-mode "markdown-mode" ())
@ -236,6 +261,11 @@ for example), set this to the full interpreter path."
:type 'boolean :type 'boolean
:group 'khoj) :group 'khoj)
(defcustom khoj-offline-chat-model nil
"Specify chat model to use for offline chat with khoj."
:type 'string
:group 'khoj)
(defcustom khoj-auto-setup t (defcustom khoj-auto-setup t
"Automate install, configure and start of khoj server. "Automate install, configure and start of khoj server.
Auto invokes setup steps on calling main entrypoint." Auto invokes setup steps on calling main entrypoint."
@ -365,9 +395,9 @@ CONFIG is json obtained from Khoj config API."
(string-join "/")))) (string-join "/"))))
(defun khoj--server-configure () (defun khoj--server-configure ()
"Configure the the Khoj server for search and chat." "Configure the Khoj server for search and chat."
(interactive) (interactive)
(let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null)) (let* ((url-request-method "GET")
(current-config (current-config
(with-temp-buffer (with-temp-buffer
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url)) (url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
@ -376,56 +406,12 @@ CONFIG is json obtained from Khoj config API."
(with-temp-buffer (with-temp-buffer
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url)) (url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false)))) (ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile))) (default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config)))))) (chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
(default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config)))) (enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config))))) (offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
(config (or current-config default-config))) (config (or current-config default-config)))
;; Configure content types
(cond
;; If khoj backend is not configured yet
((not current-config)
(message "khoj.el: Server not configured yet.")
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
(index-heading-entries . ,json-false)))))
config))
;; Else if khoj config has no org content config
((not (alist-get 'org (alist-get 'content-type config)))
(message "khoj.el: Org-mode content on server not configured yet.")
(let ((new-content-type (alist-get 'content-type config)))
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
(index-heading-entries . ,json-false)))
new-content-type)
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ,new-content-type) config)))
;; Else if khoj is not configured to index specified org files
((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
(equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
(message "khoj.el: Org-mode content on server is stale.")
(let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
(new-content-type (alist-get 'content-type config)))
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
(input-filter . ,org-directory-regexes)
(compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
(embeddings-file . ,(format "%s/org.pt" index-directory))
(index-heading-entries . ,json-false)))
new-content-type)
(setq config (delq (assoc 'content-type config) config))
(cl-pushnew `(content-type . ,new-content-type) config))))
;; Configure processors ;; Configure processors
(cond (cond
((not khoj-openai-api-key) ((not khoj-openai-api-key)
@ -441,10 +427,11 @@ CONFIG is json obtained from Khoj config API."
;; If khoj backend isn't configured yet ;; If khoj backend isn't configured yet
((not current-config) ((not current-config)
(message "khoj.el: Chat not configured yet.") (message "khoj.el: Khoj not configured yet.")
(setq config (delq (assoc 'processor config) config)) (setq config (delq (assoc 'processor config) config))
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) (cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
(enable-offline-chat . ,enable-offline-chat) (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,chat-model) (openai . ((chat-model . ,chat-model)
(api-key . ,khoj-openai-api-key))))))) (api-key . ,khoj-openai-api-key)))))))
config)) config))
@ -455,7 +442,8 @@ CONFIG is json obtained from Khoj config API."
(let ((new-processor-type (alist-get 'processor config))) (let ((new-processor-type (alist-get 'processor config)))
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir)) (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
(enable-offline-chat . ,enable-offline-chat) (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,chat-model) (openai . ((chat-model . ,chat-model)
(api-key . ,khoj-openai-api-key))))) (api-key . ,khoj-openai-api-key)))))
new-processor-type) new-processor-type)
@ -465,13 +453,15 @@ CONFIG is json obtained from Khoj config API."
;; Else if chat configuration in khoj backend has gone stale ;; Else if chat configuration in khoj backend has gone stale
((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key) ((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
(equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model) (equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
(equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat))) (equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat)
(equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model)))
(message "khoj.el: Chat configuration has gone stale.") (message "khoj.el: Chat configuration has gone stale.")
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile))) (let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
(new-processor-type (alist-get 'processor config))) (new-processor-type (alist-get 'processor config)))
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type)) (setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory)) (cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
(enable-offline-chat . ,enable-offline-chat) (offline-chat . ((enable-offline-chat . ,enable-offline-chat)
(chat-model . ,offline-chat-model)))
(openai . ((chat-model . ,khoj-chat-model) (openai . ((chat-model . ,khoj-chat-model)
(api-key . ,khoj-openai-api-key))))) (api-key . ,khoj-openai-api-key)))))
new-processor-type) new-processor-type)
@ -509,9 +499,75 @@ CONFIG is json obtained from Khoj config API."
(khoj--server-configure)))) (khoj--server-configure))))
;; ----------------------------------------------- ;; -------------------
;; Extract and Render Entries of each Content Type ;; Khoj Index Content
;; ----------------------------------------------- ;; -------------------
(defun khoj--server-index-files (&optional force content-type file-paths)
"Send files at `FILE-PATHS' to the Khoj server to index for search and chat.
`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed."
(interactive)
(let ((boundary (format "-------------------------%d" (random (expt 10 10))))
(files-to-index (or file-paths
(append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
(type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
(inhibit-message t)
(message-log-max nil))
(let ((url-request-method "POST")
(url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
(url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
("x-api-key" . ,khoj-server-api-key))))
(with-current-buffer
(url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false"))
;; render response from indexing API endpoint on server
(lambda (status)
(if (not status)
(message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " ""))
(with-current-buffer (current-buffer)
(goto-char "\n\n")
(message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s"
(if force "force " "")
content-type
status
(string-trim (buffer-substring-no-properties (point) (point-max)))))))
nil t t)))
(setq khoj--indexed-files files-to-index)))
(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
"Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
(with-temp-buffer
(set-buffer-multibyte nil)
(insert "\n")
(dolist (file-to-index files-to-index)
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert "Content-Type: text/org\r\n\r\n")
(insert (with-temp-buffer
(insert-file-contents-literally file-to-index)
(buffer-string)))
(insert "\r\n"))
(dolist (file-to-index previously-indexed-files)
(when (not (member file-to-index files-to-index))
(insert (format "--%s\r\n" boundary))
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
(insert "Content-Type: text/org\r\n\r\n")
(insert "")
(insert "\r\n")))
(insert (format "--%s--\r\n" boundary))
(buffer-string)))
;; Cancel any running indexing timer, first
(when khoj--index-timer
(cancel-timer khoj--index-timer))
;; Send files to index on server every `khoj-index-interval' seconds
(setq khoj--index-timer
(run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
;; -------------------------------------------
;; Render Response from Khoj server for Emacs
;; -------------------------------------------
(defun khoj--extract-entries-as-markdown (json-response query) (defun khoj--extract-entries-as-markdown (json-response query)
"Convert JSON-RESPONSE, QUERY from API to markdown entries." "Convert JSON-RESPONSE, QUERY from API to markdown entries."
@ -920,6 +976,9 @@ RECEIVE-DATE is the message receive date."
(message "khoj.el: Teardown Incremental Search") (message "khoj.el: Teardown Incremental Search")
;; unset khoj minibuffer window ;; unset khoj minibuffer window
(setq khoj--minibuffer-window nil) (setq khoj--minibuffer-window nil)
(when (and khoj--search-on-idle-timer
(timerp khoj--search-on-idle-timer))
(cancel-timer khoj--search-on-idle-timer))
;; delete open connections to khoj server ;; delete open connections to khoj server
(khoj--delete-open-network-connections-to-server) (khoj--delete-open-network-connections-to-server)
;; remove hooks for khoj incremental query and self ;; remove hooks for khoj incremental query and self
@ -942,8 +1001,10 @@ RECEIVE-DATE is the message receive date."
;; set current (mini-)buffer entered as khoj minibuffer ;; set current (mini-)buffer entered as khoj minibuffer
;; used to query khoj API only when user in khoj minibuffer ;; used to query khoj API only when user in khoj minibuffer
(setq khoj--minibuffer-window (current-buffer)) (setq khoj--minibuffer-window (current-buffer))
(add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action ; do khoj incremental search after idle time
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit (setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search))
; teardown khoj incremental search on minibuffer exit
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
(read-string khoj--query-prompt)))) (read-string khoj--query-prompt))))
@ -1014,6 +1075,9 @@ Paragraph only starts at first text after blank line."
;; Khoj Menu ;; Khoj Menu
;; --------- ;; ---------
(defun khoj--setup-and-show-menu ()
"Create Transient menu for khoj and show it."
;; Create the Khoj Transient menu
(transient-define-argument khoj--content-type-switch () (transient-define-argument khoj--content-type-switch ()
:class 'transient-switches :class 'transient-switches
:argument-format "--content-type=%s" :argument-format "--content-type=%s"
@ -1049,12 +1113,10 @@ Paragraph only starts at first text after blank line."
(let* ((force-update (if (member "--force-update" args) "true" "false")) (let* ((force-update (if (member "--force-update" args) "true" "false"))
;; set content type to: specified > last used > based on current buffer > default type ;; set content type to: specified > last used > based on current buffer > default type
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name)))) (content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
(type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
(update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
(url-request-method "GET")) (url-request-method "GET"))
(progn (progn
(setq khoj--content-type content-type) (setq khoj--content-type content-type)
(url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " ""))))))) (khoj--server-index-files force-update content-type))))
(transient-define-suffix khoj--chat-command (&optional _) (transient-define-suffix khoj--chat-command (&optional _)
"Command to Chat with Khoj." "Command to Chat with Khoj."
@ -1075,6 +1137,9 @@ Paragraph only starts at first text after blank line."
("u" "Update" khoj--update-command) ("u" "Update" khoj--update-command)
("q" "Quit" transient-quit-one)]]) ("q" "Quit" transient-quit-one)]])
;; Show the Khoj Transient menu
(khoj--menu))
;; ---------- ;; ----------
;; Entrypoint ;; Entrypoint
@ -1086,7 +1151,7 @@ Paragraph only starts at first text after blank line."
(interactive) (interactive)
(when khoj-auto-setup (when khoj-auto-setup
(khoj-setup t)) (khoj-setup t))
(khoj--menu)) (khoj--setup-and-show-menu))
(provide 'khoj) (provide 'khoj)

View file

@ -206,6 +206,64 @@ Rule everything\n")
"Rule everything")) "Rule everything"))
)) ))
;; -------------------------------------
;; Test Helpers to Index Content
;; -------------------------------------
(ert-deftest khoj-tests--render-files-to-add-request-body ()
"Test files are formatted into a multi-part http request body"
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
(unwind-protect
(progn
(should
(equal
(khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj")
(format
"\n--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
# Become God\n\
## Upgrade\n\n\
Penance to Immortality\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
## Act\n\n\
Rule everything\n\n\r\n\
--khoj--\r\n" upgrade-file act-file))))
(delete-file upgrade-file)
(delete-file act-file))))
(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body ()
"Test files are formatted into a multi-part http request body"
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
(unwind-protect
(progn
(should
(equal
(khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj")
(format
"\n--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
# Become God\n\
## Upgrade\n\n\
Penance to Immortality\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
## Act\n\n\
Rule everything\n\n\r
--khoj\r\n\
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
Content-Type: text/org\r\n\r\n\
\r
--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org"))))
(delete-file upgrade-file)
(delete-file act-file))))
(provide 'khoj-tests) (provide 'khoj-tests)

View file

@ -1,7 +1,7 @@
{ {
"id": "khoj", "id": "khoj",
"name": "Khoj", "name": "Khoj",
"version": "0.12.3", "version": "0.13.0",
"minAppVersion": "0.15.0", "minAppVersion": "0.15.0",
"description": "An Open-Source AI Personal Assistant for your Digital Brain", "description": "An Open-Source AI Personal Assistant for your Digital Brain",
"author": "Khoj Inc.", "author": "Khoj Inc.",

View file

@ -1,7 +1,9 @@
{ {
"name": "Khoj", "name": "Khoj",
"version": "0.12.3", "version": "0.13.0",
"description": "An AI Personal Assistant for your Digital Brain", "description": "An AI copilot for your Second Brain",
"author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
"license": "GPL-3.0-or-later",
"main": "src/main.js", "main": "src/main.js",
"scripts": { "scripts": {
"dev": "node esbuild.config.mjs", "dev": "node esbuild.config.mjs",
@ -14,8 +16,6 @@
"AI", "AI",
"assistant" "assistant"
], ],
"author": "Debanjum Singh Solanky",
"license": "GPL-3.0-or-later",
"devDependencies": { "devDependencies": {
"@types/node": "^16.11.6", "@types/node": "^16.11.6",
"@typescript-eslint/eslint-plugin": "5.29.0", "@typescript-eslint/eslint-plugin": "5.29.0",

View file

@ -1,12 +1,13 @@
import { Notice, Plugin } from 'obsidian'; import { Notice, Plugin, TFile } from 'obsidian';
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings' import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
import { KhojSearchModal } from 'src/search_modal' import { KhojSearchModal } from 'src/search_modal'
import { KhojChatModal } from 'src/chat_modal' import { KhojChatModal } from 'src/chat_modal'
import { configureKhojBackend } from './utils'; import { configureKhojBackend, updateContentIndex } from './utils';
export default class Khoj extends Plugin { export default class Khoj extends Plugin {
settings: KhojSetting; settings: KhojSetting;
indexingTimer: NodeJS.Timeout;
async onload() { async onload() {
await this.loadSettings(); await this.loadSettings();
@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
// Add a settings tab so the user can configure khoj // Add a settings tab so the user can configure khoj
this.addSettingTab(new KhojSettingTab(this.app, this)); this.addSettingTab(new KhojSettingTab(this.app, this));
// Add scheduled job to update index every 60 minutes
this.indexingTimer = setInterval(async () => {
if (this.settings.autoConfigure) {
this.settings.lastSyncedFiles = await updateContentIndex(
this.app.vault, this.settings, this.settings.lastSyncedFiles
);
}
}, 60 * 60 * 1000);
} }
async loadSettings() { async loadSettings() {
@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
} }
this.saveData(this.settings); this.saveData(this.settings);
} }
async onunload() {
// Remove scheduled job to update index at regular cadence
if (this.indexingTimer)
clearInterval(this.indexingTimer);
this.unload();
}
} }

View file

@ -1,5 +1,6 @@
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian'; import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian';
import Khoj from 'src/main'; import Khoj from 'src/main';
import { updateContentIndex } from './utils';
export interface KhojSetting { export interface KhojSetting {
enableOfflineChat: boolean; enableOfflineChat: boolean;
@ -8,6 +9,7 @@ export interface KhojSetting {
khojUrl: string; khojUrl: string;
connectedToBackend: boolean; connectedToBackend: boolean;
autoConfigure: boolean; autoConfigure: boolean;
lastSyncedFiles: TFile[];
} }
export const DEFAULT_SETTINGS: KhojSetting = { export const DEFAULT_SETTINGS: KhojSetting = {
@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
connectedToBackend: false, connectedToBackend: false,
autoConfigure: true, autoConfigure: true,
openaiApiKey: '', openaiApiKey: '',
lastSyncedFiles: []
} }
export class KhojSettingTab extends PluginSettingTab { export class KhojSettingTab extends PluginSettingTab {
@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
}, 300); }, 300);
this.plugin.registerInterval(progress_indicator); this.plugin.registerInterval(progress_indicator);
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`); this.plugin.settings.lastSyncedFiles = await updateContentIndex(
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`); this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
);
new Notice('✅ Updated Khoj index.'); new Notice('✅ Updated Khoj index.');
// Reset button once index is updated // Reset button once index is updated

View file

@ -1,4 +1,4 @@
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian'; import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
import { KhojSetting } from 'src/settings' import { KhojSetting } from 'src/settings'
export function getVaultAbsolutePath(vault: Vault): string { export function getVaultAbsolutePath(vault: Vault): string {
@ -14,18 +14,85 @@ type OpenAIType = null | {
"api-key": string; "api-key": string;
}; };
type OfflineChatType = null | {
"chat-model": string;
"enable-offline-chat": boolean;
};
interface ProcessorData { interface ProcessorData {
conversation: { conversation: {
"conversation-logfile": string; "conversation-logfile": string;
openai: OpenAIType; openai: OpenAIType;
"enable-offline-chat": boolean; "offline-chat": OfflineChatType;
"tokenizer": null | string;
"max-prompt-size": null | number;
}; };
} }
function fileExtensionToMimeType (extension: string): string {
switch (extension) {
case 'pdf':
return 'application/pdf';
case 'png':
return 'image/png';
case 'jpg':
case 'jpeg':
return 'image/jpeg';
case 'md':
case 'markdown':
return 'text/markdown';
case 'org':
return 'text/org';
default:
return 'text/plain';
}
}
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
// Get all markdown, pdf files in the vault
console.log(`Khoj: Updating Khoj content index...`)
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
let countOfFilesToIndex = 0;
let countOfFilesToDelete = 0;
// Add all files to index as multipart form data
const formData = new FormData();
for (const file of files) {
countOfFilesToIndex++;
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
}
// Add any previously synced files to be deleted to multipart form data
for (const lastSyncedFile of lastSyncedFiles) {
if (!files.includes(lastSyncedFile)) {
countOfFilesToDelete++;
formData.append('files', new Blob([]), lastSyncedFile.path);
}
}
// Call Khoj backend to update index with all markdown, pdf files
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
method: 'POST',
headers: {
'x-api-key': 'secret',
},
body: formData,
});
if (!response.ok) {
new Notice(`Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
} else {
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
}
return files;
}
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) { export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
let vaultPath = getVaultAbsolutePath(vault);
let mdInVault = `${vaultPath}/**/*.md`;
let pdfInVault = `${vaultPath}/**/*.pdf`;
let khojConfigUrl = `${setting.khojUrl}/api/config/data`; let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
// Check if khoj backend is configured, note if cannot connect to backend // Check if khoj backend is configured, note if cannot connect to backend
@ -43,124 +110,33 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
if (!setting.connectedToBackend) return; if (!setting.connectedToBackend) return;
// Set index name from the path of the current vault // Set index name from the path of the current vault
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
// Get default config fields from khoj backend // Get default config fields from khoj backend
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response)); let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]); let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"]; let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"];
// Get current config if khoj backend configured, else get default config from khoj backend // Get current config if khoj backend configured, else get default config from khoj backend
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`) await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
.then(response => JSON.parse(response)) .then(response => JSON.parse(response))
.then(data => { .then(data => {
khoj_already_configured = data["content-type"] != null;
// If khoj backend not configured yet
if (!khoj_already_configured) {
// Create khoj content-type config with only markdown configured
data["content-type"] = {
"markdown": {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
}
}
// Else if khoj config has no markdown content config
else if (!data["content-type"]["markdown"]) {
// Add markdown config to khoj content-type config
// Set markdown config to index markdown files in configured obsidian vault
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
// Else if khoj is not configured to index markdown files in configured obsidian vault
else if (
data["content-type"]["markdown"]["input-files"] != null ||
data["content-type"]["markdown"]["input-filter"] == null ||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
// Update markdown config in khoj content-type config
// Set markdown config to only index markdown files in configured obsidian vault
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
data["content-type"]["markdown"] = {
"input-filter": [mdInVault],
"input-files": null,
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
}
}
if (khoj_already_configured && !data["content-type"]["pdf"]) {
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
// Else if khoj is not configured to index pdf files in configured obsidian vault
else if (khoj_already_configured &&
(
data["content-type"]["pdf"]["input-files"] != null ||
data["content-type"]["pdf"]["input-filter"] == null ||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
if (hasPdfFiles) {
// Update pdf config in khoj content-type config
// Set pdf config to only index pdf files in configured obsidian vault
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
data["content-type"]["pdf"] = {
"input-filter": [pdfInVault],
"input-files": null,
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
}
} else {
data["content-type"]["pdf"] = null;
}
}
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`; let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
let processorData: ProcessorData = { let processorData: ProcessorData = {
"conversation": { "conversation": {
"conversation-logfile": conversationLogFile, "conversation-logfile": conversationLogFile,
"openai": null, "openai": null,
"offline-chat": {
"chat-model": khojDefaultOfflineChatModelName,
"enable-offline-chat": setting.enableOfflineChat, "enable-offline-chat": setting.enableOfflineChat,
},
"tokenizer": null,
"max-prompt-size": null,
} }
} }
// If the Open AI API Key was configured in the plugin settings // If the Open AI API Key was configured in the plugin settings
if (!!setting.openaiApiKey) { if (!!setting.openaiApiKey) {
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName;
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
processorData = { processorData = {
"conversation": { "conversation": {
"conversation-logfile": conversationLogFile, "conversation-logfile": conversationLogFile,
@ -168,8 +144,13 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
"chat-model": openAIChatModel, "chat-model": openAIChatModel,
"api-key": setting.openaiApiKey, "api-key": setting.openaiApiKey,
}, },
"offline-chat": {
"chat-model": khojDefaultOfflineChatModelName,
"enable-offline-chat": setting.enableOfflineChat, "enable-offline-chat": setting.enableOfflineChat,
}, },
"tokenizer": null,
"max-prompt-size": null,
},
} }
} }
@ -197,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
method: 'POST', method: 'POST',
contentType: 'application/json', contentType: 'application/json',
}; };
// Save khojConfig on khoj backend at khojConfigUrl // Save khojConfig on khoj backend at khojConfigUrl
await request(requestContent) request(requestContent);
// Refresh khoj search index after updating config
.then(_ => request(`${khojUrl}/api/update?t=markdown`))
.then(_ => request(`${khojUrl}/api/update?t=pdf`));
} }
function getIndexDirectoryFromBackendConfig(filepath: string) { function getIndexDirectoryFromBackendConfig(filepath: string) {

View file

@ -24,5 +24,6 @@
"0.12.0": "0.15.0", "0.12.0": "0.15.0",
"0.12.1": "0.15.0", "0.12.1": "0.15.0",
"0.12.2": "0.15.0", "0.12.2": "0.15.0",
"0.12.3": "0.15.0" "0.12.3": "0.15.0",
"0.13.0": "0.15.0"
} }

View file

@ -28,7 +28,7 @@ from khoj.utils.config import (
) )
from khoj.utils.helpers import resolve_absolute_path, merge_dicts from khoj.utils.helpers import resolve_absolute_path, merge_dicts
from khoj.utils.fs_syncer import collect_files from khoj.utils.fs_syncer import collect_files
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig
from khoj.routers.indexer import configure_content, load_content, configure_search from khoj.routers.indexer import configure_content, load_content, configure_search
@ -136,7 +136,7 @@ def configure_routes(app):
app.include_router(api, prefix="/api") app.include_router(api, prefix="/api")
app.include_router(api_beta, prefix="/api/beta") app.include_router(api_beta, prefix="/api/beta")
app.include_router(indexer, prefix="/v1/indexer") app.include_router(indexer, prefix="/api/v1/index")
app.include_router(web_client) app.include_router(web_client)
app.include_router(auth_router, prefix="/auth") app.include_router(auth_router, prefix="/auth")
@ -156,7 +156,7 @@ if not state.demo:
state.content_index = configure_content( state.content_index = configure_content(
state.content_index, state.config.content_type, all_files, state.search_models state.content_index, state.config.content_type, all_files, state.search_models
) )
logger.info("📬 Content index updated via Scheduler") logger.info("📪 Content index updated via Scheduler")
except Exception as e: except Exception as e:
logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True) logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
@ -207,9 +207,7 @@ def configure_conversation_processor(
conversation_config=ConversationProcessorConfig( conversation_config=ConversationProcessorConfig(
conversation_logfile=conversation_logfile, conversation_logfile=conversation_logfile,
openai=(conversation_config.openai if (conversation_config is not None) else None), openai=(conversation_config.openai if (conversation_config is not None) else None),
enable_offline_chat=( offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(),
conversation_config.enable_offline_chat if (conversation_config is not None) else False
),
) )
) )
else: else:

View file

@ -236,7 +236,7 @@
</h3> </h3>
</div> </div>
<div class="card-description-row"> <div class="card-description-row">
<p class="card-description">Setup chat using OpenAI</p> <p class="card-description">Setup online chat using OpenAI</p>
</div> </div>
<div class="card-action-row"> <div class="card-action-row">
<a class="card-button" href="/config/processor/conversation/openai"> <a class="card-button" href="/config/processor/conversation/openai">
@ -261,21 +261,21 @@
<img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat"> <img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
<h3 class="card-title"> <h3 class="card-title">
Offline Chat Offline Chat
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured"> <img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and not current_model_state.conversation_gpt4all %} {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %}
<img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected."> <img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
{% endif %} {% endif %}
</h3> </h3>
</div> </div>
<div class="card-description-row"> <div class="card-description-row">
<p class="card-description">Setup offline chat (Llama V2)</p> <p class="card-description">Setup offline chat</p>
</div> </div>
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}enabled{% else %}disabled{% endif %}"> <div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
<button class="card-button" onclick="toggleEnableLocalLLLM(false)"> <button class="card-button" onclick="toggleEnableLocalLLLM(false)">
Disable Disable
</button> </button>
</div> </div>
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}disabled{% else %}enabled{% endif %}"> <div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
<button class="card-button happy" onclick="toggleEnableLocalLLLM(true)"> <button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
Enable Enable
</button> </button>
@ -346,7 +346,7 @@
featuresHintText.classList.add("show"); featuresHintText.classList.add("show");
} }
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, { fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, {
method: 'POST', method: 'POST',
headers: { headers: {
'Content-Type': 'application/json', 'Content-Type': 'application/json',

View file

@ -34,7 +34,7 @@
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}"> <input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
{% else %} {% else %}
{% for input_filter in current_config['input_filter'] %} {% for input_filter in current_config['input_filter'] %}
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter.split('/*')[0] }}"> <input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter }}">
{% endfor %} {% endfor %}
{% endif %} {% endif %}
</td> </td>
@ -106,17 +106,18 @@
submit.addEventListener("click", function(event) { submit.addEventListener("click", function(event) {
event.preventDefault(); event.preventDefault();
let globFormat = "**/*." let globFormat = "**/*"
let suffixes = []; let suffixes = [];
if ('{{content_type}}' == "markdown") if ('{{content_type}}' == "markdown")
suffixes = ["md", "markdown"] suffixes = [".md", ".markdown"]
else if ('{{content_type}}' == "org") else if ('{{content_type}}' == "org")
suffixes = ["org"] suffixes = [".org"]
else if ('{{content_type}}' === "pdf") else if ('{{content_type}}' === "pdf")
suffixes = ["pdf"] suffixes = [".pdf"]
else if ('{{content_type}}' === "plaintext") else if ('{{content_type}}' === "plaintext")
suffixes = ['*'] suffixes = ['.*']
let globs = suffixes.map(x => `${globFormat}${x}`)
var inputFileNodes = document.getElementsByName("input-files"); var inputFileNodes = document.getElementsByName("input-files");
var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value); var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
@ -124,10 +125,19 @@
var inputFilter = []; var inputFilter = [];
var nodes = getValidInputNodes(inputFilterNodes); var nodes = getValidInputNodes(inputFilterNodes);
// A regex that checks for globs in the path. If they exist,
// we are going to just not add our own globing. If they don't,
// then we will assume globbing should be done.
const glob_regex = /([*?\[\]])/;
if (nodes.length > 0) { if (nodes.length > 0) {
for (var i = 0; i < nodes.length; i++) { for (var i = 0; i < nodes.length; i++) {
for (var j = 0; j < suffixes.length; j++) { for (var j = 0; j < globs.length; j++) {
inputFilter.push(nodes[i].value + globFormat + suffixes[j]); if (glob_regex.test(nodes[i].value)) {
inputFilter.push(nodes[i].value);
} else {
inputFilter.push(nodes[i].value + globs[j]);
}
} }
} }
} }

View file

@ -0,0 +1,83 @@
"""
Current format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
enable-offline-chat: false
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
openai:
...
search-type:
...
New format of khoj.yml
---
app:
...
content-type:
...
processor:
conversation:
offline-chat:
enable-offline-chat: false
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
tokenizer: null
max_prompt_size: null
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
openai:
...
search-type:
...
"""
import logging
from packaging import version
from khoj.utils.yaml import load_config_from_file, save_config_to_file
logger = logging.getLogger(__name__)
def migrate_offline_chat_schema(args):
schema_version = "0.12.3"
raw_config = load_config_from_file(args.config_file)
previous_version = raw_config.get("version")
if "processor" not in raw_config:
return args
if raw_config["processor"] is None:
return args
if "conversation" not in raw_config["processor"]:
return args
if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"):
logger.info(
f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration"
)
raw_config["version"] = schema_version
# Create max-prompt-size field in conversation processor schema
raw_config["processor"]["conversation"]["max-prompt-size"] = None
raw_config["processor"]["conversation"]["tokenizer"] = None
# Create offline chat schema based on existing enable_offline_chat field in khoj config schema
offline_chat_model = (
raw_config["processor"]["conversation"]
.get("offline-chat", {})
.get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin")
)
raw_config["processor"]["conversation"]["offline-chat"] = {
"enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False),
"chat-model": offline_chat_model,
}
# Delete old enable-offline-chat field from conversation processor schema
if "enable-offline-chat" in raw_config["processor"]["conversation"]:
del raw_config["processor"]["conversation"]["enable-offline-chat"]
save_config_to_file(raw_config, args.config_file)
return args

View file

@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
def extract_questions_offline( def extract_questions_offline(
text: str, text: str,
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin", model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
conversation_log={}, conversation_log={},
use_history: bool = True, use_history: bool = True,
@ -113,7 +113,7 @@ def filter_questions(questions: List[str]):
] ]
filtered_questions = [] filtered_questions = []
for q in questions: for q in questions:
if not any([word in q.lower() for word in hint_words]): if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q):
filtered_questions.append(q) filtered_questions.append(q)
return filtered_questions return filtered_questions
@ -123,10 +123,12 @@ def converse_offline(
references, references,
user_query, user_query,
conversation_log={}, conversation_log={},
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin", model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
loaded_model: Union[Any, None] = None, loaded_model: Union[Any, None] = None,
completion_func=None, completion_func=None,
conversation_command=ConversationCommand.Default, conversation_command=ConversationCommand.Default,
max_prompt_size=None,
tokenizer_name=None,
) -> Union[ThreadedGenerator, Iterator[str]]: ) -> Union[ThreadedGenerator, Iterator[str]]:
""" """
Converse with user using Llama Converse with user using Llama
@ -158,6 +160,8 @@ def converse_offline(
prompts.system_prompt_message_llamav2, prompts.system_prompt_message_llamav2,
conversation_log, conversation_log,
model_name=model, model_name=model,
max_prompt_size=max_prompt_size,
tokenizer_name=tokenizer_name,
) )
g = ThreadedGenerator(references, completion_func=completion_func) g = ThreadedGenerator(references, completion_func=completion_func)

View file

@ -1,3 +0,0 @@
model_name_to_url = {
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin"
}

View file

@ -1,24 +1,8 @@
import os
import logging import logging
import requests
import hashlib
from tqdm import tqdm
from khoj.processor.conversation.gpt4all import model_metadata
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"}
def get_md5_checksum(filename: str):
hash_md5 = hashlib.md5()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def download_model(model_name: str): def download_model(model_name: str):
try: try:
@ -27,57 +11,12 @@ def download_model(model_name: str):
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.") logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
raise e raise e
url = model_metadata.model_name_to_url.get(model_name) # Use GPU for Chat Model, if available
model_path = os.path.expanduser(f"~/.cache/gpt4all/")
if not url:
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
return GPT4All(model_name=model_name, model_path=model_path)
filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
if os.path.exists(filename):
# Check if the user is connected to the internet
try: try:
requests.get("https://www.google.com/", timeout=5) model = GPT4All(model_name=model_name, device="gpu")
except: logger.debug("Loaded chat model to GPU.")
logger.debug("User is offline. Disabling allowed download flag") except ValueError:
return GPT4All(model_name=model_name, model_path=model_path, allow_download=False) model = GPT4All(model_name=model_name)
return GPT4All(model_name=model_name, model_path=model_path) logger.debug("Loaded chat model to CPU.")
# Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file return model
tmp_filename = filename + ".tmp"
try:
os.makedirs(os.path.dirname(tmp_filename), exist_ok=True)
logger.debug(f"Downloading model {model_name} from {url} to {filename}...")
with requests.get(url, stream=True) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with open(tmp_filename, "wb") as f, tqdm(
unit="B", # unit string to be displayed.
unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
unit_divisor=1024, # is used when unit_scale is true
total=total_size, # the total iteration.
desc=model_name, # prefix to be displayed on progress bar.
) as progress_bar:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
progress_bar.update(len(chunk))
# Verify the checksum
if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename):
logger.error(
f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available."
)
os.remove(tmp_filename)
raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.")
# Move the tmp file to the actual file
os.rename(tmp_filename, filename)
logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}")
return GPT4All(model_name)
except Exception as e:
logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True)
# Remove the tmp file if it exists
if os.path.exists(tmp_filename):
os.remove(tmp_filename)
return None

View file

@ -116,6 +116,8 @@ def converse(
temperature: float = 0.2, temperature: float = 0.2,
completion_func=None, completion_func=None,
conversation_command=ConversationCommand.Default, conversation_command=ConversationCommand.Default,
max_prompt_size=None,
tokenizer_name=None,
): ):
""" """
Converse with user using OpenAI's ChatGPT Converse with user using OpenAI's ChatGPT
@ -141,6 +143,8 @@ def converse(
prompts.personality.format(), prompts.personality.format(),
conversation_log, conversation_log,
model, model,
max_prompt_size,
tokenizer_name,
) )
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages}) truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
logger.debug(f"Conversation Context for GPT: {truncated_messages}") logger.debug(f"Conversation Context for GPT: {truncated_messages}")

View file

@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template(
""".strip() """.strip()
) )
system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant. system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant.
Using your general knowledge and our past conversations as context, answer the following question. Using your general knowledge and our past conversations as context, answer the following question.
If you do not know the answer, say 'I don't know.'""" If you do not know the answer, say 'I don't know.'"""
@ -91,7 +91,7 @@ Question: {query}
notes_conversation_llamav2 = PromptTemplate.from_template( notes_conversation_llamav2 = PromptTemplate.from_template(
""" """
Notes: User's Notes:
{references} {references}
Question: {query} Question: {query}
""".strip() """.strip()
@ -135,13 +135,19 @@ Answer (in second person):"""
extract_questions_llamav2_sample = PromptTemplate.from_template( extract_questions_llamav2_sample = PromptTemplate.from_template(
""" """
<s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s> <s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s>
<s>[INST]How was my trip to Cambodia?[/INST][]</s> <s>[INST] How was my trip to Cambodia? [/INST]
<s>[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia?</s> How was my trip to Cambodia?</s>
<s>[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have?</s> <s>[INST] Who did I visit the temple with on that trip? [/INST]
<s>[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s> Who did I visit the temple with in Cambodia?</s>
<s>[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s> <s>[INST] How should I take care of my plants? [/INST]
What kind of plants do I have? What issues do my plants have?</s>
<s>[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST]
What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
<s>[INST] What did I do for Christmas last year? [/INST]
What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
<s>[INST] How are you feeling today? [/INST]</s> <s>[INST] How are you feeling today? [/INST]</s>
<s>[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age?</s> <s>[INST] Is Alice older than Bob? [/INST]
When was Alice born? What is Bob's age?</s>
<s>[INST] <<SYS>> <s>[INST] <<SYS>>
Use these notes from the user's previous conversations to provide a response: Use these notes from the user's previous conversations to provide a response:
{chat_history} {chat_history}

View file

@ -3,24 +3,27 @@ import logging
from time import perf_counter from time import perf_counter
import json import json
from datetime import datetime from datetime import datetime
import queue
import tiktoken import tiktoken
# External packages # External packages
from langchain.schema import ChatMessage from langchain.schema import ChatMessage
from transformers import LlamaTokenizerFast from transformers import AutoTokenizer
# Internal Packages # Internal Packages
import queue
from khoj.utils.helpers import merge_dicts from khoj.utils.helpers import merge_dicts
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
max_prompt_size = { model_to_prompt_size = {
"gpt-3.5-turbo": 4096, "gpt-3.5-turbo": 4096,
"gpt-4": 8192, "gpt-4": 8192,
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548, "llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
"gpt-3.5-turbo-16k": 15000, "gpt-3.5-turbo-16k": 15000,
} }
tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"} model_to_tokenizer = {
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
}
class ThreadedGenerator: class ThreadedGenerator:
@ -82,9 +85,26 @@ def message_to_log(
def generate_chatml_messages_with_context( def generate_chatml_messages_with_context(
user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2 user_message,
system_message,
conversation_log={},
model_name="gpt-3.5-turbo",
max_prompt_size=None,
tokenizer_name=None,
): ):
"""Generate messages for ChatGPT with context from previous conversation""" """Generate messages for ChatGPT with context from previous conversation"""
# Set max prompt size from user config, pre-configured for model or to default prompt size
try:
max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
except:
max_prompt_size = 2000
logger.warning(
f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
)
# Scale lookback turns proportional to max prompt size supported by model
lookback_turns = max_prompt_size // 750
# Extract Chat History for Context # Extract Chat History for Context
chat_logs = [] chat_logs = []
for chat in conversation_log.get("chat", []): for chat in conversation_log.get("chat", []):
@ -105,19 +125,28 @@ def generate_chatml_messages_with_context(
messages = user_chatml_message + rest_backnforths + system_chatml_message messages = user_chatml_message + rest_backnforths + system_chatml_message
# Truncate oldest messages from conversation history until under max supported prompt size by model # Truncate oldest messages from conversation history until under max supported prompt size by model
messages = truncate_messages(messages, max_prompt_size[model_name], model_name) messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
# Return message in chronological order # Return message in chronological order
return messages[::-1] return messages[::-1]
def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]: def truncate_messages(
messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None
) -> list[ChatMessage]:
"""Truncate messages to fit within max prompt size supported by model""" """Truncate messages to fit within max prompt size supported by model"""
if "llama" in model_name: try:
encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name]) if model_name.startswith("gpt-"):
else:
encoder = tiktoken.encoding_for_model(model_name) encoder = tiktoken.encoding_for_model(model_name)
else:
encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name])
except:
default_tokenizer = "hf-internal-testing/llama-tokenizer"
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
logger.warning(
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
)
system_message = messages.pop() system_message = messages.pop()
system_message_tokens = len(encoder.encode(system_message.content)) system_message_tokens = len(encoder.encode(system_message.content))

View file

@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path # Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
tmp_file = f"tmp_pdf_file.pdf" tmp_file = f"tmp_pdf_file.pdf"
with open(f"{tmp_file}", "wb") as f: with open(f"{tmp_file}", "wb") as f:
bytes = base64.b64decode(pdf_files[pdf_file]) bytes = pdf_files[pdf_file]
f.write(bytes) f.write(bytes)
loader = PyMuPDFLoader(f"{tmp_file}") loader = PyMuPDFLoader(f"{tmp_file}")
pdf_entries_per_file = [page.page_content for page in loader.load()] pdf_entries_per_file = [page.page_content for page in loader.load()]

View file

@ -30,6 +30,7 @@ from khoj.utils.rawconfig import (
GithubContentConfig, GithubContentConfig,
NotionContentConfig, NotionContentConfig,
ConversationProcessorConfig, ConversationProcessorConfig,
OfflineChatProcessorConfig,
) )
from khoj.utils.helpers import resolve_absolute_path from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.state import SearchType from khoj.utils.state import SearchType
@ -185,6 +186,10 @@ if not state.demo:
state.content_index.markdown = None state.content_index.markdown = None
elif content_type == "org": elif content_type == "org":
state.content_index.org = None state.content_index.org = None
elif content_type == "plaintext":
state.content_index.plaintext = None
else:
logger.warning(f"Request to delete unknown content type: {content_type} via API")
try: try:
save_config_to_file_updated_state() save_config_to_file_updated_state()
@ -284,10 +289,11 @@ if not state.demo:
except Exception as e: except Exception as e:
return {"status": "error", "message": str(e)} return {"status": "error", "message": str(e)}
@api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200) @api.post("/config/data/processor/conversation/offline_chat", status_code=200)
async def set_processor_enable_offline_chat_config_data( async def set_processor_enable_offline_chat_config_data(
request: Request, request: Request,
enable_offline_chat: bool, enable_offline_chat: bool,
offline_chat_model: Optional[str] = None,
client: Optional[str] = None, client: Optional[str] = None,
): ):
_initialize_config() _initialize_config()
@ -301,7 +307,12 @@ if not state.demo:
state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore
assert state.config.processor.conversation is not None assert state.config.processor.conversation is not None
state.config.processor.conversation.enable_offline_chat = enable_offline_chat if state.config.processor.conversation.offline_chat is None:
state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig()
state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
if offline_chat_model is not None:
state.config.processor.conversation.offline_chat.chat_model = offline_chat_model
state.processor_config = configure_processor(state.config.processor, state.processor_config) state.processor_config = configure_processor(state.config.processor, state.processor_config)
update_telemetry_state( update_telemetry_state(
@ -322,7 +333,7 @@ if not state.demo:
# Create Routes # Create Routes
@api.get("/config/data/default") @api.get("/config/data/default")
def get_default_config_data(): def get_default_config_data():
return constants.default_config return constants.empty_config
@api.get("/config/types", response_model=List[str]) @api.get("/config/types", response_model=List[str])
@ -387,7 +398,7 @@ async def search(
# Encode query with filter terms removed # Encode query with filter terms removed
defiltered_query = user_query defiltered_query = user_query
for filter in [DateFilter(), WordFilter(), FileFilter()]: for filter in [DateFilter(), WordFilter(), FileFilter()]:
defiltered_query = filter.defilter(user_query) defiltered_query = filter.defilter(defiltered_query)
encoded_asymmetric_query = None encoded_asymmetric_query = None
if t == SearchType.All or t != SearchType.Image: if t == SearchType.All or t != SearchType.Image:
@ -622,7 +633,7 @@ def update(
if state.processor_config: if state.processor_config:
components.append("Conversation processor") components.append("Conversation processor")
components_msg = ", ".join(components) components_msg = ", ".join(components)
logger.info(f"📬 {components_msg} updated via API") logger.info(f"📪 {components_msg} updated via API")
update_telemetry_state( update_telemetry_state(
request=request, request=request,
@ -702,12 +713,18 @@ async def chat(
) -> Response: ) -> Response:
perform_chat_checks() perform_chat_checks()
conversation_command = get_conversation_command(query=q, any_references=True) conversation_command = get_conversation_command(query=q, any_references=True)
q = q.replace(f"/{conversation_command.value}", "").strip()
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions( compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
request, q, (n or 5), conversation_command request, q, (n or 5), conversation_command
) )
conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references):
conversation_command = ConversationCommand.General
if conversation_command == ConversationCommand.Help: if conversation_command == ConversationCommand.Help:
model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai" model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai"
formatted_help = help_message.format(model=model_type, version=state.khoj_version) formatted_help = help_message.format(model=model_type, version=state.khoj_version)
return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200) return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
@ -768,23 +785,21 @@ async def extract_references_and_questions(
logger.warning( logger.warning(
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes." "No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
) )
return compiled_references, inferred_queries return compiled_references, inferred_queries, q
if conversation_type == ConversationCommand.General: if conversation_type == ConversationCommand.General:
return compiled_references, inferred_queries, q return compiled_references, inferred_queries, q
# Extract filter terms from user message # Extract filter terms from user message
defiltered_query = q defiltered_query = q
filter_terms = []
for filter in [DateFilter(), WordFilter(), FileFilter()]: for filter in [DateFilter(), WordFilter(), FileFilter()]:
filter_terms += filter.get_filter_terms(q) defiltered_query = filter.defilter(defiltered_query)
defiltered_query = filter.defilter(q) filters_in_query = q.replace(defiltered_query, "").strip()
filters_in_query = " ".join(filter_terms)
# Infer search queries from user message # Infer search queries from user message
with timer("Extracting search queries took", logger): with timer("Extracting search queries took", logger):
# If we've reached here, either the user has enabled offline chat or the openai model is enabled. # If we've reached here, either the user has enabled offline chat or the openai model is enabled.
if state.processor_config.conversation.enable_offline_chat: if state.processor_config.conversation.offline_chat.enable_offline_chat:
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
inferred_queries = extract_questions_offline( inferred_queries = extract_questions_offline(
defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
@ -800,7 +815,7 @@ async def extract_references_and_questions(
with timer("Searching knowledge base took", logger): with timer("Searching knowledge base took", logger):
result_list = [] result_list = []
for query in inferred_queries: for query in inferred_queries:
n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n
result_list.extend( result_list.extend(
await search( await search(
f"{query} {filters_in_query}", f"{query} {filters_in_query}",

View file

@ -113,7 +113,7 @@ def generate_chat_response(
meta_log=meta_log, meta_log=meta_log,
) )
if state.processor_config.conversation.enable_offline_chat: if state.processor_config.conversation.offline_chat.enable_offline_chat:
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
chat_response = converse_offline( chat_response = converse_offline(
references=compiled_references, references=compiled_references,
@ -122,6 +122,9 @@ def generate_chat_response(
conversation_log=meta_log, conversation_log=meta_log,
completion_func=partial_completion, completion_func=partial_completion,
conversation_command=conversation_command, conversation_command=conversation_command,
model=state.processor_config.conversation.offline_chat.chat_model,
max_prompt_size=state.processor_config.conversation.max_prompt_size,
tokenizer_name=state.processor_config.conversation.tokenizer,
) )
elif state.processor_config.conversation.openai_model: elif state.processor_config.conversation.openai_model:
@ -135,6 +138,8 @@ def generate_chat_response(
api_key=api_key, api_key=api_key,
completion_func=partial_completion, completion_func=partial_completion,
conversation_command=conversation_command, conversation_command=conversation_command,
max_prompt_size=state.processor_config.conversation.max_prompt_size,
tokenizer_name=state.processor_config.conversation.tokenizer,
) )
except Exception as e: except Exception as e:

View file

@ -1,11 +1,11 @@
# Standard Packages # Standard Packages
import logging import logging
import sys
from typing import Optional, Union, Dict from typing import Optional, Union, Dict
# External Packages # External Packages
from fastapi import APIRouter, HTTPException, Header, Request, Body, Response from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile
from pydantic import BaseModel from pydantic import BaseModel
from khoj.routers.helpers import update_telemetry_state
# Internal Packages # Internal Packages
from khoj.utils import state, constants from khoj.utils import state, constants
@ -56,42 +56,30 @@ class IndexerInput(BaseModel):
plaintext: Optional[dict[str, str]] = None plaintext: Optional[dict[str, str]] = None
@indexer.post("/batch") @indexer.post("/update")
async def index_batch( async def update(
request: Request, request: Request,
files: list[UploadFile],
x_api_key: str = Header(None), x_api_key: str = Header(None),
regenerate: bool = False, force: bool = False,
search_type: Optional[Union[state.SearchType, str]] = None, t: Optional[Union[state.SearchType, str]] = None,
client: Optional[str] = None,
user_agent: Optional[str] = Header(None),
referer: Optional[str] = Header(None),
host: Optional[str] = Header(None),
): ):
if x_api_key != "secret": if x_api_key != "secret":
raise HTTPException(status_code=401, detail="Invalid API Key") raise HTTPException(status_code=401, detail="Invalid API Key")
state.config_lock.acquire() state.config_lock.acquire()
try: try:
logger.info(f"Received batch indexing request") logger.info(f"📬 Updating content index via API call by {client} client")
index_batch_request_acc = b""
async for chunk in request.stream():
index_batch_request_acc += chunk
data_bytes = sys.getsizeof(index_batch_request_acc)
unit = "KB"
data_size = data_bytes / 1024
if data_size > 1000:
unit = "MB"
data_size = data_size / 1024
if data_size > 1000:
unit = "GB"
data_size = data_size / 1024
data_size_metric = f"{data_size:.2f} {unit}"
logger.info(f"Received {data_size_metric} of data")
index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
logger.info(f"Received {len(index_batch_request.files)} files")
org_files: Dict[str, str] = {} org_files: Dict[str, str] = {}
markdown_files: Dict[str, str] = {} markdown_files: Dict[str, str] = {}
pdf_files: Dict[str, str] = {} pdf_files: Dict[str, str] = {}
plaintext_files: Dict[str, str] = {} plaintext_files: Dict[str, str] = {}
for file in index_batch_request.files: for file in files:
file_type = get_file_type(file.path) file_type, encoding = get_file_type(file.content_type)
dict_to_update = None dict_to_update = None
if file_type == "org": if file_type == "org":
dict_to_update = org_files dict_to_update = org_files
@ -103,9 +91,11 @@ async def index_batch(
dict_to_update = plaintext_files dict_to_update = plaintext_files
if dict_to_update is not None: if dict_to_update is not None:
dict_to_update[file.path] = file.content dict_to_update[file.filename] = (
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
)
else: else:
logger.info(f"Skipping unsupported streamed file: {file.path}") logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
indexer_input = IndexerInput( indexer_input = IndexerInput(
org=org_files, org=org_files,
@ -115,7 +105,7 @@ async def index_batch(
) )
if state.config == None: if state.config == None:
logger.info("First run, initializing state.") logger.info("📬 Initializing content index on first run.")
default_full_config = FullConfig( default_full_config = FullConfig(
content_type=None, content_type=None,
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]), search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
@ -142,15 +132,30 @@ async def index_batch(
state.config.content_type, state.config.content_type,
indexer_input.dict(), indexer_input.dict(),
state.search_models, state.search_models,
regenerate=regenerate, regenerate=force,
t=search_type, t=t,
full_corpus=False, full_corpus=False,
) )
except Exception as e: except Exception as e:
logger.error(f"Failed to process batch indexing request: {e}", exc_info=True) logger.error(
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
exc_info=True,
)
finally: finally:
state.config_lock.release() state.config_lock.release()
update_telemetry_state(
request=request,
telemetry_type="api",
api="index/update",
client=client,
user_agent=user_agent,
referer=referer,
host=host,
)
logger.info(f"📪 Content index updated via API call by {client} client")
return Response(content="OK", status_code=200) return Response(content="OK", status_code=200)

View file

@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file
from khoj.migrations.migrate_version import migrate_config_to_version from khoj.migrations.migrate_version import migrate_config_to_version
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
from khoj.migrations.migrate_offline_model import migrate_offline_model from khoj.migrations.migrate_offline_model import migrate_offline_model
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
def cli(args=None): def cli(args=None):
@ -55,7 +56,12 @@ def cli(args=None):
def run_migrations(args): def run_migrations(args):
migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model] migrations = [
migrate_config_to_version,
migrate_processor_conversation_schema,
migrate_offline_model,
migrate_offline_chat_schema,
]
for migration in migrations: for migration in migrations:
args = migration(args) args = migration(args)
return args return args

View file

@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model
# External Packages # External Packages
import torch import torch
from khoj.utils.rawconfig import OfflineChatProcessorConfig
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Internal Packages # Internal Packages
@ -84,7 +86,6 @@ class SearchModels:
@dataclass @dataclass
class GPT4AllProcessorConfig: class GPT4AllProcessorConfig:
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
loaded_model: Union[Any, None] = None loaded_model: Union[Any, None] = None
@ -95,18 +96,20 @@ class ConversationProcessorConfigModel:
): ):
self.openai_model = conversation_config.openai self.openai_model = conversation_config.openai
self.gpt4all_model = GPT4AllProcessorConfig() self.gpt4all_model = GPT4AllProcessorConfig()
self.enable_offline_chat = conversation_config.enable_offline_chat self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig()
self.max_prompt_size = conversation_config.max_prompt_size
self.tokenizer = conversation_config.tokenizer
self.conversation_logfile = Path(conversation_config.conversation_logfile) self.conversation_logfile = Path(conversation_config.conversation_logfile)
self.chat_session: List[str] = [] self.chat_session: List[str] = []
self.meta_log: dict = {} self.meta_log: dict = {}
if self.enable_offline_chat: if self.offline_chat.enable_offline_chat:
try: try:
self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model) self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
except ValueError as e: except Exception as e:
self.offline_chat.enable_offline_chat = False
self.gpt4all_model.loaded_model = None self.gpt4all_model.loaded_model = None
logger.error(f"Error while loading offline chat model: {e}", exc_info=True) logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
self.enable_offline_chat = False
else: else:
self.gpt4all_model.loaded_model = None self.gpt4all_model.loaded_model = None

View file

@ -6,6 +6,64 @@ empty_escape_sequences = "\n|\r|\t| "
app_env_filepath = "~/.khoj/env" app_env_filepath = "~/.khoj/env"
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry" telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
empty_config = {
"content-type": {
"org": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz",
"embeddings-file": "~/.khoj/content/org/org_embeddings.pt",
"index-heading-entries": False,
},
"markdown": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
"embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
},
"pdf": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
},
"plaintext": {
"input-files": None,
"input-filter": None,
"compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
"embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
},
},
"search-type": {
"symmetric": {
"encoder": "sentence-transformers/all-MiniLM-L6-v2",
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"model_directory": "~/.khoj/search/symmetric/",
},
"asymmetric": {
"encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"model_directory": "~/.khoj/search/asymmetric/",
},
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
},
"processor": {
"conversation": {
"openai": {
"api-key": None,
"chat-model": "gpt-3.5-turbo",
},
"offline-chat": {
"enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
},
"tokenizer": None,
"max-prompt-size": None,
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
}
},
}
# default app config to use # default app config to use
default_config = { default_config = {
"content-type": { "content-type": {
@ -72,7 +130,12 @@ default_config = {
"api-key": None, "api-key": None,
"chat-model": "gpt-3.5-turbo", "chat-model": "gpt-3.5-turbo",
}, },
"offline-chat": {
"enable-offline-chat": False, "enable-offline-chat": False,
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
},
"tokenizer": None,
"max-prompt-size": None,
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json", "conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
} }
}, },

View file

@ -1,6 +1,6 @@
import logging import logging
import glob import glob
import base64 import os
from typing import Optional from typing import Optional
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -39,13 +39,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
return soup.get_text(strip=True, separator="\n") return soup.get_text(strip=True, separator="\n")
# Extract required fields from config # Extract required fields from config
input_files, input_filter = ( input_files, input_filters = (
config.input_files, config.input_files,
config.input_filter, config.input_filter,
) )
# Input Validation # Input Validation
if is_none_or_empty(input_files) and is_none_or_empty(input_filter): if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
logger.debug("At least one of input-files or input-file-filter is required to be specified") logger.debug("At least one of input-files or input-file-filter is required to be specified")
return {} return {}
@ -53,11 +53,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
absolute_plaintext_files, filtered_plaintext_files = set(), set() absolute_plaintext_files, filtered_plaintext_files = set(), set()
if input_files: if input_files:
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files} absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
if input_filter: if input_filters:
filtered_plaintext_files = { filtered_plaintext_files = {
filtered_file filtered_file
for jsonl_file_filter in input_filter for plaintext_file_filter in input_filters
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True) for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
if os.path.isfile(filtered_file)
} }
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
@ -73,12 +74,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
filename_to_content_map = {} filename_to_content_map = {}
for file in all_target_files: for file in all_target_files:
with open(file, "r") as f: with open(file, "r", encoding="utf8") as f:
try: try:
plaintext_content = f.read() plaintext_content = f.read()
if file.endswith(("html", "htm", "xml")): if file.endswith(("html", "htm", "xml")):
plaintext_content = extract_html_content(plaintext_content) plaintext_content = extract_html_content(plaintext_content)
filename_to_content_map[file] = f.read() filename_to_content_map[file] = plaintext_content
except Exception as e: except Exception as e:
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.") logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)
@ -88,13 +89,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
def get_org_files(config: TextContentConfig): def get_org_files(config: TextContentConfig):
# Extract required fields from config # Extract required fields from config
org_files, org_file_filter = ( org_files, org_file_filters = (
config.input_files, config.input_files,
config.input_filter, config.input_filter,
) )
# Input Validation # Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
logger.debug("At least one of org-files or org-file-filter is required to be specified") logger.debug("At least one of org-files or org-file-filter is required to be specified")
return {} return {}
@ -102,11 +103,12 @@ def get_org_files(config: TextContentConfig):
absolute_org_files, filtered_org_files = set(), set() absolute_org_files, filtered_org_files = set(), set()
if org_files: if org_files:
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files} absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
if org_file_filter: if org_file_filters:
filtered_org_files = { filtered_org_files = {
filtered_file filtered_file
for org_file_filter in org_file_filter for org_file_filter in org_file_filters
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True) for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
if os.path.isfile(filtered_file)
} }
all_org_files = sorted(absolute_org_files | filtered_org_files) all_org_files = sorted(absolute_org_files | filtered_org_files)
@ -119,7 +121,7 @@ def get_org_files(config: TextContentConfig):
filename_to_content_map = {} filename_to_content_map = {}
for file in all_org_files: for file in all_org_files:
with open(file, "r") as f: with open(file, "r", encoding="utf8") as f:
try: try:
filename_to_content_map[file] = f.read() filename_to_content_map[file] = f.read()
except Exception as e: except Exception as e:
@ -131,26 +133,27 @@ def get_org_files(config: TextContentConfig):
def get_markdown_files(config: TextContentConfig): def get_markdown_files(config: TextContentConfig):
# Extract required fields from config # Extract required fields from config
markdown_files, markdown_file_filter = ( markdown_files, markdown_file_filters = (
config.input_files, config.input_files,
config.input_filter, config.input_filter,
) )
# Input Validation # Input Validation
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified") logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
return {} return {}
"Get Markdown files to process" # Get markdown files to process
absolute_markdown_files, filtered_markdown_files = set(), set() absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files: if markdown_files:
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
if markdown_file_filter: if markdown_file_filters:
filtered_markdown_files = { filtered_markdown_files = {
filtered_file filtered_file
for markdown_file_filter in markdown_file_filter for markdown_file_filter in markdown_file_filters
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True) for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
if os.path.isfile(filtered_file)
} }
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
@ -168,7 +171,7 @@ def get_markdown_files(config: TextContentConfig):
filename_to_content_map = {} filename_to_content_map = {}
for file in all_markdown_files: for file in all_markdown_files:
with open(file, "r") as f: with open(file, "r", encoding="utf8") as f:
try: try:
filename_to_content_map[file] = f.read() filename_to_content_map[file] = f.read()
except Exception as e: except Exception as e:
@ -180,13 +183,13 @@ def get_markdown_files(config: TextContentConfig):
def get_pdf_files(config: TextContentConfig): def get_pdf_files(config: TextContentConfig):
# Extract required fields from config # Extract required fields from config
pdf_files, pdf_file_filter = ( pdf_files, pdf_file_filters = (
config.input_files, config.input_files,
config.input_filter, config.input_filter,
) )
# Input Validation # Input Validation
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter): if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified") logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
return {} return {}
@ -194,11 +197,12 @@ def get_pdf_files(config: TextContentConfig):
absolute_pdf_files, filtered_pdf_files = set(), set() absolute_pdf_files, filtered_pdf_files = set(), set()
if pdf_files: if pdf_files:
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files} absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
if pdf_file_filter: if pdf_file_filters:
filtered_pdf_files = { filtered_pdf_files = {
filtered_file filtered_file
for pdf_file_filter in pdf_file_filter for pdf_file_filter in pdf_file_filters
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True) for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
if os.path.isfile(filtered_file)
} }
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files) all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
@ -214,7 +218,7 @@ def get_pdf_files(config: TextContentConfig):
for file in all_pdf_files: for file in all_pdf_files:
with open(file, "rb") as f: with open(file, "rb") as f:
try: try:
filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8") filename_to_content_map[file] = f.read()
except Exception as e: except Exception as e:
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.") logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
logger.warning(e, exc_info=True) logger.warning(e, exc_info=True)

View file

@ -66,20 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
return merged_dict return merged_dict
def get_file_type(filepath: str) -> str: def get_file_type(file_type: str) -> tuple[str, str]:
"Get file type from file path" "Get file type from file mime type"
file_type = Path(filepath).suffix[1:]
if file_type in ["md", "markdown"]: encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
return "markdown" file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
elif file_type in ["org", "orgmode"]: if file_type in ["text/markdown"]:
return "org" return "markdown", encoding
elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]: elif file_type in ["text/org"]:
return "plaintext" return "org", encoding
elif file_type in ["pdf"]: elif file_type in ["application/pdf"]:
return "pdf" return "pdf", encoding
elif file_type in ["image/jpeg"]:
return file_type return "jpeg", encoding
elif file_type in ["image/png"]:
return "png", encoding
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
return "plaintext", encoding
else:
return "other", encoding
def load_model( def load_model(

View file

@ -91,10 +91,17 @@ class OpenAIProcessorConfig(ConfigBase):
chat_model: Optional[str] = "gpt-3.5-turbo" chat_model: Optional[str] = "gpt-3.5-turbo"
class OfflineChatProcessorConfig(ConfigBase):
enable_offline_chat: Optional[bool] = False
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
class ConversationProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase):
conversation_logfile: Path conversation_logfile: Path
openai: Optional[OpenAIProcessorConfig] openai: Optional[OpenAIProcessorConfig]
enable_offline_chat: Optional[bool] = False offline_chat: Optional[OfflineChatProcessorConfig]
max_prompt_size: Optional[int]
tokenizer: Optional[str]
class ProcessorConfig(ConfigBase): class ProcessorConfig(ConfigBase):

View file

@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
from khoj.utils.rawconfig import ( from khoj.utils.rawconfig import (
ContentConfig, ContentConfig,
ConversationProcessorConfig, ConversationProcessorConfig,
OfflineChatProcessorConfig,
OpenAIProcessorConfig, OpenAIProcessorConfig,
ProcessorConfig, ProcessorConfig,
TextContentConfig, TextContentConfig,
@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):
# Setup conversation processor # Setup conversation processor
processor_config = ProcessorConfig() processor_config = ProcessorConfig()
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
processor_config.conversation = ConversationProcessorConfig( processor_config.conversation = ConversationProcessorConfig(
enable_offline_chat=True, offline_chat=offline_chat,
conversation_logfile=processor_dir.joinpath("conversation_logs.json"), conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
) )

View file

@ -6,6 +6,7 @@ from urllib.parse import quote
# External Packages # External Packages
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
import pytest
# Internal Packages # Internal Packages
from app.main import app from app.main import app
@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_index_batch(client): def test_index_update(client):
# Arrange # Arrange
request_body = get_sample_files_data() files = get_sample_files_data()
headers = {"x-api-key": "secret"} headers = {"x-api-key": "secret"}
# Act # Act
response = client.post("/v1/indexer/batch", json=request_body, headers=headers) response = client.post("/api/v1/index/update", files=files, headers=headers)
# Assert # Assert
assert response.status_code == 200 assert response.status_code == 200
@ -76,12 +77,11 @@ def test_index_batch(client):
def test_regenerate_with_valid_content_type(client): def test_regenerate_with_valid_content_type(client):
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]: for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
# Arrange # Arrange
request_body = get_sample_files_data() files = get_sample_files_data()
headers = {"x-api-key": "secret"} headers = {"x-api-key": "secret"}
# Act # Act
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers) response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
# Assert # Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}" assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
response = client.get(f"/api/update?force=true&t=github") response = client.get(f"/api/update?force=true&t=github")
# Arrange # Arrange
request_body = get_sample_files_data() files = get_sample_files_data()
headers = {"x-api-key": "secret"} headers = {"x-api-key": "secret"}
# Act # Act
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers) response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
# Assert # Assert
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github" assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="Flaky test on parallel test runs")
def test_get_configured_types_via_api(client): def test_get_configured_types_via_api(client):
# Act # Act
response = client.get(f"/api/config/types") response = client.get(f"/api/config/types")
@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
def get_sample_files_data(): def get_sample_files_data():
return { return {
"org": { "files": ("path/to/filename.org", "* practicing piano", "text/org"),
"path/to/filename.org": "* practicing piano", "files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
"path/to/filename1.org": "** top 3 reasons why I moved to SF", "files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
"path/to/filename2.org": "* how to build a search engine", "files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
}, "files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
"pdf": { "files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware", "files": ("path/to/filename.txt", "data,column,value", "text/plain"),
"path/to/filename1.pdf": "The sun is a ball of helium", "files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness", "files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
}, "files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
"plaintext": { "files": (
"path/to/filename.txt": "data,column,value", "path/to/filename1.md",
"path/to/filename1.txt": "<html>my first web page</html>", "## Studying anthropological records from the Fatimid caliphate",
"path/to/filename2.txt": "2021-02-02 Journal Entry", "text/markdown",
}, ),
"markdown": { "files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
"path/to/filename.md": "# Notes from client call",
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
"path/to/filename2.md": "**Understanding science through the lens of art**",
},
} }

View file

@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
from khoj.processor.conversation.utils import message_to_log from khoj.processor.conversation.utils import message_to_log
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin" MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
@pytest.mark.chatquality @pytest.mark.chatquality
def test_extract_multiple_implicit_questions_from_message(loaded_model): def test_extract_multiple_implicit_questions_from_message(loaded_model):
# Act # Act
response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model) response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
# Assert # Assert
expected_responses = ["height", "taller", "shorter", "heights"] expected_responses = ["height", "taller", "shorter", "heights", "who"]
assert len(response) <= 3 assert len(response) <= 3
for question in response: for question in response:
assert any([expected_response in question.lower() for expected_response in expected_responses]), ( assert any([expected_response in question.lower() for expected_response in expected_responses]), (
"Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question "Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
) )
@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
def test_generate_search_query_using_question_from_chat_history(loaded_model): def test_generate_search_query_using_question_from_chat_history(loaded_model):
# Arrange # Arrange
message_list = [ message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []), ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
] ]
# Act # Act
@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
use_history=True, use_history=True,
) )
expected_responses = [ all_expected_in_response = [
"Vader", "Anderson",
"sons", ]
any_expected_in_response = [
"son", "son",
"Darth", "sons",
"children", "children",
] ]
# Assert # Assert
assert len(response) >= 1 assert len(response) >= 1
assert any([expected_response in response[0] for expected_response in expected_responses]), ( assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0]
)
assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
"Expected chat actor to ask for clarification in response, but got: " + response[0] "Expected chat actor to ask for clarification in response, but got: " + response[0]
) )
@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
def test_generate_search_query_using_answer_from_chat_history(loaded_model): def test_generate_search_query_using_answer_from_chat_history(loaded_model):
# Arrange # Arrange
message_list = [ message_list = [
("What is the name of Mr. Vader's daughter?", "Princess Leia", []), ("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
] ]
# Act # Act
response = extract_questions_offline( response = extract_questions_offline(
"Is she a Jedi?", "Is she a Doctor?",
conversation_log=populate_chat_history(message_list), conversation_log=populate_chat_history(message_list),
loaded_model=loaded_model, loaded_model=loaded_model,
use_history=True, use_history=True,
) )
expected_responses = [ expected_responses = [
"Leia", "Barbara",
"Vader", "Robert",
"daughter", "daughter",
] ]

View file

@ -1,7 +1,6 @@
# Standard Packages # Standard Packages
import json import json
import os import os
import base64
# Internal Packages # Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
# Extract Entries from specified Pdf files # Extract Entries from specified Pdf files
# Read singlepage.pdf into memory as bytes # Read singlepage.pdf into memory as bytes
with open("tests/data/pdf/singlepage.pdf", "rb") as f: with open("tests/data/pdf/singlepage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8") pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
# Act # Act
# Extract Entries from specified Pdf files # Extract Entries from specified Pdf files
with open("tests/data/pdf/multipage.pdf", "rb") as f: with open("tests/data/pdf/multipage.pdf", "rb") as f:
pdf_bytes = base64.b64encode(f.read()).decode("utf-8") pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes} data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)

View file

@ -1,26 +1,25 @@
# System Packages # System Packages
import logging import logging
import locale
from pathlib import Path from pathlib import Path
import os import os
# External Packages # External Packages
import pytest import pytest
from khoj.utils.config import SearchModels
# Internal Packages # Internal Packages
from khoj.utils.state import content_index, search_models from khoj.utils.state import content_index, search_models
from khoj.search_type import text_search from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.utils.config import SearchModels
from khoj.utils.fs_syncer import get_org_files from khoj.utils.fs_syncer import get_org_files
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
# Test # Test
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_text_search_setup_with_missing_file_raises_error( def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
):
# Arrange # Arrange
# Ensure file mentioned in org.input-files is missing # Ensure file mentioned in org.input-files is missing
single_new_file = Path(org_config_with_only_new_file.input_files[0]) single_new_file = Path(org_config_with_only_new_file.input_files[0])
@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
# Act # Act
# Generate notes embeddings during asymmetric setup # Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError): with pytest.raises(FileNotFoundError):
data = get_org_files(org_config_with_only_new_file) get_org_files(org_config_with_only_new_file)
# ----------------------------------------------------------------------------------------------------
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
# Arrange
orgfile = tmp_path / "directory.org" / "file.org"
orgfile.parent.mkdir()
with open(orgfile, "w") as f:
f.write("* Heading\n- List item\n")
org_content_config = TextContentConfig(
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
)
# Act
# should not raise IsADirectoryError and return orgfile
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels): def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
# Arrange # Arrange
data = get_org_files(content_config.org) data = get_org_files(content_config.org)
# Act # Act
# Regenerate notes embeddings during asymmetric setup # Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup( notes_model = text_search.setup(

View file

@ -24,5 +24,6 @@
"0.12.0": "0.15.0", "0.12.0": "0.15.0",
"0.12.1": "0.15.0", "0.12.1": "0.15.0",
"0.12.2": "0.15.0", "0.12.2": "0.15.0",
"0.12.3": "0.15.0" "0.12.3": "0.15.0",
"0.13.0": "0.15.0"
} }