mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Resolve merge conflicts
This commit is contained in:
commit
963cd165eb
42 changed files with 941 additions and 590 deletions
11
docs/chat.md
11
docs/chat.md
|
@ -7,18 +7,21 @@
|
|||
|
||||
### Setup
|
||||
#### Offline Chat
|
||||
Offline chat works without internet but it is slower, lower quality and more compute intensive.
|
||||
Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
|
||||
|
||||
!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time
|
||||
> **System Requirements**:
|
||||
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
|
||||
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
|
||||
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
|
||||
|
||||
- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card
|
||||
- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card
|
||||
|
||||
![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
|
||||
|
||||
#### Online Chat
|
||||
Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
|
||||
|
||||
!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing
|
||||
!> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing
|
||||
|
||||
1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
|
||||
2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.
|
||||
|
|
|
@ -46,7 +46,7 @@ Indexes your org-agenda files, by default.
|
|||
(use-package khoj
|
||||
:ensure t
|
||||
:pin melpa-stable
|
||||
:bind ("C-c s" . 'khoj)
|
||||
:bind ("C-c s" . 'khoj))
|
||||
```
|
||||
|
||||
- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.12.3",
|
||||
"version": "0.13.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
||||
"author": "Khoj Inc.",
|
||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|||
|
||||
[project]
|
||||
name = "khoj-assistant"
|
||||
description = "An AI personal assistant for your Digital Brain"
|
||||
description = "An AI copilot for your Second Brain"
|
||||
readme = "README.md"
|
||||
license = "GPL-3.0-or-later"
|
||||
requires-python = ">=3.8"
|
||||
|
@ -40,8 +40,9 @@ dependencies = [
|
|||
"dateparser >= 1.1.1",
|
||||
"defusedxml == 0.7.1",
|
||||
"fastapi == 0.77.1",
|
||||
"python-multipart >= 0.0.5",
|
||||
"jinja2 == 3.1.2",
|
||||
"openai >= 0.27.0",
|
||||
"openai >= 0.27.0, < 1.0.0",
|
||||
"tiktoken >= 0.3.2",
|
||||
"tenacity >= 8.2.2",
|
||||
"pillow == 9.3.0",
|
||||
|
@ -83,6 +84,7 @@ test = [
|
|||
"freezegun >= 1.2.0",
|
||||
"factory-boy >= 3.2.1",
|
||||
"trio >= 0.22.0",
|
||||
"pytest-xdist",
|
||||
]
|
||||
dev = [
|
||||
"khoj-assistant[test]",
|
||||
|
|
|
@ -9,6 +9,10 @@ do
|
|||
# Get current project version
|
||||
current_version=$OPTARG
|
||||
|
||||
# Bump Desktop app to current version
|
||||
cd $project_root/src/interface/desktop
|
||||
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
||||
|
||||
# Bump Obsidian plugin to current version
|
||||
cd $project_root/src/interface/obsidian
|
||||
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
||||
|
|
|
@ -14,10 +14,11 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
|
|||
|
||||
# External Packages
|
||||
import uvicorn
|
||||
import django
|
||||
import schedule
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import schedule
|
||||
import django
|
||||
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from rich.logging import RichHandler
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
@ -41,6 +42,15 @@ app = FastAPI()
|
|||
# Get Django Application
|
||||
django_app = get_asgi_application()
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Set Locale
|
||||
locale.setlocale(locale.LC_ALL, "")
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ const {dialog} = require('electron');
|
|||
|
||||
const cron = require('cron').CronJob;
|
||||
const axios = require('axios');
|
||||
const { Readable } = require('stream');
|
||||
|
||||
const KHOJ_URL = 'http://127.0.0.1:42110'
|
||||
|
||||
|
@ -65,7 +64,7 @@ const schema = {
|
|||
|
||||
var state = {}
|
||||
|
||||
const store = new Store({schema});
|
||||
const store = new Store({ schema });
|
||||
|
||||
console.log(store);
|
||||
|
||||
|
@ -86,57 +85,65 @@ function handleSetTitle (event, title) {
|
|||
});
|
||||
}
|
||||
|
||||
function filenameToMimeType (filename) {
|
||||
const extension = filename.split('.').pop();
|
||||
switch (extension) {
|
||||
case 'pdf':
|
||||
return 'application/pdf';
|
||||
case 'png':
|
||||
return 'image/png';
|
||||
case 'jpg':
|
||||
case 'jpeg':
|
||||
return 'image/jpeg';
|
||||
case 'md':
|
||||
case 'markdown':
|
||||
return 'text/markdown';
|
||||
case 'org':
|
||||
return 'text/org';
|
||||
default:
|
||||
return 'text/plain';
|
||||
}
|
||||
}
|
||||
|
||||
function pushDataToKhoj (regenerate = false) {
|
||||
let filesToPush = [];
|
||||
const files = store.get('files');
|
||||
const folders = store.get('folders');
|
||||
state = {
|
||||
completed: true
|
||||
const files = store.get('files') || [];
|
||||
const folders = store.get('folders') || [];
|
||||
state = { completed: true }
|
||||
|
||||
// Collect paths of all configured files to index
|
||||
for (const file of files) {
|
||||
filesToPush.push(file.path);
|
||||
}
|
||||
|
||||
if (files) {
|
||||
for (file of files) {
|
||||
filesToPush.push(file.path);
|
||||
}
|
||||
}
|
||||
if (folders) {
|
||||
for (folder of folders) {
|
||||
const files = fs.readdirSync(folder.path, { withFileTypes: true });
|
||||
for (file of files) {
|
||||
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
||||
filesToPush.push(path.join(folder.path, file.name));
|
||||
}
|
||||
// Collect paths of all indexable files in configured folders
|
||||
for (const folder of folders) {
|
||||
const files = fs.readdirSync(folder.path, { withFileTypes: true });
|
||||
for (const file of files) {
|
||||
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
||||
filesToPush.push(path.join(folder.path, file.name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data = {
|
||||
files: []
|
||||
}
|
||||
|
||||
const lastSync = store.get('lastSync') || [];
|
||||
|
||||
for (file of filesToPush) {
|
||||
const formData = new FormData();
|
||||
for (const file of filesToPush) {
|
||||
const stats = fs.statSync(file);
|
||||
if (!regenerate) {
|
||||
// Only push files that have been modified since last sync
|
||||
if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Collect all updated or newly created files since last sync to index on Khoj server
|
||||
try {
|
||||
let rawData;
|
||||
// If the file is a PDF or IMG file, read it as a binary file
|
||||
if (binaryFileTypes.includes(file.split('.').pop())) {
|
||||
rawData = fs.readFileSync(file).toString('base64');
|
||||
} else {
|
||||
rawData = fs.readFileSync(file, 'utf8');
|
||||
}
|
||||
|
||||
data.files.push({
|
||||
path: file,
|
||||
content: rawData
|
||||
});
|
||||
let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
|
||||
let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||
let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
|
||||
let fileObj = new Blob([fileContent], { type: mimeType });
|
||||
formData.append('files', fileObj, file);
|
||||
state[file] = {
|
||||
success: true,
|
||||
}
|
||||
|
@ -149,46 +156,46 @@ function pushDataToKhoj (regenerate = false) {
|
|||
}
|
||||
}
|
||||
|
||||
// Mark deleted files for removal from index on Khoj server
|
||||
for (const syncedFile of lastSync) {
|
||||
if (!filesToPush.includes(syncedFile.path)) {
|
||||
data.files.push({
|
||||
path: syncedFile.path,
|
||||
content: ""
|
||||
});
|
||||
fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
|
||||
formData.append('files', fileObj, syncedFile.path);
|
||||
}
|
||||
}
|
||||
|
||||
const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' };
|
||||
|
||||
const stream = new Readable({
|
||||
read() {
|
||||
this.push(JSON.stringify(data));
|
||||
this.push(null);
|
||||
}
|
||||
});
|
||||
|
||||
const hostURL = store.get('hostURL') || KHOJ_URL;
|
||||
|
||||
axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
|
||||
.then(response => {
|
||||
console.log(response.data);
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win.webContents.send('update-state', state);
|
||||
let lastSync = [];
|
||||
for (const file of filesToPush) {
|
||||
lastSync.push({
|
||||
path: file,
|
||||
datetime: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
store.set('lastSync', lastSync);
|
||||
})
|
||||
.catch(error => {
|
||||
console.error(error);
|
||||
state['completed'] = false
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win.webContents.send('update-state', state);
|
||||
});
|
||||
// Send collected files to Khoj server for indexing
|
||||
if (!!formData?.entries()?.next().value) {
|
||||
const hostURL = store.get('hostURL') || KHOJ_URL;
|
||||
const headers = {
|
||||
'x-api-key': 'secret'
|
||||
};
|
||||
axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
|
||||
.then(response => {
|
||||
console.log(response.data);
|
||||
let lastSync = [];
|
||||
for (const file of filesToPush) {
|
||||
lastSync.push({
|
||||
path: file,
|
||||
datetime: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
store.set('lastSync', lastSync);
|
||||
})
|
||||
.catch(error => {
|
||||
console.error(error);
|
||||
state['completed'] = false
|
||||
})
|
||||
.finally(() => {
|
||||
// Syncing complete
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
if (win) win.webContents.send('update-state', state);
|
||||
});
|
||||
} else {
|
||||
// Syncing complete
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
if (win) win.webContents.send('update-state', state);
|
||||
}
|
||||
}
|
||||
|
||||
pushDataToKhoj();
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
{
|
||||
"name": "Khoj",
|
||||
"homepage": ".",
|
||||
"productName": "Khoj",
|
||||
"version": "1.0.2",
|
||||
"description": "Scaffolding for the desktop entrypoint to Khoj",
|
||||
"main": "main.js",
|
||||
"version": "0.13.0",
|
||||
"description": "An AI copilot for your Second Brain",
|
||||
"author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
|
||||
"license": "GPL-3.0-or-later",
|
||||
"homepage": "https://khoj.dev",
|
||||
"repository": "\"https://github.com/khoj-ai/khoj\"",
|
||||
"author": "Khoj <team@khoj.dev>",
|
||||
"license": "MIT",
|
||||
"productName": "Khoj",
|
||||
"main": "main.js",
|
||||
"private": false,
|
||||
"devDependencies": {
|
||||
"electron": "25.8.1"
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*-
|
||||
;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*-
|
||||
|
||||
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
|
||||
;; Copyright (C) 2021-2023 Khoj Inc.
|
||||
|
||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||
;; Description: An AI personal assistant for your digital brain
|
||||
;; Author: Debanjum Singh Solanky <debanjum@khoj.dev>
|
||||
;; Saba Imran <saba@khoj.dev>
|
||||
;; Description: An AI copilot for your Second Brain
|
||||
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
|
||||
;; Version: 0.12.3
|
||||
;; Version: 0.13.0
|
||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
|
||||
|
||||
|
@ -28,8 +29,8 @@
|
|||
|
||||
;;; Commentary:
|
||||
|
||||
;; Create an AI personal assistant for your `org-mode', `markdown' notes,
|
||||
;; PDFs and images. The assistant exposes 2 modes, search and chat:
|
||||
;; Create an AI copilot to your `org-mode', `markdown' notes,
|
||||
;; PDFs and images. The copilot exposes 2 modes, search and chat:
|
||||
;;
|
||||
;; Chat provides faster answers, iterative discovery and assisted
|
||||
;; creativity. It requires your OpenAI API key to access GPT models
|
||||
|
@ -87,6 +88,21 @@
|
|||
:group 'khoj
|
||||
:type 'integer)
|
||||
|
||||
(defcustom khoj-search-on-idle-time 0.3
|
||||
"Idle time (in seconds) to wait before triggering search."
|
||||
:group 'khoj
|
||||
:type 'number)
|
||||
|
||||
(defcustom khoj-server-api-key "secret"
|
||||
"API Key to Khoj server."
|
||||
:group 'khoj
|
||||
:type 'string)
|
||||
|
||||
(defcustom khoj-index-interval 3600
|
||||
"Interval (in seconds) to wait before updating content index."
|
||||
:group 'khoj
|
||||
:type 'number)
|
||||
|
||||
(defcustom khoj-default-content-type "org"
|
||||
"The default content type to perform search on."
|
||||
:group 'khoj
|
||||
|
@ -115,6 +131,15 @@
|
|||
(defvar khoj--content-type "org"
|
||||
"The type of content to perform search on.")
|
||||
|
||||
(defvar khoj--search-on-idle-timer nil
|
||||
"Idle timer to trigger incremental search.")
|
||||
|
||||
(defvar khoj--index-timer nil
|
||||
"Timer to trigger content indexing.")
|
||||
|
||||
(defvar khoj--indexed-files '()
|
||||
"Files that were indexed in previous content indexing run.")
|
||||
|
||||
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
|
||||
(declare-function org-element-type "org-mode" (ELEMENT))
|
||||
(declare-function markdown-mode "markdown-mode" ())
|
||||
|
@ -236,6 +261,11 @@ for example), set this to the full interpreter path."
|
|||
:type 'boolean
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-offline-chat-model nil
|
||||
"Specify chat model to use for offline chat with khoj."
|
||||
:type 'string
|
||||
:group 'khoj)
|
||||
|
||||
(defcustom khoj-auto-setup t
|
||||
"Automate install, configure and start of khoj server.
|
||||
Auto invokes setup steps on calling main entrypoint."
|
||||
|
@ -365,9 +395,9 @@ CONFIG is json obtained from Khoj config API."
|
|||
(string-join "/"))))
|
||||
|
||||
(defun khoj--server-configure ()
|
||||
"Configure the the Khoj server for search and chat."
|
||||
"Configure the Khoj server for search and chat."
|
||||
(interactive)
|
||||
(let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
|
||||
(let* ((url-request-method "GET")
|
||||
(current-config
|
||||
(with-temp-buffer
|
||||
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
|
||||
|
@ -376,56 +406,12 @@ CONFIG is json obtained from Khoj config API."
|
|||
(with-temp-buffer
|
||||
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
|
||||
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
||||
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
|
||||
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
|
||||
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||
(default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))))
|
||||
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))
|
||||
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||
(offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||
(config (or current-config default-config)))
|
||||
|
||||
;; Configure content types
|
||||
(cond
|
||||
;; If khoj backend is not configured yet
|
||||
((not current-config)
|
||||
(message "khoj.el: Server not configured yet.")
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
||||
(index-heading-entries . ,json-false)))))
|
||||
config))
|
||||
|
||||
;; Else if khoj config has no org content config
|
||||
((not (alist-get 'org (alist-get 'content-type config)))
|
||||
(message "khoj.el: Org-mode content on server not configured yet.")
|
||||
(let ((new-content-type (alist-get 'content-type config)))
|
||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
||||
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
||||
(index-heading-entries . ,json-false)))
|
||||
new-content-type)
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(cl-pushnew `(content-type . ,new-content-type) config)))
|
||||
|
||||
;; Else if khoj is not configured to index specified org files
|
||||
((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
|
||||
(equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
|
||||
(message "khoj.el: Org-mode content on server is stale.")
|
||||
(let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
|
||||
(new-content-type (alist-get 'content-type config)))
|
||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
||||
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
|
||||
(input-filter . ,org-directory-regexes)
|
||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
|
||||
(embeddings-file . ,(format "%s/org.pt" index-directory))
|
||||
(index-heading-entries . ,json-false)))
|
||||
new-content-type)
|
||||
(setq config (delq (assoc 'content-type config) config))
|
||||
(cl-pushnew `(content-type . ,new-content-type) config))))
|
||||
|
||||
;; Configure processors
|
||||
(cond
|
||||
((not khoj-openai-api-key)
|
||||
|
@ -441,10 +427,11 @@ CONFIG is json obtained from Khoj config API."
|
|||
|
||||
;; If khoj backend isn't configured yet
|
||||
((not current-config)
|
||||
(message "khoj.el: Chat not configured yet.")
|
||||
(message "khoj.el: Khoj not configured yet.")
|
||||
(setq config (delq (assoc 'processor config) config))
|
||||
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(enable-offline-chat . ,enable-offline-chat)
|
||||
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||
(chat-model . ,offline-chat-model)))
|
||||
(openai . ((chat-model . ,chat-model)
|
||||
(api-key . ,khoj-openai-api-key)))))))
|
||||
config))
|
||||
|
@ -455,7 +442,8 @@ CONFIG is json obtained from Khoj config API."
|
|||
(let ((new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||
(enable-offline-chat . ,enable-offline-chat)
|
||||
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||
(chat-model . ,offline-chat-model)))
|
||||
(openai . ((chat-model . ,chat-model)
|
||||
(api-key . ,khoj-openai-api-key)))))
|
||||
new-processor-type)
|
||||
|
@ -465,13 +453,15 @@ CONFIG is json obtained from Khoj config API."
|
|||
;; Else if chat configuration in khoj backend has gone stale
|
||||
((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
|
||||
(equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
|
||||
(equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat)))
|
||||
(equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat)
|
||||
(equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model)))
|
||||
(message "khoj.el: Chat configuration has gone stale.")
|
||||
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
|
||||
(new-processor-type (alist-get 'processor config)))
|
||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
|
||||
(enable-offline-chat . ,enable-offline-chat)
|
||||
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||
(chat-model . ,offline-chat-model)))
|
||||
(openai . ((chat-model . ,khoj-chat-model)
|
||||
(api-key . ,khoj-openai-api-key)))))
|
||||
new-processor-type)
|
||||
|
@ -509,9 +499,75 @@ CONFIG is json obtained from Khoj config API."
|
|||
(khoj--server-configure))))
|
||||
|
||||
|
||||
;; -----------------------------------------------
|
||||
;; Extract and Render Entries of each Content Type
|
||||
;; -----------------------------------------------
|
||||
;; -------------------
|
||||
;; Khoj Index Content
|
||||
;; -------------------
|
||||
|
||||
(defun khoj--server-index-files (&optional force content-type file-paths)
|
||||
"Send files at `FILE-PATHS' to the Khoj server to index for search and chat.
|
||||
`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed."
|
||||
(interactive)
|
||||
(let ((boundary (format "-------------------------%d" (random (expt 10 10))))
|
||||
(files-to-index (or file-paths
|
||||
(append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
|
||||
(type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
|
||||
(inhibit-message t)
|
||||
(message-log-max nil))
|
||||
(let ((url-request-method "POST")
|
||||
(url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
|
||||
(url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
|
||||
("x-api-key" . ,khoj-server-api-key))))
|
||||
(with-current-buffer
|
||||
(url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false"))
|
||||
;; render response from indexing API endpoint on server
|
||||
(lambda (status)
|
||||
(if (not status)
|
||||
(message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " ""))
|
||||
(with-current-buffer (current-buffer)
|
||||
(goto-char "\n\n")
|
||||
(message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s"
|
||||
(if force "force " "")
|
||||
content-type
|
||||
status
|
||||
(string-trim (buffer-substring-no-properties (point) (point-max)))))))
|
||||
nil t t)))
|
||||
(setq khoj--indexed-files files-to-index)))
|
||||
|
||||
(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
|
||||
"Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
|
||||
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
|
||||
(with-temp-buffer
|
||||
(set-buffer-multibyte nil)
|
||||
(insert "\n")
|
||||
(dolist (file-to-index files-to-index)
|
||||
(insert (format "--%s\r\n" boundary))
|
||||
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
|
||||
(insert "Content-Type: text/org\r\n\r\n")
|
||||
(insert (with-temp-buffer
|
||||
(insert-file-contents-literally file-to-index)
|
||||
(buffer-string)))
|
||||
(insert "\r\n"))
|
||||
(dolist (file-to-index previously-indexed-files)
|
||||
(when (not (member file-to-index files-to-index))
|
||||
(insert (format "--%s\r\n" boundary))
|
||||
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
|
||||
(insert "Content-Type: text/org\r\n\r\n")
|
||||
(insert "")
|
||||
(insert "\r\n")))
|
||||
(insert (format "--%s--\r\n" boundary))
|
||||
(buffer-string)))
|
||||
|
||||
;; Cancel any running indexing timer, first
|
||||
(when khoj--index-timer
|
||||
(cancel-timer khoj--index-timer))
|
||||
;; Send files to index on server every `khoj-index-interval' seconds
|
||||
(setq khoj--index-timer
|
||||
(run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
|
||||
|
||||
|
||||
;; -------------------------------------------
|
||||
;; Render Response from Khoj server for Emacs
|
||||
;; -------------------------------------------
|
||||
|
||||
(defun khoj--extract-entries-as-markdown (json-response query)
|
||||
"Convert JSON-RESPONSE, QUERY from API to markdown entries."
|
||||
|
@ -920,6 +976,9 @@ RECEIVE-DATE is the message receive date."
|
|||
(message "khoj.el: Teardown Incremental Search")
|
||||
;; unset khoj minibuffer window
|
||||
(setq khoj--minibuffer-window nil)
|
||||
(when (and khoj--search-on-idle-timer
|
||||
(timerp khoj--search-on-idle-timer))
|
||||
(cancel-timer khoj--search-on-idle-timer))
|
||||
;; delete open connections to khoj server
|
||||
(khoj--delete-open-network-connections-to-server)
|
||||
;; remove hooks for khoj incremental query and self
|
||||
|
@ -942,8 +1001,10 @@ RECEIVE-DATE is the message receive date."
|
|||
;; set current (mini-)buffer entered as khoj minibuffer
|
||||
;; used to query khoj API only when user in khoj minibuffer
|
||||
(setq khoj--minibuffer-window (current-buffer))
|
||||
(add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action
|
||||
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit
|
||||
; do khoj incremental search after idle time
|
||||
(setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search))
|
||||
; teardown khoj incremental search on minibuffer exit
|
||||
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
|
||||
(read-string khoj--query-prompt))))
|
||||
|
||||
|
||||
|
@ -1014,17 +1075,20 @@ Paragraph only starts at first text after blank line."
|
|||
;; Khoj Menu
|
||||
;; ---------
|
||||
|
||||
(transient-define-argument khoj--content-type-switch ()
|
||||
:class 'transient-switches
|
||||
:argument-format "--content-type=%s"
|
||||
:argument-regexp ".+"
|
||||
;; set content type to: last used > based on current buffer > default type
|
||||
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
||||
;; dynamically set choices to content types enabled on khoj backend
|
||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
|
||||
(defun khoj--setup-and-show-menu ()
|
||||
"Create Transient menu for khoj and show it."
|
||||
;; Create the Khoj Transient menu
|
||||
(transient-define-argument khoj--content-type-switch ()
|
||||
:class 'transient-switches
|
||||
:argument-format "--content-type=%s"
|
||||
:argument-regexp ".+"
|
||||
;; set content type to: last used > based on current buffer > default type
|
||||
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
||||
;; dynamically set choices to content types enabled on khoj backend
|
||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
|
||||
|
||||
(transient-define-suffix khoj--search-command (&optional args)
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(transient-define-suffix khoj--search-command (&optional args)
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(progn
|
||||
;; set content type to: specified > last used > based on current buffer > default type
|
||||
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||
|
@ -1033,9 +1097,9 @@ Paragraph only starts at first text after blank line."
|
|||
;; trigger incremental search
|
||||
(call-interactively #'khoj-incremental)))
|
||||
|
||||
(transient-define-suffix khoj--find-similar-command (&optional args)
|
||||
"Find items similar to current item at point."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(transient-define-suffix khoj--find-similar-command (&optional args)
|
||||
"Find items similar to current item at point."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(progn
|
||||
;; set content type to: specified > last used > based on current buffer > default type
|
||||
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||
|
@ -1043,37 +1107,38 @@ Paragraph only starts at first text after blank line."
|
|||
(setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count))
|
||||
(khoj--find-similar khoj--content-type)))
|
||||
|
||||
(transient-define-suffix khoj--update-command (&optional args)
|
||||
"Call khoj API to update index of specified content type."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(let* ((force-update (if (member "--force-update" args) "true" "false"))
|
||||
;; set content type to: specified > last used > based on current buffer > default type
|
||||
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||
(type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
|
||||
(update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
|
||||
(url-request-method "GET"))
|
||||
(progn
|
||||
(setq khoj--content-type content-type)
|
||||
(url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
|
||||
(transient-define-suffix khoj--update-command (&optional args)
|
||||
"Call khoj API to update index of specified content type."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(let* ((force-update (if (member "--force-update" args) "true" "false"))
|
||||
;; set content type to: specified > last used > based on current buffer > default type
|
||||
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||
(url-request-method "GET"))
|
||||
(progn
|
||||
(setq khoj--content-type content-type)
|
||||
(khoj--server-index-files force-update content-type))))
|
||||
|
||||
(transient-define-suffix khoj--chat-command (&optional _)
|
||||
"Command to Chat with Khoj."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(khoj--chat))
|
||||
(transient-define-suffix khoj--chat-command (&optional _)
|
||||
"Command to Chat with Khoj."
|
||||
(interactive (list (transient-args transient-current-command)))
|
||||
(khoj--chat))
|
||||
|
||||
(transient-define-prefix khoj--menu ()
|
||||
"Create Khoj Menu to Configure and Execute Commands."
|
||||
[["Configure Search"
|
||||
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
|
||||
("t" "Content Type" khoj--content-type-switch)]
|
||||
["Configure Update"
|
||||
("-f" "Force Update" "--force-update")]]
|
||||
[["Act"
|
||||
("c" "Chat" khoj--chat-command)
|
||||
("s" "Search" khoj--search-command)
|
||||
("f" "Find Similar" khoj--find-similar-command)
|
||||
("u" "Update" khoj--update-command)
|
||||
("q" "Quit" transient-quit-one)]])
|
||||
(transient-define-prefix khoj--menu ()
|
||||
"Create Khoj Menu to Configure and Execute Commands."
|
||||
[["Configure Search"
|
||||
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
|
||||
("t" "Content Type" khoj--content-type-switch)]
|
||||
["Configure Update"
|
||||
("-f" "Force Update" "--force-update")]]
|
||||
[["Act"
|
||||
("c" "Chat" khoj--chat-command)
|
||||
("s" "Search" khoj--search-command)
|
||||
("f" "Find Similar" khoj--find-similar-command)
|
||||
("u" "Update" khoj--update-command)
|
||||
("q" "Quit" transient-quit-one)]])
|
||||
|
||||
;; Show the Khoj Transient menu
|
||||
(khoj--menu))
|
||||
|
||||
|
||||
;; ----------
|
||||
|
@ -1086,7 +1151,7 @@ Paragraph only starts at first text after blank line."
|
|||
(interactive)
|
||||
(when khoj-auto-setup
|
||||
(khoj-setup t))
|
||||
(khoj--menu))
|
||||
(khoj--setup-and-show-menu))
|
||||
|
||||
(provide 'khoj)
|
||||
|
||||
|
|
|
@ -206,6 +206,64 @@ Rule everything\n")
|
|||
"Rule everything"))
|
||||
))
|
||||
|
||||
|
||||
;; -------------------------------------
|
||||
;; Test Helpers to Index Content
|
||||
;; -------------------------------------
|
||||
|
||||
(ert-deftest khoj-tests--render-files-to-add-request-body ()
|
||||
"Test files are formatted into a multi-part http request body"
|
||||
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
|
||||
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
|
||||
(unwind-protect
|
||||
(progn
|
||||
(should
|
||||
(equal
|
||||
(khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj")
|
||||
(format
|
||||
"\n--khoj\r\n\
|
||||
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||
Content-Type: text/org\r\n\r\n\
|
||||
# Become God\n\
|
||||
## Upgrade\n\n\
|
||||
Penance to Immortality\n\n\r
|
||||
--khoj\r\n\
|
||||
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||
Content-Type: text/org\r\n\r\n\
|
||||
## Act\n\n\
|
||||
Rule everything\n\n\r\n\
|
||||
--khoj--\r\n" upgrade-file act-file))))
|
||||
(delete-file upgrade-file)
|
||||
(delete-file act-file))))
|
||||
|
||||
(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body ()
|
||||
"Test files are formatted into a multi-part http request body"
|
||||
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
|
||||
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
|
||||
(unwind-protect
|
||||
(progn
|
||||
(should
|
||||
(equal
|
||||
(khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj")
|
||||
(format
|
||||
"\n--khoj\r\n\
|
||||
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||
Content-Type: text/org\r\n\r\n\
|
||||
# Become God\n\
|
||||
## Upgrade\n\n\
|
||||
Penance to Immortality\n\n\r
|
||||
--khoj\r\n\
|
||||
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||
Content-Type: text/org\r\n\r\n\
|
||||
## Act\n\n\
|
||||
Rule everything\n\n\r
|
||||
--khoj\r\n\
|
||||
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||
Content-Type: text/org\r\n\r\n\
|
||||
\r
|
||||
--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org"))))
|
||||
(delete-file upgrade-file)
|
||||
(delete-file act-file))))
|
||||
|
||||
(provide 'khoj-tests)
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"id": "khoj",
|
||||
"name": "Khoj",
|
||||
"version": "0.12.3",
|
||||
"version": "0.13.0",
|
||||
"minAppVersion": "0.15.0",
|
||||
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
||||
"author": "Khoj Inc.",
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
{
|
||||
"name": "Khoj",
|
||||
"version": "0.12.3",
|
||||
"description": "An AI Personal Assistant for your Digital Brain",
|
||||
"version": "0.13.0",
|
||||
"description": "An AI copilot for your Second Brain",
|
||||
"author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
|
||||
"license": "GPL-3.0-or-later",
|
||||
"main": "src/main.js",
|
||||
"scripts": {
|
||||
"dev": "node esbuild.config.mjs",
|
||||
|
@ -14,8 +16,6 @@
|
|||
"AI",
|
||||
"assistant"
|
||||
],
|
||||
"author": "Debanjum Singh Solanky",
|
||||
"license": "GPL-3.0-or-later",
|
||||
"devDependencies": {
|
||||
"@types/node": "^16.11.6",
|
||||
"@typescript-eslint/eslint-plugin": "5.29.0",
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
import { Notice, Plugin } from 'obsidian';
|
||||
import { Notice, Plugin, TFile } from 'obsidian';
|
||||
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
|
||||
import { KhojSearchModal } from 'src/search_modal'
|
||||
import { KhojChatModal } from 'src/chat_modal'
|
||||
import { configureKhojBackend } from './utils';
|
||||
import { configureKhojBackend, updateContentIndex } from './utils';
|
||||
|
||||
|
||||
export default class Khoj extends Plugin {
|
||||
settings: KhojSetting;
|
||||
indexingTimer: NodeJS.Timeout;
|
||||
|
||||
async onload() {
|
||||
await this.loadSettings();
|
||||
|
@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
|
|||
|
||||
// Add a settings tab so the user can configure khoj
|
||||
this.addSettingTab(new KhojSettingTab(this.app, this));
|
||||
|
||||
// Add scheduled job to update index every 60 minutes
|
||||
this.indexingTimer = setInterval(async () => {
|
||||
if (this.settings.autoConfigure) {
|
||||
this.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.settings, this.settings.lastSyncedFiles
|
||||
);
|
||||
}
|
||||
}, 60 * 60 * 1000);
|
||||
}
|
||||
|
||||
async loadSettings() {
|
||||
|
@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
|
|||
}
|
||||
this.saveData(this.settings);
|
||||
}
|
||||
|
||||
async onunload() {
|
||||
// Remove scheduled job to update index at regular cadence
|
||||
if (this.indexingTimer)
|
||||
clearInterval(this.indexingTimer);
|
||||
|
||||
this.unload();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
|
||||
import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian';
|
||||
import Khoj from 'src/main';
|
||||
import { updateContentIndex } from './utils';
|
||||
|
||||
export interface KhojSetting {
|
||||
enableOfflineChat: boolean;
|
||||
|
@ -8,6 +9,7 @@ export interface KhojSetting {
|
|||
khojUrl: string;
|
||||
connectedToBackend: boolean;
|
||||
autoConfigure: boolean;
|
||||
lastSyncedFiles: TFile[];
|
||||
}
|
||||
|
||||
export const DEFAULT_SETTINGS: KhojSetting = {
|
||||
|
@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
|
|||
connectedToBackend: false,
|
||||
autoConfigure: true,
|
||||
openaiApiKey: '',
|
||||
lastSyncedFiles: []
|
||||
}
|
||||
|
||||
export class KhojSettingTab extends PluginSettingTab {
|
||||
|
@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
|
|||
}, 300);
|
||||
this.plugin.registerInterval(progress_indicator);
|
||||
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
|
||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
|
||||
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
|
||||
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
|
||||
);
|
||||
new Notice('✅ Updated Khoj index.');
|
||||
|
||||
// Reset button once index is updated
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
|
||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
|
||||
import { KhojSetting } from 'src/settings'
|
||||
|
||||
export function getVaultAbsolutePath(vault: Vault): string {
|
||||
|
@ -14,18 +14,85 @@ type OpenAIType = null | {
|
|||
"api-key": string;
|
||||
};
|
||||
|
||||
type OfflineChatType = null | {
|
||||
"chat-model": string;
|
||||
"enable-offline-chat": boolean;
|
||||
};
|
||||
|
||||
interface ProcessorData {
|
||||
conversation: {
|
||||
"conversation-logfile": string;
|
||||
openai: OpenAIType;
|
||||
"enable-offline-chat": boolean;
|
||||
"offline-chat": OfflineChatType;
|
||||
"tokenizer": null | string;
|
||||
"max-prompt-size": null | number;
|
||||
};
|
||||
}
|
||||
|
||||
function fileExtensionToMimeType (extension: string): string {
|
||||
switch (extension) {
|
||||
case 'pdf':
|
||||
return 'application/pdf';
|
||||
case 'png':
|
||||
return 'image/png';
|
||||
case 'jpg':
|
||||
case 'jpeg':
|
||||
return 'image/jpeg';
|
||||
case 'md':
|
||||
case 'markdown':
|
||||
return 'text/markdown';
|
||||
case 'org':
|
||||
return 'text/org';
|
||||
default:
|
||||
return 'text/plain';
|
||||
}
|
||||
}
|
||||
|
||||
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
||||
// Get all markdown, pdf files in the vault
|
||||
console.log(`Khoj: Updating Khoj content index...`)
|
||||
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
|
||||
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
|
||||
let countOfFilesToIndex = 0;
|
||||
let countOfFilesToDelete = 0;
|
||||
|
||||
// Add all files to index as multipart form data
|
||||
const formData = new FormData();
|
||||
for (const file of files) {
|
||||
countOfFilesToIndex++;
|
||||
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
||||
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
|
||||
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
|
||||
}
|
||||
|
||||
// Add any previously synced files to be deleted to multipart form data
|
||||
for (const lastSyncedFile of lastSyncedFiles) {
|
||||
if (!files.includes(lastSyncedFile)) {
|
||||
countOfFilesToDelete++;
|
||||
formData.append('files', new Blob([]), lastSyncedFile.path);
|
||||
}
|
||||
}
|
||||
|
||||
// Call Khoj backend to update index with all markdown, pdf files
|
||||
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'x-api-key': 'secret',
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
|
||||
} else {
|
||||
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
||||
let vaultPath = getVaultAbsolutePath(vault);
|
||||
let mdInVault = `${vaultPath}/**/*.md`;
|
||||
let pdfInVault = `${vaultPath}/**/*.pdf`;
|
||||
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
||||
|
||||
// Check if khoj backend is configured, note if cannot connect to backend
|
||||
|
@ -43,124 +110,33 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
if (!setting.connectedToBackend) return;
|
||||
|
||||
// Set index name from the path of the current vault
|
||||
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
||||
// Get default config fields from khoj backend
|
||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
||||
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
|
||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
|
||||
let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
|
||||
let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"];
|
||||
|
||||
// Get current config if khoj backend configured, else get default config from khoj backend
|
||||
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
|
||||
.then(response => JSON.parse(response))
|
||||
.then(data => {
|
||||
khoj_already_configured = data["content-type"] != null;
|
||||
// If khoj backend not configured yet
|
||||
if (!khoj_already_configured) {
|
||||
// Create khoj content-type config with only markdown configured
|
||||
data["content-type"] = {
|
||||
"markdown": {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
}
|
||||
// Else if khoj config has no markdown content config
|
||||
else if (!data["content-type"]["markdown"]) {
|
||||
// Add markdown config to khoj content-type config
|
||||
// Set markdown config to index markdown files in configured obsidian vault
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
||||
else if (
|
||||
data["content-type"]["markdown"]["input-files"] != null ||
|
||||
data["content-type"]["markdown"]["input-filter"] == null ||
|
||||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
||||
// Update markdown config in khoj content-type config
|
||||
// Set markdown config to only index markdown files in configured obsidian vault
|
||||
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
||||
data["content-type"]["markdown"] = {
|
||||
"input-filter": [mdInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
}
|
||||
|
||||
if (khoj_already_configured && !data["content-type"]["pdf"]) {
|
||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
} else {
|
||||
data["content-type"]["pdf"] = null;
|
||||
}
|
||||
}
|
||||
// Else if khoj is not configured to index pdf files in configured obsidian vault
|
||||
else if (khoj_already_configured &&
|
||||
(
|
||||
data["content-type"]["pdf"]["input-files"] != null ||
|
||||
data["content-type"]["pdf"]["input-filter"] == null ||
|
||||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
|
||||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
|
||||
|
||||
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
||||
|
||||
if (hasPdfFiles) {
|
||||
// Update pdf config in khoj content-type config
|
||||
// Set pdf config to only index pdf files in configured obsidian vault
|
||||
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
|
||||
data["content-type"]["pdf"] = {
|
||||
"input-filter": [pdfInVault],
|
||||
"input-files": null,
|
||||
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
|
||||
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
||||
}
|
||||
} else {
|
||||
data["content-type"]["pdf"] = null;
|
||||
}
|
||||
}
|
||||
|
||||
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
|
||||
|
||||
let processorData: ProcessorData = {
|
||||
"conversation": {
|
||||
"conversation-logfile": conversationLogFile,
|
||||
"openai": null,
|
||||
"enable-offline-chat": setting.enableOfflineChat,
|
||||
"offline-chat": {
|
||||
"chat-model": khojDefaultOfflineChatModelName,
|
||||
"enable-offline-chat": setting.enableOfflineChat,
|
||||
},
|
||||
"tokenizer": null,
|
||||
"max-prompt-size": null,
|
||||
}
|
||||
}
|
||||
|
||||
// If the Open AI API Key was configured in the plugin settings
|
||||
if (!!setting.openaiApiKey) {
|
||||
|
||||
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
|
||||
|
||||
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName;
|
||||
processorData = {
|
||||
"conversation": {
|
||||
"conversation-logfile": conversationLogFile,
|
||||
|
@ -168,7 +144,12 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
|||
"chat-model": openAIChatModel,
|
||||
"api-key": setting.openaiApiKey,
|
||||
},
|
||||
"enable-offline-chat": setting.enableOfflineChat,
|
||||
"offline-chat": {
|
||||
"chat-model": khojDefaultOfflineChatModelName,
|
||||
"enable-offline-chat": setting.enableOfflineChat,
|
||||
},
|
||||
"tokenizer": null,
|
||||
"max-prompt-size": null,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -197,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
|
|||
method: 'POST',
|
||||
contentType: 'application/json',
|
||||
};
|
||||
|
||||
// Save khojConfig on khoj backend at khojConfigUrl
|
||||
await request(requestContent)
|
||||
// Refresh khoj search index after updating config
|
||||
.then(_ => request(`${khojUrl}/api/update?t=markdown`))
|
||||
.then(_ => request(`${khojUrl}/api/update?t=pdf`));
|
||||
request(requestContent);
|
||||
}
|
||||
|
||||
function getIndexDirectoryFromBackendConfig(filepath: string) {
|
||||
|
|
|
@ -24,5 +24,6 @@
|
|||
"0.12.0": "0.15.0",
|
||||
"0.12.1": "0.15.0",
|
||||
"0.12.2": "0.15.0",
|
||||
"0.12.3": "0.15.0"
|
||||
"0.12.3": "0.15.0",
|
||||
"0.13.0": "0.15.0"
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ from khoj.utils.config import (
|
|||
)
|
||||
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
|
||||
from khoj.utils.fs_syncer import collect_files
|
||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
|
||||
from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig
|
||||
from khoj.routers.indexer import configure_content, load_content, configure_search
|
||||
|
||||
|
||||
|
@ -136,7 +136,7 @@ def configure_routes(app):
|
|||
|
||||
app.include_router(api, prefix="/api")
|
||||
app.include_router(api_beta, prefix="/api/beta")
|
||||
app.include_router(indexer, prefix="/v1/indexer")
|
||||
app.include_router(indexer, prefix="/api/v1/index")
|
||||
app.include_router(web_client)
|
||||
app.include_router(auth_router, prefix="/auth")
|
||||
|
||||
|
@ -156,7 +156,7 @@ if not state.demo:
|
|||
state.content_index = configure_content(
|
||||
state.content_index, state.config.content_type, all_files, state.search_models
|
||||
)
|
||||
logger.info("📬 Content index updated via Scheduler")
|
||||
logger.info("📪 Content index updated via Scheduler")
|
||||
except Exception as e:
|
||||
logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
|
||||
|
||||
|
@ -207,9 +207,7 @@ def configure_conversation_processor(
|
|||
conversation_config=ConversationProcessorConfig(
|
||||
conversation_logfile=conversation_logfile,
|
||||
openai=(conversation_config.openai if (conversation_config is not None) else None),
|
||||
enable_offline_chat=(
|
||||
conversation_config.enable_offline_chat if (conversation_config is not None) else False
|
||||
),
|
||||
offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(),
|
||||
)
|
||||
)
|
||||
else:
|
||||
|
|
|
@ -236,7 +236,7 @@
|
|||
</h3>
|
||||
</div>
|
||||
<div class="card-description-row">
|
||||
<p class="card-description">Setup chat using OpenAI</p>
|
||||
<p class="card-description">Setup online chat using OpenAI</p>
|
||||
</div>
|
||||
<div class="card-action-row">
|
||||
<a class="card-button" href="/config/processor/conversation/openai">
|
||||
|
@ -261,21 +261,21 @@
|
|||
<img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
|
||||
<h3 class="card-title">
|
||||
Offline Chat
|
||||
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and not current_model_state.conversation_gpt4all %}
|
||||
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %}
|
||||
<img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
|
||||
{% endif %}
|
||||
</h3>
|
||||
</div>
|
||||
<div class="card-description-row">
|
||||
<p class="card-description">Setup offline chat (Llama V2)</p>
|
||||
<p class="card-description">Setup offline chat</p>
|
||||
</div>
|
||||
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
|
||||
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
|
||||
<button class="card-button" onclick="toggleEnableLocalLLLM(false)">
|
||||
Disable
|
||||
</button>
|
||||
</div>
|
||||
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
|
||||
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
|
||||
<button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
|
||||
Enable
|
||||
</button>
|
||||
|
@ -346,7 +346,7 @@
|
|||
featuresHintText.classList.add("show");
|
||||
}
|
||||
|
||||
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
|
||||
fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
|
||||
{% else %}
|
||||
{% for input_filter in current_config['input_filter'] %}
|
||||
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter.split('/*')[0] }}">
|
||||
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter }}">
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</td>
|
||||
|
@ -106,17 +106,18 @@
|
|||
|
||||
submit.addEventListener("click", function(event) {
|
||||
event.preventDefault();
|
||||
let globFormat = "**/*."
|
||||
let globFormat = "**/*"
|
||||
let suffixes = [];
|
||||
if ('{{content_type}}' == "markdown")
|
||||
suffixes = ["md", "markdown"]
|
||||
suffixes = [".md", ".markdown"]
|
||||
else if ('{{content_type}}' == "org")
|
||||
suffixes = ["org"]
|
||||
suffixes = [".org"]
|
||||
else if ('{{content_type}}' === "pdf")
|
||||
suffixes = ["pdf"]
|
||||
suffixes = [".pdf"]
|
||||
else if ('{{content_type}}' === "plaintext")
|
||||
suffixes = ['*']
|
||||
suffixes = ['.*']
|
||||
|
||||
let globs = suffixes.map(x => `${globFormat}${x}`)
|
||||
var inputFileNodes = document.getElementsByName("input-files");
|
||||
var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
|
||||
|
||||
|
@ -124,10 +125,19 @@
|
|||
|
||||
var inputFilter = [];
|
||||
var nodes = getValidInputNodes(inputFilterNodes);
|
||||
|
||||
// A regex that checks for globs in the path. If they exist,
|
||||
// we are going to just not add our own globing. If they don't,
|
||||
// then we will assume globbing should be done.
|
||||
const glob_regex = /([*?\[\]])/;
|
||||
if (nodes.length > 0) {
|
||||
for (var i = 0; i < nodes.length; i++) {
|
||||
for (var j = 0; j < suffixes.length; j++) {
|
||||
inputFilter.push(nodes[i].value + globFormat + suffixes[j]);
|
||||
for (var j = 0; j < globs.length; j++) {
|
||||
if (glob_regex.test(nodes[i].value)) {
|
||||
inputFilter.push(nodes[i].value);
|
||||
} else {
|
||||
inputFilter.push(nodes[i].value + globs[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
83
src/khoj/migrations/migrate_offline_chat_schema.py
Normal file
83
src/khoj/migrations/migrate_offline_chat_schema.py
Normal file
|
@ -0,0 +1,83 @@
|
|||
"""
|
||||
Current format of khoj.yml
|
||||
---
|
||||
app:
|
||||
...
|
||||
content-type:
|
||||
...
|
||||
processor:
|
||||
conversation:
|
||||
enable-offline-chat: false
|
||||
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
|
||||
openai:
|
||||
...
|
||||
search-type:
|
||||
...
|
||||
|
||||
New format of khoj.yml
|
||||
---
|
||||
app:
|
||||
...
|
||||
content-type:
|
||||
...
|
||||
processor:
|
||||
conversation:
|
||||
offline-chat:
|
||||
enable-offline-chat: false
|
||||
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
|
||||
tokenizer: null
|
||||
max_prompt_size: null
|
||||
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
|
||||
openai:
|
||||
...
|
||||
search-type:
|
||||
...
|
||||
"""
|
||||
import logging
|
||||
from packaging import version
|
||||
|
||||
from khoj.utils.yaml import load_config_from_file, save_config_to_file
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def migrate_offline_chat_schema(args):
|
||||
schema_version = "0.12.3"
|
||||
raw_config = load_config_from_file(args.config_file)
|
||||
previous_version = raw_config.get("version")
|
||||
|
||||
if "processor" not in raw_config:
|
||||
return args
|
||||
if raw_config["processor"] is None:
|
||||
return args
|
||||
if "conversation" not in raw_config["processor"]:
|
||||
return args
|
||||
|
||||
if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"):
|
||||
logger.info(
|
||||
f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration"
|
||||
)
|
||||
raw_config["version"] = schema_version
|
||||
|
||||
# Create max-prompt-size field in conversation processor schema
|
||||
raw_config["processor"]["conversation"]["max-prompt-size"] = None
|
||||
raw_config["processor"]["conversation"]["tokenizer"] = None
|
||||
|
||||
# Create offline chat schema based on existing enable_offline_chat field in khoj config schema
|
||||
offline_chat_model = (
|
||||
raw_config["processor"]["conversation"]
|
||||
.get("offline-chat", {})
|
||||
.get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin")
|
||||
)
|
||||
raw_config["processor"]["conversation"]["offline-chat"] = {
|
||||
"enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False),
|
||||
"chat-model": offline_chat_model,
|
||||
}
|
||||
|
||||
# Delete old enable-offline-chat field from conversation processor schema
|
||||
if "enable-offline-chat" in raw_config["processor"]["conversation"]:
|
||||
del raw_config["processor"]["conversation"]["enable-offline-chat"]
|
||||
|
||||
save_config_to_file(raw_config, args.config_file)
|
||||
return args
|
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
def extract_questions_offline(
|
||||
text: str,
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
loaded_model: Union[Any, None] = None,
|
||||
conversation_log={},
|
||||
use_history: bool = True,
|
||||
|
@ -113,7 +113,7 @@ def filter_questions(questions: List[str]):
|
|||
]
|
||||
filtered_questions = []
|
||||
for q in questions:
|
||||
if not any([word in q.lower() for word in hint_words]):
|
||||
if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q):
|
||||
filtered_questions.append(q)
|
||||
|
||||
return filtered_questions
|
||||
|
@ -123,10 +123,12 @@ def converse_offline(
|
|||
references,
|
||||
user_query,
|
||||
conversation_log={},
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
||||
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
loaded_model: Union[Any, None] = None,
|
||||
completion_func=None,
|
||||
conversation_command=ConversationCommand.Default,
|
||||
max_prompt_size=None,
|
||||
tokenizer_name=None,
|
||||
) -> Union[ThreadedGenerator, Iterator[str]]:
|
||||
"""
|
||||
Converse with user using Llama
|
||||
|
@ -158,6 +160,8 @@ def converse_offline(
|
|||
prompts.system_prompt_message_llamav2,
|
||||
conversation_log,
|
||||
model_name=model,
|
||||
max_prompt_size=max_prompt_size,
|
||||
tokenizer_name=tokenizer_name,
|
||||
)
|
||||
|
||||
g = ThreadedGenerator(references, completion_func=completion_func)
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
model_name_to_url = {
|
||||
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
||||
}
|
|
@ -1,24 +1,8 @@
|
|||
import os
|
||||
import logging
|
||||
import requests
|
||||
import hashlib
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from khoj.processor.conversation.gpt4all import model_metadata
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"}
|
||||
|
||||
|
||||
def get_md5_checksum(filename: str):
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(filename, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(8192), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def download_model(model_name: str):
|
||||
try:
|
||||
|
@ -27,57 +11,12 @@ def download_model(model_name: str):
|
|||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||
raise e
|
||||
|
||||
url = model_metadata.model_name_to_url.get(model_name)
|
||||
model_path = os.path.expanduser(f"~/.cache/gpt4all/")
|
||||
if not url:
|
||||
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
|
||||
return GPT4All(model_name=model_name, model_path=model_path)
|
||||
|
||||
filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
|
||||
if os.path.exists(filename):
|
||||
# Check if the user is connected to the internet
|
||||
try:
|
||||
requests.get("https://www.google.com/", timeout=5)
|
||||
except:
|
||||
logger.debug("User is offline. Disabling allowed download flag")
|
||||
return GPT4All(model_name=model_name, model_path=model_path, allow_download=False)
|
||||
return GPT4All(model_name=model_name, model_path=model_path)
|
||||
|
||||
# Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file
|
||||
tmp_filename = filename + ".tmp"
|
||||
|
||||
# Use GPU for Chat Model, if available
|
||||
try:
|
||||
os.makedirs(os.path.dirname(tmp_filename), exist_ok=True)
|
||||
logger.debug(f"Downloading model {model_name} from {url} to {filename}...")
|
||||
with requests.get(url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
total_size = int(r.headers.get("content-length", 0))
|
||||
with open(tmp_filename, "wb") as f, tqdm(
|
||||
unit="B", # unit string to be displayed.
|
||||
unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
|
||||
unit_divisor=1024, # is used when unit_scale is true
|
||||
total=total_size, # the total iteration.
|
||||
desc=model_name, # prefix to be displayed on progress bar.
|
||||
) as progress_bar:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
model = GPT4All(model_name=model_name, device="gpu")
|
||||
logger.debug("Loaded chat model to GPU.")
|
||||
except ValueError:
|
||||
model = GPT4All(model_name=model_name)
|
||||
logger.debug("Loaded chat model to CPU.")
|
||||
|
||||
# Verify the checksum
|
||||
if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename):
|
||||
logger.error(
|
||||
f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available."
|
||||
)
|
||||
os.remove(tmp_filename)
|
||||
raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.")
|
||||
|
||||
# Move the tmp file to the actual file
|
||||
os.rename(tmp_filename, filename)
|
||||
logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}")
|
||||
return GPT4All(model_name)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True)
|
||||
# Remove the tmp file if it exists
|
||||
if os.path.exists(tmp_filename):
|
||||
os.remove(tmp_filename)
|
||||
return None
|
||||
return model
|
||||
|
|
|
@ -116,6 +116,8 @@ def converse(
|
|||
temperature: float = 0.2,
|
||||
completion_func=None,
|
||||
conversation_command=ConversationCommand.Default,
|
||||
max_prompt_size=None,
|
||||
tokenizer_name=None,
|
||||
):
|
||||
"""
|
||||
Converse with user using OpenAI's ChatGPT
|
||||
|
@ -141,6 +143,8 @@ def converse(
|
|||
prompts.personality.format(),
|
||||
conversation_log,
|
||||
model,
|
||||
max_prompt_size,
|
||||
tokenizer_name,
|
||||
)
|
||||
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
|
||||
logger.debug(f"Conversation Context for GPT: {truncated_messages}")
|
||||
|
|
|
@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template(
|
|||
""".strip()
|
||||
)
|
||||
|
||||
system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant.
|
||||
system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant.
|
||||
Using your general knowledge and our past conversations as context, answer the following question.
|
||||
If you do not know the answer, say 'I don't know.'"""
|
||||
|
||||
|
@ -51,13 +51,13 @@ extract_questions_system_prompt_llamav2 = PromptTemplate.from_template(
|
|||
|
||||
general_conversation_llamav2 = PromptTemplate.from_template(
|
||||
"""
|
||||
<s>[INST]{query}[/INST]
|
||||
<s>[INST] {query} [/INST]
|
||||
""".strip()
|
||||
)
|
||||
|
||||
chat_history_llamav2_from_user = PromptTemplate.from_template(
|
||||
"""
|
||||
<s>[INST]{message}[/INST]
|
||||
<s>[INST] {message} [/INST]
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
@ -69,7 +69,7 @@ chat_history_llamav2_from_assistant = PromptTemplate.from_template(
|
|||
|
||||
conversation_llamav2 = PromptTemplate.from_template(
|
||||
"""
|
||||
<s>[INST]{query}[/INST]
|
||||
<s>[INST] {query} [/INST]
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
@ -91,7 +91,7 @@ Question: {query}
|
|||
|
||||
notes_conversation_llamav2 = PromptTemplate.from_template(
|
||||
"""
|
||||
Notes:
|
||||
User's Notes:
|
||||
{references}
|
||||
Question: {query}
|
||||
""".strip()
|
||||
|
@ -134,19 +134,25 @@ Answer (in second person):"""
|
|||
|
||||
extract_questions_llamav2_sample = PromptTemplate.from_template(
|
||||
"""
|
||||
<s>[INST]<<SYS>>Current Date: {current_date}<</SYS>>[/INST]</s>
|
||||
<s>[INST]How was my trip to Cambodia?[/INST][]</s>
|
||||
<s>[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia?</s>
|
||||
<s>[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have?</s>
|
||||
<s>[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
|
||||
<s>[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
|
||||
<s>[INST]How are you feeling today?[/INST]</s>
|
||||
<s>[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age?</s>
|
||||
<s>[INST]<<SYS>>
|
||||
<s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s>
|
||||
<s>[INST] How was my trip to Cambodia? [/INST]
|
||||
How was my trip to Cambodia?</s>
|
||||
<s>[INST] Who did I visit the temple with on that trip? [/INST]
|
||||
Who did I visit the temple with in Cambodia?</s>
|
||||
<s>[INST] How should I take care of my plants? [/INST]
|
||||
What kind of plants do I have? What issues do my plants have?</s>
|
||||
<s>[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST]
|
||||
What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
|
||||
<s>[INST] What did I do for Christmas last year? [/INST]
|
||||
What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
|
||||
<s>[INST] How are you feeling today? [/INST]</s>
|
||||
<s>[INST] Is Alice older than Bob? [/INST]
|
||||
When was Alice born? What is Bob's age?</s>
|
||||
<s>[INST] <<SYS>>
|
||||
Use these notes from the user's previous conversations to provide a response:
|
||||
{chat_history}
|
||||
<</SYS>>[/INST]</s>
|
||||
<s>[INST]{query}[/INST]
|
||||
<</SYS>> [/INST]</s>
|
||||
<s>[INST] {query} [/INST]
|
||||
"""
|
||||
)
|
||||
|
||||
|
|
|
@ -3,24 +3,27 @@ import logging
|
|||
from time import perf_counter
|
||||
import json
|
||||
from datetime import datetime
|
||||
import queue
|
||||
import tiktoken
|
||||
|
||||
# External packages
|
||||
from langchain.schema import ChatMessage
|
||||
from transformers import LlamaTokenizerFast
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Internal Packages
|
||||
import queue
|
||||
from khoj.utils.helpers import merge_dicts
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
max_prompt_size = {
|
||||
model_to_prompt_size = {
|
||||
"gpt-3.5-turbo": 4096,
|
||||
"gpt-4": 8192,
|
||||
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548,
|
||||
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
|
||||
"gpt-3.5-turbo-16k": 15000,
|
||||
}
|
||||
tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"}
|
||||
model_to_tokenizer = {
|
||||
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
|
||||
}
|
||||
|
||||
|
||||
class ThreadedGenerator:
|
||||
|
@ -82,9 +85,26 @@ def message_to_log(
|
|||
|
||||
|
||||
def generate_chatml_messages_with_context(
|
||||
user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2
|
||||
user_message,
|
||||
system_message,
|
||||
conversation_log={},
|
||||
model_name="gpt-3.5-turbo",
|
||||
max_prompt_size=None,
|
||||
tokenizer_name=None,
|
||||
):
|
||||
"""Generate messages for ChatGPT with context from previous conversation"""
|
||||
# Set max prompt size from user config, pre-configured for model or to default prompt size
|
||||
try:
|
||||
max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
|
||||
except:
|
||||
max_prompt_size = 2000
|
||||
logger.warning(
|
||||
f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
|
||||
)
|
||||
|
||||
# Scale lookback turns proportional to max prompt size supported by model
|
||||
lookback_turns = max_prompt_size // 750
|
||||
|
||||
# Extract Chat History for Context
|
||||
chat_logs = []
|
||||
for chat in conversation_log.get("chat", []):
|
||||
|
@ -105,19 +125,28 @@ def generate_chatml_messages_with_context(
|
|||
messages = user_chatml_message + rest_backnforths + system_chatml_message
|
||||
|
||||
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
||||
messages = truncate_messages(messages, max_prompt_size[model_name], model_name)
|
||||
messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
|
||||
|
||||
# Return message in chronological order
|
||||
return messages[::-1]
|
||||
|
||||
|
||||
def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]:
|
||||
def truncate_messages(
|
||||
messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None
|
||||
) -> list[ChatMessage]:
|
||||
"""Truncate messages to fit within max prompt size supported by model"""
|
||||
|
||||
if "llama" in model_name:
|
||||
encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name])
|
||||
else:
|
||||
encoder = tiktoken.encoding_for_model(model_name)
|
||||
try:
|
||||
if model_name.startswith("gpt-"):
|
||||
encoder = tiktoken.encoding_for_model(model_name)
|
||||
else:
|
||||
encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name])
|
||||
except:
|
||||
default_tokenizer = "hf-internal-testing/llama-tokenizer"
|
||||
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
|
||||
logger.warning(
|
||||
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
|
||||
)
|
||||
|
||||
system_message = messages.pop()
|
||||
system_message_tokens = len(encoder.encode(system_message.content))
|
||||
|
|
|
@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
|
|||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
|
||||
tmp_file = f"tmp_pdf_file.pdf"
|
||||
with open(f"{tmp_file}", "wb") as f:
|
||||
bytes = base64.b64decode(pdf_files[pdf_file])
|
||||
bytes = pdf_files[pdf_file]
|
||||
f.write(bytes)
|
||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||
|
|
|
@ -30,6 +30,7 @@ from khoj.utils.rawconfig import (
|
|||
GithubContentConfig,
|
||||
NotionContentConfig,
|
||||
ConversationProcessorConfig,
|
||||
OfflineChatProcessorConfig,
|
||||
)
|
||||
from khoj.utils.helpers import resolve_absolute_path
|
||||
from khoj.utils.state import SearchType
|
||||
|
@ -185,6 +186,10 @@ if not state.demo:
|
|||
state.content_index.markdown = None
|
||||
elif content_type == "org":
|
||||
state.content_index.org = None
|
||||
elif content_type == "plaintext":
|
||||
state.content_index.plaintext = None
|
||||
else:
|
||||
logger.warning(f"Request to delete unknown content type: {content_type} via API")
|
||||
|
||||
try:
|
||||
save_config_to_file_updated_state()
|
||||
|
@ -284,10 +289,11 @@ if not state.demo:
|
|||
except Exception as e:
|
||||
return {"status": "error", "message": str(e)}
|
||||
|
||||
@api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200)
|
||||
@api.post("/config/data/processor/conversation/offline_chat", status_code=200)
|
||||
async def set_processor_enable_offline_chat_config_data(
|
||||
request: Request,
|
||||
enable_offline_chat: bool,
|
||||
offline_chat_model: Optional[str] = None,
|
||||
client: Optional[str] = None,
|
||||
):
|
||||
_initialize_config()
|
||||
|
@ -301,7 +307,12 @@ if not state.demo:
|
|||
state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore
|
||||
|
||||
assert state.config.processor.conversation is not None
|
||||
state.config.processor.conversation.enable_offline_chat = enable_offline_chat
|
||||
if state.config.processor.conversation.offline_chat is None:
|
||||
state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig()
|
||||
|
||||
state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
|
||||
if offline_chat_model is not None:
|
||||
state.config.processor.conversation.offline_chat.chat_model = offline_chat_model
|
||||
state.processor_config = configure_processor(state.config.processor, state.processor_config)
|
||||
|
||||
update_telemetry_state(
|
||||
|
@ -322,7 +333,7 @@ if not state.demo:
|
|||
# Create Routes
|
||||
@api.get("/config/data/default")
|
||||
def get_default_config_data():
|
||||
return constants.default_config
|
||||
return constants.empty_config
|
||||
|
||||
|
||||
@api.get("/config/types", response_model=List[str])
|
||||
|
@ -387,7 +398,7 @@ async def search(
|
|||
# Encode query with filter terms removed
|
||||
defiltered_query = user_query
|
||||
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
||||
defiltered_query = filter.defilter(user_query)
|
||||
defiltered_query = filter.defilter(defiltered_query)
|
||||
|
||||
encoded_asymmetric_query = None
|
||||
if t == SearchType.All or t != SearchType.Image:
|
||||
|
@ -622,7 +633,7 @@ def update(
|
|||
if state.processor_config:
|
||||
components.append("Conversation processor")
|
||||
components_msg = ", ".join(components)
|
||||
logger.info(f"📬 {components_msg} updated via API")
|
||||
logger.info(f"📪 {components_msg} updated via API")
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
|
@ -702,12 +713,18 @@ async def chat(
|
|||
) -> Response:
|
||||
perform_chat_checks()
|
||||
conversation_command = get_conversation_command(query=q, any_references=True)
|
||||
|
||||
q = q.replace(f"/{conversation_command.value}", "").strip()
|
||||
|
||||
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
|
||||
request, q, (n or 5), conversation_command
|
||||
)
|
||||
conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
|
||||
|
||||
if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references):
|
||||
conversation_command = ConversationCommand.General
|
||||
|
||||
if conversation_command == ConversationCommand.Help:
|
||||
model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai"
|
||||
model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai"
|
||||
formatted_help = help_message.format(model=model_type, version=state.khoj_version)
|
||||
return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
|
||||
|
||||
|
@ -768,23 +785,21 @@ async def extract_references_and_questions(
|
|||
logger.warning(
|
||||
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
||||
)
|
||||
return compiled_references, inferred_queries
|
||||
return compiled_references, inferred_queries, q
|
||||
|
||||
if conversation_type == ConversationCommand.General:
|
||||
return compiled_references, inferred_queries, q
|
||||
|
||||
# Extract filter terms from user message
|
||||
defiltered_query = q
|
||||
filter_terms = []
|
||||
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
||||
filter_terms += filter.get_filter_terms(q)
|
||||
defiltered_query = filter.defilter(q)
|
||||
filters_in_query = " ".join(filter_terms)
|
||||
defiltered_query = filter.defilter(defiltered_query)
|
||||
filters_in_query = q.replace(defiltered_query, "").strip()
|
||||
|
||||
# Infer search queries from user message
|
||||
with timer("Extracting search queries took", logger):
|
||||
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
|
||||
if state.processor_config.conversation.enable_offline_chat:
|
||||
if state.processor_config.conversation.offline_chat.enable_offline_chat:
|
||||
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
||||
inferred_queries = extract_questions_offline(
|
||||
defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
|
||||
|
@ -800,7 +815,7 @@ async def extract_references_and_questions(
|
|||
with timer("Searching knowledge base took", logger):
|
||||
result_list = []
|
||||
for query in inferred_queries:
|
||||
n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n
|
||||
n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n
|
||||
result_list.extend(
|
||||
await search(
|
||||
f"{query} {filters_in_query}",
|
||||
|
|
|
@ -113,7 +113,7 @@ def generate_chat_response(
|
|||
meta_log=meta_log,
|
||||
)
|
||||
|
||||
if state.processor_config.conversation.enable_offline_chat:
|
||||
if state.processor_config.conversation.offline_chat.enable_offline_chat:
|
||||
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
||||
chat_response = converse_offline(
|
||||
references=compiled_references,
|
||||
|
@ -122,6 +122,9 @@ def generate_chat_response(
|
|||
conversation_log=meta_log,
|
||||
completion_func=partial_completion,
|
||||
conversation_command=conversation_command,
|
||||
model=state.processor_config.conversation.offline_chat.chat_model,
|
||||
max_prompt_size=state.processor_config.conversation.max_prompt_size,
|
||||
tokenizer_name=state.processor_config.conversation.tokenizer,
|
||||
)
|
||||
|
||||
elif state.processor_config.conversation.openai_model:
|
||||
|
@ -135,6 +138,8 @@ def generate_chat_response(
|
|||
api_key=api_key,
|
||||
completion_func=partial_completion,
|
||||
conversation_command=conversation_command,
|
||||
max_prompt_size=state.processor_config.conversation.max_prompt_size,
|
||||
tokenizer_name=state.processor_config.conversation.tokenizer,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# Standard Packages
|
||||
import logging
|
||||
import sys
|
||||
from typing import Optional, Union, Dict
|
||||
|
||||
# External Packages
|
||||
from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
|
||||
from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile
|
||||
from pydantic import BaseModel
|
||||
from khoj.routers.helpers import update_telemetry_state
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils import state, constants
|
||||
|
@ -56,42 +56,30 @@ class IndexerInput(BaseModel):
|
|||
plaintext: Optional[dict[str, str]] = None
|
||||
|
||||
|
||||
@indexer.post("/batch")
|
||||
async def index_batch(
|
||||
@indexer.post("/update")
|
||||
async def update(
|
||||
request: Request,
|
||||
files: list[UploadFile],
|
||||
x_api_key: str = Header(None),
|
||||
regenerate: bool = False,
|
||||
search_type: Optional[Union[state.SearchType, str]] = None,
|
||||
force: bool = False,
|
||||
t: Optional[Union[state.SearchType, str]] = None,
|
||||
client: Optional[str] = None,
|
||||
user_agent: Optional[str] = Header(None),
|
||||
referer: Optional[str] = Header(None),
|
||||
host: Optional[str] = Header(None),
|
||||
):
|
||||
if x_api_key != "secret":
|
||||
raise HTTPException(status_code=401, detail="Invalid API Key")
|
||||
state.config_lock.acquire()
|
||||
try:
|
||||
logger.info(f"Received batch indexing request")
|
||||
index_batch_request_acc = b""
|
||||
async for chunk in request.stream():
|
||||
index_batch_request_acc += chunk
|
||||
data_bytes = sys.getsizeof(index_batch_request_acc)
|
||||
unit = "KB"
|
||||
data_size = data_bytes / 1024
|
||||
if data_size > 1000:
|
||||
unit = "MB"
|
||||
data_size = data_size / 1024
|
||||
if data_size > 1000:
|
||||
unit = "GB"
|
||||
data_size = data_size / 1024
|
||||
data_size_metric = f"{data_size:.2f} {unit}"
|
||||
logger.info(f"Received {data_size_metric} of data")
|
||||
index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
|
||||
logger.info(f"Received {len(index_batch_request.files)} files")
|
||||
|
||||
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||
org_files: Dict[str, str] = {}
|
||||
markdown_files: Dict[str, str] = {}
|
||||
pdf_files: Dict[str, str] = {}
|
||||
plaintext_files: Dict[str, str] = {}
|
||||
|
||||
for file in index_batch_request.files:
|
||||
file_type = get_file_type(file.path)
|
||||
for file in files:
|
||||
file_type, encoding = get_file_type(file.content_type)
|
||||
dict_to_update = None
|
||||
if file_type == "org":
|
||||
dict_to_update = org_files
|
||||
|
@ -103,9 +91,11 @@ async def index_batch(
|
|||
dict_to_update = plaintext_files
|
||||
|
||||
if dict_to_update is not None:
|
||||
dict_to_update[file.path] = file.content
|
||||
dict_to_update[file.filename] = (
|
||||
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
|
||||
)
|
||||
else:
|
||||
logger.info(f"Skipping unsupported streamed file: {file.path}")
|
||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||
|
||||
indexer_input = IndexerInput(
|
||||
org=org_files,
|
||||
|
@ -115,7 +105,7 @@ async def index_batch(
|
|||
)
|
||||
|
||||
if state.config == None:
|
||||
logger.info("First run, initializing state.")
|
||||
logger.info("📬 Initializing content index on first run.")
|
||||
default_full_config = FullConfig(
|
||||
content_type=None,
|
||||
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
|
||||
|
@ -142,15 +132,30 @@ async def index_batch(
|
|||
state.config.content_type,
|
||||
indexer_input.dict(),
|
||||
state.search_models,
|
||||
regenerate=regenerate,
|
||||
t=search_type,
|
||||
regenerate=force,
|
||||
t=t,
|
||||
full_corpus=False,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process batch indexing request: {e}", exc_info=True)
|
||||
logger.error(
|
||||
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
finally:
|
||||
state.config_lock.release()
|
||||
|
||||
update_telemetry_state(
|
||||
request=request,
|
||||
telemetry_type="api",
|
||||
api="index/update",
|
||||
client=client,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
host=host,
|
||||
)
|
||||
|
||||
logger.info(f"📪 Content index updated via API call by {client} client")
|
||||
return Response(content="OK", status_code=200)
|
||||
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file
|
|||
from khoj.migrations.migrate_version import migrate_config_to_version
|
||||
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
|
||||
from khoj.migrations.migrate_offline_model import migrate_offline_model
|
||||
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
|
||||
|
||||
|
||||
def cli(args=None):
|
||||
|
@ -55,7 +56,12 @@ def cli(args=None):
|
|||
|
||||
|
||||
def run_migrations(args):
|
||||
migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model]
|
||||
migrations = [
|
||||
migrate_config_to_version,
|
||||
migrate_processor_conversation_schema,
|
||||
migrate_offline_model,
|
||||
migrate_offline_chat_schema,
|
||||
]
|
||||
for migration in migrations:
|
||||
args = migration(args)
|
||||
return args
|
||||
|
|
|
@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model
|
|||
# External Packages
|
||||
import torch
|
||||
|
||||
from khoj.utils.rawconfig import OfflineChatProcessorConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Internal Packages
|
||||
|
@ -84,7 +86,6 @@ class SearchModels:
|
|||
|
||||
@dataclass
|
||||
class GPT4AllProcessorConfig:
|
||||
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
||||
loaded_model: Union[Any, None] = None
|
||||
|
||||
|
||||
|
@ -95,18 +96,20 @@ class ConversationProcessorConfigModel:
|
|||
):
|
||||
self.openai_model = conversation_config.openai
|
||||
self.gpt4all_model = GPT4AllProcessorConfig()
|
||||
self.enable_offline_chat = conversation_config.enable_offline_chat
|
||||
self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig()
|
||||
self.max_prompt_size = conversation_config.max_prompt_size
|
||||
self.tokenizer = conversation_config.tokenizer
|
||||
self.conversation_logfile = Path(conversation_config.conversation_logfile)
|
||||
self.chat_session: List[str] = []
|
||||
self.meta_log: dict = {}
|
||||
|
||||
if self.enable_offline_chat:
|
||||
if self.offline_chat.enable_offline_chat:
|
||||
try:
|
||||
self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model)
|
||||
except ValueError as e:
|
||||
self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
|
||||
except Exception as e:
|
||||
self.offline_chat.enable_offline_chat = False
|
||||
self.gpt4all_model.loaded_model = None
|
||||
logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
|
||||
self.enable_offline_chat = False
|
||||
else:
|
||||
self.gpt4all_model.loaded_model = None
|
||||
|
||||
|
|
|
@ -6,6 +6,64 @@ empty_escape_sequences = "\n|\r|\t| "
|
|||
app_env_filepath = "~/.khoj/env"
|
||||
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
||||
|
||||
empty_config = {
|
||||
"content-type": {
|
||||
"org": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/org/org_embeddings.pt",
|
||||
"index-heading-entries": False,
|
||||
},
|
||||
"markdown": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
|
||||
},
|
||||
"pdf": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||
},
|
||||
"plaintext": {
|
||||
"input-files": None,
|
||||
"input-filter": None,
|
||||
"compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
|
||||
},
|
||||
},
|
||||
"search-type": {
|
||||
"symmetric": {
|
||||
"encoder": "sentence-transformers/all-MiniLM-L6-v2",
|
||||
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"model_directory": "~/.khoj/search/symmetric/",
|
||||
},
|
||||
"asymmetric": {
|
||||
"encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
||||
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||
"model_directory": "~/.khoj/search/asymmetric/",
|
||||
},
|
||||
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
|
||||
},
|
||||
"processor": {
|
||||
"conversation": {
|
||||
"openai": {
|
||||
"api-key": None,
|
||||
"chat-model": "gpt-3.5-turbo",
|
||||
},
|
||||
"offline-chat": {
|
||||
"enable-offline-chat": False,
|
||||
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
},
|
||||
"tokenizer": None,
|
||||
"max-prompt-size": None,
|
||||
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
# default app config to use
|
||||
default_config = {
|
||||
"content-type": {
|
||||
|
@ -72,7 +130,12 @@ default_config = {
|
|||
"api-key": None,
|
||||
"chat-model": "gpt-3.5-turbo",
|
||||
},
|
||||
"enable-offline-chat": False,
|
||||
"offline-chat": {
|
||||
"enable-offline-chat": False,
|
||||
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||
},
|
||||
"tokenizer": None,
|
||||
"max-prompt-size": None,
|
||||
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
||||
}
|
||||
},
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
import logging
|
||||
import glob
|
||||
import base64
|
||||
import os
|
||||
from typing import Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
@ -39,13 +39,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||
return soup.get_text(strip=True, separator="\n")
|
||||
|
||||
# Extract required fields from config
|
||||
input_files, input_filter = (
|
||||
input_files, input_filters = (
|
||||
config.input_files,
|
||||
config.input_filter,
|
||||
)
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(input_files) and is_none_or_empty(input_filter):
|
||||
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
|
||||
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
||||
return {}
|
||||
|
||||
|
@ -53,11 +53,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
||||
if input_files:
|
||||
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
||||
if input_filter:
|
||||
if input_filters:
|
||||
filtered_plaintext_files = {
|
||||
filtered_file
|
||||
for jsonl_file_filter in input_filter
|
||||
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
|
||||
for plaintext_file_filter in input_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
|
||||
if os.path.isfile(filtered_file)
|
||||
}
|
||||
|
||||
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
||||
|
@ -73,12 +74,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||
|
||||
filename_to_content_map = {}
|
||||
for file in all_target_files:
|
||||
with open(file, "r") as f:
|
||||
with open(file, "r", encoding="utf8") as f:
|
||||
try:
|
||||
plaintext_content = f.read()
|
||||
if file.endswith(("html", "htm", "xml")):
|
||||
plaintext_content = extract_html_content(plaintext_content)
|
||||
filename_to_content_map[file] = f.read()
|
||||
filename_to_content_map[file] = plaintext_content
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
|
||||
logger.warning(e, exc_info=True)
|
||||
|
@ -88,13 +89,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||
|
||||
def get_org_files(config: TextContentConfig):
|
||||
# Extract required fields from config
|
||||
org_files, org_file_filter = (
|
||||
org_files, org_file_filters = (
|
||||
config.input_files,
|
||||
config.input_filter,
|
||||
)
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
|
||||
logger.debug("At least one of org-files or org-file-filter is required to be specified")
|
||||
return {}
|
||||
|
||||
|
@ -102,11 +103,12 @@ def get_org_files(config: TextContentConfig):
|
|||
absolute_org_files, filtered_org_files = set(), set()
|
||||
if org_files:
|
||||
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
|
||||
if org_file_filter:
|
||||
if org_file_filters:
|
||||
filtered_org_files = {
|
||||
filtered_file
|
||||
for org_file_filter in org_file_filter
|
||||
for org_file_filter in org_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
||||
if os.path.isfile(filtered_file)
|
||||
}
|
||||
|
||||
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||
|
@ -119,7 +121,7 @@ def get_org_files(config: TextContentConfig):
|
|||
|
||||
filename_to_content_map = {}
|
||||
for file in all_org_files:
|
||||
with open(file, "r") as f:
|
||||
with open(file, "r", encoding="utf8") as f:
|
||||
try:
|
||||
filename_to_content_map[file] = f.read()
|
||||
except Exception as e:
|
||||
|
@ -131,26 +133,27 @@ def get_org_files(config: TextContentConfig):
|
|||
|
||||
def get_markdown_files(config: TextContentConfig):
|
||||
# Extract required fields from config
|
||||
markdown_files, markdown_file_filter = (
|
||||
markdown_files, markdown_file_filters = (
|
||||
config.input_files,
|
||||
config.input_filter,
|
||||
)
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
|
||||
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||
return {}
|
||||
|
||||
"Get Markdown files to process"
|
||||
# Get markdown files to process
|
||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||
if markdown_files:
|
||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||
|
||||
if markdown_file_filter:
|
||||
if markdown_file_filters:
|
||||
filtered_markdown_files = {
|
||||
filtered_file
|
||||
for markdown_file_filter in markdown_file_filter
|
||||
for markdown_file_filter in markdown_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
||||
if os.path.isfile(filtered_file)
|
||||
}
|
||||
|
||||
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||
|
@ -168,7 +171,7 @@ def get_markdown_files(config: TextContentConfig):
|
|||
|
||||
filename_to_content_map = {}
|
||||
for file in all_markdown_files:
|
||||
with open(file, "r") as f:
|
||||
with open(file, "r", encoding="utf8") as f:
|
||||
try:
|
||||
filename_to_content_map[file] = f.read()
|
||||
except Exception as e:
|
||||
|
@ -180,13 +183,13 @@ def get_markdown_files(config: TextContentConfig):
|
|||
|
||||
def get_pdf_files(config: TextContentConfig):
|
||||
# Extract required fields from config
|
||||
pdf_files, pdf_file_filter = (
|
||||
pdf_files, pdf_file_filters = (
|
||||
config.input_files,
|
||||
config.input_filter,
|
||||
)
|
||||
|
||||
# Input Validation
|
||||
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
|
||||
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
|
||||
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
||||
return {}
|
||||
|
||||
|
@ -194,11 +197,12 @@ def get_pdf_files(config: TextContentConfig):
|
|||
absolute_pdf_files, filtered_pdf_files = set(), set()
|
||||
if pdf_files:
|
||||
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
||||
if pdf_file_filter:
|
||||
if pdf_file_filters:
|
||||
filtered_pdf_files = {
|
||||
filtered_file
|
||||
for pdf_file_filter in pdf_file_filter
|
||||
for pdf_file_filter in pdf_file_filters
|
||||
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
||||
if os.path.isfile(filtered_file)
|
||||
}
|
||||
|
||||
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
||||
|
@ -214,7 +218,7 @@ def get_pdf_files(config: TextContentConfig):
|
|||
for file in all_pdf_files:
|
||||
with open(file, "rb") as f:
|
||||
try:
|
||||
filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8")
|
||||
filename_to_content_map[file] = f.read()
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
||||
logger.warning(e, exc_info=True)
|
||||
|
|
|
@ -66,20 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
|||
return merged_dict
|
||||
|
||||
|
||||
def get_file_type(filepath: str) -> str:
|
||||
"Get file type from file path"
|
||||
file_type = Path(filepath).suffix[1:]
|
||||
def get_file_type(file_type: str) -> tuple[str, str]:
|
||||
"Get file type from file mime type"
|
||||
|
||||
if file_type in ["md", "markdown"]:
|
||||
return "markdown"
|
||||
elif file_type in ["org", "orgmode"]:
|
||||
return "org"
|
||||
elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]:
|
||||
return "plaintext"
|
||||
elif file_type in ["pdf"]:
|
||||
return "pdf"
|
||||
|
||||
return file_type
|
||||
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
|
||||
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
||||
if file_type in ["text/markdown"]:
|
||||
return "markdown", encoding
|
||||
elif file_type in ["text/org"]:
|
||||
return "org", encoding
|
||||
elif file_type in ["application/pdf"]:
|
||||
return "pdf", encoding
|
||||
elif file_type in ["image/jpeg"]:
|
||||
return "jpeg", encoding
|
||||
elif file_type in ["image/png"]:
|
||||
return "png", encoding
|
||||
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
|
||||
return "plaintext", encoding
|
||||
else:
|
||||
return "other", encoding
|
||||
|
||||
|
||||
def load_model(
|
||||
|
|
|
@ -91,10 +91,17 @@ class OpenAIProcessorConfig(ConfigBase):
|
|||
chat_model: Optional[str] = "gpt-3.5-turbo"
|
||||
|
||||
|
||||
class OfflineChatProcessorConfig(ConfigBase):
|
||||
enable_offline_chat: Optional[bool] = False
|
||||
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||
|
||||
|
||||
class ConversationProcessorConfig(ConfigBase):
|
||||
conversation_logfile: Path
|
||||
openai: Optional[OpenAIProcessorConfig]
|
||||
enable_offline_chat: Optional[bool] = False
|
||||
offline_chat: Optional[OfflineChatProcessorConfig]
|
||||
max_prompt_size: Optional[int]
|
||||
tokenizer: Optional[str]
|
||||
|
||||
|
||||
class ProcessorConfig(ConfigBase):
|
||||
|
|
|
@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
|
|||
from khoj.utils.rawconfig import (
|
||||
ContentConfig,
|
||||
ConversationProcessorConfig,
|
||||
OfflineChatProcessorConfig,
|
||||
OpenAIProcessorConfig,
|
||||
ProcessorConfig,
|
||||
TextContentConfig,
|
||||
|
@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):
|
|||
|
||||
# Setup conversation processor
|
||||
processor_config = ProcessorConfig()
|
||||
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
|
||||
processor_config.conversation = ConversationProcessorConfig(
|
||||
enable_offline_chat=True,
|
||||
offline_chat=offline_chat,
|
||||
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
||||
)
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ from urllib.parse import quote
|
|||
|
||||
# External Packages
|
||||
from fastapi.testclient import TestClient
|
||||
import pytest
|
||||
|
||||
# Internal Packages
|
||||
from app.main import app
|
||||
|
@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
|
|||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_index_batch(client):
|
||||
def test_index_update(client):
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
|
||||
response = client.post("/api/v1/index/update", files=files, headers=headers)
|
||||
|
||||
# Assert
|
||||
assert response.status_code == 200
|
||||
|
@ -76,12 +77,11 @@ def test_index_batch(client):
|
|||
def test_regenerate_with_valid_content_type(client):
|
||||
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
|
||||
response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||
|
||||
|
@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
|
|||
response = client.get(f"/api/update?force=true&t=github")
|
||||
|
||||
# Arrange
|
||||
request_body = get_sample_files_data()
|
||||
|
||||
files = get_sample_files_data()
|
||||
headers = {"x-api-key": "secret"}
|
||||
|
||||
# Act
|
||||
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
|
||||
response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
|
||||
# Assert
|
||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||
def test_get_configured_types_via_api(client):
|
||||
# Act
|
||||
response = client.get(f"/api/config/types")
|
||||
|
@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
|
|||
|
||||
def get_sample_files_data():
|
||||
return {
|
||||
"org": {
|
||||
"path/to/filename.org": "* practicing piano",
|
||||
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
|
||||
"path/to/filename2.org": "* how to build a search engine",
|
||||
},
|
||||
"pdf": {
|
||||
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
|
||||
"path/to/filename1.pdf": "The sun is a ball of helium",
|
||||
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
|
||||
},
|
||||
"plaintext": {
|
||||
"path/to/filename.txt": "data,column,value",
|
||||
"path/to/filename1.txt": "<html>my first web page</html>",
|
||||
"path/to/filename2.txt": "2021-02-02 Journal Entry",
|
||||
},
|
||||
"markdown": {
|
||||
"path/to/filename.md": "# Notes from client call",
|
||||
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
|
||||
"path/to/filename2.md": "**Understanding science through the lens of art**",
|
||||
},
|
||||
"files": ("path/to/filename.org", "* practicing piano", "text/org"),
|
||||
"files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
|
||||
"files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
|
||||
"files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
|
||||
"files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
|
||||
"files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
|
||||
"files": ("path/to/filename.txt", "data,column,value", "text/plain"),
|
||||
"files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
|
||||
"files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
|
||||
"files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
|
||||
"files": (
|
||||
"path/to/filename1.md",
|
||||
"## Studying anthropological records from the Fatimid caliphate",
|
||||
"text/markdown",
|
||||
),
|
||||
"files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
|
|||
|
||||
from khoj.processor.conversation.utils import message_to_log
|
||||
|
||||
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
||||
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
|
|||
@pytest.mark.chatquality
|
||||
def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
||||
# Act
|
||||
response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
|
||||
response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
|
||||
|
||||
# Assert
|
||||
expected_responses = ["height", "taller", "shorter", "heights"]
|
||||
expected_responses = ["height", "taller", "shorter", "heights", "who"]
|
||||
assert len(response) <= 3
|
||||
|
||||
for question in response:
|
||||
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
|
||||
"Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
|
||||
"Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
|
||||
)
|
||||
|
||||
|
||||
|
@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
|||
def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
||||
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
|
@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
|||
use_history=True,
|
||||
)
|
||||
|
||||
expected_responses = [
|
||||
"Vader",
|
||||
"sons",
|
||||
all_expected_in_response = [
|
||||
"Anderson",
|
||||
]
|
||||
|
||||
any_expected_in_response = [
|
||||
"son",
|
||||
"Darth",
|
||||
"sons",
|
||||
"children",
|
||||
]
|
||||
|
||||
# Assert
|
||||
assert len(response) >= 1
|
||||
assert any([expected_response in response[0] for expected_response in expected_responses]), (
|
||||
assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
|
||||
"Expected chat actor to ask for clarification in response, but got: " + response[0]
|
||||
)
|
||||
assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
|
||||
"Expected chat actor to ask for clarification in response, but got: " + response[0]
|
||||
)
|
||||
|
||||
|
@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
|||
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
|
||||
# Arrange
|
||||
message_list = [
|
||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
||||
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
||||
]
|
||||
|
||||
# Act
|
||||
response = extract_questions_offline(
|
||||
"Is she a Jedi?",
|
||||
"Is she a Doctor?",
|
||||
conversation_log=populate_chat_history(message_list),
|
||||
loaded_model=loaded_model,
|
||||
use_history=True,
|
||||
)
|
||||
|
||||
expected_responses = [
|
||||
"Leia",
|
||||
"Vader",
|
||||
"Barbara",
|
||||
"Robert",
|
||||
"daughter",
|
||||
]
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# Standard Packages
|
||||
import json
|
||||
import os
|
||||
import base64
|
||||
|
||||
# Internal Packages
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
|
@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
|
|||
# Extract Entries from specified Pdf files
|
||||
# Read singlepage.pdf into memory as bytes
|
||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
|
|||
# Act
|
||||
# Extract Entries from specified Pdf files
|
||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
||||
pdf_bytes = f.read()
|
||||
|
||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||
|
|
|
@ -1,26 +1,25 @@
|
|||
# System Packages
|
||||
import logging
|
||||
import locale
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# External Packages
|
||||
import pytest
|
||||
from khoj.utils.config import SearchModels
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.state import content_index, search_models
|
||||
from khoj.search_type import text_search
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.utils.config import SearchModels
|
||||
from khoj.utils.fs_syncer import get_org_files
|
||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||
|
||||
|
||||
# Test
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_text_search_setup_with_missing_file_raises_error(
|
||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
||||
):
|
||||
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
|
||||
# Arrange
|
||||
# Ensure file mentioned in org.input-files is missing
|
||||
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
||||
|
@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
|
|||
# Act
|
||||
# Generate notes embeddings during asymmetric setup
|
||||
with pytest.raises(FileNotFoundError):
|
||||
data = get_org_files(org_config_with_only_new_file)
|
||||
get_org_files(org_config_with_only_new_file)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
|
||||
# Arrange
|
||||
orgfile = tmp_path / "directory.org" / "file.org"
|
||||
orgfile.parent.mkdir()
|
||||
with open(orgfile, "w") as f:
|
||||
f.write("* Heading\n- List item\n")
|
||||
org_content_config = TextContentConfig(
|
||||
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
|
||||
)
|
||||
|
||||
# Act
|
||||
# should not raise IsADirectoryError and return orgfile
|
||||
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------
|
||||
|
@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
|||
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||
# Arrange
|
||||
data = get_org_files(content_config.org)
|
||||
|
||||
# Act
|
||||
# Regenerate notes embeddings during asymmetric setup
|
||||
notes_model = text_search.setup(
|
||||
|
|
|
@ -24,5 +24,6 @@
|
|||
"0.12.0": "0.15.0",
|
||||
"0.12.1": "0.15.0",
|
||||
"0.12.2": "0.15.0",
|
||||
"0.12.3": "0.15.0"
|
||||
"0.12.3": "0.15.0",
|
||||
"0.13.0": "0.15.0"
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue