mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Resolve merge conflicts
This commit is contained in:
commit
963cd165eb
42 changed files with 941 additions and 590 deletions
11
docs/chat.md
11
docs/chat.md
|
@ -7,18 +7,21 @@
|
||||||
|
|
||||||
### Setup
|
### Setup
|
||||||
#### Offline Chat
|
#### Offline Chat
|
||||||
Offline chat works without internet but it is slower, lower quality and more compute intensive.
|
Offline chat stays completely private and works without internet. But it is slower, lower quality and more compute intensive.
|
||||||
|
|
||||||
!> **Warning**: This will download a 3Gb+ Llama v2 chat model which can take some time
|
> **System Requirements**:
|
||||||
|
> - Machine with at least **6 GB of RAM** and **4 GB of Disk** available
|
||||||
|
> - A CPU supporting [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) is required
|
||||||
|
> - A Mac M1+ or [Vulcan supported GPU](https://vulkan.gpuinfo.org/) should significantly speed up chat response times
|
||||||
|
|
||||||
- Open your [Khoj settings](http://localhost:42110/config/), click *Enable* on the Offline Chat card
|
- Open your [Khoj settings](http://localhost:42110/config/) and click *Enable* on the Offline Chat card
|
||||||
|
|
||||||
![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
|
![Configure offline chat](https://user-images.githubusercontent.com/6413477/257021364-8a2029f5-dc21-4de8-9af9-9ba6100d695c.mp4 ':include :type=mp4')
|
||||||
|
|
||||||
#### Online Chat
|
#### Online Chat
|
||||||
Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
|
Online chat requires internet to use ChatGPT but is faster, higher quality and less compute intensive.
|
||||||
|
|
||||||
!> **Warning**: This will enable Khoj to send your chat queries and notes to OpenAI for processing
|
!> **Warning**: This will enable Khoj to send your chat queries and query relevant notes to OpenAI for processing
|
||||||
|
|
||||||
1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
|
1. Get your [OpenAI API Key](https://platform.openai.com/account/api-keys)
|
||||||
2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.
|
2. Open your [Khoj Online Chat settings](http://localhost:42110/config/processor/conversation), add your OpenAI API key, and click *Save*. Then go to your [Khoj settings](http://localhost:42110/config) and click `Configure`. This will refresh Khoj with your OpenAI API key.
|
||||||
|
|
|
@ -46,7 +46,7 @@ Indexes your org-agenda files, by default.
|
||||||
(use-package khoj
|
(use-package khoj
|
||||||
:ensure t
|
:ensure t
|
||||||
:pin melpa-stable
|
:pin melpa-stable
|
||||||
:bind ("C-c s" . 'khoj)
|
:bind ("C-c s" . 'khoj))
|
||||||
```
|
```
|
||||||
|
|
||||||
- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj
|
- Note: Install `khoj.el` from MELPA (instead of MELPA Stable) if you installed the pre-release version of khoj
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"id": "khoj",
|
"id": "khoj",
|
||||||
"name": "Khoj",
|
"name": "Khoj",
|
||||||
"version": "0.12.3",
|
"version": "0.13.0",
|
||||||
"minAppVersion": "0.15.0",
|
"minAppVersion": "0.15.0",
|
||||||
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
||||||
"author": "Khoj Inc.",
|
"author": "Khoj Inc.",
|
||||||
|
|
|
@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "khoj-assistant"
|
name = "khoj-assistant"
|
||||||
description = "An AI personal assistant for your Digital Brain"
|
description = "An AI copilot for your Second Brain"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
requires-python = ">=3.8"
|
requires-python = ">=3.8"
|
||||||
|
@ -40,8 +40,9 @@ dependencies = [
|
||||||
"dateparser >= 1.1.1",
|
"dateparser >= 1.1.1",
|
||||||
"defusedxml == 0.7.1",
|
"defusedxml == 0.7.1",
|
||||||
"fastapi == 0.77.1",
|
"fastapi == 0.77.1",
|
||||||
|
"python-multipart >= 0.0.5",
|
||||||
"jinja2 == 3.1.2",
|
"jinja2 == 3.1.2",
|
||||||
"openai >= 0.27.0",
|
"openai >= 0.27.0, < 1.0.0",
|
||||||
"tiktoken >= 0.3.2",
|
"tiktoken >= 0.3.2",
|
||||||
"tenacity >= 8.2.2",
|
"tenacity >= 8.2.2",
|
||||||
"pillow == 9.3.0",
|
"pillow == 9.3.0",
|
||||||
|
@ -83,6 +84,7 @@ test = [
|
||||||
"freezegun >= 1.2.0",
|
"freezegun >= 1.2.0",
|
||||||
"factory-boy >= 3.2.1",
|
"factory-boy >= 3.2.1",
|
||||||
"trio >= 0.22.0",
|
"trio >= 0.22.0",
|
||||||
|
"pytest-xdist",
|
||||||
]
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"khoj-assistant[test]",
|
"khoj-assistant[test]",
|
||||||
|
|
|
@ -9,6 +9,10 @@ do
|
||||||
# Get current project version
|
# Get current project version
|
||||||
current_version=$OPTARG
|
current_version=$OPTARG
|
||||||
|
|
||||||
|
# Bump Desktop app to current version
|
||||||
|
cd $project_root/src/interface/desktop
|
||||||
|
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
||||||
|
|
||||||
# Bump Obsidian plugin to current version
|
# Bump Obsidian plugin to current version
|
||||||
cd $project_root/src/interface/obsidian
|
cd $project_root/src/interface/obsidian
|
||||||
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
sed -E -i.bak "s/version\": \"(.*)\",/version\": \"$current_version\",/" package.json
|
||||||
|
|
|
@ -14,10 +14,11 @@ warnings.filterwarnings("ignore", message=r"legacy way to download files from th
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import uvicorn
|
import uvicorn
|
||||||
import django
|
|
||||||
import schedule
|
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
import schedule
|
||||||
|
import django
|
||||||
|
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from rich.logging import RichHandler
|
from rich.logging import RichHandler
|
||||||
from django.core.asgi import get_asgi_application
|
from django.core.asgi import get_asgi_application
|
||||||
|
@ -41,6 +42,15 @@ app = FastAPI()
|
||||||
# Get Django Application
|
# Get Django Application
|
||||||
django_app = get_asgi_application()
|
django_app = get_asgi_application()
|
||||||
|
|
||||||
|
# Add CORS middleware
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["app://obsidian.md", "http://localhost:*", "https://app.khoj.dev/*", "app://khoj.dev"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
# Set Locale
|
# Set Locale
|
||||||
locale.setlocale(locale.LC_ALL, "")
|
locale.setlocale(locale.LC_ALL, "")
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,6 @@ const {dialog} = require('electron');
|
||||||
|
|
||||||
const cron = require('cron').CronJob;
|
const cron = require('cron').CronJob;
|
||||||
const axios = require('axios');
|
const axios = require('axios');
|
||||||
const { Readable } = require('stream');
|
|
||||||
|
|
||||||
const KHOJ_URL = 'http://127.0.0.1:42110'
|
const KHOJ_URL = 'http://127.0.0.1:42110'
|
||||||
|
|
||||||
|
@ -65,7 +64,7 @@ const schema = {
|
||||||
|
|
||||||
var state = {}
|
var state = {}
|
||||||
|
|
||||||
const store = new Store({schema});
|
const store = new Store({ schema });
|
||||||
|
|
||||||
console.log(store);
|
console.log(store);
|
||||||
|
|
||||||
|
@ -86,57 +85,65 @@ function handleSetTitle (event, title) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function filenameToMimeType (filename) {
|
||||||
|
const extension = filename.split('.').pop();
|
||||||
|
switch (extension) {
|
||||||
|
case 'pdf':
|
||||||
|
return 'application/pdf';
|
||||||
|
case 'png':
|
||||||
|
return 'image/png';
|
||||||
|
case 'jpg':
|
||||||
|
case 'jpeg':
|
||||||
|
return 'image/jpeg';
|
||||||
|
case 'md':
|
||||||
|
case 'markdown':
|
||||||
|
return 'text/markdown';
|
||||||
|
case 'org':
|
||||||
|
return 'text/org';
|
||||||
|
default:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function pushDataToKhoj (regenerate = false) {
|
function pushDataToKhoj (regenerate = false) {
|
||||||
let filesToPush = [];
|
let filesToPush = [];
|
||||||
const files = store.get('files');
|
const files = store.get('files') || [];
|
||||||
const folders = store.get('folders');
|
const folders = store.get('folders') || [];
|
||||||
state = {
|
state = { completed: true }
|
||||||
completed: true
|
|
||||||
|
// Collect paths of all configured files to index
|
||||||
|
for (const file of files) {
|
||||||
|
filesToPush.push(file.path);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (files) {
|
// Collect paths of all indexable files in configured folders
|
||||||
for (file of files) {
|
for (const folder of folders) {
|
||||||
filesToPush.push(file.path);
|
const files = fs.readdirSync(folder.path, { withFileTypes: true });
|
||||||
}
|
for (const file of files) {
|
||||||
}
|
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
||||||
if (folders) {
|
filesToPush.push(path.join(folder.path, file.name));
|
||||||
for (folder of folders) {
|
|
||||||
const files = fs.readdirSync(folder.path, { withFileTypes: true });
|
|
||||||
for (file of files) {
|
|
||||||
if (file.isFile() && validFileTypes.includes(file.name.split('.').pop())) {
|
|
||||||
filesToPush.push(path.join(folder.path, file.name));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let data = {
|
|
||||||
files: []
|
|
||||||
}
|
|
||||||
|
|
||||||
const lastSync = store.get('lastSync') || [];
|
const lastSync = store.get('lastSync') || [];
|
||||||
|
const formData = new FormData();
|
||||||
for (file of filesToPush) {
|
for (const file of filesToPush) {
|
||||||
const stats = fs.statSync(file);
|
const stats = fs.statSync(file);
|
||||||
if (!regenerate) {
|
if (!regenerate) {
|
||||||
|
// Only push files that have been modified since last sync
|
||||||
if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
|
if (stats.mtime.toISOString() < lastSync.find((syncedFile) => syncedFile.path === file)?.datetime) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Collect all updated or newly created files since last sync to index on Khoj server
|
||||||
try {
|
try {
|
||||||
let rawData;
|
let encoding = binaryFileTypes.includes(file.split('.').pop()) ? "binary" : "utf8";
|
||||||
// If the file is a PDF or IMG file, read it as a binary file
|
let mimeType = filenameToMimeType(file) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||||
if (binaryFileTypes.includes(file.split('.').pop())) {
|
let fileContent = Buffer.from(fs.readFileSync(file, { encoding: encoding }), encoding);
|
||||||
rawData = fs.readFileSync(file).toString('base64');
|
let fileObj = new Blob([fileContent], { type: mimeType });
|
||||||
} else {
|
formData.append('files', fileObj, file);
|
||||||
rawData = fs.readFileSync(file, 'utf8');
|
|
||||||
}
|
|
||||||
|
|
||||||
data.files.push({
|
|
||||||
path: file,
|
|
||||||
content: rawData
|
|
||||||
});
|
|
||||||
state[file] = {
|
state[file] = {
|
||||||
success: true,
|
success: true,
|
||||||
}
|
}
|
||||||
|
@ -149,46 +156,46 @@ function pushDataToKhoj (regenerate = false) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mark deleted files for removal from index on Khoj server
|
||||||
for (const syncedFile of lastSync) {
|
for (const syncedFile of lastSync) {
|
||||||
if (!filesToPush.includes(syncedFile.path)) {
|
if (!filesToPush.includes(syncedFile.path)) {
|
||||||
data.files.push({
|
fileObj = new Blob([""], { type: filenameToMimeType(syncedFile.path) });
|
||||||
path: syncedFile.path,
|
formData.append('files', fileObj, syncedFile.path);
|
||||||
content: ""
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const headers = { 'x-api-key': 'secret', 'Content-Type': 'application/json' };
|
// Send collected files to Khoj server for indexing
|
||||||
|
if (!!formData?.entries()?.next().value) {
|
||||||
const stream = new Readable({
|
const hostURL = store.get('hostURL') || KHOJ_URL;
|
||||||
read() {
|
const headers = {
|
||||||
this.push(JSON.stringify(data));
|
'x-api-key': 'secret'
|
||||||
this.push(null);
|
};
|
||||||
}
|
axios.post(`${hostURL}/api/v1/index/update?force=${regenerate}&client=desktop`, formData, { headers })
|
||||||
});
|
.then(response => {
|
||||||
|
console.log(response.data);
|
||||||
const hostURL = store.get('hostURL') || KHOJ_URL;
|
let lastSync = [];
|
||||||
|
for (const file of filesToPush) {
|
||||||
axios.post(`${hostURL}/v1/indexer/batch?regenerate=${regenerate}`, stream, { headers })
|
lastSync.push({
|
||||||
.then(response => {
|
path: file,
|
||||||
console.log(response.data);
|
datetime: new Date().toISOString()
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
});
|
||||||
win.webContents.send('update-state', state);
|
}
|
||||||
let lastSync = [];
|
store.set('lastSync', lastSync);
|
||||||
for (const file of filesToPush) {
|
})
|
||||||
lastSync.push({
|
.catch(error => {
|
||||||
path: file,
|
console.error(error);
|
||||||
datetime: new Date().toISOString()
|
state['completed'] = false
|
||||||
});
|
})
|
||||||
}
|
.finally(() => {
|
||||||
store.set('lastSync', lastSync);
|
// Syncing complete
|
||||||
})
|
const win = BrowserWindow.getAllWindows()[0];
|
||||||
.catch(error => {
|
if (win) win.webContents.send('update-state', state);
|
||||||
console.error(error);
|
});
|
||||||
state['completed'] = false
|
} else {
|
||||||
const win = BrowserWindow.getAllWindows()[0];
|
// Syncing complete
|
||||||
win.webContents.send('update-state', state);
|
const win = BrowserWindow.getAllWindows()[0];
|
||||||
});
|
if (win) win.webContents.send('update-state', state);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pushDataToKhoj();
|
pushDataToKhoj();
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
{
|
{
|
||||||
"name": "Khoj",
|
"name": "Khoj",
|
||||||
"homepage": ".",
|
"version": "0.13.0",
|
||||||
"productName": "Khoj",
|
"description": "An AI copilot for your Second Brain",
|
||||||
"version": "1.0.2",
|
"author": "Saba Imran, Debanjum Singh Solanky <team@khoj.dev>",
|
||||||
"description": "Scaffolding for the desktop entrypoint to Khoj",
|
"license": "GPL-3.0-or-later",
|
||||||
"main": "main.js",
|
"homepage": "https://khoj.dev",
|
||||||
"repository": "\"https://github.com/khoj-ai/khoj\"",
|
"repository": "\"https://github.com/khoj-ai/khoj\"",
|
||||||
"author": "Khoj <team@khoj.dev>",
|
"productName": "Khoj",
|
||||||
"license": "MIT",
|
"main": "main.js",
|
||||||
"private": false,
|
"private": false,
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"electron": "25.8.1"
|
"electron": "25.8.1"
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
;;; khoj.el --- AI personal assistant for your digital brain -*- lexical-binding: t -*-
|
;;; khoj.el --- AI copilot for your Second Brain -*- lexical-binding: t -*-
|
||||||
|
|
||||||
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
|
;; Copyright (C) 2021-2023 Khoj Inc.
|
||||||
|
|
||||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
;; Author: Debanjum Singh Solanky <debanjum@khoj.dev>
|
||||||
;; Description: An AI personal assistant for your digital brain
|
;; Saba Imran <saba@khoj.dev>
|
||||||
|
;; Description: An AI copilot for your Second Brain
|
||||||
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
|
;; Keywords: search, chat, org-mode, outlines, markdown, pdf, image
|
||||||
;; Version: 0.12.3
|
;; Version: 0.13.0
|
||||||
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
;; Package-Requires: ((emacs "27.1") (transient "0.3.0") (dash "2.19.1"))
|
||||||
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
|
;; URL: https://github.com/khoj-ai/khoj/tree/master/src/interface/emacs
|
||||||
|
|
||||||
|
@ -28,8 +29,8 @@
|
||||||
|
|
||||||
;;; Commentary:
|
;;; Commentary:
|
||||||
|
|
||||||
;; Create an AI personal assistant for your `org-mode', `markdown' notes,
|
;; Create an AI copilot to your `org-mode', `markdown' notes,
|
||||||
;; PDFs and images. The assistant exposes 2 modes, search and chat:
|
;; PDFs and images. The copilot exposes 2 modes, search and chat:
|
||||||
;;
|
;;
|
||||||
;; Chat provides faster answers, iterative discovery and assisted
|
;; Chat provides faster answers, iterative discovery and assisted
|
||||||
;; creativity. It requires your OpenAI API key to access GPT models
|
;; creativity. It requires your OpenAI API key to access GPT models
|
||||||
|
@ -87,6 +88,21 @@
|
||||||
:group 'khoj
|
:group 'khoj
|
||||||
:type 'integer)
|
:type 'integer)
|
||||||
|
|
||||||
|
(defcustom khoj-search-on-idle-time 0.3
|
||||||
|
"Idle time (in seconds) to wait before triggering search."
|
||||||
|
:group 'khoj
|
||||||
|
:type 'number)
|
||||||
|
|
||||||
|
(defcustom khoj-server-api-key "secret"
|
||||||
|
"API Key to Khoj server."
|
||||||
|
:group 'khoj
|
||||||
|
:type 'string)
|
||||||
|
|
||||||
|
(defcustom khoj-index-interval 3600
|
||||||
|
"Interval (in seconds) to wait before updating content index."
|
||||||
|
:group 'khoj
|
||||||
|
:type 'number)
|
||||||
|
|
||||||
(defcustom khoj-default-content-type "org"
|
(defcustom khoj-default-content-type "org"
|
||||||
"The default content type to perform search on."
|
"The default content type to perform search on."
|
||||||
:group 'khoj
|
:group 'khoj
|
||||||
|
@ -115,6 +131,15 @@
|
||||||
(defvar khoj--content-type "org"
|
(defvar khoj--content-type "org"
|
||||||
"The type of content to perform search on.")
|
"The type of content to perform search on.")
|
||||||
|
|
||||||
|
(defvar khoj--search-on-idle-timer nil
|
||||||
|
"Idle timer to trigger incremental search.")
|
||||||
|
|
||||||
|
(defvar khoj--index-timer nil
|
||||||
|
"Timer to trigger content indexing.")
|
||||||
|
|
||||||
|
(defvar khoj--indexed-files '()
|
||||||
|
"Files that were indexed in previous content indexing run.")
|
||||||
|
|
||||||
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
|
(declare-function org-element-property "org-mode" (PROPERTY ELEMENT))
|
||||||
(declare-function org-element-type "org-mode" (ELEMENT))
|
(declare-function org-element-type "org-mode" (ELEMENT))
|
||||||
(declare-function markdown-mode "markdown-mode" ())
|
(declare-function markdown-mode "markdown-mode" ())
|
||||||
|
@ -236,6 +261,11 @@ for example), set this to the full interpreter path."
|
||||||
:type 'boolean
|
:type 'boolean
|
||||||
:group 'khoj)
|
:group 'khoj)
|
||||||
|
|
||||||
|
(defcustom khoj-offline-chat-model nil
|
||||||
|
"Specify chat model to use for offline chat with khoj."
|
||||||
|
:type 'string
|
||||||
|
:group 'khoj)
|
||||||
|
|
||||||
(defcustom khoj-auto-setup t
|
(defcustom khoj-auto-setup t
|
||||||
"Automate install, configure and start of khoj server.
|
"Automate install, configure and start of khoj server.
|
||||||
Auto invokes setup steps on calling main entrypoint."
|
Auto invokes setup steps on calling main entrypoint."
|
||||||
|
@ -365,9 +395,9 @@ CONFIG is json obtained from Khoj config API."
|
||||||
(string-join "/"))))
|
(string-join "/"))))
|
||||||
|
|
||||||
(defun khoj--server-configure ()
|
(defun khoj--server-configure ()
|
||||||
"Configure the the Khoj server for search and chat."
|
"Configure the Khoj server for search and chat."
|
||||||
(interactive)
|
(interactive)
|
||||||
(let* ((org-directory-regexes (or (mapcar (lambda (dir) (format "%s/**/*.org" dir)) khoj-org-directories) json-null))
|
(let* ((url-request-method "GET")
|
||||||
(current-config
|
(current-config
|
||||||
(with-temp-buffer
|
(with-temp-buffer
|
||||||
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
|
(url-insert-file-contents (format "%s/api/config/data" khoj-server-url))
|
||||||
|
@ -376,56 +406,12 @@ CONFIG is json obtained from Khoj config API."
|
||||||
(with-temp-buffer
|
(with-temp-buffer
|
||||||
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
|
(url-insert-file-contents (format "%s/api/config/data/default" khoj-server-url))
|
||||||
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
(ignore-error json-end-of-file (json-parse-buffer :object-type 'alist :array-type 'list :null-object json-null :false-object json-false))))
|
||||||
(default-index-dir (khoj--get-directory-from-config default-config '(content-type org embeddings-file)))
|
|
||||||
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
|
(default-chat-dir (khoj--get-directory-from-config default-config '(processor conversation conversation-logfile)))
|
||||||
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
|
(chat-model (or khoj-chat-model (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||||
(default-model (alist-get 'model (alist-get 'conversation (alist-get 'processor default-config))))
|
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||||
(enable-offline-chat (or khoj-chat-offline (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor default-config)))))
|
(offline-chat-model (or khoj-offline-chat-model (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor default-config))))))
|
||||||
(config (or current-config default-config)))
|
(config (or current-config default-config)))
|
||||||
|
|
||||||
;; Configure content types
|
|
||||||
(cond
|
|
||||||
;; If khoj backend is not configured yet
|
|
||||||
((not current-config)
|
|
||||||
(message "khoj.el: Server not configured yet.")
|
|
||||||
(setq config (delq (assoc 'content-type config) config))
|
|
||||||
(cl-pushnew `(content-type . ((org . ((input-files . ,khoj-org-files)
|
|
||||||
(input-filter . ,org-directory-regexes)
|
|
||||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
|
||||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
|
||||||
(index-heading-entries . ,json-false)))))
|
|
||||||
config))
|
|
||||||
|
|
||||||
;; Else if khoj config has no org content config
|
|
||||||
((not (alist-get 'org (alist-get 'content-type config)))
|
|
||||||
(message "khoj.el: Org-mode content on server not configured yet.")
|
|
||||||
(let ((new-content-type (alist-get 'content-type config)))
|
|
||||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
|
||||||
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
|
|
||||||
(input-filter . ,org-directory-regexes)
|
|
||||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" default-index-dir))
|
|
||||||
(embeddings-file . ,(format "%s/org.pt" default-index-dir))
|
|
||||||
(index-heading-entries . ,json-false)))
|
|
||||||
new-content-type)
|
|
||||||
(setq config (delq (assoc 'content-type config) config))
|
|
||||||
(cl-pushnew `(content-type . ,new-content-type) config)))
|
|
||||||
|
|
||||||
;; Else if khoj is not configured to index specified org files
|
|
||||||
((not (and (equal (alist-get 'input-files (alist-get 'org (alist-get 'content-type config))) khoj-org-files)
|
|
||||||
(equal (alist-get 'input-filter (alist-get 'org (alist-get 'content-type config))) org-directory-regexes)))
|
|
||||||
(message "khoj.el: Org-mode content on server is stale.")
|
|
||||||
(let* ((index-directory (khoj--get-directory-from-config config '(content-type org embeddings-file)))
|
|
||||||
(new-content-type (alist-get 'content-type config)))
|
|
||||||
(setq new-content-type (delq (assoc 'org new-content-type) new-content-type))
|
|
||||||
(cl-pushnew `(org . ((input-files . ,khoj-org-files)
|
|
||||||
(input-filter . ,org-directory-regexes)
|
|
||||||
(compressed-jsonl . ,(format "%s/org.jsonl.gz" index-directory))
|
|
||||||
(embeddings-file . ,(format "%s/org.pt" index-directory))
|
|
||||||
(index-heading-entries . ,json-false)))
|
|
||||||
new-content-type)
|
|
||||||
(setq config (delq (assoc 'content-type config) config))
|
|
||||||
(cl-pushnew `(content-type . ,new-content-type) config))))
|
|
||||||
|
|
||||||
;; Configure processors
|
;; Configure processors
|
||||||
(cond
|
(cond
|
||||||
((not khoj-openai-api-key)
|
((not khoj-openai-api-key)
|
||||||
|
@ -441,10 +427,11 @@ CONFIG is json obtained from Khoj config API."
|
||||||
|
|
||||||
;; If khoj backend isn't configured yet
|
;; If khoj backend isn't configured yet
|
||||||
((not current-config)
|
((not current-config)
|
||||||
(message "khoj.el: Chat not configured yet.")
|
(message "khoj.el: Khoj not configured yet.")
|
||||||
(setq config (delq (assoc 'processor config) config))
|
(setq config (delq (assoc 'processor config) config))
|
||||||
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
(cl-pushnew `(processor . ((conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||||
(enable-offline-chat . ,enable-offline-chat)
|
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||||
|
(chat-model . ,offline-chat-model)))
|
||||||
(openai . ((chat-model . ,chat-model)
|
(openai . ((chat-model . ,chat-model)
|
||||||
(api-key . ,khoj-openai-api-key)))))))
|
(api-key . ,khoj-openai-api-key)))))))
|
||||||
config))
|
config))
|
||||||
|
@ -455,7 +442,8 @@ CONFIG is json obtained from Khoj config API."
|
||||||
(let ((new-processor-type (alist-get 'processor config)))
|
(let ((new-processor-type (alist-get 'processor config)))
|
||||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" default-chat-dir))
|
||||||
(enable-offline-chat . ,enable-offline-chat)
|
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||||
|
(chat-model . ,offline-chat-model)))
|
||||||
(openai . ((chat-model . ,chat-model)
|
(openai . ((chat-model . ,chat-model)
|
||||||
(api-key . ,khoj-openai-api-key)))))
|
(api-key . ,khoj-openai-api-key)))))
|
||||||
new-processor-type)
|
new-processor-type)
|
||||||
|
@ -465,13 +453,15 @@ CONFIG is json obtained from Khoj config API."
|
||||||
;; Else if chat configuration in khoj backend has gone stale
|
;; Else if chat configuration in khoj backend has gone stale
|
||||||
((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
|
((not (and (equal (alist-get 'api-key (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-openai-api-key)
|
||||||
(equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
|
(equal (alist-get 'chat-model (alist-get 'openai (alist-get 'conversation (alist-get 'processor config)))) khoj-chat-model)
|
||||||
(equal (alist-get 'enable-offline-chat (alist-get 'conversation (alist-get 'processor config))) enable-offline-chat)))
|
(equal (alist-get 'enable-offline-chat (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) enable-offline-chat)
|
||||||
|
(equal (alist-get 'chat-model (alist-get 'offline-chat (alist-get 'conversation (alist-get 'processor config)))) offline-chat-model)))
|
||||||
(message "khoj.el: Chat configuration has gone stale.")
|
(message "khoj.el: Chat configuration has gone stale.")
|
||||||
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
|
(let* ((chat-directory (khoj--get-directory-from-config config '(processor conversation conversation-logfile)))
|
||||||
(new-processor-type (alist-get 'processor config)))
|
(new-processor-type (alist-get 'processor config)))
|
||||||
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
(setq new-processor-type (delq (assoc 'conversation new-processor-type) new-processor-type))
|
||||||
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
|
(cl-pushnew `(conversation . ((conversation-logfile . ,(format "%s/conversation.json" chat-directory))
|
||||||
(enable-offline-chat . ,enable-offline-chat)
|
(offline-chat . ((enable-offline-chat . ,enable-offline-chat)
|
||||||
|
(chat-model . ,offline-chat-model)))
|
||||||
(openai . ((chat-model . ,khoj-chat-model)
|
(openai . ((chat-model . ,khoj-chat-model)
|
||||||
(api-key . ,khoj-openai-api-key)))))
|
(api-key . ,khoj-openai-api-key)))))
|
||||||
new-processor-type)
|
new-processor-type)
|
||||||
|
@ -509,9 +499,75 @@ CONFIG is json obtained from Khoj config API."
|
||||||
(khoj--server-configure))))
|
(khoj--server-configure))))
|
||||||
|
|
||||||
|
|
||||||
;; -----------------------------------------------
|
;; -------------------
|
||||||
;; Extract and Render Entries of each Content Type
|
;; Khoj Index Content
|
||||||
;; -----------------------------------------------
|
;; -------------------
|
||||||
|
|
||||||
|
(defun khoj--server-index-files (&optional force content-type file-paths)
|
||||||
|
"Send files at `FILE-PATHS' to the Khoj server to index for search and chat.
|
||||||
|
`FORCE' re-indexes all files of `CONTENT-TYPE' even if they are already indexed."
|
||||||
|
(interactive)
|
||||||
|
(let ((boundary (format "-------------------------%d" (random (expt 10 10))))
|
||||||
|
(files-to-index (or file-paths
|
||||||
|
(append (mapcan (lambda (dir) (directory-files-recursively dir "\\.org$")) khoj-org-directories) khoj-org-files)))
|
||||||
|
(type-query (if (or (equal content-type "all") (not content-type)) "" (format "t=%s" content-type)))
|
||||||
|
(inhibit-message t)
|
||||||
|
(message-log-max nil))
|
||||||
|
(let ((url-request-method "POST")
|
||||||
|
(url-request-data (khoj--render-files-as-request-body files-to-index khoj--indexed-files boundary))
|
||||||
|
(url-request-extra-headers `(("content-type" . ,(format "multipart/form-data; boundary=%s" boundary))
|
||||||
|
("x-api-key" . ,khoj-server-api-key))))
|
||||||
|
(with-current-buffer
|
||||||
|
(url-retrieve (format "%s/api/v1/index/update?%s&force=%s&client=emacs" khoj-server-url type-query (or force "false"))
|
||||||
|
;; render response from indexing API endpoint on server
|
||||||
|
(lambda (status)
|
||||||
|
(if (not status)
|
||||||
|
(message "khoj.el: %scontent index %supdated" (if content-type (format "%s " content-type) "") (if force "force " ""))
|
||||||
|
(with-current-buffer (current-buffer)
|
||||||
|
(goto-char "\n\n")
|
||||||
|
(message "khoj.el: Failed to %supdate %s content index. Status: %s. Response: %s"
|
||||||
|
(if force "force " "")
|
||||||
|
content-type
|
||||||
|
status
|
||||||
|
(string-trim (buffer-substring-no-properties (point) (point-max)))))))
|
||||||
|
nil t t)))
|
||||||
|
(setq khoj--indexed-files files-to-index)))
|
||||||
|
|
||||||
|
(defun khoj--render-files-as-request-body (files-to-index previously-indexed-files boundary)
|
||||||
|
"Render `FILES-TO-INDEX', `PREVIOUSLY-INDEXED-FILES' as multi-part form body.
|
||||||
|
Use `BOUNDARY' to separate files. This is sent to Khoj server as a POST request."
|
||||||
|
(with-temp-buffer
|
||||||
|
(set-buffer-multibyte nil)
|
||||||
|
(insert "\n")
|
||||||
|
(dolist (file-to-index files-to-index)
|
||||||
|
(insert (format "--%s\r\n" boundary))
|
||||||
|
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
|
||||||
|
(insert "Content-Type: text/org\r\n\r\n")
|
||||||
|
(insert (with-temp-buffer
|
||||||
|
(insert-file-contents-literally file-to-index)
|
||||||
|
(buffer-string)))
|
||||||
|
(insert "\r\n"))
|
||||||
|
(dolist (file-to-index previously-indexed-files)
|
||||||
|
(when (not (member file-to-index files-to-index))
|
||||||
|
(insert (format "--%s\r\n" boundary))
|
||||||
|
(insert (format "Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n" file-to-index))
|
||||||
|
(insert "Content-Type: text/org\r\n\r\n")
|
||||||
|
(insert "")
|
||||||
|
(insert "\r\n")))
|
||||||
|
(insert (format "--%s--\r\n" boundary))
|
||||||
|
(buffer-string)))
|
||||||
|
|
||||||
|
;; Cancel any running indexing timer, first
|
||||||
|
(when khoj--index-timer
|
||||||
|
(cancel-timer khoj--index-timer))
|
||||||
|
;; Send files to index on server every `khoj-index-interval' seconds
|
||||||
|
(setq khoj--index-timer
|
||||||
|
(run-with-timer 60 khoj-index-interval 'khoj--server-index-files))
|
||||||
|
|
||||||
|
|
||||||
|
;; -------------------------------------------
|
||||||
|
;; Render Response from Khoj server for Emacs
|
||||||
|
;; -------------------------------------------
|
||||||
|
|
||||||
(defun khoj--extract-entries-as-markdown (json-response query)
|
(defun khoj--extract-entries-as-markdown (json-response query)
|
||||||
"Convert JSON-RESPONSE, QUERY from API to markdown entries."
|
"Convert JSON-RESPONSE, QUERY from API to markdown entries."
|
||||||
|
@ -920,6 +976,9 @@ RECEIVE-DATE is the message receive date."
|
||||||
(message "khoj.el: Teardown Incremental Search")
|
(message "khoj.el: Teardown Incremental Search")
|
||||||
;; unset khoj minibuffer window
|
;; unset khoj minibuffer window
|
||||||
(setq khoj--minibuffer-window nil)
|
(setq khoj--minibuffer-window nil)
|
||||||
|
(when (and khoj--search-on-idle-timer
|
||||||
|
(timerp khoj--search-on-idle-timer))
|
||||||
|
(cancel-timer khoj--search-on-idle-timer))
|
||||||
;; delete open connections to khoj server
|
;; delete open connections to khoj server
|
||||||
(khoj--delete-open-network-connections-to-server)
|
(khoj--delete-open-network-connections-to-server)
|
||||||
;; remove hooks for khoj incremental query and self
|
;; remove hooks for khoj incremental query and self
|
||||||
|
@ -942,8 +1001,10 @@ RECEIVE-DATE is the message receive date."
|
||||||
;; set current (mini-)buffer entered as khoj minibuffer
|
;; set current (mini-)buffer entered as khoj minibuffer
|
||||||
;; used to query khoj API only when user in khoj minibuffer
|
;; used to query khoj API only when user in khoj minibuffer
|
||||||
(setq khoj--minibuffer-window (current-buffer))
|
(setq khoj--minibuffer-window (current-buffer))
|
||||||
(add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action
|
; do khoj incremental search after idle time
|
||||||
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit
|
(setq khoj--search-on-idle-timer (run-with-idle-timer khoj-search-on-idle-time t #'khoj--incremental-search))
|
||||||
|
; teardown khoj incremental search on minibuffer exit
|
||||||
|
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
|
||||||
(read-string khoj--query-prompt))))
|
(read-string khoj--query-prompt))))
|
||||||
|
|
||||||
|
|
||||||
|
@ -1014,17 +1075,20 @@ Paragraph only starts at first text after blank line."
|
||||||
;; Khoj Menu
|
;; Khoj Menu
|
||||||
;; ---------
|
;; ---------
|
||||||
|
|
||||||
(transient-define-argument khoj--content-type-switch ()
|
(defun khoj--setup-and-show-menu ()
|
||||||
:class 'transient-switches
|
"Create Transient menu for khoj and show it."
|
||||||
:argument-format "--content-type=%s"
|
;; Create the Khoj Transient menu
|
||||||
:argument-regexp ".+"
|
(transient-define-argument khoj--content-type-switch ()
|
||||||
;; set content type to: last used > based on current buffer > default type
|
:class 'transient-switches
|
||||||
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
:argument-format "--content-type=%s"
|
||||||
;; dynamically set choices to content types enabled on khoj backend
|
:argument-regexp ".+"
|
||||||
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
|
;; set content type to: last used > based on current buffer > default type
|
||||||
|
:init-value (lambda (obj) (oset obj value (format "--content-type=%s" (or khoj--content-type (khoj--buffer-name-to-content-type (buffer-name))))))
|
||||||
|
;; dynamically set choices to content types enabled on khoj backend
|
||||||
|
:choices (or (ignore-errors (mapcar #'symbol-name (khoj--get-enabled-content-types))) '("all" "org" "markdown" "pdf" "image")))
|
||||||
|
|
||||||
(transient-define-suffix khoj--search-command (&optional args)
|
(transient-define-suffix khoj--search-command (&optional args)
|
||||||
(interactive (list (transient-args transient-current-command)))
|
(interactive (list (transient-args transient-current-command)))
|
||||||
(progn
|
(progn
|
||||||
;; set content type to: specified > last used > based on current buffer > default type
|
;; set content type to: specified > last used > based on current buffer > default type
|
||||||
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||||
|
@ -1033,9 +1097,9 @@ Paragraph only starts at first text after blank line."
|
||||||
;; trigger incremental search
|
;; trigger incremental search
|
||||||
(call-interactively #'khoj-incremental)))
|
(call-interactively #'khoj-incremental)))
|
||||||
|
|
||||||
(transient-define-suffix khoj--find-similar-command (&optional args)
|
(transient-define-suffix khoj--find-similar-command (&optional args)
|
||||||
"Find items similar to current item at point."
|
"Find items similar to current item at point."
|
||||||
(interactive (list (transient-args transient-current-command)))
|
(interactive (list (transient-args transient-current-command)))
|
||||||
(progn
|
(progn
|
||||||
;; set content type to: specified > last used > based on current buffer > default type
|
;; set content type to: specified > last used > based on current buffer > default type
|
||||||
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
(setq khoj--content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||||
|
@ -1043,37 +1107,38 @@ Paragraph only starts at first text after blank line."
|
||||||
(setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count))
|
(setq khoj-results-count (or (transient-arg-value "--results-count=" args) khoj-results-count))
|
||||||
(khoj--find-similar khoj--content-type)))
|
(khoj--find-similar khoj--content-type)))
|
||||||
|
|
||||||
(transient-define-suffix khoj--update-command (&optional args)
|
(transient-define-suffix khoj--update-command (&optional args)
|
||||||
"Call khoj API to update index of specified content type."
|
"Call khoj API to update index of specified content type."
|
||||||
(interactive (list (transient-args transient-current-command)))
|
(interactive (list (transient-args transient-current-command)))
|
||||||
(let* ((force-update (if (member "--force-update" args) "true" "false"))
|
(let* ((force-update (if (member "--force-update" args) "true" "false"))
|
||||||
;; set content type to: specified > last used > based on current buffer > default type
|
;; set content type to: specified > last used > based on current buffer > default type
|
||||||
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
(content-type (or (transient-arg-value "--content-type=" args) (khoj--buffer-name-to-content-type (buffer-name))))
|
||||||
(type-query (if (equal content-type "all") "" (format "t=%s" content-type)))
|
(url-request-method "GET"))
|
||||||
(update-url (format "%s/api/update?%s&force=%s&client=emacs" khoj-server-url type-query force-update))
|
(progn
|
||||||
(url-request-method "GET"))
|
(setq khoj--content-type content-type)
|
||||||
(progn
|
(khoj--server-index-files force-update content-type))))
|
||||||
(setq khoj--content-type content-type)
|
|
||||||
(url-retrieve update-url (lambda (_) (message "khoj.el: %s index %supdated!" content-type (if (member "--force-update" args) "force " "")))))))
|
|
||||||
|
|
||||||
(transient-define-suffix khoj--chat-command (&optional _)
|
(transient-define-suffix khoj--chat-command (&optional _)
|
||||||
"Command to Chat with Khoj."
|
"Command to Chat with Khoj."
|
||||||
(interactive (list (transient-args transient-current-command)))
|
(interactive (list (transient-args transient-current-command)))
|
||||||
(khoj--chat))
|
(khoj--chat))
|
||||||
|
|
||||||
(transient-define-prefix khoj--menu ()
|
(transient-define-prefix khoj--menu ()
|
||||||
"Create Khoj Menu to Configure and Execute Commands."
|
"Create Khoj Menu to Configure and Execute Commands."
|
||||||
[["Configure Search"
|
[["Configure Search"
|
||||||
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
|
("n" "Results Count" "--results-count=" :init-value (lambda (obj) (oset obj value (format "%s" khoj-results-count))))
|
||||||
("t" "Content Type" khoj--content-type-switch)]
|
("t" "Content Type" khoj--content-type-switch)]
|
||||||
["Configure Update"
|
["Configure Update"
|
||||||
("-f" "Force Update" "--force-update")]]
|
("-f" "Force Update" "--force-update")]]
|
||||||
[["Act"
|
[["Act"
|
||||||
("c" "Chat" khoj--chat-command)
|
("c" "Chat" khoj--chat-command)
|
||||||
("s" "Search" khoj--search-command)
|
("s" "Search" khoj--search-command)
|
||||||
("f" "Find Similar" khoj--find-similar-command)
|
("f" "Find Similar" khoj--find-similar-command)
|
||||||
("u" "Update" khoj--update-command)
|
("u" "Update" khoj--update-command)
|
||||||
("q" "Quit" transient-quit-one)]])
|
("q" "Quit" transient-quit-one)]])
|
||||||
|
|
||||||
|
;; Show the Khoj Transient menu
|
||||||
|
(khoj--menu))
|
||||||
|
|
||||||
|
|
||||||
;; ----------
|
;; ----------
|
||||||
|
@ -1086,7 +1151,7 @@ Paragraph only starts at first text after blank line."
|
||||||
(interactive)
|
(interactive)
|
||||||
(when khoj-auto-setup
|
(when khoj-auto-setup
|
||||||
(khoj-setup t))
|
(khoj-setup t))
|
||||||
(khoj--menu))
|
(khoj--setup-and-show-menu))
|
||||||
|
|
||||||
(provide 'khoj)
|
(provide 'khoj)
|
||||||
|
|
||||||
|
|
|
@ -206,6 +206,64 @@ Rule everything\n")
|
||||||
"Rule everything"))
|
"Rule everything"))
|
||||||
))
|
))
|
||||||
|
|
||||||
|
|
||||||
|
;; -------------------------------------
|
||||||
|
;; Test Helpers to Index Content
|
||||||
|
;; -------------------------------------
|
||||||
|
|
||||||
|
(ert-deftest khoj-tests--render-files-to-add-request-body ()
|
||||||
|
"Test files are formatted into a multi-part http request body"
|
||||||
|
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
|
||||||
|
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
|
||||||
|
(unwind-protect
|
||||||
|
(progn
|
||||||
|
(should
|
||||||
|
(equal
|
||||||
|
(khoj--render-files-as-request-body (list upgrade-file act-file) '() "khoj")
|
||||||
|
(format
|
||||||
|
"\n--khoj\r\n\
|
||||||
|
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||||
|
Content-Type: text/org\r\n\r\n\
|
||||||
|
# Become God\n\
|
||||||
|
## Upgrade\n\n\
|
||||||
|
Penance to Immortality\n\n\r
|
||||||
|
--khoj\r\n\
|
||||||
|
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||||
|
Content-Type: text/org\r\n\r\n\
|
||||||
|
## Act\n\n\
|
||||||
|
Rule everything\n\n\r\n\
|
||||||
|
--khoj--\r\n" upgrade-file act-file))))
|
||||||
|
(delete-file upgrade-file)
|
||||||
|
(delete-file act-file))))
|
||||||
|
|
||||||
|
(ert-deftest khoj-tests--render-files-to-add-delete-in-request-body ()
|
||||||
|
"Test files are formatted into a multi-part http request body"
|
||||||
|
(let ((upgrade-file (make-temp-file "upgrade" nil ".org" "# Become God\n## Upgrade\n\nPenance to Immortality\n\n"))
|
||||||
|
(act-file (make-temp-file "act" nil ".org" "## Act\n\nRule everything\n\n")))
|
||||||
|
(unwind-protect
|
||||||
|
(progn
|
||||||
|
(should
|
||||||
|
(equal
|
||||||
|
(khoj--render-files-as-request-body (list upgrade-file act-file) (list upgrade-file act-file "/tmp/deleted-file.org") "khoj")
|
||||||
|
(format
|
||||||
|
"\n--khoj\r\n\
|
||||||
|
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||||
|
Content-Type: text/org\r\n\r\n\
|
||||||
|
# Become God\n\
|
||||||
|
## Upgrade\n\n\
|
||||||
|
Penance to Immortality\n\n\r
|
||||||
|
--khoj\r\n\
|
||||||
|
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||||
|
Content-Type: text/org\r\n\r\n\
|
||||||
|
## Act\n\n\
|
||||||
|
Rule everything\n\n\r
|
||||||
|
--khoj\r\n\
|
||||||
|
Content-Disposition: form-data; name=\"files\"; filename=\"%s\"\r\n\
|
||||||
|
Content-Type: text/org\r\n\r\n\
|
||||||
|
\r
|
||||||
|
--khoj--\r\n" upgrade-file act-file "/tmp/deleted-file.org"))))
|
||||||
|
(delete-file upgrade-file)
|
||||||
|
(delete-file act-file))))
|
||||||
|
|
||||||
(provide 'khoj-tests)
|
(provide 'khoj-tests)
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"id": "khoj",
|
"id": "khoj",
|
||||||
"name": "Khoj",
|
"name": "Khoj",
|
||||||
"version": "0.12.3",
|
"version": "0.13.0",
|
||||||
"minAppVersion": "0.15.0",
|
"minAppVersion": "0.15.0",
|
||||||
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
"description": "An Open-Source AI Personal Assistant for your Digital Brain",
|
||||||
"author": "Khoj Inc.",
|
"author": "Khoj Inc.",
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
{
|
{
|
||||||
"name": "Khoj",
|
"name": "Khoj",
|
||||||
"version": "0.12.3",
|
"version": "0.13.0",
|
||||||
"description": "An AI Personal Assistant for your Digital Brain",
|
"description": "An AI copilot for your Second Brain",
|
||||||
|
"author": "Debanjum Singh Solanky, Saba Imran <team@khoj.dev>",
|
||||||
|
"license": "GPL-3.0-or-later",
|
||||||
"main": "src/main.js",
|
"main": "src/main.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "node esbuild.config.mjs",
|
"dev": "node esbuild.config.mjs",
|
||||||
|
@ -14,8 +16,6 @@
|
||||||
"AI",
|
"AI",
|
||||||
"assistant"
|
"assistant"
|
||||||
],
|
],
|
||||||
"author": "Debanjum Singh Solanky",
|
|
||||||
"license": "GPL-3.0-or-later",
|
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^16.11.6",
|
"@types/node": "^16.11.6",
|
||||||
"@typescript-eslint/eslint-plugin": "5.29.0",
|
"@typescript-eslint/eslint-plugin": "5.29.0",
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
import { Notice, Plugin } from 'obsidian';
|
import { Notice, Plugin, TFile } from 'obsidian';
|
||||||
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
|
import { KhojSetting, KhojSettingTab, DEFAULT_SETTINGS } from 'src/settings'
|
||||||
import { KhojSearchModal } from 'src/search_modal'
|
import { KhojSearchModal } from 'src/search_modal'
|
||||||
import { KhojChatModal } from 'src/chat_modal'
|
import { KhojChatModal } from 'src/chat_modal'
|
||||||
import { configureKhojBackend } from './utils';
|
import { configureKhojBackend, updateContentIndex } from './utils';
|
||||||
|
|
||||||
|
|
||||||
export default class Khoj extends Plugin {
|
export default class Khoj extends Plugin {
|
||||||
settings: KhojSetting;
|
settings: KhojSetting;
|
||||||
|
indexingTimer: NodeJS.Timeout;
|
||||||
|
|
||||||
async onload() {
|
async onload() {
|
||||||
await this.loadSettings();
|
await this.loadSettings();
|
||||||
|
@ -54,6 +55,15 @@ export default class Khoj extends Plugin {
|
||||||
|
|
||||||
// Add a settings tab so the user can configure khoj
|
// Add a settings tab so the user can configure khoj
|
||||||
this.addSettingTab(new KhojSettingTab(this.app, this));
|
this.addSettingTab(new KhojSettingTab(this.app, this));
|
||||||
|
|
||||||
|
// Add scheduled job to update index every 60 minutes
|
||||||
|
this.indexingTimer = setInterval(async () => {
|
||||||
|
if (this.settings.autoConfigure) {
|
||||||
|
this.settings.lastSyncedFiles = await updateContentIndex(
|
||||||
|
this.app.vault, this.settings, this.settings.lastSyncedFiles
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}, 60 * 60 * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
async loadSettings() {
|
async loadSettings() {
|
||||||
|
@ -72,4 +82,12 @@ export default class Khoj extends Plugin {
|
||||||
}
|
}
|
||||||
this.saveData(this.settings);
|
this.saveData(this.settings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async onunload() {
|
||||||
|
// Remove scheduled job to update index at regular cadence
|
||||||
|
if (this.indexingTimer)
|
||||||
|
clearInterval(this.indexingTimer);
|
||||||
|
|
||||||
|
this.unload();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import { App, Notice, PluginSettingTab, request, Setting } from 'obsidian';
|
import { App, Notice, PluginSettingTab, Setting, TFile } from 'obsidian';
|
||||||
import Khoj from 'src/main';
|
import Khoj from 'src/main';
|
||||||
|
import { updateContentIndex } from './utils';
|
||||||
|
|
||||||
export interface KhojSetting {
|
export interface KhojSetting {
|
||||||
enableOfflineChat: boolean;
|
enableOfflineChat: boolean;
|
||||||
|
@ -8,6 +9,7 @@ export interface KhojSetting {
|
||||||
khojUrl: string;
|
khojUrl: string;
|
||||||
connectedToBackend: boolean;
|
connectedToBackend: boolean;
|
||||||
autoConfigure: boolean;
|
autoConfigure: boolean;
|
||||||
|
lastSyncedFiles: TFile[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export const DEFAULT_SETTINGS: KhojSetting = {
|
export const DEFAULT_SETTINGS: KhojSetting = {
|
||||||
|
@ -17,6 +19,7 @@ export const DEFAULT_SETTINGS: KhojSetting = {
|
||||||
connectedToBackend: false,
|
connectedToBackend: false,
|
||||||
autoConfigure: true,
|
autoConfigure: true,
|
||||||
openaiApiKey: '',
|
openaiApiKey: '',
|
||||||
|
lastSyncedFiles: []
|
||||||
}
|
}
|
||||||
|
|
||||||
export class KhojSettingTab extends PluginSettingTab {
|
export class KhojSettingTab extends PluginSettingTab {
|
||||||
|
@ -118,8 +121,9 @@ export class KhojSettingTab extends PluginSettingTab {
|
||||||
}, 300);
|
}, 300);
|
||||||
this.plugin.registerInterval(progress_indicator);
|
this.plugin.registerInterval(progress_indicator);
|
||||||
|
|
||||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=markdown&force=true&client=obsidian`);
|
this.plugin.settings.lastSyncedFiles = await updateContentIndex(
|
||||||
await request(`${this.plugin.settings.khojUrl}/api/update?t=pdf&force=true&client=obsidian`);
|
this.app.vault, this.plugin.settings, this.plugin.settings.lastSyncedFiles, true
|
||||||
|
);
|
||||||
new Notice('✅ Updated Khoj index.');
|
new Notice('✅ Updated Khoj index.');
|
||||||
|
|
||||||
// Reset button once index is updated
|
// Reset button once index is updated
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal } from 'obsidian';
|
import { FileSystemAdapter, Notice, RequestUrlParam, request, Vault, Modal, TFile } from 'obsidian';
|
||||||
import { KhojSetting } from 'src/settings'
|
import { KhojSetting } from 'src/settings'
|
||||||
|
|
||||||
export function getVaultAbsolutePath(vault: Vault): string {
|
export function getVaultAbsolutePath(vault: Vault): string {
|
||||||
|
@ -14,18 +14,85 @@ type OpenAIType = null | {
|
||||||
"api-key": string;
|
"api-key": string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type OfflineChatType = null | {
|
||||||
|
"chat-model": string;
|
||||||
|
"enable-offline-chat": boolean;
|
||||||
|
};
|
||||||
|
|
||||||
interface ProcessorData {
|
interface ProcessorData {
|
||||||
conversation: {
|
conversation: {
|
||||||
"conversation-logfile": string;
|
"conversation-logfile": string;
|
||||||
openai: OpenAIType;
|
openai: OpenAIType;
|
||||||
"enable-offline-chat": boolean;
|
"offline-chat": OfflineChatType;
|
||||||
|
"tokenizer": null | string;
|
||||||
|
"max-prompt-size": null | number;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function fileExtensionToMimeType (extension: string): string {
|
||||||
|
switch (extension) {
|
||||||
|
case 'pdf':
|
||||||
|
return 'application/pdf';
|
||||||
|
case 'png':
|
||||||
|
return 'image/png';
|
||||||
|
case 'jpg':
|
||||||
|
case 'jpeg':
|
||||||
|
return 'image/jpeg';
|
||||||
|
case 'md':
|
||||||
|
case 'markdown':
|
||||||
|
return 'text/markdown';
|
||||||
|
case 'org':
|
||||||
|
return 'text/org';
|
||||||
|
default:
|
||||||
|
return 'text/plain';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function updateContentIndex(vault: Vault, setting: KhojSetting, lastSyncedFiles: TFile[], regenerate: boolean = false): Promise<TFile[]> {
|
||||||
|
// Get all markdown, pdf files in the vault
|
||||||
|
console.log(`Khoj: Updating Khoj content index...`)
|
||||||
|
const files = vault.getFiles().filter(file => file.extension === 'md' || file.extension === 'pdf');
|
||||||
|
const binaryFileTypes = ['pdf', 'png', 'jpg', 'jpeg']
|
||||||
|
let countOfFilesToIndex = 0;
|
||||||
|
let countOfFilesToDelete = 0;
|
||||||
|
|
||||||
|
// Add all files to index as multipart form data
|
||||||
|
const formData = new FormData();
|
||||||
|
for (const file of files) {
|
||||||
|
countOfFilesToIndex++;
|
||||||
|
const encoding = binaryFileTypes.includes(file.extension) ? "binary" : "utf8";
|
||||||
|
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
|
||||||
|
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
|
||||||
|
formData.append('files', new Blob([fileContent], { type: mimeType }), file.path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add any previously synced files to be deleted to multipart form data
|
||||||
|
for (const lastSyncedFile of lastSyncedFiles) {
|
||||||
|
if (!files.includes(lastSyncedFile)) {
|
||||||
|
countOfFilesToDelete++;
|
||||||
|
formData.append('files', new Blob([]), lastSyncedFile.path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call Khoj backend to update index with all markdown, pdf files
|
||||||
|
const response = await fetch(`${setting.khojUrl}/api/v1/index/update?force=${regenerate}&client=obsidian`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'x-api-key': 'secret',
|
||||||
|
},
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
new Notice(`❗️Failed to update Khoj content index. Ensure Khoj server connected or raise issue on Khoj Discord/Github\nError: ${response.statusText}`);
|
||||||
|
} else {
|
||||||
|
console.log(`✅ Refreshed Khoj content index. Updated: ${countOfFilesToIndex} files, Deleted: ${countOfFilesToDelete} files.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return files;
|
||||||
|
}
|
||||||
|
|
||||||
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
export async function configureKhojBackend(vault: Vault, setting: KhojSetting, notify: boolean = true) {
|
||||||
let vaultPath = getVaultAbsolutePath(vault);
|
|
||||||
let mdInVault = `${vaultPath}/**/*.md`;
|
|
||||||
let pdfInVault = `${vaultPath}/**/*.pdf`;
|
|
||||||
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
let khojConfigUrl = `${setting.khojUrl}/api/config/data`;
|
||||||
|
|
||||||
// Check if khoj backend is configured, note if cannot connect to backend
|
// Check if khoj backend is configured, note if cannot connect to backend
|
||||||
|
@ -43,124 +110,33 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
if (!setting.connectedToBackend) return;
|
if (!setting.connectedToBackend) return;
|
||||||
|
|
||||||
// Set index name from the path of the current vault
|
// Set index name from the path of the current vault
|
||||||
let indexName = vaultPath.replace(/\//g, '_').replace(/\\/g, '_').replace(/ /g, '_').replace(/:/g, '_');
|
|
||||||
// Get default config fields from khoj backend
|
// Get default config fields from khoj backend
|
||||||
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
let defaultConfig = await request(`${khojConfigUrl}/default`).then(response => JSON.parse(response));
|
||||||
let khojDefaultMdIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["markdown"]["embeddings-file"]);
|
|
||||||
let khojDefaultPdfIndexDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["content-type"]["pdf"]["embeddings-file"]);
|
|
||||||
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
let khojDefaultChatDirectory = getIndexDirectoryFromBackendConfig(defaultConfig["processor"]["conversation"]["conversation-logfile"]);
|
||||||
let khojDefaultChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
|
let khojDefaultOpenAIChatModelName = defaultConfig["processor"]["conversation"]["openai"]["chat-model"];
|
||||||
|
let khojDefaultOfflineChatModelName = defaultConfig["processor"]["conversation"]["offline-chat"]["chat-model"];
|
||||||
|
|
||||||
// Get current config if khoj backend configured, else get default config from khoj backend
|
// Get current config if khoj backend configured, else get default config from khoj backend
|
||||||
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
|
await request(khoj_already_configured ? khojConfigUrl : `${khojConfigUrl}/default`)
|
||||||
.then(response => JSON.parse(response))
|
.then(response => JSON.parse(response))
|
||||||
.then(data => {
|
.then(data => {
|
||||||
khoj_already_configured = data["content-type"] != null;
|
|
||||||
// If khoj backend not configured yet
|
|
||||||
if (!khoj_already_configured) {
|
|
||||||
// Create khoj content-type config with only markdown configured
|
|
||||||
data["content-type"] = {
|
|
||||||
"markdown": {
|
|
||||||
"input-filter": [mdInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
|
||||||
|
|
||||||
if (hasPdfFiles) {
|
|
||||||
data["content-type"]["pdf"] = {
|
|
||||||
"input-filter": [pdfInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Else if khoj config has no markdown content config
|
|
||||||
else if (!data["content-type"]["markdown"]) {
|
|
||||||
// Add markdown config to khoj content-type config
|
|
||||||
// Set markdown config to index markdown files in configured obsidian vault
|
|
||||||
data["content-type"]["markdown"] = {
|
|
||||||
"input-filter": [mdInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojDefaultMdIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojDefaultMdIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Else if khoj is not configured to index markdown files in configured obsidian vault
|
|
||||||
else if (
|
|
||||||
data["content-type"]["markdown"]["input-files"] != null ||
|
|
||||||
data["content-type"]["markdown"]["input-filter"] == null ||
|
|
||||||
data["content-type"]["markdown"]["input-filter"].length != 1 ||
|
|
||||||
data["content-type"]["markdown"]["input-filter"][0] !== mdInVault) {
|
|
||||||
// Update markdown config in khoj content-type config
|
|
||||||
// Set markdown config to only index markdown files in configured obsidian vault
|
|
||||||
let khojMdIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["markdown"]["embeddings-file"]);
|
|
||||||
data["content-type"]["markdown"] = {
|
|
||||||
"input-filter": [mdInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojMdIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojMdIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (khoj_already_configured && !data["content-type"]["pdf"]) {
|
|
||||||
const hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
|
||||||
|
|
||||||
if (hasPdfFiles) {
|
|
||||||
data["content-type"]["pdf"] = {
|
|
||||||
"input-filter": [pdfInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojDefaultPdfIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojDefaultPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
data["content-type"]["pdf"] = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Else if khoj is not configured to index pdf files in configured obsidian vault
|
|
||||||
else if (khoj_already_configured &&
|
|
||||||
(
|
|
||||||
data["content-type"]["pdf"]["input-files"] != null ||
|
|
||||||
data["content-type"]["pdf"]["input-filter"] == null ||
|
|
||||||
data["content-type"]["pdf"]["input-filter"].length != 1 ||
|
|
||||||
data["content-type"]["pdf"]["input-filter"][0] !== pdfInVault)) {
|
|
||||||
|
|
||||||
let hasPdfFiles = app.vault.getFiles().some(file => file.extension === 'pdf');
|
|
||||||
|
|
||||||
if (hasPdfFiles) {
|
|
||||||
// Update pdf config in khoj content-type config
|
|
||||||
// Set pdf config to only index pdf files in configured obsidian vault
|
|
||||||
let khojPdfIndexDirectory = getIndexDirectoryFromBackendConfig(data["content-type"]["pdf"]["embeddings-file"]);
|
|
||||||
data["content-type"]["pdf"] = {
|
|
||||||
"input-filter": [pdfInVault],
|
|
||||||
"input-files": null,
|
|
||||||
"embeddings-file": `${khojPdfIndexDirectory}/${indexName}.pt`,
|
|
||||||
"compressed-jsonl": `${khojPdfIndexDirectory}/${indexName}.jsonl.gz`,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
data["content-type"]["pdf"] = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
|
let conversationLogFile = data?.["processor"]?.["conversation"]?.["conversation-logfile"] ?? `${khojDefaultChatDirectory}/conversation.json`;
|
||||||
|
|
||||||
let processorData: ProcessorData = {
|
let processorData: ProcessorData = {
|
||||||
"conversation": {
|
"conversation": {
|
||||||
"conversation-logfile": conversationLogFile,
|
"conversation-logfile": conversationLogFile,
|
||||||
"openai": null,
|
"openai": null,
|
||||||
"enable-offline-chat": setting.enableOfflineChat,
|
"offline-chat": {
|
||||||
|
"chat-model": khojDefaultOfflineChatModelName,
|
||||||
|
"enable-offline-chat": setting.enableOfflineChat,
|
||||||
|
},
|
||||||
|
"tokenizer": null,
|
||||||
|
"max-prompt-size": null,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the Open AI API Key was configured in the plugin settings
|
// If the Open AI API Key was configured in the plugin settings
|
||||||
if (!!setting.openaiApiKey) {
|
if (!!setting.openaiApiKey) {
|
||||||
|
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultOpenAIChatModelName;
|
||||||
let openAIChatModel = data?.["processor"]?.["conversation"]?.["openai"]?.["chat-model"] ?? khojDefaultChatModelName;
|
|
||||||
|
|
||||||
processorData = {
|
processorData = {
|
||||||
"conversation": {
|
"conversation": {
|
||||||
"conversation-logfile": conversationLogFile,
|
"conversation-logfile": conversationLogFile,
|
||||||
|
@ -168,7 +144,12 @@ export async function configureKhojBackend(vault: Vault, setting: KhojSetting, n
|
||||||
"chat-model": openAIChatModel,
|
"chat-model": openAIChatModel,
|
||||||
"api-key": setting.openaiApiKey,
|
"api-key": setting.openaiApiKey,
|
||||||
},
|
},
|
||||||
"enable-offline-chat": setting.enableOfflineChat,
|
"offline-chat": {
|
||||||
|
"chat-model": khojDefaultOfflineChatModelName,
|
||||||
|
"enable-offline-chat": setting.enableOfflineChat,
|
||||||
|
},
|
||||||
|
"tokenizer": null,
|
||||||
|
"max-prompt-size": null,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -197,12 +178,8 @@ export async function updateKhojBackend(khojUrl: string, khojConfig: Object) {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
contentType: 'application/json',
|
contentType: 'application/json',
|
||||||
};
|
};
|
||||||
|
|
||||||
// Save khojConfig on khoj backend at khojConfigUrl
|
// Save khojConfig on khoj backend at khojConfigUrl
|
||||||
await request(requestContent)
|
request(requestContent);
|
||||||
// Refresh khoj search index after updating config
|
|
||||||
.then(_ => request(`${khojUrl}/api/update?t=markdown`))
|
|
||||||
.then(_ => request(`${khojUrl}/api/update?t=pdf`));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function getIndexDirectoryFromBackendConfig(filepath: string) {
|
function getIndexDirectoryFromBackendConfig(filepath: string) {
|
||||||
|
|
|
@ -24,5 +24,6 @@
|
||||||
"0.12.0": "0.15.0",
|
"0.12.0": "0.15.0",
|
||||||
"0.12.1": "0.15.0",
|
"0.12.1": "0.15.0",
|
||||||
"0.12.2": "0.15.0",
|
"0.12.2": "0.15.0",
|
||||||
"0.12.3": "0.15.0"
|
"0.12.3": "0.15.0",
|
||||||
|
"0.13.0": "0.15.0"
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,7 @@ from khoj.utils.config import (
|
||||||
)
|
)
|
||||||
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
|
from khoj.utils.helpers import resolve_absolute_path, merge_dicts
|
||||||
from khoj.utils.fs_syncer import collect_files
|
from khoj.utils.fs_syncer import collect_files
|
||||||
from khoj.utils.rawconfig import FullConfig, ProcessorConfig, ConversationProcessorConfig
|
from khoj.utils.rawconfig import FullConfig, OfflineChatProcessorConfig, ProcessorConfig, ConversationProcessorConfig
|
||||||
from khoj.routers.indexer import configure_content, load_content, configure_search
|
from khoj.routers.indexer import configure_content, load_content, configure_search
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,7 +136,7 @@ def configure_routes(app):
|
||||||
|
|
||||||
app.include_router(api, prefix="/api")
|
app.include_router(api, prefix="/api")
|
||||||
app.include_router(api_beta, prefix="/api/beta")
|
app.include_router(api_beta, prefix="/api/beta")
|
||||||
app.include_router(indexer, prefix="/v1/indexer")
|
app.include_router(indexer, prefix="/api/v1/index")
|
||||||
app.include_router(web_client)
|
app.include_router(web_client)
|
||||||
app.include_router(auth_router, prefix="/auth")
|
app.include_router(auth_router, prefix="/auth")
|
||||||
|
|
||||||
|
@ -156,7 +156,7 @@ if not state.demo:
|
||||||
state.content_index = configure_content(
|
state.content_index = configure_content(
|
||||||
state.content_index, state.config.content_type, all_files, state.search_models
|
state.content_index, state.config.content_type, all_files, state.search_models
|
||||||
)
|
)
|
||||||
logger.info("📬 Content index updated via Scheduler")
|
logger.info("📪 Content index updated via Scheduler")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
|
logger.error(f"🚨 Error updating content index via Scheduler: {e}", exc_info=True)
|
||||||
|
|
||||||
|
@ -207,9 +207,7 @@ def configure_conversation_processor(
|
||||||
conversation_config=ConversationProcessorConfig(
|
conversation_config=ConversationProcessorConfig(
|
||||||
conversation_logfile=conversation_logfile,
|
conversation_logfile=conversation_logfile,
|
||||||
openai=(conversation_config.openai if (conversation_config is not None) else None),
|
openai=(conversation_config.openai if (conversation_config is not None) else None),
|
||||||
enable_offline_chat=(
|
offline_chat=conversation_config.offline_chat if conversation_config else OfflineChatProcessorConfig(),
|
||||||
conversation_config.enable_offline_chat if (conversation_config is not None) else False
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -236,7 +236,7 @@
|
||||||
</h3>
|
</h3>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-description-row">
|
<div class="card-description-row">
|
||||||
<p class="card-description">Setup chat using OpenAI</p>
|
<p class="card-description">Setup online chat using OpenAI</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-action-row">
|
<div class="card-action-row">
|
||||||
<a class="card-button" href="/config/processor/conversation/openai">
|
<a class="card-button" href="/config/processor/conversation/openai">
|
||||||
|
@ -261,21 +261,21 @@
|
||||||
<img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
|
<img class="card-icon" src="/static/assets/icons/chat.svg" alt="Chat">
|
||||||
<h3 class="card-title">
|
<h3 class="card-title">
|
||||||
Offline Chat
|
Offline Chat
|
||||||
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
<img id="configured-icon-conversation-enable-offline-chat" class="configured-icon {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and current_model_state.conversation_gpt4all %}enabled{% else %}disabled{% endif %}" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
|
||||||
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat and not current_model_state.conversation_gpt4all %}
|
{% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat and not current_model_state.conversation_gpt4all %}
|
||||||
<img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
|
<img id="misconfigured-icon-conversation-enable-offline-chat" class="configured-icon" src="/static/assets/icons/question-mark-icon.svg" alt="Not Configured" title="The model was not downloaded as expected.">
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</h3>
|
</h3>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-description-row">
|
<div class="card-description-row">
|
||||||
<p class="card-description">Setup offline chat (Llama V2)</p>
|
<p class="card-description">Setup offline chat</p>
|
||||||
</div>
|
</div>
|
||||||
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
|
<div id="clear-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}enabled{% else %}disabled{% endif %}">
|
||||||
<button class="card-button" onclick="toggleEnableLocalLLLM(false)">
|
<button class="card-button" onclick="toggleEnableLocalLLLM(false)">
|
||||||
Disable
|
Disable
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
|
<div id="set-enable-offline-chat" class="card-action-row {% if current_config.processor and current_config.processor.conversation and current_config.processor.conversation.offline_chat.enable_offline_chat %}disabled{% else %}enabled{% endif %}">
|
||||||
<button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
|
<button class="card-button happy" onclick="toggleEnableLocalLLLM(true)">
|
||||||
Enable
|
Enable
|
||||||
</button>
|
</button>
|
||||||
|
@ -346,7 +346,7 @@
|
||||||
featuresHintText.classList.add("show");
|
featuresHintText.classList.add("show");
|
||||||
}
|
}
|
||||||
|
|
||||||
fetch('/api/config/data/processor/conversation/enable_offline_chat' + '?enable_offline_chat=' + enable, {
|
fetch('/api/config/data/processor/conversation/offline_chat' + '?enable_offline_chat=' + enable, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
|
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}">
|
||||||
{% else %}
|
{% else %}
|
||||||
{% for input_filter in current_config['input_filter'] %}
|
{% for input_filter in current_config['input_filter'] %}
|
||||||
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter.split('/*')[0] }}">
|
<input type="text" id="input-filter" name="input-filter" placeholder="~/Documents/{{content_type}}" value="{{ input_filter }}">
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
|
@ -106,17 +106,18 @@
|
||||||
|
|
||||||
submit.addEventListener("click", function(event) {
|
submit.addEventListener("click", function(event) {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
let globFormat = "**/*."
|
let globFormat = "**/*"
|
||||||
let suffixes = [];
|
let suffixes = [];
|
||||||
if ('{{content_type}}' == "markdown")
|
if ('{{content_type}}' == "markdown")
|
||||||
suffixes = ["md", "markdown"]
|
suffixes = [".md", ".markdown"]
|
||||||
else if ('{{content_type}}' == "org")
|
else if ('{{content_type}}' == "org")
|
||||||
suffixes = ["org"]
|
suffixes = [".org"]
|
||||||
else if ('{{content_type}}' === "pdf")
|
else if ('{{content_type}}' === "pdf")
|
||||||
suffixes = ["pdf"]
|
suffixes = [".pdf"]
|
||||||
else if ('{{content_type}}' === "plaintext")
|
else if ('{{content_type}}' === "plaintext")
|
||||||
suffixes = ['*']
|
suffixes = ['.*']
|
||||||
|
|
||||||
|
let globs = suffixes.map(x => `${globFormat}${x}`)
|
||||||
var inputFileNodes = document.getElementsByName("input-files");
|
var inputFileNodes = document.getElementsByName("input-files");
|
||||||
var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
|
var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
|
||||||
|
|
||||||
|
@ -124,10 +125,19 @@
|
||||||
|
|
||||||
var inputFilter = [];
|
var inputFilter = [];
|
||||||
var nodes = getValidInputNodes(inputFilterNodes);
|
var nodes = getValidInputNodes(inputFilterNodes);
|
||||||
|
|
||||||
|
// A regex that checks for globs in the path. If they exist,
|
||||||
|
// we are going to just not add our own globing. If they don't,
|
||||||
|
// then we will assume globbing should be done.
|
||||||
|
const glob_regex = /([*?\[\]])/;
|
||||||
if (nodes.length > 0) {
|
if (nodes.length > 0) {
|
||||||
for (var i = 0; i < nodes.length; i++) {
|
for (var i = 0; i < nodes.length; i++) {
|
||||||
for (var j = 0; j < suffixes.length; j++) {
|
for (var j = 0; j < globs.length; j++) {
|
||||||
inputFilter.push(nodes[i].value + globFormat + suffixes[j]);
|
if (glob_regex.test(nodes[i].value)) {
|
||||||
|
inputFilter.push(nodes[i].value);
|
||||||
|
} else {
|
||||||
|
inputFilter.push(nodes[i].value + globs[j]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
83
src/khoj/migrations/migrate_offline_chat_schema.py
Normal file
83
src/khoj/migrations/migrate_offline_chat_schema.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
"""
|
||||||
|
Current format of khoj.yml
|
||||||
|
---
|
||||||
|
app:
|
||||||
|
...
|
||||||
|
content-type:
|
||||||
|
...
|
||||||
|
processor:
|
||||||
|
conversation:
|
||||||
|
enable-offline-chat: false
|
||||||
|
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
|
||||||
|
openai:
|
||||||
|
...
|
||||||
|
search-type:
|
||||||
|
...
|
||||||
|
|
||||||
|
New format of khoj.yml
|
||||||
|
---
|
||||||
|
app:
|
||||||
|
...
|
||||||
|
content-type:
|
||||||
|
...
|
||||||
|
processor:
|
||||||
|
conversation:
|
||||||
|
offline-chat:
|
||||||
|
enable-offline-chat: false
|
||||||
|
chat-model: llama-2-7b-chat.ggmlv3.q4_0.bin
|
||||||
|
tokenizer: null
|
||||||
|
max_prompt_size: null
|
||||||
|
conversation-logfile: ~/.khoj/processor/conversation/conversation_logs.json
|
||||||
|
openai:
|
||||||
|
...
|
||||||
|
search-type:
|
||||||
|
...
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from packaging import version
|
||||||
|
|
||||||
|
from khoj.utils.yaml import load_config_from_file, save_config_to_file
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_offline_chat_schema(args):
|
||||||
|
schema_version = "0.12.3"
|
||||||
|
raw_config = load_config_from_file(args.config_file)
|
||||||
|
previous_version = raw_config.get("version")
|
||||||
|
|
||||||
|
if "processor" not in raw_config:
|
||||||
|
return args
|
||||||
|
if raw_config["processor"] is None:
|
||||||
|
return args
|
||||||
|
if "conversation" not in raw_config["processor"]:
|
||||||
|
return args
|
||||||
|
|
||||||
|
if previous_version is None or version.parse(previous_version) < version.parse("0.12.3"):
|
||||||
|
logger.info(
|
||||||
|
f"Upgrading config schema to {schema_version} from {previous_version} to make (offline) chat more configuration"
|
||||||
|
)
|
||||||
|
raw_config["version"] = schema_version
|
||||||
|
|
||||||
|
# Create max-prompt-size field in conversation processor schema
|
||||||
|
raw_config["processor"]["conversation"]["max-prompt-size"] = None
|
||||||
|
raw_config["processor"]["conversation"]["tokenizer"] = None
|
||||||
|
|
||||||
|
# Create offline chat schema based on existing enable_offline_chat field in khoj config schema
|
||||||
|
offline_chat_model = (
|
||||||
|
raw_config["processor"]["conversation"]
|
||||||
|
.get("offline-chat", {})
|
||||||
|
.get("chat-model", "llama-2-7b-chat.ggmlv3.q4_0.bin")
|
||||||
|
)
|
||||||
|
raw_config["processor"]["conversation"]["offline-chat"] = {
|
||||||
|
"enable-offline-chat": raw_config["processor"]["conversation"].get("enable-offline-chat", False),
|
||||||
|
"chat-model": offline_chat_model,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Delete old enable-offline-chat field from conversation processor schema
|
||||||
|
if "enable-offline-chat" in raw_config["processor"]["conversation"]:
|
||||||
|
del raw_config["processor"]["conversation"]["enable-offline-chat"]
|
||||||
|
|
||||||
|
save_config_to_file(raw_config, args.config_file)
|
||||||
|
return args
|
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def extract_questions_offline(
|
def extract_questions_offline(
|
||||||
text: str,
|
text: str,
|
||||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||||
loaded_model: Union[Any, None] = None,
|
loaded_model: Union[Any, None] = None,
|
||||||
conversation_log={},
|
conversation_log={},
|
||||||
use_history: bool = True,
|
use_history: bool = True,
|
||||||
|
@ -113,7 +113,7 @@ def filter_questions(questions: List[str]):
|
||||||
]
|
]
|
||||||
filtered_questions = []
|
filtered_questions = []
|
||||||
for q in questions:
|
for q in questions:
|
||||||
if not any([word in q.lower() for word in hint_words]):
|
if not any([word in q.lower() for word in hint_words]) and not is_none_or_empty(q):
|
||||||
filtered_questions.append(q)
|
filtered_questions.append(q)
|
||||||
|
|
||||||
return filtered_questions
|
return filtered_questions
|
||||||
|
@ -123,10 +123,12 @@ def converse_offline(
|
||||||
references,
|
references,
|
||||||
user_query,
|
user_query,
|
||||||
conversation_log={},
|
conversation_log={},
|
||||||
model: str = "llama-2-7b-chat.ggmlv3.q4_K_S.bin",
|
model: str = "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||||
loaded_model: Union[Any, None] = None,
|
loaded_model: Union[Any, None] = None,
|
||||||
completion_func=None,
|
completion_func=None,
|
||||||
conversation_command=ConversationCommand.Default,
|
conversation_command=ConversationCommand.Default,
|
||||||
|
max_prompt_size=None,
|
||||||
|
tokenizer_name=None,
|
||||||
) -> Union[ThreadedGenerator, Iterator[str]]:
|
) -> Union[ThreadedGenerator, Iterator[str]]:
|
||||||
"""
|
"""
|
||||||
Converse with user using Llama
|
Converse with user using Llama
|
||||||
|
@ -158,6 +160,8 @@ def converse_offline(
|
||||||
prompts.system_prompt_message_llamav2,
|
prompts.system_prompt_message_llamav2,
|
||||||
conversation_log,
|
conversation_log,
|
||||||
model_name=model,
|
model_name=model,
|
||||||
|
max_prompt_size=max_prompt_size,
|
||||||
|
tokenizer_name=tokenizer_name,
|
||||||
)
|
)
|
||||||
|
|
||||||
g = ThreadedGenerator(references, completion_func=completion_func)
|
g = ThreadedGenerator(references, completion_func=completion_func)
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
model_name_to_url = {
|
|
||||||
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
|
||||||
}
|
|
|
@ -1,24 +1,8 @@
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import requests
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from khoj.processor.conversation.gpt4all import model_metadata
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
expected_checksum = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "cfa87b15d92fb15a2d7c354b0098578b"}
|
|
||||||
|
|
||||||
|
|
||||||
def get_md5_checksum(filename: str):
|
|
||||||
hash_md5 = hashlib.md5()
|
|
||||||
with open(filename, "rb") as f:
|
|
||||||
for chunk in iter(lambda: f.read(8192), b""):
|
|
||||||
hash_md5.update(chunk)
|
|
||||||
return hash_md5.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def download_model(model_name: str):
|
def download_model(model_name: str):
|
||||||
try:
|
try:
|
||||||
|
@ -27,57 +11,12 @@ def download_model(model_name: str):
|
||||||
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
logger.info("There was an error importing GPT4All. Please run pip install gpt4all in order to install it.")
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
url = model_metadata.model_name_to_url.get(model_name)
|
# Use GPU for Chat Model, if available
|
||||||
model_path = os.path.expanduser(f"~/.cache/gpt4all/")
|
|
||||||
if not url:
|
|
||||||
logger.debug(f"Model {model_name} not found in model metadata. Skipping download.")
|
|
||||||
return GPT4All(model_name=model_name, model_path=model_path)
|
|
||||||
|
|
||||||
filename = os.path.expanduser(f"~/.cache/gpt4all/{model_name}")
|
|
||||||
if os.path.exists(filename):
|
|
||||||
# Check if the user is connected to the internet
|
|
||||||
try:
|
|
||||||
requests.get("https://www.google.com/", timeout=5)
|
|
||||||
except:
|
|
||||||
logger.debug("User is offline. Disabling allowed download flag")
|
|
||||||
return GPT4All(model_name=model_name, model_path=model_path, allow_download=False)
|
|
||||||
return GPT4All(model_name=model_name, model_path=model_path)
|
|
||||||
|
|
||||||
# Download the model to a tmp file. Once the download is completed, move the tmp file to the actual file
|
|
||||||
tmp_filename = filename + ".tmp"
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
os.makedirs(os.path.dirname(tmp_filename), exist_ok=True)
|
model = GPT4All(model_name=model_name, device="gpu")
|
||||||
logger.debug(f"Downloading model {model_name} from {url} to {filename}...")
|
logger.debug("Loaded chat model to GPU.")
|
||||||
with requests.get(url, stream=True) as r:
|
except ValueError:
|
||||||
r.raise_for_status()
|
model = GPT4All(model_name=model_name)
|
||||||
total_size = int(r.headers.get("content-length", 0))
|
logger.debug("Loaded chat model to CPU.")
|
||||||
with open(tmp_filename, "wb") as f, tqdm(
|
|
||||||
unit="B", # unit string to be displayed.
|
|
||||||
unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc.
|
|
||||||
unit_divisor=1024, # is used when unit_scale is true
|
|
||||||
total=total_size, # the total iteration.
|
|
||||||
desc=model_name, # prefix to be displayed on progress bar.
|
|
||||||
) as progress_bar:
|
|
||||||
for chunk in r.iter_content(chunk_size=8192):
|
|
||||||
f.write(chunk)
|
|
||||||
progress_bar.update(len(chunk))
|
|
||||||
|
|
||||||
# Verify the checksum
|
return model
|
||||||
if expected_checksum.get(model_name) != get_md5_checksum(tmp_filename):
|
|
||||||
logger.error(
|
|
||||||
f"Checksum verification failed for {filename}. Removing the tmp file. Offline model will not be available."
|
|
||||||
)
|
|
||||||
os.remove(tmp_filename)
|
|
||||||
raise ValueError(f"Checksum verification failed for downloading {model_name} from {url}.")
|
|
||||||
|
|
||||||
# Move the tmp file to the actual file
|
|
||||||
os.rename(tmp_filename, filename)
|
|
||||||
logger.debug(f"Successfully downloaded model {model_name} from {url} to {filename}")
|
|
||||||
return GPT4All(model_name)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to download model {model_name} from {url} to {filename}. Error: {e}", exc_info=True)
|
|
||||||
# Remove the tmp file if it exists
|
|
||||||
if os.path.exists(tmp_filename):
|
|
||||||
os.remove(tmp_filename)
|
|
||||||
return None
|
|
||||||
|
|
|
@ -116,6 +116,8 @@ def converse(
|
||||||
temperature: float = 0.2,
|
temperature: float = 0.2,
|
||||||
completion_func=None,
|
completion_func=None,
|
||||||
conversation_command=ConversationCommand.Default,
|
conversation_command=ConversationCommand.Default,
|
||||||
|
max_prompt_size=None,
|
||||||
|
tokenizer_name=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Converse with user using OpenAI's ChatGPT
|
Converse with user using OpenAI's ChatGPT
|
||||||
|
@ -141,6 +143,8 @@ def converse(
|
||||||
prompts.personality.format(),
|
prompts.personality.format(),
|
||||||
conversation_log,
|
conversation_log,
|
||||||
model,
|
model,
|
||||||
|
max_prompt_size,
|
||||||
|
tokenizer_name,
|
||||||
)
|
)
|
||||||
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
|
truncated_messages = "\n".join({f"{message.content[:40]}..." for message in messages})
|
||||||
logger.debug(f"Conversation Context for GPT: {truncated_messages}")
|
logger.debug(f"Conversation Context for GPT: {truncated_messages}")
|
||||||
|
|
|
@ -23,7 +23,7 @@ no_notes_found = PromptTemplate.from_template(
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
system_prompt_message_llamav2 = f"""You are Khoj, a friendly, smart and helpful personal assistant.
|
system_prompt_message_llamav2 = f"""You are Khoj, a smart, inquisitive and helpful personal assistant.
|
||||||
Using your general knowledge and our past conversations as context, answer the following question.
|
Using your general knowledge and our past conversations as context, answer the following question.
|
||||||
If you do not know the answer, say 'I don't know.'"""
|
If you do not know the answer, say 'I don't know.'"""
|
||||||
|
|
||||||
|
@ -51,13 +51,13 @@ extract_questions_system_prompt_llamav2 = PromptTemplate.from_template(
|
||||||
|
|
||||||
general_conversation_llamav2 = PromptTemplate.from_template(
|
general_conversation_llamav2 = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
<s>[INST]{query}[/INST]
|
<s>[INST] {query} [/INST]
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
chat_history_llamav2_from_user = PromptTemplate.from_template(
|
chat_history_llamav2_from_user = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
<s>[INST]{message}[/INST]
|
<s>[INST] {message} [/INST]
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ chat_history_llamav2_from_assistant = PromptTemplate.from_template(
|
||||||
|
|
||||||
conversation_llamav2 = PromptTemplate.from_template(
|
conversation_llamav2 = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
<s>[INST]{query}[/INST]
|
<s>[INST] {query} [/INST]
|
||||||
""".strip()
|
""".strip()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ Question: {query}
|
||||||
|
|
||||||
notes_conversation_llamav2 = PromptTemplate.from_template(
|
notes_conversation_llamav2 = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
Notes:
|
User's Notes:
|
||||||
{references}
|
{references}
|
||||||
Question: {query}
|
Question: {query}
|
||||||
""".strip()
|
""".strip()
|
||||||
|
@ -134,19 +134,25 @@ Answer (in second person):"""
|
||||||
|
|
||||||
extract_questions_llamav2_sample = PromptTemplate.from_template(
|
extract_questions_llamav2_sample = PromptTemplate.from_template(
|
||||||
"""
|
"""
|
||||||
<s>[INST]<<SYS>>Current Date: {current_date}<</SYS>>[/INST]</s>
|
<s>[INST] <<SYS>>Current Date: {current_date}<</SYS>> [/INST]</s>
|
||||||
<s>[INST]How was my trip to Cambodia?[/INST][]</s>
|
<s>[INST] How was my trip to Cambodia? [/INST]
|
||||||
<s>[INST]Who did I visit the temple with on that trip?[/INST]Who did I visit the temple with in Cambodia?</s>
|
How was my trip to Cambodia?</s>
|
||||||
<s>[INST]How should I take care of my plants?[/INST]What kind of plants do I have? What issues do my plants have?</s>
|
<s>[INST] Who did I visit the temple with on that trip? [/INST]
|
||||||
<s>[INST]How many tennis balls fit in the back of a 2002 Honda Civic?[/INST]What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
|
Who did I visit the temple with in Cambodia?</s>
|
||||||
<s>[INST]What did I do for Christmas last year?[/INST]What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
|
<s>[INST] How should I take care of my plants? [/INST]
|
||||||
<s>[INST]How are you feeling today?[/INST]</s>
|
What kind of plants do I have? What issues do my plants have?</s>
|
||||||
<s>[INST]Is Alice older than Bob?[/INST]When was Alice born? What is Bob's age?</s>
|
<s>[INST] How many tennis balls fit in the back of a 2002 Honda Civic? [/INST]
|
||||||
<s>[INST]<<SYS>>
|
What is the size of a tennis ball? What is the trunk size of a 2002 Honda Civic?</s>
|
||||||
|
<s>[INST] What did I do for Christmas last year? [/INST]
|
||||||
|
What did I do for Christmas {last_year} dt>='{last_christmas_date}' dt<'{next_christmas_date}'</s>
|
||||||
|
<s>[INST] How are you feeling today? [/INST]</s>
|
||||||
|
<s>[INST] Is Alice older than Bob? [/INST]
|
||||||
|
When was Alice born? What is Bob's age?</s>
|
||||||
|
<s>[INST] <<SYS>>
|
||||||
Use these notes from the user's previous conversations to provide a response:
|
Use these notes from the user's previous conversations to provide a response:
|
||||||
{chat_history}
|
{chat_history}
|
||||||
<</SYS>>[/INST]</s>
|
<</SYS>> [/INST]</s>
|
||||||
<s>[INST]{query}[/INST]
|
<s>[INST] {query} [/INST]
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -3,24 +3,27 @@ import logging
|
||||||
from time import perf_counter
|
from time import perf_counter
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import queue
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
|
||||||
# External packages
|
# External packages
|
||||||
from langchain.schema import ChatMessage
|
from langchain.schema import ChatMessage
|
||||||
from transformers import LlamaTokenizerFast
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
import queue
|
|
||||||
from khoj.utils.helpers import merge_dicts
|
from khoj.utils.helpers import merge_dicts
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
max_prompt_size = {
|
model_to_prompt_size = {
|
||||||
"gpt-3.5-turbo": 4096,
|
"gpt-3.5-turbo": 4096,
|
||||||
"gpt-4": 8192,
|
"gpt-4": 8192,
|
||||||
"llama-2-7b-chat.ggmlv3.q4_K_S.bin": 1548,
|
"llama-2-7b-chat.ggmlv3.q4_0.bin": 1548,
|
||||||
"gpt-3.5-turbo-16k": 15000,
|
"gpt-3.5-turbo-16k": 15000,
|
||||||
}
|
}
|
||||||
tokenizer = {"llama-2-7b-chat.ggmlv3.q4_K_S.bin": "hf-internal-testing/llama-tokenizer"}
|
model_to_tokenizer = {
|
||||||
|
"llama-2-7b-chat.ggmlv3.q4_0.bin": "hf-internal-testing/llama-tokenizer",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class ThreadedGenerator:
|
class ThreadedGenerator:
|
||||||
|
@ -82,9 +85,26 @@ def message_to_log(
|
||||||
|
|
||||||
|
|
||||||
def generate_chatml_messages_with_context(
|
def generate_chatml_messages_with_context(
|
||||||
user_message, system_message, conversation_log={}, model_name="gpt-3.5-turbo", lookback_turns=2
|
user_message,
|
||||||
|
system_message,
|
||||||
|
conversation_log={},
|
||||||
|
model_name="gpt-3.5-turbo",
|
||||||
|
max_prompt_size=None,
|
||||||
|
tokenizer_name=None,
|
||||||
):
|
):
|
||||||
"""Generate messages for ChatGPT with context from previous conversation"""
|
"""Generate messages for ChatGPT with context from previous conversation"""
|
||||||
|
# Set max prompt size from user config, pre-configured for model or to default prompt size
|
||||||
|
try:
|
||||||
|
max_prompt_size = max_prompt_size or model_to_prompt_size[model_name]
|
||||||
|
except:
|
||||||
|
max_prompt_size = 2000
|
||||||
|
logger.warning(
|
||||||
|
f"Fallback to default prompt size: {max_prompt_size}.\nConfigure max_prompt_size for unsupported model: {model_name} in Khoj settings to longer context window."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scale lookback turns proportional to max prompt size supported by model
|
||||||
|
lookback_turns = max_prompt_size // 750
|
||||||
|
|
||||||
# Extract Chat History for Context
|
# Extract Chat History for Context
|
||||||
chat_logs = []
|
chat_logs = []
|
||||||
for chat in conversation_log.get("chat", []):
|
for chat in conversation_log.get("chat", []):
|
||||||
|
@ -105,19 +125,28 @@ def generate_chatml_messages_with_context(
|
||||||
messages = user_chatml_message + rest_backnforths + system_chatml_message
|
messages = user_chatml_message + rest_backnforths + system_chatml_message
|
||||||
|
|
||||||
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
# Truncate oldest messages from conversation history until under max supported prompt size by model
|
||||||
messages = truncate_messages(messages, max_prompt_size[model_name], model_name)
|
messages = truncate_messages(messages, max_prompt_size, model_name, tokenizer_name)
|
||||||
|
|
||||||
# Return message in chronological order
|
# Return message in chronological order
|
||||||
return messages[::-1]
|
return messages[::-1]
|
||||||
|
|
||||||
|
|
||||||
def truncate_messages(messages: list[ChatMessage], max_prompt_size, model_name) -> list[ChatMessage]:
|
def truncate_messages(
|
||||||
|
messages: list[ChatMessage], max_prompt_size, model_name: str, tokenizer_name=None
|
||||||
|
) -> list[ChatMessage]:
|
||||||
"""Truncate messages to fit within max prompt size supported by model"""
|
"""Truncate messages to fit within max prompt size supported by model"""
|
||||||
|
|
||||||
if "llama" in model_name:
|
try:
|
||||||
encoder = LlamaTokenizerFast.from_pretrained(tokenizer[model_name])
|
if model_name.startswith("gpt-"):
|
||||||
else:
|
encoder = tiktoken.encoding_for_model(model_name)
|
||||||
encoder = tiktoken.encoding_for_model(model_name)
|
else:
|
||||||
|
encoder = AutoTokenizer.from_pretrained(tokenizer_name or model_to_tokenizer[model_name])
|
||||||
|
except:
|
||||||
|
default_tokenizer = "hf-internal-testing/llama-tokenizer"
|
||||||
|
encoder = AutoTokenizer.from_pretrained(default_tokenizer)
|
||||||
|
logger.warning(
|
||||||
|
f"Fallback to default chat model tokenizer: {default_tokenizer}.\nConfigure tokenizer for unsupported model: {model_name} in Khoj settings to improve context stuffing."
|
||||||
|
)
|
||||||
|
|
||||||
system_message = messages.pop()
|
system_message = messages.pop()
|
||||||
system_message_tokens = len(encoder.encode(system_message.content))
|
system_message_tokens = len(encoder.encode(system_message.content))
|
||||||
|
|
|
@ -65,7 +65,7 @@ class PdfToJsonl(TextToJsonl):
|
||||||
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
|
# Write the PDF file to a temporary file, as it is stored in byte format in the pdf_file object and the PyPDFLoader expects a file path
|
||||||
tmp_file = f"tmp_pdf_file.pdf"
|
tmp_file = f"tmp_pdf_file.pdf"
|
||||||
with open(f"{tmp_file}", "wb") as f:
|
with open(f"{tmp_file}", "wb") as f:
|
||||||
bytes = base64.b64decode(pdf_files[pdf_file])
|
bytes = pdf_files[pdf_file]
|
||||||
f.write(bytes)
|
f.write(bytes)
|
||||||
loader = PyMuPDFLoader(f"{tmp_file}")
|
loader = PyMuPDFLoader(f"{tmp_file}")
|
||||||
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
pdf_entries_per_file = [page.page_content for page in loader.load()]
|
||||||
|
|
|
@ -30,6 +30,7 @@ from khoj.utils.rawconfig import (
|
||||||
GithubContentConfig,
|
GithubContentConfig,
|
||||||
NotionContentConfig,
|
NotionContentConfig,
|
||||||
ConversationProcessorConfig,
|
ConversationProcessorConfig,
|
||||||
|
OfflineChatProcessorConfig,
|
||||||
)
|
)
|
||||||
from khoj.utils.helpers import resolve_absolute_path
|
from khoj.utils.helpers import resolve_absolute_path
|
||||||
from khoj.utils.state import SearchType
|
from khoj.utils.state import SearchType
|
||||||
|
@ -185,6 +186,10 @@ if not state.demo:
|
||||||
state.content_index.markdown = None
|
state.content_index.markdown = None
|
||||||
elif content_type == "org":
|
elif content_type == "org":
|
||||||
state.content_index.org = None
|
state.content_index.org = None
|
||||||
|
elif content_type == "plaintext":
|
||||||
|
state.content_index.plaintext = None
|
||||||
|
else:
|
||||||
|
logger.warning(f"Request to delete unknown content type: {content_type} via API")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
save_config_to_file_updated_state()
|
save_config_to_file_updated_state()
|
||||||
|
@ -284,10 +289,11 @@ if not state.demo:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"status": "error", "message": str(e)}
|
return {"status": "error", "message": str(e)}
|
||||||
|
|
||||||
@api.post("/config/data/processor/conversation/enable_offline_chat", status_code=200)
|
@api.post("/config/data/processor/conversation/offline_chat", status_code=200)
|
||||||
async def set_processor_enable_offline_chat_config_data(
|
async def set_processor_enable_offline_chat_config_data(
|
||||||
request: Request,
|
request: Request,
|
||||||
enable_offline_chat: bool,
|
enable_offline_chat: bool,
|
||||||
|
offline_chat_model: Optional[str] = None,
|
||||||
client: Optional[str] = None,
|
client: Optional[str] = None,
|
||||||
):
|
):
|
||||||
_initialize_config()
|
_initialize_config()
|
||||||
|
@ -301,7 +307,12 @@ if not state.demo:
|
||||||
state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore
|
state.config.processor = ProcessorConfig(conversation=ConversationProcessorConfig(conversation_logfile=conversation_logfile)) # type: ignore
|
||||||
|
|
||||||
assert state.config.processor.conversation is not None
|
assert state.config.processor.conversation is not None
|
||||||
state.config.processor.conversation.enable_offline_chat = enable_offline_chat
|
if state.config.processor.conversation.offline_chat is None:
|
||||||
|
state.config.processor.conversation.offline_chat = OfflineChatProcessorConfig()
|
||||||
|
|
||||||
|
state.config.processor.conversation.offline_chat.enable_offline_chat = enable_offline_chat
|
||||||
|
if offline_chat_model is not None:
|
||||||
|
state.config.processor.conversation.offline_chat.chat_model = offline_chat_model
|
||||||
state.processor_config = configure_processor(state.config.processor, state.processor_config)
|
state.processor_config = configure_processor(state.config.processor, state.processor_config)
|
||||||
|
|
||||||
update_telemetry_state(
|
update_telemetry_state(
|
||||||
|
@ -322,7 +333,7 @@ if not state.demo:
|
||||||
# Create Routes
|
# Create Routes
|
||||||
@api.get("/config/data/default")
|
@api.get("/config/data/default")
|
||||||
def get_default_config_data():
|
def get_default_config_data():
|
||||||
return constants.default_config
|
return constants.empty_config
|
||||||
|
|
||||||
|
|
||||||
@api.get("/config/types", response_model=List[str])
|
@api.get("/config/types", response_model=List[str])
|
||||||
|
@ -387,7 +398,7 @@ async def search(
|
||||||
# Encode query with filter terms removed
|
# Encode query with filter terms removed
|
||||||
defiltered_query = user_query
|
defiltered_query = user_query
|
||||||
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
||||||
defiltered_query = filter.defilter(user_query)
|
defiltered_query = filter.defilter(defiltered_query)
|
||||||
|
|
||||||
encoded_asymmetric_query = None
|
encoded_asymmetric_query = None
|
||||||
if t == SearchType.All or t != SearchType.Image:
|
if t == SearchType.All or t != SearchType.Image:
|
||||||
|
@ -622,7 +633,7 @@ def update(
|
||||||
if state.processor_config:
|
if state.processor_config:
|
||||||
components.append("Conversation processor")
|
components.append("Conversation processor")
|
||||||
components_msg = ", ".join(components)
|
components_msg = ", ".join(components)
|
||||||
logger.info(f"📬 {components_msg} updated via API")
|
logger.info(f"📪 {components_msg} updated via API")
|
||||||
|
|
||||||
update_telemetry_state(
|
update_telemetry_state(
|
||||||
request=request,
|
request=request,
|
||||||
|
@ -702,12 +713,18 @@ async def chat(
|
||||||
) -> Response:
|
) -> Response:
|
||||||
perform_chat_checks()
|
perform_chat_checks()
|
||||||
conversation_command = get_conversation_command(query=q, any_references=True)
|
conversation_command = get_conversation_command(query=q, any_references=True)
|
||||||
|
|
||||||
|
q = q.replace(f"/{conversation_command.value}", "").strip()
|
||||||
|
|
||||||
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
|
compiled_references, inferred_queries, defiltered_query = await extract_references_and_questions(
|
||||||
request, q, (n or 5), conversation_command
|
request, q, (n or 5), conversation_command
|
||||||
)
|
)
|
||||||
conversation_command = get_conversation_command(query=q, any_references=not is_none_or_empty(compiled_references))
|
|
||||||
|
if conversation_command == ConversationCommand.Default and is_none_or_empty(compiled_references):
|
||||||
|
conversation_command = ConversationCommand.General
|
||||||
|
|
||||||
if conversation_command == ConversationCommand.Help:
|
if conversation_command == ConversationCommand.Help:
|
||||||
model_type = "offline" if state.processor_config.conversation.enable_offline_chat else "openai"
|
model_type = "offline" if state.processor_config.conversation.offline_chat.enable_offline_chat else "openai"
|
||||||
formatted_help = help_message.format(model=model_type, version=state.khoj_version)
|
formatted_help = help_message.format(model=model_type, version=state.khoj_version)
|
||||||
return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
|
return StreamingResponse(iter([formatted_help]), media_type="text/event-stream", status_code=200)
|
||||||
|
|
||||||
|
@ -768,23 +785,21 @@ async def extract_references_and_questions(
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
"No content index loaded, so cannot extract references from knowledge base. Please configure your data sources and update the index to chat with your notes."
|
||||||
)
|
)
|
||||||
return compiled_references, inferred_queries
|
return compiled_references, inferred_queries, q
|
||||||
|
|
||||||
if conversation_type == ConversationCommand.General:
|
if conversation_type == ConversationCommand.General:
|
||||||
return compiled_references, inferred_queries, q
|
return compiled_references, inferred_queries, q
|
||||||
|
|
||||||
# Extract filter terms from user message
|
# Extract filter terms from user message
|
||||||
defiltered_query = q
|
defiltered_query = q
|
||||||
filter_terms = []
|
|
||||||
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
for filter in [DateFilter(), WordFilter(), FileFilter()]:
|
||||||
filter_terms += filter.get_filter_terms(q)
|
defiltered_query = filter.defilter(defiltered_query)
|
||||||
defiltered_query = filter.defilter(q)
|
filters_in_query = q.replace(defiltered_query, "").strip()
|
||||||
filters_in_query = " ".join(filter_terms)
|
|
||||||
|
|
||||||
# Infer search queries from user message
|
# Infer search queries from user message
|
||||||
with timer("Extracting search queries took", logger):
|
with timer("Extracting search queries took", logger):
|
||||||
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
|
# If we've reached here, either the user has enabled offline chat or the openai model is enabled.
|
||||||
if state.processor_config.conversation.enable_offline_chat:
|
if state.processor_config.conversation.offline_chat.enable_offline_chat:
|
||||||
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
||||||
inferred_queries = extract_questions_offline(
|
inferred_queries = extract_questions_offline(
|
||||||
defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
|
defiltered_query, loaded_model=loaded_model, conversation_log=meta_log, should_extract_questions=False
|
||||||
|
@ -800,7 +815,7 @@ async def extract_references_and_questions(
|
||||||
with timer("Searching knowledge base took", logger):
|
with timer("Searching knowledge base took", logger):
|
||||||
result_list = []
|
result_list = []
|
||||||
for query in inferred_queries:
|
for query in inferred_queries:
|
||||||
n_items = min(n, 3) if state.processor_config.conversation.enable_offline_chat else n
|
n_items = min(n, 3) if state.processor_config.conversation.offline_chat.enable_offline_chat else n
|
||||||
result_list.extend(
|
result_list.extend(
|
||||||
await search(
|
await search(
|
||||||
f"{query} {filters_in_query}",
|
f"{query} {filters_in_query}",
|
||||||
|
|
|
@ -113,7 +113,7 @@ def generate_chat_response(
|
||||||
meta_log=meta_log,
|
meta_log=meta_log,
|
||||||
)
|
)
|
||||||
|
|
||||||
if state.processor_config.conversation.enable_offline_chat:
|
if state.processor_config.conversation.offline_chat.enable_offline_chat:
|
||||||
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
loaded_model = state.processor_config.conversation.gpt4all_model.loaded_model
|
||||||
chat_response = converse_offline(
|
chat_response = converse_offline(
|
||||||
references=compiled_references,
|
references=compiled_references,
|
||||||
|
@ -122,6 +122,9 @@ def generate_chat_response(
|
||||||
conversation_log=meta_log,
|
conversation_log=meta_log,
|
||||||
completion_func=partial_completion,
|
completion_func=partial_completion,
|
||||||
conversation_command=conversation_command,
|
conversation_command=conversation_command,
|
||||||
|
model=state.processor_config.conversation.offline_chat.chat_model,
|
||||||
|
max_prompt_size=state.processor_config.conversation.max_prompt_size,
|
||||||
|
tokenizer_name=state.processor_config.conversation.tokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif state.processor_config.conversation.openai_model:
|
elif state.processor_config.conversation.openai_model:
|
||||||
|
@ -135,6 +138,8 @@ def generate_chat_response(
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
completion_func=partial_completion,
|
completion_func=partial_completion,
|
||||||
conversation_command=conversation_command,
|
conversation_command=conversation_command,
|
||||||
|
max_prompt_size=state.processor_config.conversation.max_prompt_size,
|
||||||
|
tokenizer_name=state.processor_config.conversation.tokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
from typing import Optional, Union, Dict
|
from typing import Optional, Union, Dict
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
from fastapi import APIRouter, HTTPException, Header, Request, Body, Response
|
from fastapi import APIRouter, HTTPException, Header, Request, Response, UploadFile
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
from khoj.routers.helpers import update_telemetry_state
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils import state, constants
|
from khoj.utils import state, constants
|
||||||
|
@ -56,42 +56,30 @@ class IndexerInput(BaseModel):
|
||||||
plaintext: Optional[dict[str, str]] = None
|
plaintext: Optional[dict[str, str]] = None
|
||||||
|
|
||||||
|
|
||||||
@indexer.post("/batch")
|
@indexer.post("/update")
|
||||||
async def index_batch(
|
async def update(
|
||||||
request: Request,
|
request: Request,
|
||||||
|
files: list[UploadFile],
|
||||||
x_api_key: str = Header(None),
|
x_api_key: str = Header(None),
|
||||||
regenerate: bool = False,
|
force: bool = False,
|
||||||
search_type: Optional[Union[state.SearchType, str]] = None,
|
t: Optional[Union[state.SearchType, str]] = None,
|
||||||
|
client: Optional[str] = None,
|
||||||
|
user_agent: Optional[str] = Header(None),
|
||||||
|
referer: Optional[str] = Header(None),
|
||||||
|
host: Optional[str] = Header(None),
|
||||||
):
|
):
|
||||||
if x_api_key != "secret":
|
if x_api_key != "secret":
|
||||||
raise HTTPException(status_code=401, detail="Invalid API Key")
|
raise HTTPException(status_code=401, detail="Invalid API Key")
|
||||||
state.config_lock.acquire()
|
state.config_lock.acquire()
|
||||||
try:
|
try:
|
||||||
logger.info(f"Received batch indexing request")
|
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||||
index_batch_request_acc = b""
|
|
||||||
async for chunk in request.stream():
|
|
||||||
index_batch_request_acc += chunk
|
|
||||||
data_bytes = sys.getsizeof(index_batch_request_acc)
|
|
||||||
unit = "KB"
|
|
||||||
data_size = data_bytes / 1024
|
|
||||||
if data_size > 1000:
|
|
||||||
unit = "MB"
|
|
||||||
data_size = data_size / 1024
|
|
||||||
if data_size > 1000:
|
|
||||||
unit = "GB"
|
|
||||||
data_size = data_size / 1024
|
|
||||||
data_size_metric = f"{data_size:.2f} {unit}"
|
|
||||||
logger.info(f"Received {data_size_metric} of data")
|
|
||||||
index_batch_request = IndexBatchRequest.parse_raw(index_batch_request_acc)
|
|
||||||
logger.info(f"Received {len(index_batch_request.files)} files")
|
|
||||||
|
|
||||||
org_files: Dict[str, str] = {}
|
org_files: Dict[str, str] = {}
|
||||||
markdown_files: Dict[str, str] = {}
|
markdown_files: Dict[str, str] = {}
|
||||||
pdf_files: Dict[str, str] = {}
|
pdf_files: Dict[str, str] = {}
|
||||||
plaintext_files: Dict[str, str] = {}
|
plaintext_files: Dict[str, str] = {}
|
||||||
|
|
||||||
for file in index_batch_request.files:
|
for file in files:
|
||||||
file_type = get_file_type(file.path)
|
file_type, encoding = get_file_type(file.content_type)
|
||||||
dict_to_update = None
|
dict_to_update = None
|
||||||
if file_type == "org":
|
if file_type == "org":
|
||||||
dict_to_update = org_files
|
dict_to_update = org_files
|
||||||
|
@ -103,9 +91,11 @@ async def index_batch(
|
||||||
dict_to_update = plaintext_files
|
dict_to_update = plaintext_files
|
||||||
|
|
||||||
if dict_to_update is not None:
|
if dict_to_update is not None:
|
||||||
dict_to_update[file.path] = file.content
|
dict_to_update[file.filename] = (
|
||||||
|
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.info(f"Skipping unsupported streamed file: {file.path}")
|
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||||
|
|
||||||
indexer_input = IndexerInput(
|
indexer_input = IndexerInput(
|
||||||
org=org_files,
|
org=org_files,
|
||||||
|
@ -115,7 +105,7 @@ async def index_batch(
|
||||||
)
|
)
|
||||||
|
|
||||||
if state.config == None:
|
if state.config == None:
|
||||||
logger.info("First run, initializing state.")
|
logger.info("📬 Initializing content index on first run.")
|
||||||
default_full_config = FullConfig(
|
default_full_config = FullConfig(
|
||||||
content_type=None,
|
content_type=None,
|
||||||
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
|
search_type=SearchConfig.parse_obj(constants.default_config["search-type"]),
|
||||||
|
@ -142,15 +132,30 @@ async def index_batch(
|
||||||
state.config.content_type,
|
state.config.content_type,
|
||||||
indexer_input.dict(),
|
indexer_input.dict(),
|
||||||
state.search_models,
|
state.search_models,
|
||||||
regenerate=regenerate,
|
regenerate=force,
|
||||||
t=search_type,
|
t=t,
|
||||||
full_corpus=False,
|
full_corpus=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to process batch indexing request: {e}", exc_info=True)
|
logger.error(
|
||||||
|
f"🚨 Failed to {force} update {t} content index triggered via API call by {client} client: {e}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
state.config_lock.release()
|
state.config_lock.release()
|
||||||
|
|
||||||
|
update_telemetry_state(
|
||||||
|
request=request,
|
||||||
|
telemetry_type="api",
|
||||||
|
api="index/update",
|
||||||
|
client=client,
|
||||||
|
user_agent=user_agent,
|
||||||
|
referer=referer,
|
||||||
|
host=host,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"📪 Content index updated via API call by {client} client")
|
||||||
return Response(content="OK", status_code=200)
|
return Response(content="OK", status_code=200)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from khoj.utils.yaml import parse_config_from_file
|
||||||
from khoj.migrations.migrate_version import migrate_config_to_version
|
from khoj.migrations.migrate_version import migrate_config_to_version
|
||||||
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
|
from khoj.migrations.migrate_processor_config_openai import migrate_processor_conversation_schema
|
||||||
from khoj.migrations.migrate_offline_model import migrate_offline_model
|
from khoj.migrations.migrate_offline_model import migrate_offline_model
|
||||||
|
from khoj.migrations.migrate_offline_chat_schema import migrate_offline_chat_schema
|
||||||
|
|
||||||
|
|
||||||
def cli(args=None):
|
def cli(args=None):
|
||||||
|
@ -55,7 +56,12 @@ def cli(args=None):
|
||||||
|
|
||||||
|
|
||||||
def run_migrations(args):
|
def run_migrations(args):
|
||||||
migrations = [migrate_config_to_version, migrate_processor_conversation_schema, migrate_offline_model]
|
migrations = [
|
||||||
|
migrate_config_to_version,
|
||||||
|
migrate_processor_conversation_schema,
|
||||||
|
migrate_offline_model,
|
||||||
|
migrate_offline_chat_schema,
|
||||||
|
]
|
||||||
for migration in migrations:
|
for migration in migrations:
|
||||||
args = migration(args)
|
args = migration(args)
|
||||||
return args
|
return args
|
||||||
|
|
|
@ -12,6 +12,8 @@ from khoj.processor.conversation.gpt4all.utils import download_model
|
||||||
# External Packages
|
# External Packages
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from khoj.utils.rawconfig import OfflineChatProcessorConfig
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
|
@ -84,7 +86,6 @@ class SearchModels:
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GPT4AllProcessorConfig:
|
class GPT4AllProcessorConfig:
|
||||||
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
|
||||||
loaded_model: Union[Any, None] = None
|
loaded_model: Union[Any, None] = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,18 +96,20 @@ class ConversationProcessorConfigModel:
|
||||||
):
|
):
|
||||||
self.openai_model = conversation_config.openai
|
self.openai_model = conversation_config.openai
|
||||||
self.gpt4all_model = GPT4AllProcessorConfig()
|
self.gpt4all_model = GPT4AllProcessorConfig()
|
||||||
self.enable_offline_chat = conversation_config.enable_offline_chat
|
self.offline_chat = conversation_config.offline_chat or OfflineChatProcessorConfig()
|
||||||
|
self.max_prompt_size = conversation_config.max_prompt_size
|
||||||
|
self.tokenizer = conversation_config.tokenizer
|
||||||
self.conversation_logfile = Path(conversation_config.conversation_logfile)
|
self.conversation_logfile = Path(conversation_config.conversation_logfile)
|
||||||
self.chat_session: List[str] = []
|
self.chat_session: List[str] = []
|
||||||
self.meta_log: dict = {}
|
self.meta_log: dict = {}
|
||||||
|
|
||||||
if self.enable_offline_chat:
|
if self.offline_chat.enable_offline_chat:
|
||||||
try:
|
try:
|
||||||
self.gpt4all_model.loaded_model = download_model(self.gpt4all_model.chat_model)
|
self.gpt4all_model.loaded_model = download_model(self.offline_chat.chat_model)
|
||||||
except ValueError as e:
|
except Exception as e:
|
||||||
|
self.offline_chat.enable_offline_chat = False
|
||||||
self.gpt4all_model.loaded_model = None
|
self.gpt4all_model.loaded_model = None
|
||||||
logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
|
logger.error(f"Error while loading offline chat model: {e}", exc_info=True)
|
||||||
self.enable_offline_chat = False
|
|
||||||
else:
|
else:
|
||||||
self.gpt4all_model.loaded_model = None
|
self.gpt4all_model.loaded_model = None
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,64 @@ empty_escape_sequences = "\n|\r|\t| "
|
||||||
app_env_filepath = "~/.khoj/env"
|
app_env_filepath = "~/.khoj/env"
|
||||||
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
telemetry_server = "https://khoj.beta.haletic.com/v1/telemetry"
|
||||||
|
|
||||||
|
empty_config = {
|
||||||
|
"content-type": {
|
||||||
|
"org": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/org/org.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/org/org_embeddings.pt",
|
||||||
|
"index-heading-entries": False,
|
||||||
|
},
|
||||||
|
"markdown": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/markdown/markdown.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/markdown/markdown_embeddings.pt",
|
||||||
|
},
|
||||||
|
"pdf": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/pdf/pdf.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/pdf/pdf_embeddings.pt",
|
||||||
|
},
|
||||||
|
"plaintext": {
|
||||||
|
"input-files": None,
|
||||||
|
"input-filter": None,
|
||||||
|
"compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
|
||||||
|
"embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"search-type": {
|
||||||
|
"symmetric": {
|
||||||
|
"encoder": "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
|
"model_directory": "~/.khoj/search/symmetric/",
|
||||||
|
},
|
||||||
|
"asymmetric": {
|
||||||
|
"encoder": "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
||||||
|
"cross-encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
|
"model_directory": "~/.khoj/search/asymmetric/",
|
||||||
|
},
|
||||||
|
"image": {"encoder": "sentence-transformers/clip-ViT-B-32", "model_directory": "~/.khoj/search/image/"},
|
||||||
|
},
|
||||||
|
"processor": {
|
||||||
|
"conversation": {
|
||||||
|
"openai": {
|
||||||
|
"api-key": None,
|
||||||
|
"chat-model": "gpt-3.5-turbo",
|
||||||
|
},
|
||||||
|
"offline-chat": {
|
||||||
|
"enable-offline-chat": False,
|
||||||
|
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||||
|
},
|
||||||
|
"tokenizer": None,
|
||||||
|
"max-prompt-size": None,
|
||||||
|
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
# default app config to use
|
# default app config to use
|
||||||
default_config = {
|
default_config = {
|
||||||
"content-type": {
|
"content-type": {
|
||||||
|
@ -72,7 +130,12 @@ default_config = {
|
||||||
"api-key": None,
|
"api-key": None,
|
||||||
"chat-model": "gpt-3.5-turbo",
|
"chat-model": "gpt-3.5-turbo",
|
||||||
},
|
},
|
||||||
"enable-offline-chat": False,
|
"offline-chat": {
|
||||||
|
"enable-offline-chat": False,
|
||||||
|
"chat-model": "llama-2-7b-chat.ggmlv3.q4_0.bin",
|
||||||
|
},
|
||||||
|
"tokenizer": None,
|
||||||
|
"max-prompt-size": None,
|
||||||
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
"conversation-logfile": "~/.khoj/processor/conversation/conversation_logs.json",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import logging
|
import logging
|
||||||
import glob
|
import glob
|
||||||
import base64
|
import os
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
@ -39,13 +39,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
return soup.get_text(strip=True, separator="\n")
|
return soup.get_text(strip=True, separator="\n")
|
||||||
|
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
input_files, input_filter = (
|
input_files, input_filters = (
|
||||||
config.input_files,
|
config.input_files,
|
||||||
config.input_filter,
|
config.input_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(input_files) and is_none_or_empty(input_filter):
|
if is_none_or_empty(input_files) and is_none_or_empty(input_filters):
|
||||||
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -53,11 +53,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
||||||
if input_files:
|
if input_files:
|
||||||
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
||||||
if input_filter:
|
if input_filters:
|
||||||
filtered_plaintext_files = {
|
filtered_plaintext_files = {
|
||||||
filtered_file
|
filtered_file
|
||||||
for jsonl_file_filter in input_filter
|
for plaintext_file_filter in input_filters
|
||||||
for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(plaintext_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
|
||||||
|
@ -73,12 +74,12 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_target_files:
|
for file in all_target_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
plaintext_content = f.read()
|
plaintext_content = f.read()
|
||||||
if file.endswith(("html", "htm", "xml")):
|
if file.endswith(("html", "htm", "xml")):
|
||||||
plaintext_content = extract_html_content(plaintext_content)
|
plaintext_content = extract_html_content(plaintext_content)
|
||||||
filename_to_content_map[file] = f.read()
|
filename_to_content_map[file] = plaintext_content
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
|
logger.warning(f"Unable to read file: {file} as plaintext. Skipping file.")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
|
@ -88,13 +89,13 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
|
|
||||||
def get_org_files(config: TextContentConfig):
|
def get_org_files(config: TextContentConfig):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
org_files, org_file_filter = (
|
org_files, org_file_filters = (
|
||||||
config.input_files,
|
config.input_files,
|
||||||
config.input_filter,
|
config.input_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filters):
|
||||||
logger.debug("At least one of org-files or org-file-filter is required to be specified")
|
logger.debug("At least one of org-files or org-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -102,11 +103,12 @@ def get_org_files(config: TextContentConfig):
|
||||||
absolute_org_files, filtered_org_files = set(), set()
|
absolute_org_files, filtered_org_files = set(), set()
|
||||||
if org_files:
|
if org_files:
|
||||||
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
|
absolute_org_files = {get_absolute_path(org_file) for org_file in org_files}
|
||||||
if org_file_filter:
|
if org_file_filters:
|
||||||
filtered_org_files = {
|
filtered_org_files = {
|
||||||
filtered_file
|
filtered_file
|
||||||
for org_file_filter in org_file_filter
|
for org_file_filter in org_file_filters
|
||||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(org_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||||
|
@ -119,7 +121,7 @@ def get_org_files(config: TextContentConfig):
|
||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_org_files:
|
for file in all_org_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = f.read()
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -131,26 +133,27 @@ def get_org_files(config: TextContentConfig):
|
||||||
|
|
||||||
def get_markdown_files(config: TextContentConfig):
|
def get_markdown_files(config: TextContentConfig):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
markdown_files, markdown_file_filter = (
|
markdown_files, markdown_file_filters = (
|
||||||
config.input_files,
|
config.input_files,
|
||||||
config.input_filter,
|
config.input_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filters):
|
||||||
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
logger.debug("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
"Get Markdown files to process"
|
# Get markdown files to process
|
||||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||||
if markdown_files:
|
if markdown_files:
|
||||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||||
|
|
||||||
if markdown_file_filter:
|
if markdown_file_filters:
|
||||||
filtered_markdown_files = {
|
filtered_markdown_files = {
|
||||||
filtered_file
|
filtered_file
|
||||||
for markdown_file_filter in markdown_file_filter
|
for markdown_file_filter in markdown_file_filters
|
||||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||||
|
@ -168,7 +171,7 @@ def get_markdown_files(config: TextContentConfig):
|
||||||
|
|
||||||
filename_to_content_map = {}
|
filename_to_content_map = {}
|
||||||
for file in all_markdown_files:
|
for file in all_markdown_files:
|
||||||
with open(file, "r") as f:
|
with open(file, "r", encoding="utf8") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = f.read()
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -180,13 +183,13 @@ def get_markdown_files(config: TextContentConfig):
|
||||||
|
|
||||||
def get_pdf_files(config: TextContentConfig):
|
def get_pdf_files(config: TextContentConfig):
|
||||||
# Extract required fields from config
|
# Extract required fields from config
|
||||||
pdf_files, pdf_file_filter = (
|
pdf_files, pdf_file_filters = (
|
||||||
config.input_files,
|
config.input_files,
|
||||||
config.input_filter,
|
config.input_filter,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filter):
|
if is_none_or_empty(pdf_files) and is_none_or_empty(pdf_file_filters):
|
||||||
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -194,11 +197,12 @@ def get_pdf_files(config: TextContentConfig):
|
||||||
absolute_pdf_files, filtered_pdf_files = set(), set()
|
absolute_pdf_files, filtered_pdf_files = set(), set()
|
||||||
if pdf_files:
|
if pdf_files:
|
||||||
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
||||||
if pdf_file_filter:
|
if pdf_file_filters:
|
||||||
filtered_pdf_files = {
|
filtered_pdf_files = {
|
||||||
filtered_file
|
filtered_file
|
||||||
for pdf_file_filter in pdf_file_filter
|
for pdf_file_filter in pdf_file_filters
|
||||||
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
for filtered_file in glob.glob(get_absolute_path(pdf_file_filter), recursive=True)
|
||||||
|
if os.path.isfile(filtered_file)
|
||||||
}
|
}
|
||||||
|
|
||||||
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
all_pdf_files = sorted(absolute_pdf_files | filtered_pdf_files)
|
||||||
|
@ -214,7 +218,7 @@ def get_pdf_files(config: TextContentConfig):
|
||||||
for file in all_pdf_files:
|
for file in all_pdf_files:
|
||||||
with open(file, "rb") as f:
|
with open(file, "rb") as f:
|
||||||
try:
|
try:
|
||||||
filename_to_content_map[file] = base64.b64encode(f.read()).decode("utf-8")
|
filename_to_content_map[file] = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
logger.warning(f"Unable to read file: {file} as PDF. Skipping file.")
|
||||||
logger.warning(e, exc_info=True)
|
logger.warning(e, exc_info=True)
|
||||||
|
|
|
@ -66,20 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
||||||
return merged_dict
|
return merged_dict
|
||||||
|
|
||||||
|
|
||||||
def get_file_type(filepath: str) -> str:
|
def get_file_type(file_type: str) -> tuple[str, str]:
|
||||||
"Get file type from file path"
|
"Get file type from file mime type"
|
||||||
file_type = Path(filepath).suffix[1:]
|
|
||||||
|
|
||||||
if file_type in ["md", "markdown"]:
|
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
|
||||||
return "markdown"
|
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
||||||
elif file_type in ["org", "orgmode"]:
|
if file_type in ["text/markdown"]:
|
||||||
return "org"
|
return "markdown", encoding
|
||||||
elif file_type in ["txt", "text", "html", "xml", "htm", "rst"]:
|
elif file_type in ["text/org"]:
|
||||||
return "plaintext"
|
return "org", encoding
|
||||||
elif file_type in ["pdf"]:
|
elif file_type in ["application/pdf"]:
|
||||||
return "pdf"
|
return "pdf", encoding
|
||||||
|
elif file_type in ["image/jpeg"]:
|
||||||
return file_type
|
return "jpeg", encoding
|
||||||
|
elif file_type in ["image/png"]:
|
||||||
|
return "png", encoding
|
||||||
|
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
|
||||||
|
return "plaintext", encoding
|
||||||
|
else:
|
||||||
|
return "other", encoding
|
||||||
|
|
||||||
|
|
||||||
def load_model(
|
def load_model(
|
||||||
|
|
|
@ -91,10 +91,17 @@ class OpenAIProcessorConfig(ConfigBase):
|
||||||
chat_model: Optional[str] = "gpt-3.5-turbo"
|
chat_model: Optional[str] = "gpt-3.5-turbo"
|
||||||
|
|
||||||
|
|
||||||
|
class OfflineChatProcessorConfig(ConfigBase):
|
||||||
|
enable_offline_chat: Optional[bool] = False
|
||||||
|
chat_model: Optional[str] = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||||
|
|
||||||
|
|
||||||
class ConversationProcessorConfig(ConfigBase):
|
class ConversationProcessorConfig(ConfigBase):
|
||||||
conversation_logfile: Path
|
conversation_logfile: Path
|
||||||
openai: Optional[OpenAIProcessorConfig]
|
openai: Optional[OpenAIProcessorConfig]
|
||||||
enable_offline_chat: Optional[bool] = False
|
offline_chat: Optional[OfflineChatProcessorConfig]
|
||||||
|
max_prompt_size: Optional[int]
|
||||||
|
tokenizer: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
class ProcessorConfig(ConfigBase):
|
class ProcessorConfig(ConfigBase):
|
||||||
|
|
|
@ -18,6 +18,7 @@ from khoj.utils.helpers import resolve_absolute_path
|
||||||
from khoj.utils.rawconfig import (
|
from khoj.utils.rawconfig import (
|
||||||
ContentConfig,
|
ContentConfig,
|
||||||
ConversationProcessorConfig,
|
ConversationProcessorConfig,
|
||||||
|
OfflineChatProcessorConfig,
|
||||||
OpenAIProcessorConfig,
|
OpenAIProcessorConfig,
|
||||||
ProcessorConfig,
|
ProcessorConfig,
|
||||||
TextContentConfig,
|
TextContentConfig,
|
||||||
|
@ -207,8 +208,9 @@ def processor_config_offline_chat(tmp_path_factory):
|
||||||
|
|
||||||
# Setup conversation processor
|
# Setup conversation processor
|
||||||
processor_config = ProcessorConfig()
|
processor_config = ProcessorConfig()
|
||||||
|
offline_chat = OfflineChatProcessorConfig(enable_offline_chat=True)
|
||||||
processor_config.conversation = ConversationProcessorConfig(
|
processor_config.conversation = ConversationProcessorConfig(
|
||||||
enable_offline_chat=True,
|
offline_chat=offline_chat,
|
||||||
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
conversation_logfile=processor_dir.joinpath("conversation_logs.json"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ from urllib.parse import quote
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
|
import pytest
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from app.main import app
|
from app.main import app
|
||||||
|
@ -60,13 +61,13 @@ def test_regenerate_with_invalid_content_type(client):
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_index_batch(client):
|
def test_index_update(client):
|
||||||
# Arrange
|
# Arrange
|
||||||
request_body = get_sample_files_data()
|
files = get_sample_files_data()
|
||||||
headers = {"x-api-key": "secret"}
|
headers = {"x-api-key": "secret"}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = client.post("/v1/indexer/batch", json=request_body, headers=headers)
|
response = client.post("/api/v1/index/update", files=files, headers=headers)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
@ -76,12 +77,11 @@ def test_index_batch(client):
|
||||||
def test_regenerate_with_valid_content_type(client):
|
def test_regenerate_with_valid_content_type(client):
|
||||||
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
for content_type in ["all", "org", "markdown", "image", "pdf", "notion", "plugin1"]:
|
||||||
# Arrange
|
# Arrange
|
||||||
request_body = get_sample_files_data()
|
files = get_sample_files_data()
|
||||||
|
|
||||||
headers = {"x-api-key": "secret"}
|
headers = {"x-api-key": "secret"}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = client.post(f"/v1/indexer/batch?search_type={content_type}", json=request_body, headers=headers)
|
response = client.post(f"/api/v1/index/update?t={content_type}", files=files, headers=headers)
|
||||||
# Assert
|
# Assert
|
||||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: {content_type}"
|
||||||
|
|
||||||
|
@ -92,17 +92,17 @@ def test_regenerate_with_github_fails_without_pat(client):
|
||||||
response = client.get(f"/api/update?force=true&t=github")
|
response = client.get(f"/api/update?force=true&t=github")
|
||||||
|
|
||||||
# Arrange
|
# Arrange
|
||||||
request_body = get_sample_files_data()
|
files = get_sample_files_data()
|
||||||
|
|
||||||
headers = {"x-api-key": "secret"}
|
headers = {"x-api-key": "secret"}
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = client.post(f"/v1/indexer/batch?search_type=github", json=request_body, headers=headers)
|
response = client.post(f"/api/v1/index/update?t=github", files=files, headers=headers)
|
||||||
# Assert
|
# Assert
|
||||||
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
|
assert response.status_code == 200, f"Returned status: {response.status_code} for content type: github"
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||||
def test_get_configured_types_via_api(client):
|
def test_get_configured_types_via_api(client):
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/config/types")
|
response = client.get(f"/api/config/types")
|
||||||
|
@ -288,24 +288,20 @@ def test_notes_search_with_exclude_filter(
|
||||||
|
|
||||||
def get_sample_files_data():
|
def get_sample_files_data():
|
||||||
return {
|
return {
|
||||||
"org": {
|
"files": ("path/to/filename.org", "* practicing piano", "text/org"),
|
||||||
"path/to/filename.org": "* practicing piano",
|
"files": ("path/to/filename1.org", "** top 3 reasons why I moved to SF", "text/org"),
|
||||||
"path/to/filename1.org": "** top 3 reasons why I moved to SF",
|
"files": ("path/to/filename2.org", "* how to build a search engine", "text/org"),
|
||||||
"path/to/filename2.org": "* how to build a search engine",
|
"files": ("path/to/filename.pdf", "Moore's law does not apply to consumer hardware", "application/pdf"),
|
||||||
},
|
"files": ("path/to/filename1.pdf", "The sun is a ball of helium", "application/pdf"),
|
||||||
"pdf": {
|
"files": ("path/to/filename2.pdf", "Effect of sunshine on baseline human happiness", "application/pdf"),
|
||||||
"path/to/filename.pdf": "Moore's law does not apply to consumer hardware",
|
"files": ("path/to/filename.txt", "data,column,value", "text/plain"),
|
||||||
"path/to/filename1.pdf": "The sun is a ball of helium",
|
"files": ("path/to/filename1.txt", "<html>my first web page</html>", "text/plain"),
|
||||||
"path/to/filename2.pdf": "Effect of sunshine on baseline human happiness",
|
"files": ("path/to/filename2.txt", "2021-02-02 Journal Entry", "text/plain"),
|
||||||
},
|
"files": ("path/to/filename.md", "# Notes from client call", "text/markdown"),
|
||||||
"plaintext": {
|
"files": (
|
||||||
"path/to/filename.txt": "data,column,value",
|
"path/to/filename1.md",
|
||||||
"path/to/filename1.txt": "<html>my first web page</html>",
|
"## Studying anthropological records from the Fatimid caliphate",
|
||||||
"path/to/filename2.txt": "2021-02-02 Journal Entry",
|
"text/markdown",
|
||||||
},
|
),
|
||||||
"markdown": {
|
"files": ("path/to/filename2.md", "**Understanding science through the lens of art**", "text/markdown"),
|
||||||
"path/to/filename.md": "# Notes from client call",
|
|
||||||
"path/to/filename1.md": "## Studying anthropological records from the Fatimid caliphate",
|
|
||||||
"path/to/filename2.md": "**Understanding science through the lens of art**",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@ from khoj.processor.conversation.gpt4all.utils import download_model
|
||||||
|
|
||||||
from khoj.processor.conversation.utils import message_to_log
|
from khoj.processor.conversation.utils import message_to_log
|
||||||
|
|
||||||
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_K_S.bin"
|
MODEL_NAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
|
@ -128,15 +128,15 @@ def test_extract_multiple_explicit_questions_from_message(loaded_model):
|
||||||
@pytest.mark.chatquality
|
@pytest.mark.chatquality
|
||||||
def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
||||||
# Act
|
# Act
|
||||||
response = extract_questions_offline("Is Morpheus taller than Neo?", loaded_model=loaded_model)
|
response = extract_questions_offline("Is Carl taller than Ross?", loaded_model=loaded_model)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
expected_responses = ["height", "taller", "shorter", "heights"]
|
expected_responses = ["height", "taller", "shorter", "heights", "who"]
|
||||||
assert len(response) <= 3
|
assert len(response) <= 3
|
||||||
|
|
||||||
for question in response:
|
for question in response:
|
||||||
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
|
assert any([expected_response in question.lower() for expected_response in expected_responses]), (
|
||||||
"Expected chat actor to ask follow-up questions about Morpheus and Neo, but got: " + question
|
"Expected chat actor to ask follow-up questions about Carl and Ross, but got: " + question
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -145,7 +145,7 @@ def test_extract_multiple_implicit_questions_from_message(loaded_model):
|
||||||
def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
||||||
# Arrange
|
# Arrange
|
||||||
message_list = [
|
message_list = [
|
||||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -156,17 +156,22 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
||||||
use_history=True,
|
use_history=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_responses = [
|
all_expected_in_response = [
|
||||||
"Vader",
|
"Anderson",
|
||||||
"sons",
|
]
|
||||||
|
|
||||||
|
any_expected_in_response = [
|
||||||
"son",
|
"son",
|
||||||
"Darth",
|
"sons",
|
||||||
"children",
|
"children",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(response) >= 1
|
assert len(response) >= 1
|
||||||
assert any([expected_response in response[0] for expected_response in expected_responses]), (
|
assert all([expected_response in response[0] for expected_response in all_expected_in_response]), (
|
||||||
|
"Expected chat actor to ask for clarification in response, but got: " + response[0]
|
||||||
|
)
|
||||||
|
assert any([expected_response in response[0] for expected_response in any_expected_in_response]), (
|
||||||
"Expected chat actor to ask for clarification in response, but got: " + response[0]
|
"Expected chat actor to ask for clarification in response, but got: " + response[0]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -176,20 +181,20 @@ def test_generate_search_query_using_question_from_chat_history(loaded_model):
|
||||||
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
|
def test_generate_search_query_using_answer_from_chat_history(loaded_model):
|
||||||
# Arrange
|
# Arrange
|
||||||
message_list = [
|
message_list = [
|
||||||
("What is the name of Mr. Vader's daughter?", "Princess Leia", []),
|
("What is the name of Mr. Anderson's daughter?", "Miss Barbara", []),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = extract_questions_offline(
|
response = extract_questions_offline(
|
||||||
"Is she a Jedi?",
|
"Is she a Doctor?",
|
||||||
conversation_log=populate_chat_history(message_list),
|
conversation_log=populate_chat_history(message_list),
|
||||||
loaded_model=loaded_model,
|
loaded_model=loaded_model,
|
||||||
use_history=True,
|
use_history=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_responses = [
|
expected_responses = [
|
||||||
"Leia",
|
"Barbara",
|
||||||
"Vader",
|
"Robert",
|
||||||
"daughter",
|
"daughter",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import base64
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||||
|
@ -16,7 +15,7 @@ def test_single_page_pdf_to_jsonl():
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
# Read singlepage.pdf into memory as bytes
|
# Read singlepage.pdf into memory as bytes
|
||||||
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
with open("tests/data/pdf/singlepage.pdf", "rb") as f:
|
||||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||||
|
@ -36,7 +35,7 @@ def test_multi_page_pdf_to_jsonl():
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
with open("tests/data/pdf/multipage.pdf", "rb") as f:
|
||||||
pdf_bytes = base64.b64encode(f.read()).decode("utf-8")
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
|
@ -1,26 +1,25 @@
|
||||||
# System Packages
|
# System Packages
|
||||||
import logging
|
import logging
|
||||||
|
import locale
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import pytest
|
import pytest
|
||||||
from khoj.utils.config import SearchModels
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.state import content_index, search_models
|
from khoj.utils.state import content_index, search_models
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||||
|
from khoj.utils.config import SearchModels
|
||||||
from khoj.utils.fs_syncer import get_org_files
|
from khoj.utils.fs_syncer import get_org_files
|
||||||
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_text_search_setup_with_missing_file_raises_error(
|
def test_text_search_setup_with_missing_file_raises_error(org_config_with_only_new_file: TextContentConfig):
|
||||||
org_config_with_only_new_file: TextContentConfig, search_config: SearchConfig
|
|
||||||
):
|
|
||||||
# Arrange
|
# Arrange
|
||||||
# Ensure file mentioned in org.input-files is missing
|
# Ensure file mentioned in org.input-files is missing
|
||||||
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
single_new_file = Path(org_config_with_only_new_file.input_files[0])
|
||||||
|
@ -29,7 +28,23 @@ def test_text_search_setup_with_missing_file_raises_error(
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with pytest.raises(FileNotFoundError):
|
with pytest.raises(FileNotFoundError):
|
||||||
data = get_org_files(org_config_with_only_new_file)
|
get_org_files(org_config_with_only_new_file)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_get_org_files_with_org_suffixed_dir_doesnt_raise_error(tmp_path: Path):
|
||||||
|
# Arrange
|
||||||
|
orgfile = tmp_path / "directory.org" / "file.org"
|
||||||
|
orgfile.parent.mkdir()
|
||||||
|
with open(orgfile, "w") as f:
|
||||||
|
f.write("* Heading\n- List item\n")
|
||||||
|
org_content_config = TextContentConfig(
|
||||||
|
input_filter=[f"{tmp_path}/**/*"], compressed_jsonl="test.jsonl", embeddings_file="test.pt"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# should not raise IsADirectoryError and return orgfile
|
||||||
|
assert get_org_files(org_content_config) == {f"{orgfile}": "* Heading\n- List item\n"}
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
@ -48,6 +63,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
||||||
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
def test_text_search_setup(content_config: ContentConfig, search_models: SearchModels):
|
||||||
# Arrange
|
# Arrange
|
||||||
data = get_org_files(content_config.org)
|
data = get_org_files(content_config.org)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Regenerate notes embeddings during asymmetric setup
|
# Regenerate notes embeddings during asymmetric setup
|
||||||
notes_model = text_search.setup(
|
notes_model = text_search.setup(
|
||||||
|
|
|
@ -24,5 +24,6 @@
|
||||||
"0.12.0": "0.15.0",
|
"0.12.0": "0.15.0",
|
||||||
"0.12.1": "0.15.0",
|
"0.12.1": "0.15.0",
|
||||||
"0.12.2": "0.15.0",
|
"0.12.2": "0.15.0",
|
||||||
"0.12.3": "0.15.0"
|
"0.12.3": "0.15.0",
|
||||||
|
"0.13.0": "0.15.0"
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue