Address merge conflicts from master branch

This commit is contained in:
Saba 2022-09-14 14:23:17 +03:00
commit ea62d47aa5
41 changed files with 1599 additions and 515 deletions

View file

@ -1,14 +1,16 @@
name: publish
on:
pull_request:
push:
tags:
- v*
branches:
- 'master'
paths:
- src/**
- setup.py
- .github/workflows/publish.yml
push:
pull_request:
branches:
- 'master'
paths:
@ -38,11 +40,16 @@ jobs:
pip install --upgrade .
- name: Publish Release to PyPI
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
if: startsWith(github.ref, 'refs/tags')
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_KEY }}
run: |
# Setup Environment for Reproducible Builds
export PYTHONHASHSEED=42
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
# Build and Upload PyPi Package
rm -rf dist
python -m build
twine check dist/*
@ -54,7 +61,14 @@ jobs:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_KEY }}
run: |
# Set Pre-Release Version
sed -E -i "s/version=(.*)',/version=\1a$(date +%s)',/g" setup.py
# Setup Environment for Reproducible Builds
export PYTHONHASHSEED=42
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
# Build and Upload PyPi Package
rm -rf dist
python -m build
twine check dist/*
@ -67,7 +81,14 @@ jobs:
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_KEY }}
PULL_REQUEST_NUMBER: ${{ github.event.number }}
run: |
# Set Development Release Version
sed -E -i "s/version=(.*)',/version=\1.dev$PULL_REQUEST_NUMBER$(date +%s)',/g" setup.py
# Setup Environment for Reproducible Builds
export PYTHONHASHSEED=42
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
# Build and Upload PyPi Package
rm -rf dist
python -m build
twine check dist/*

117
.github/workflows/release.yml vendored Normal file
View file

@ -0,0 +1,117 @@
name: release
on:
workflow_dispatch:
inputs:
version:
description: 'Version Number'
required: true
type: string
push:
tags:
- v*
jobs:
publish:
strategy:
matrix:
include:
- os: ubuntu-latest
extension: deb
- os: macos-latest
extension: dmg
- os: windows-latest
extension: exe
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install Dependencies
shell: bash
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt install libegl1 libxcb-xinerama0 python3-tk -y
fi
python -m pip install --upgrade pip
pip install pyinstaller
- name: Install Khoj App
run: |
pip install --upgrade .
- name: Package Khoj App
shell: bash
run: |
# Setup Environment for Reproducible Builds
export PYTHONHASHSEED=42
export SOURCE_DATE_EPOCH=$(git log -1 --pretty=%ct)
pyinstaller --noconfirm Khoj.spec
if [ "$RUNNER_OS" == "Windows" ]; then
mv dist/Khoj.exe dist/khoj_"$GITHUB_REF_NAME"_amd64.exe
fi
- name: Create Mac App DMG
if: matrix.os == 'macos-latest'
run: |
# Install Mac DMG Creator
brew install create-dmg
# Copy app to separate dmg folder
mkdir -p dist/dmg && cp -r dist/Khoj.app dist/dmg
# Create disk image with the app
create-dmg \
--volname "Khoj" \
--volicon "src/interface/web/assets/icons/favicon.icns" \
--window-pos 200 120 \
--window-size 600 300 \
--icon-size 100 \
--icon "Khoj.app" 175 120 \
--hide-extension "Khoj.app" \
--app-drop-link 425 120 \
"dist/khoj_"$GITHUB_REF_NAME"_amd64.dmg" \
"dist/dmg/"
- uses: ruby/setup-ruby@v1
if: matrix.os == 'ubuntu-latest'
with:
ruby-version: '3.0'
- name: Create Debian Package
if: matrix.os == 'ubuntu-latest'
shell: bash
env:
DEBIAN_PACKAGE_VERSION: ${{ inputs.version }}
run: |
# Install Debian Packager
gem install fpm
# Copy app files into expected output directory structure
mkdir -p package/opt package/usr/share/applications package/usr/share/icons/hicolor/128x128/apps
cp -r dist/Khoj package/opt/Khoj
cp src/interface/web/assets/icons/favicon-128x128.png package/usr/share/icons/hicolor/128x128/apps/Khoj.png
cp Khoj.desktop package/usr/share/applications
# Fix permissions to be usable by non-root users
find package/usr/share -type f -exec chmod 644 -- {} +
chmod 755 package/opt/Khoj
# Package the app
if [ -z "$DEBIAN_PACKAGE_VERSION" ]; then
DEBIAN_PACKAGE_VERSION=$(echo $GITHUB_REF_NAME | sed -E 's/v(.*)/\1/g')
fi
fpm -C package -s dir -t deb -n Khoj --version $DEBIAN_PACKAGE_VERSION -p dist/khoj_"$GITHUB_REF_NAME"_amd64.deb
- uses: actions/upload-artifact@v3
with:
name: khoj_${{github.ref_name}}_amd64.${{matrix.extension}}
path: dist/khoj_${{github.ref_name}}_amd64.${{matrix.extension}}
- name: Release
uses: softprops/action-gh-release@v1
if: startsWith(github.ref, 'refs/tags/')
with:
files: dist/khoj_${{github.ref_name}}_amd64.${{matrix.extension}}

View file

@ -34,6 +34,7 @@ jobs:
- name: Install Dependencies
run: |
sudo apt install libegl1 -y
python -m pip install --upgrade pip
pip install pytest

7
Khoj.desktop Normal file
View file

@ -0,0 +1,7 @@
[Desktop Entry]
Type=Application
Name=Khoj
Comment=A natural language search engine for your personal notes, transactions and images.
Path=/opt
Exec=/opt/Khoj
Icon=Khoj

115
Khoj.spec Normal file
View file

@ -0,0 +1,115 @@
# -*- mode: python ; coding: utf-8 -*-
from os.path import join
from platform import system
from PyInstaller.utils.hooks import copy_metadata
import sysconfig
datas = [
('src/interface/web', 'src/interface/web'),
(f'{sysconfig.get_paths()["purelib"]}/transformers', 'transformers')
]
datas += copy_metadata('tqdm')
datas += copy_metadata('regex')
datas += copy_metadata('requests')
datas += copy_metadata('packaging')
datas += copy_metadata('filelock')
datas += copy_metadata('numpy')
datas += copy_metadata('tokenizers')
block_cipher = None
a = Analysis(
['src/main.py'],
pathex=[],
binaries=[],
datas=datas,
hiddenimports=['huggingface_hub.repository'],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False,
)
# Filter out unused and/or duplicate shared libs
torch_lib_paths = {
join('torch', 'lib', 'libtorch_cuda.so'),
join('torch', 'lib', 'libtorch_cpu.so'),
}
a.datas = [entry for entry in a.datas if not entry[0] in torch_lib_paths]
os_path_separator = '\\' if system() == 'Windows' else '/'
a.datas = [entry for entry in a.datas if not f'torch{os_path_separator}_C.cp' in entry[0]]
a.datas = [entry for entry in a.datas if not f'torch{os_path_separator}_dl.cp' in entry[0]]
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
if system() != 'Darwin':
# Add Splash screen to show on app launch
splash = Splash(
'src/interface/web/assets/icons/favicon-144x144.png',
binaries=a.binaries,
datas=a.datas,
text_pos=(10, 160),
text_size=12,
text_color='black',
minify_script=True,
always_on_top=True
)
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
splash,
splash.binaries,
[],
name='Khoj',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=False,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch='x86_64',
codesign_identity=None,
entitlements_file=None,
icon='src/interface/web/assets/icons/favicon-144x144.ico',
)
else:
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='Khoj',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=False,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch='x86_64',
codesign_identity=None,
entitlements_file=None,
icon='src/interface/web/assets/icons/favicon.icns',
)
app = BUNDLE(
exe,
name='Khoj.app',
icon='src/interface/web/assets/icons/favicon.icns',
bundle_identifier=None,
)

145
Readme.md
View file

@ -2,6 +2,7 @@
[![build](https://github.com/debanjum/khoj/actions/workflows/build.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/build.yml)
[![test](https://github.com/debanjum/khoj/actions/workflows/test.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/test.yml)
[![publish](https://github.com/debanjum/khoj/actions/workflows/publish.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/publish.yml)
[![release](https://github.com/debanjum/khoj/actions/workflows/release.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/release.yml)
*A natural language search engine for your personal notes, transactions and images*
@ -11,6 +12,7 @@
- [Demo](#Demo)
- [Description](#Description)
- [Analysis](#Analysis)
- [Interfaces](#Interfaces)
- [Architecture](#Architecture)
- [Setup](#Setup)
- [Install](#1-Install)
@ -34,7 +36,7 @@
## Features
- **Natural**: Advanced Natural language understanding using Transformer based ML Models
- **Natural**: Advanced natural language understanding using Transformer based ML Models
- **Local**: Your personal data stays local. All search, indexing is done on your machine[\*](https://github.com/debanjum/khoj#miscellaneous)
- **Incremental**: Incremental search for a fast, search-as-you-type experience
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
@ -43,12 +45,14 @@
## Demo
<https://user-images.githubusercontent.com/6413477/181664862-31565b0a-0e64-47e1-a79a-599dfc486c74.mp4>
https://user-images.githubusercontent.com/6413477/184735169-92c78bf1-d827-4663-9087-a1ea194b8f4b.mp4
### Description
- User searches for \"*Setup editor*\"
- The demo looks for the most relevant section in this readme and the [khoj.el readme](https://github.com/debanjum/khoj/tree/master/src/interface/emacs)
- Install Khoj via pip
- Start Khoj app
- Add this readme and [khoj.el readme](https://github.com/debanjum/khoj/tree/master/src/interface/emacs) as org-mode for Khoj to index
- Search \"*Setup editor*\" on the Web and Emacs. Re-rank the results for better accuracy
- Top result is what we are looking for, the [section to Install Khoj.el on Emacs](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#installation)
### Analysis
@ -56,7 +60,11 @@
- The results do not have any words used in the query
- *Based on the top result it seems the re-ranking model understands that Emacs is an editor?*
- The results incrementally update as the query is entered
- The results are re-ranked, for better accuracy, once user is idle
- The results are re-ranked, for better accuracy, once user hits enter
### Interfaces
![](https://github.com/debanjum/khoj/blob/master/docs/interfaces.png)
## Architecture
@ -64,56 +72,58 @@
## Setup
### 1. Install
``` shell
pip install khoj-assistant
```
### 2. Configure
- Set `input-files` or `input-filter` in each relevant `content-type` section of [khoj_sample.yml](./config/khoj_sample.yml)
- Set `input-directories` field in `content-type.image` section
- Delete `content-type`, `processor` sub-sections irrelevant for your use-case
```shell
pip install khoj-assistant
```
### 3. Run
``` shell
khoj config/khoj_sample.yml -vv
```
Loads ML model, generates embeddings and exposes API to search notes, images, transactions etc specified in config YAML
### 2. Start App
```shell
khoj
```
### 3. Configure
1. Enable content types and point to files to search in the First Run Screen that pops up on app start
2. Click configure and wait. The app will load ML model, generates embeddings and expose the search API
## Use
- **Khoj via Web**
- Open <http://localhost:8000/>
- Open <http://localhost:8000/> via desktop interface or directly
- **Khoj via Emacs**
- [Install](https://github.com/debanjum/khoj/tree/master/src/interface/emacs#installation) [khoj.el](./src/interface/emacs/khoj.el)
- Run `M-x khoj <user-query>`
- **Khoj via API**
- See [Khoj FastAPI Docs](http://localhost:8000/docs), [Khoj FastAPI ReDocs](http://localhost:8000/redocs)
- See the Khoj FastAPI [Swagger Docs](http://localhost:8000/docs), [ReDocs](http://localhost:8000/redocs)
## Upgrade
``` shell
```shell
pip install --upgrade khoj-assistant
```
## Troubleshoot
- Symptom: Errors out complaining about Tensors mismatch, null etc
- Mitigation: Delete `content-type` > `image` section from `khoj_sample.yml`
- Mitigation: Disable `image` search on the desktop GUI
- Symptom: Errors out with \"Killed\" in error message in Docker
- Fix: Increase RAM available to Docker Containers in Docker Settings
- Refer: [StackOverflow Solution](https://stackoverflow.com/a/50770267), [Configure Resources on Docker for Mac](https://docs.docker.com/desktop/mac/#resources)
## Miscellaneous
- The experimental [chat](localhost:8000/chat) API endpoint uses the [OpenAI API](https://openai.com/api/)
- It is disabled by default
- To use it add your `openai-api-key` to config.yml
- The beta [chat](http://localhost:8000/beta/chat) and [search](http://localhost:8000/beta/search) API endpoints use [OpenAI API](https://openai.com/api/)
- It is disabled by default
- To use it add your `openai-api-key` via the app configure screen
- Warning: *If you use the above beta APIs, your query and top result(s) will be sent to OpenAI for processing*
## Performance
### Query performance
- Semantic search using the bi-encoder is fairly fast at \<5 ms
- Semantic search using the bi-encoder is fairly fast at \<50 ms
- Reranking using the cross-encoder is slower at \<2s on 15 results. Tweak `top_k` to tradeoff speed for accuracy of results
- Applying explicit filters is very slow currently at \~6s. This is because the filters are rudimentary. Considerable speed-ups can be achieved using indexes etc
@ -133,39 +143,48 @@ pip install --upgrade khoj-assistant
### Setup
#### Using Pip
##### 1. Install
``` shell
git clone https://github.com/debanjum/khoj && cd khoj
python -m venv .venv && source .venv/bin/activate
pip install
```
```shell
git clone https://github.com/debanjum/khoj && cd khoj
python3 -m venv .venv && source .venv/bin/activate
pip install -e .
```
##### 2. Configure
- Set `input-files` or `input-filter` in each relevant `content-type` section of `khoj_sample.yml`
- Set `input-directories` field in `image` `content-type` section
- Delete `content-type`, `processor` sub-sections irrelevant for your use-case
- Copy the `config/khoj_sample.yml` to `~/.khoj/khoj.yml`
- Set `input-files` or `input-filter` in each relevant `content-type` section of `~/.khoj/khoj.yml`
- Set `input-directories` field in `image` `content-type` section
- Delete `content-type` and `processor` sub-section(s) irrelevant for your use-case
##### 3. Run
``` shell
khoj config/khoj_sample.yml -vv
```
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
```shell
khoj -vv
```
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
##### 4. Upgrade
```shell
# To Upgrade To Latest Stable Release
# Maps to the latest tagged version of khoj on master branch
pip install --upgrade khoj-assistant
# To Upgrade To Latest Pre-Release
# Maps to the latest commit on the master branch
pip install --upgrade --pre khoj-assistant
# To Upgrade To Specific Development Release
pip install -r testpypi khoj-assistant==0.1.5.dev491659577806
# To Upgrade To Specific Development Release.
# Useful to test, review a PR.
# Note: khoj-assistant is published to test PyPi on creating a PR
pip install -i https://test.pypi.org/simple/ khoj-assistant==0.1.5.dev57166025766
```
#### Using Docker
##### 1. Clone
``` shell
```shell
git clone https://github.com/debanjum/khoj && cd khoj
```
@ -176,7 +195,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
##### 3. Run
``` shell
```shell
docker-compose up -d
```
@ -184,38 +203,39 @@ docker-compose up -d
##### 4. Upgrade
``` shell
```shell
docker-compose build --pull
```
#### Using Conda
##### 1. Install Dependencies
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\]
- Install Exiftool \[Optional\]
``` shell
sudo apt -y install libimage-exiftool-perl
```
- [Install Conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) \[Required\]
- Install Exiftool \[Optional\]
``` shell
sudo apt -y install libimage-exiftool-perl
```
##### 2. Install Khoj
``` shell
git clone https://github.com/debanjum/khoj && cd khoj
conda env create -f config/environment.yml
conda activate khoj
```
```shell
git clone https://github.com/debanjum/khoj && cd khoj
conda env create -f config/environment.yml
conda activate khoj
```
##### 3. Configure
- Set `input-files` or `input-filter` in each relevant `content-type` section of `khoj_sample.yml`
- Set `input-directories` field in `image` `content-type` section
- Delete `content-type`, `processor` sub-sections irrelevant for your use-case
- Copy the `config/khoj_sample.yml` to `~/.khoj/khoj.yml`
- Set `input-files` or `input-filter` in each relevant `content-type` section of `~/.khoj/khoj.yml`
- Set `input-directories` field in `image` `content-type` section
- Delete `content-type`, `processor` sub-sections irrelevant for your use-case
##### 4. Run
``` shell
python3 -m src.main config/khoj_sample.yml -vv
```
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
```shell
python3 -m src.main -vv
```
Load ML model, generate embeddings and expose API to query notes, images, transactions etc specified in config YAML
##### 5. Upgrade
``` shell
```shell
cd khoj
git pull origin master
conda deactivate khoj
@ -224,8 +244,7 @@ conda activate khoj
```
### Test
``` shell
```shell
pytest
```

Binary file not shown.

BIN
docs/interfaces.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 606 KiB

View file

@ -7,7 +7,7 @@ this_directory = Path(__file__).parent
setup(
name='khoj-assistant',
version='0.1.5',
version='0.1.6',
description="A natural language search engine for your personal notes, transactions and images",
long_description=(this_directory / "Readme.md").read_text(encoding="utf-8"),
long_description_content_type="text/markdown",
@ -24,8 +24,8 @@ setup(
),
install_requires=[
"numpy == 1.22.4",
"torch == 1.11.0",
"torchvision == 0.12.0",
"torch == 1.12.1",
"torchvision == 0.13.1",
"transformers == 4.21.0",
"sentence-transformers == 2.1.0",
"openai == 0.20.0",
@ -36,9 +36,10 @@ setup(
"jinja2 == 3.1.2",
"pyyaml == 6.0",
"pytest == 7.1.2",
"pillow >= 9.0.1",
"pillow == 9.2.0",
"aiofiles == 0.8.0",
"dateparser == 1.1.1",
"pyqt6 == 6.3.1",
],
include_package_data=True,
entry_points={"console_scripts": ["khoj = src.main:run"]},
@ -47,9 +48,6 @@ setup(
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",

102
src/configure.py Normal file
View file

@ -0,0 +1,102 @@
# System Packages
import sys
# External Packages
import torch
import json
# Internal Packages
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
from src.processor.panchayat.panchayat_to_jsonl import panchayat_to_jsonl
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.search_type import image_search, text_search
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils import state
from src.utils.helpers import get_absolute_path
from src.utils.rawconfig import FullConfig, ProcessorConfig
def configure_server(args, required=False):
if args.config is None:
if required:
print('Exiting as Khoj is not configured. Configure the application to use it.')
sys.exit(1)
else:
return
else:
state.config = args.config
# Initialize the search model from Config
state.model = configure_search(state.model, state.config, args.regenerate, verbose=state.verbose)
# Initialize Processor from Config
state.processor_config = configure_processor(args.config.processor, verbose=state.verbose)
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None, verbose: int = 0):
# Initialize Org Notes Search
if (t == SearchType.Org or t == None) and config.content_type.org:
# Extract Entries, Generate Notes Embeddings
model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
# Initialize Org Music Search
if (t == SearchType.Music or t == None) and config.content_type.music:
# Extract Entries, Generate Music Embeddings
model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
# Initialize Markdown Search
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
# Extract Entries, Generate Markdown Embeddings
model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
# Initialize Panchayat Search
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
# Extract Entries, Generate Yaml Embeddings
model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, verbose=verbose)
# Initialize Ledger Search
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, verbose=verbose)
# Initialize Image Search
if (t == SearchType.Image or t == None) and config.content_type.image:
# Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate, verbose=verbose)
return model
def configure_processor(processor_config: ProcessorConfig, verbose: int):
if not processor_config:
return
processor = ProcessorConfigModel()
# Initialize Conversation Processor
if processor_config.conversation:
processor.conversation = configure_conversation_processor(processor_config.conversation, verbose)
return processor
def configure_conversation_processor(conversation_processor_config, verbose: int):
conversation_processor = ConversationProcessorConfigModel(conversation_processor_config, verbose)
conversation_logfile = conversation_processor.conversation_logfile
if conversation_processor.verbose:
print('INFO:\tLoading conversation logs from disk...')
if conversation_logfile.expanduser().absolute().is_file():
# Load Metadata Logs from Conversation Logfile
with open(get_absolute_path(conversation_logfile), 'r') as f:
conversation_processor.meta_log = json.load(f)
print('INFO:\tConversation logs loaded from disk.')
else:
# Initialize Conversation Logs
conversation_processor.meta_log = {}
conversation_processor.chat_session = ""
return conversation_processor

View file

View file

@ -0,0 +1,72 @@
# External Packages
from PyQt6 import QtWidgets
from PyQt6.QtCore import QDir
# Internal Packages
from src.utils.config import SearchType
from src.utils.helpers import is_none_or_empty
class FileBrowser(QtWidgets.QWidget):
def __init__(self, title, search_type: SearchType=None, default_files:list=[]):
QtWidgets.QWidget.__init__(self)
layout = QtWidgets.QHBoxLayout()
self.setLayout(layout)
self.search_type = search_type
self.filter_name = self.getFileFilter(search_type)
self.dirpath = QDir.homePath()
self.label = QtWidgets.QLabel()
self.label.setText(title)
self.label.setFixedWidth(95)
self.label.setWordWrap(True)
layout.addWidget(self.label)
self.lineEdit = QtWidgets.QPlainTextEdit(self)
self.lineEdit.setFixedWidth(330)
self.setFiles(default_files)
self.lineEdit.setFixedHeight(min(7+20*len(self.lineEdit.toPlainText().split('\n')),90))
self.lineEdit.textChanged.connect(self.updateFieldHeight)
layout.addWidget(self.lineEdit)
self.button = QtWidgets.QPushButton('Add')
self.button.clicked.connect(self.storeFilesSelectedInFileDialog)
layout.addWidget(self.button)
layout.addStretch()
def getFileFilter(self, search_type):
if search_type == SearchType.Org:
return 'Org-Mode Files (*.org)'
elif search_type == SearchType.Ledger:
return 'Beancount Files (*.bean *.beancount)'
elif search_type == SearchType.Markdown:
return 'Markdown Files (*.md *.markdown)'
elif search_type == SearchType.Music:
return 'Org-Music Files (*.org)'
elif search_type == SearchType.Image:
return 'Images (*.jp[e]g)'
def storeFilesSelectedInFileDialog(self):
filepaths = self.getPaths()
if self.search_type == SearchType.Image:
filepaths.append(QtWidgets.QFileDialog.getExistingDirectory(self, caption='Choose Folder',
directory=self.dirpath))
else:
filepaths.extend(QtWidgets.QFileDialog.getOpenFileNames(self, caption='Choose Files',
directory=self.dirpath,
filter=self.filter_name)[0])
self.setFiles(filepaths)
def setFiles(self, paths:list):
self.filepaths = [path for path in paths if not is_none_or_empty(path)]
self.lineEdit.setPlainText("\n".join(self.filepaths))
def getPaths(self) -> list:
if self.lineEdit.toPlainText() == '':
return []
else:
return self.lineEdit.toPlainText().split('\n')
def updateFieldHeight(self):
self.lineEdit.setFixedHeight(min(7+20*len(self.lineEdit.toPlainText().split('\n')),90))

View file

@ -0,0 +1,27 @@
# External Packages
from PyQt6 import QtWidgets
# Internal Packages
from src.utils.config import ProcessorType
class LabelledTextField(QtWidgets.QWidget):
def __init__(self, title, processor_type: ProcessorType=None, default_value: str=None):
QtWidgets.QWidget.__init__(self)
layout = QtWidgets.QHBoxLayout()
self.setLayout(layout)
self.processor_type = processor_type
self.label = QtWidgets.QLabel()
self.label.setText(title)
self.label.setFixedWidth(95)
self.label.setWordWrap(True)
layout.addWidget(self.label)
self.input_field = QtWidgets.QTextEdit(self)
self.input_field.setFixedWidth(410)
self.input_field.setFixedHeight(27)
self.input_field.setText(default_value)
layout.addWidget(self.input_field)
layout.addStretch()

View file

@ -0,0 +1,318 @@
# Standard Packages
from enum import Enum
from pathlib import Path
from copy import deepcopy
import webbrowser
# External Packages
from PyQt6 import QtGui, QtWidgets
from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
# Internal Packages
from src.configure import configure_server
from src.interface.desktop.file_browser import FileBrowser
from src.interface.desktop.labelled_text_field import LabelledTextField
from src.utils import constants, state, yaml as yaml_utils
from src.utils.cli import cli
from src.utils.config import SearchType, ProcessorType
from src.utils.helpers import merge_dicts, resolve_absolute_path
class MainWindow(QtWidgets.QMainWindow):
"""Create Window to Configure Khoj
Allow user to
1. Configure content types to search
2. Configure conversation processor
3. Save the configuration to khoj.yml
"""
def __init__(self, config_file: Path):
super(MainWindow, self).__init__()
self.config_file = config_file
# Set regenerate flag to regenerate embeddings everytime user clicks configure
if state.cli_args:
state.cli_args += ['--regenerate']
else:
state.cli_args = ['--regenerate']
# Load config from existing config, if exists, else load from default config
if resolve_absolute_path(self.config_file).exists():
self.first_run = False
self.current_config = yaml_utils.load_config_from_file(self.config_file)
else:
self.first_run = True
self.current_config = deepcopy(constants.default_config)
self.new_config = self.current_config
# Initialize Configure Window
self.setWindowTitle("Khoj")
self.setFixedWidth(600)
# Set Window Icon
icon_path = constants.web_directory / 'assets/icons/favicon-144x144.png'
self.setWindowIcon(QtGui.QIcon(f'{icon_path.absolute()}'))
# Initialize Configure Window Layout
self.layout = QtWidgets.QVBoxLayout()
# Add Settings Panels for each Search Type to Configure Window Layout
self.search_settings_panels = []
for search_type in SearchType:
current_content_config = self.current_config['content-type'].get(search_type, {})
self.search_settings_panels += [self.add_settings_panel(current_content_config, search_type)]
# Add Conversation Processor Panel to Configure Screen
self.processor_settings_panels = []
conversation_type = ProcessorType.Conversation
current_conversation_config = self.current_config['processor'].get(conversation_type, {})
self.processor_settings_panels += [self.add_processor_panel(current_conversation_config, conversation_type)]
# Add Action Buttons Panel
self.add_action_panel()
# Set the central widget of the Window. Widget will expand
# to take up all the space in the window by default.
self.config_window = QtWidgets.QWidget()
self.config_window.setLayout(self.layout)
self.setCentralWidget(self.config_window)
self.position_window()
def add_settings_panel(self, current_content_config: dict, search_type: SearchType):
"Add Settings Panel for specified Search Type. Toggle Editable Search Types"
# Get current files from config for given search type
if search_type == SearchType.Image:
current_content_files = current_content_config.get('input-directories', [])
file_input_text = f'{search_type.name} Folders'
else:
current_content_files = current_content_config.get('input-files', [])
file_input_text = f'{search_type.name} Files'
# Create widgets to display settings for given search type
search_type_settings = QtWidgets.QWidget()
search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
# Add file browser to set input files for given search type
input_files = FileBrowser(file_input_text, search_type, current_content_files)
# Set enabled/disabled based on checkbox state
enable_search_type.setChecked(current_content_files is not None and len(current_content_files) > 0)
input_files.setEnabled(enable_search_type.isChecked())
enable_search_type.stateChanged.connect(lambda _: input_files.setEnabled(enable_search_type.isChecked()))
# Add setting widgets for given search type to panel
search_type_layout.addWidget(enable_search_type)
search_type_layout.addWidget(input_files)
self.layout.addWidget(search_type_settings)
return search_type_settings
def add_processor_panel(self, current_conversation_config: dict, processor_type: ProcessorType):
"Add Conversation Processor Panel"
# Get current settings from config for given processor type
current_openai_api_key = current_conversation_config.get('openai-api-key', None)
# Create widgets to display settings for given processor type
processor_type_settings = QtWidgets.QWidget()
processor_type_layout = QtWidgets.QVBoxLayout(processor_type_settings)
enable_conversation = ProcessorCheckBox(f"Conversation", processor_type)
# Add file browser to set input files for given processor type
input_field = LabelledTextField("OpenAI API Key", processor_type, current_openai_api_key)
# Set enabled/disabled based on checkbox state
enable_conversation.setChecked(current_openai_api_key is not None)
input_field.setEnabled(enable_conversation.isChecked())
enable_conversation.stateChanged.connect(lambda _: input_field.setEnabled(enable_conversation.isChecked()))
# Add setting widgets for given processor type to panel
processor_type_layout.addWidget(enable_conversation)
processor_type_layout.addWidget(input_field)
self.layout.addWidget(processor_type_settings)
return processor_type_settings
def add_action_panel(self):
"Add Action Panel"
# Button to Save Settings
action_bar = QtWidgets.QWidget()
action_bar_layout = QtWidgets.QHBoxLayout(action_bar)
self.configure_button = QtWidgets.QPushButton("Configure", clicked=self.configure_app)
self.search_button = QtWidgets.QPushButton("Search", clicked=lambda: webbrowser.open(f'http://{state.host}:{state.port}/'))
self.search_button.setEnabled(not self.first_run)
action_bar_layout.addWidget(self.configure_button)
action_bar_layout.addWidget(self.search_button)
self.layout.addWidget(action_bar)
def get_default_config(self, search_type:SearchType=None, processor_type:ProcessorType=None):
"Get default config"
config = constants.default_config
if search_type:
return config['content-type'][search_type]
elif processor_type:
return config['processor'][processor_type]
else:
return config
def add_error_message(self, message: str):
"Add Error Message to Configure Screen"
# Remove any existing error messages
for message_prefix in ErrorType:
for i in reversed(range(self.layout.count())):
current_widget = self.layout.itemAt(i).widget()
if isinstance(current_widget, QtWidgets.QLabel) and current_widget.text().startswith(message_prefix.value):
self.layout.removeWidget(current_widget)
current_widget.deleteLater()
# Add new error message
if message:
error_message = QtWidgets.QLabel()
error_message.setWordWrap(True)
error_message.setText(message)
error_message.setStyleSheet("color: red")
self.layout.addWidget(error_message)
def update_search_settings(self):
"Update config with search settings from UI"
for settings_panel in self.search_settings_panels:
for child in settings_panel.children():
if not isinstance(child, (SearchCheckBox, FileBrowser)):
continue
if isinstance(child, SearchCheckBox):
# Search Type Disabled
if not child.isChecked() and child.search_type in self.new_config['content-type']:
del self.new_config['content-type'][child.search_type]
# Search Type (re)-Enabled
if child.isChecked():
current_search_config = self.current_config['content-type'].get(child.search_type, {})
default_search_config = self.get_default_config(search_type = child.search_type)
self.new_config['content-type'][child.search_type.value] = merge_dicts(current_search_config, default_search_config)
elif isinstance(child, FileBrowser) and child.search_type in self.new_config['content-type']:
if child.search_type.value == SearchType.Image:
self.new_config['content-type'][child.search_type.value]['input-directories'] = child.getPaths() if child.getPaths() != [] else None
else:
self.new_config['content-type'][child.search_type.value]['input-files'] = child.getPaths() if child.getPaths() != [] else None
def update_processor_settings(self):
"Update config with conversation settings from UI"
for settings_panel in self.processor_settings_panels:
for child in settings_panel.children():
if not isinstance(child, (ProcessorCheckBox, LabelledTextField)):
continue
if isinstance(child, ProcessorCheckBox):
# Processor Type Disabled
if not child.isChecked() and child.processor_type in self.new_config['processor']:
del self.new_config['processor'][child.processor_type]
# Processor Type (re)-Enabled
if child.isChecked():
current_processor_config = self.current_config['processor'].get(child.processor_type, {})
default_processor_config = self.get_default_config(processor_type = child.processor_type)
self.new_config['processor'][child.processor_type.value] = merge_dicts(current_processor_config, default_processor_config)
elif isinstance(child, LabelledTextField) and child.processor_type in self.new_config['processor']:
if child.processor_type == ProcessorType.Conversation:
self.new_config['processor'][child.processor_type.value]['openai-api-key'] = child.input_field.toPlainText() if child.input_field.toPlainText() != '' else None
def save_settings_to_file(self) -> bool:
"Save validated settings to file"
# Validate config before writing to file
try:
yaml_utils.parse_config_from_string(self.new_config)
except Exception as e:
print(f"Error validating config: {e}")
self.add_error_message(f"{ErrorType.ConfigValidationError.value}: {e}")
return False
# Save the config to app config file
self.add_error_message(None)
yaml_utils.save_config_to_file(self.new_config, self.config_file)
return True
def load_updated_settings(self):
"Hot swap to use the updated config from config file"
# Load parsed, validated config from app config file
args = cli(state.cli_args)
self.current_config = self.new_config
# Configure server with loaded config
configure_server(args, required=True)
def configure_app(self):
"Save the new settings to khoj.yml. Reload app with updated settings"
self.update_search_settings()
self.update_processor_settings()
if self.save_settings_to_file():
# Setup thread to load updated settings in background
self.thread = QThread()
self.settings_loader = SettingsLoader(self.load_updated_settings)
self.settings_loader.moveToThread(self.thread)
# Connect slots and signals for thread
self.thread.started.connect(self.settings_loader.run)
self.settings_loader.finished.connect(self.thread.quit)
self.settings_loader.finished.connect(self.settings_loader.deleteLater)
self.settings_loader.error.connect(self.add_error_message)
self.thread.finished.connect(self.thread.deleteLater)
# Start thread
self.thread.start()
# Disable Save Button
self.search_button.setEnabled(False)
self.configure_button.setEnabled(False)
self.configure_button.setText("Configuring...")
# Reset UI
self.thread.finished.connect(lambda: self.configure_button.setText("Configure"))
self.thread.finished.connect(lambda: self.configure_button.setEnabled(True))
self.thread.finished.connect(lambda: self.search_button.setEnabled(True))
def position_window(self):
"Position the window at center of X axis and near top on Y axis"
window_rectangle = self.geometry()
screen_center = self.screen().availableGeometry().center()
window_rectangle.moveCenter(screen_center)
self.move(window_rectangle.topLeft().x(), 25)
def show_on_top(self):
"Bring Window on Top"
self.show()
self.setWindowState(Qt.WindowState.WindowActive)
self.activateWindow() # For Bringing to Top on Windows
self.raise_() # For Bringing to Top from Minimized State on OSX
class SettingsLoader(QObject):
"Load Settings Thread"
finished = pyqtSignal()
error = pyqtSignal(str)
def __init__(self, load_settings_func):
super(SettingsLoader, self).__init__()
self.load_settings_func = load_settings_func
def run(self):
"Load Settings"
try:
self.load_settings_func()
except FileNotFoundError as e:
self.error.emit(f"{ErrorType.ConfigLoadingError.value}: {e}")
else:
self.error.emit(None)
self.finished.emit()
class SearchCheckBox(QtWidgets.QCheckBox):
def __init__(self, text, search_type: SearchType, parent=None):
self.search_type = search_type
super(SearchCheckBox, self).__init__(text, parent=parent)
class ProcessorCheckBox(QtWidgets.QCheckBox):
def __init__(self, text, processor_type: ProcessorType, parent=None):
self.processor_type = processor_type
super(ProcessorCheckBox, self).__init__(text, parent=parent)
class ErrorType(Enum):
"Error Types"
ConfigLoadingError = "Config Loading Error"
ConfigValidationError = "Config Validation Error"

View file

@ -0,0 +1,41 @@
# Standard Packages
import webbrowser
# External Packages
from PyQt6 import QtGui, QtWidgets
# Internal Packages
from src.utils import constants, state
def create_system_tray(gui: QtWidgets.QApplication, main_window: QtWidgets.QMainWindow):
"""Create System Tray with Menu. Menu contain options to
1. Open Search Page on the Web Interface
2. Open App Configuration Screen
3. Quit Application
"""
# Create the system tray with icon
icon_path = constants.web_directory / 'assets/icons/favicon-144x144.png'
icon = QtGui.QIcon(f'{icon_path.absolute()}')
tray = QtWidgets.QSystemTrayIcon(icon)
tray.setVisible(True)
# Create the menu and menu actions
menu = QtWidgets.QMenu()
menu_actions = [
('Search', lambda: webbrowser.open(f'http://{state.host}:{state.port}/')),
('Configure', main_window.show_on_top),
('Quit', gui.quit),
]
# Add the menu actions to the menu
for action_text, action_function in menu_actions:
menu_action = QtGui.QAction(action_text, menu)
menu_action.triggered.connect(action_function)
menu.addAction(menu_action)
# Add the menu to the system tray
tray.setContextMenu(menu)
return tray

View file

@ -1,10 +1,12 @@
;;; khoj.el --- Natural, Incremental Search via Emacs
;;; khoj.el --- Natural, Incremental Search for your Second Brain -*- lexical-binding: t -*-
;; Copyright (C) 2021-2022 Debanjum Singh Solanky
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
;; Version: 2.0
;; Keywords: search, org-mode, outlines, markdown, image
;; Description: Natural, Incremental Search for your Second Brain
;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image
;; Version: 0.1.6
;; Package-Requires: ((emacs "27.1"))
;; URL: http://github.com/debanjum/khoj/interface/emacs
;; This file is NOT part of GNU Emacs.
@ -27,9 +29,20 @@
;;; Commentary:
;; This package provides a natural, incremental search interface to your
;; org-mode notes, markdown files, beancount transactions and images.
;; It is a wrapper that interfaces with transformer based ML models.
;; The models search capabilities are exposed via the Khoj HTTP API.
;; `org-mode' notes, `markdown' files, `beancount' transactions and images.
;; It is a wrapper that interfaces with the Khoj server.
;; The server exposes an API for advanced search using transformer ML models.
;; The Khoj server needs to be running to use this package.
;; See the repository docs for detailed setup of the Khoj server.
;;
;; Quickstart
;; -------------
;; 1. Install Khoj Server
;; pip install khoj-assistant
;; 2. Start, Configure Khoj Server
;; khoj
;; 3. Install khoj.el
;; (use-package khoj :bind ("C-c s" . 'khoj))
;;; Code:
@ -51,11 +64,6 @@
:group 'khoj
:type 'integer)
(defcustom khoj-rerank-after-idle-time 2.0
"Idle time (in seconds) to trigger cross-encoder to rerank incremental search results."
:group 'khoj
:type 'float)
(defcustom khoj-results-count 5
"Number of results to get from Khoj API for each query."
:group 'khoj
@ -69,9 +77,6 @@
(const "ledger")
(const "music")))
(defvar khoj--rerank-timer nil
"Idle timer to make cross-encoder re-rank incremental search results if user idle.")
(defvar khoj--minibuffer-window nil
"Minibuffer window being used by user to enter query.")
@ -85,6 +90,7 @@
"The type of content to perform search on.")
(defun khoj--keybindings-info-message ()
"Show available khoj keybindings in-context, when user invokes Khoj."
(let ((enabled-content-types (khoj--get-enabled-content-types)))
(concat
"
@ -101,15 +107,18 @@
(when (member 'music enabled-content-types)
"C-x M | music\n"))))
(defun khoj--search-markdown () (interactive) (setq khoj--search-type "markdown"))
(defun khoj--search-org () (interactive) (setq khoj--search-type "org"))
(defun khoj--search-ledger () (interactive) (setq khoj--search-type "ledger"))
(defun khoj--search-images () (interactive) (setq khoj--search-type "image"))
(defun khoj--search-music () (interactive) (setq khoj--search-type "music"))
(defvar khoj--rerank nil "Track when re-rank of results triggered")
(defun khoj--search-markdown () "Set search-type to 'markdown'." (interactive) (setq khoj--search-type "markdown"))
(defun khoj--search-org () "Set search-type to 'org-mode'." (interactive) (setq khoj--search-type "org"))
(defun khoj--search-ledger () "Set search-type to 'ledger'." (interactive) (setq khoj--search-type "ledger"))
(defun khoj--search-images () "Set search-type to image." (interactive) (setq khoj--search-type "image"))
(defun khoj--search-music () "Set search-type to music." (interactive) (setq khoj--search-type "music"))
(defun khoj--improve-rank () "Use cross-encoder to rerank search results." (interactive) (khoj--incremental-search t))
(defun khoj--make-search-keymap (&optional existing-keymap)
"Setup keymap to configure Khoj search"
"Setup keymap to configure Khoj search. Build of EXISTING-KEYMAP when passed."
(let ((enabled-content-types (khoj--get-enabled-content-types))
(kmap (or existing-keymap (make-sparse-keymap))))
(define-key kmap (kbd "C-c RET") #'khoj--improve-rank)
(when (member 'markdown enabled-content-types)
(define-key kmap (kbd "C-x m") #'khoj--search-markdown))
(when (member 'org enabled-content-types)
@ -121,6 +130,8 @@
(when (member 'music enabled-content-types)
(define-key kmap (kbd "C-x M") #'khoj--search-music))
kmap))
(defvar khoj--keymap nil "Track Khoj keymap in this variable.")
(defun khoj--display-keybinding-info ()
"Display information on keybindings to customize khoj search.
Use `which-key` if available, else display simple message in echo area"
@ -132,7 +143,7 @@ Use `which-key` if available, else display simple message in echo area"
(message "%s" (khoj--keybindings-info-message))))
(defun khoj--extract-entries-as-markdown (json-response query)
"Convert json response from API to markdown entries"
"Convert JSON-RESPONSE, QUERY from API to markdown entries."
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"^[\(\) ]" ""
@ -147,12 +158,12 @@ Use `which-key` if available, else display simple message in echo area"
json-response))))
(defun khoj--extract-entries-as-org (json-response query)
"Convert json response from API to org-mode entries"
"Convert JSON-RESPONSE, QUERY from API to 'org-mode' entries."
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"^[\(\) ]" ""
;; extract entries from response as single string and convert to entries
(format "#+STARTUP: showall hidestars inlineimages\n* %s\n%s"
(format "* %s\n%s\n#+STARTUP: showall hidestars inlineimages"
query
(mapcar
(lambda (args)
@ -162,7 +173,7 @@ Use `which-key` if available, else display simple message in echo area"
json-response))))
(defun khoj--extract-entries-as-images (json-response query)
"Convert json response from API to html with images"
"Convert JSON-RESPONSE, QUERY from API to html with images."
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"[\(\) ]$" ""
@ -188,7 +199,7 @@ Use `which-key` if available, else display simple message in echo area"
json-response)))))
(defun khoj--extract-entries-as-ledger (json-response query)
"Convert json response from API to ledger entries"
"Convert JSON-RESPONSE, QUERY from API to ledger entries."
;; remove leading (, ) or SPC from extracted entries string
(replace-regexp-in-string
"[\(\) ]$" ""
@ -203,6 +214,7 @@ Use `which-key` if available, else display simple message in echo area"
json-response)))))
(defun khoj--buffer-name-to-search-type (buffer-name)
"Infer search type based on BUFFER-NAME."
(let ((enabled-content-types (khoj--get-enabled-content-types))
(file-extension (file-name-extension buffer-name)))
(cond
@ -213,7 +225,7 @@ Use `which-key` if available, else display simple message in echo area"
(t khoj-default-search-type))))
(defun khoj--get-enabled-content-types ()
"Get content types enabled for search from API"
"Get content types enabled for search from API."
(let ((config-url (format "%s/config/data" khoj-server-url)))
(with-temp-buffer
(erase-buffer)
@ -228,11 +240,14 @@ Use `which-key` if available, else display simple message in echo area"
content-type))))))
(defun khoj--construct-api-query (query search-type &optional rerank)
"Construct API Query from QUERY, SEARCH-TYPE and (optional) RERANK params."
(let ((rerank (or rerank "false"))
(encoded-query (url-hexify-string query)))
(format "%s/search?q=%s&t=%s&r=%s&n=%s" khoj-server-url encoded-query search-type rerank khoj-results-count)))
(defun khoj--query-api-and-render-results (query search-type query-url buffer-name)
"Query Khoj API using QUERY, SEARCH-TYPE, QUERY-URL.
Render results in BUFFER-NAME."
;; get json response from api
(with-current-buffer buffer-name
(let ((inhibit-read-only t))
@ -260,8 +275,8 @@ Use `which-key` if available, else display simple message in echo area"
(read-only-mode t)))
;; Incremental Search on Khoj
(defun khoj--incremental-search (&optional rerank)
"Perform Incremental Search on Khoj. Allow optional RERANK of results."
(let* ((rerank-str (cond (rerank "true") (t "false")))
(khoj-buffer-name (get-buffer-create khoj--buffer-name))
(query (minibuffer-contents-no-properties))
@ -271,18 +286,27 @@ Use `which-key` if available, else display simple message in echo area"
;; 1. user hasn't started typing query
;; 2. during recursive edits
;; 3. with contents of other buffers user may jump to
(when (and (not (equal query "")) (active-minibuffer-window) (equal (current-buffer) khoj--minibuffer-window))
;; 4. search not triggered right after rerank
;; ignore to not overwrite reranked results before the user even sees them
(if khoj--rerank
(setq khoj--rerank nil)
(when
(and
(not (equal query ""))
(active-minibuffer-window)
(equal (current-buffer) khoj--minibuffer-window))
(progn
(when rerank
(setq khoj--rerank t)
(message "Khoj: Rerank Results"))
(khoj--query-api-and-render-results
query
khoj--search-type
query-url
khoj-buffer-name)))))
khoj-buffer-name))))))
(defun delete-open-network-connections-to-khoj ()
"Delete all network connections to khoj server"
(defun khoj--delete-open-network-connections-to-server ()
"Delete all network connections to khoj server."
(dolist (proc (process-list))
(let ((proc-buf (buffer-name (process-buffer proc)))
(khoj-network-proc-buf (string-join (split-string khoj-server-url "://") " ")))
@ -290,33 +314,24 @@ Use `which-key` if available, else display simple message in echo area"
(delete-process proc)))))
(defun khoj--teardown-incremental-search ()
"Teardown hooks used for incremental search."
(message "Khoj: Teardown Incremental Search")
;; remove advice to rerank results on normal exit from minibuffer
(advice-remove 'exit-minibuffer #'khoj--minibuffer-exit-advice)
;; unset khoj minibuffer window
(setq khoj--minibuffer-window nil)
;; cancel rerank timer
(when (timerp khoj--rerank-timer)
(cancel-timer khoj--rerank-timer))
;; delete open connections to khoj
(delete-open-network-connections-to-khoj)
;; delete open connections to khoj server
(khoj--delete-open-network-connections-to-server)
;; remove hooks for khoj incremental query and self
(remove-hook 'post-command-hook #'khoj--incremental-search)
(remove-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search))
(defun khoj--minibuffer-exit-advice (&rest _args)
(khoj--incremental-search t))
;;;###autoload
(defun khoj ()
"Natural, Incremental Search for your personal notes, transactions and music using Khoj"
"Natural, Incremental Search for your personal notes, transactions and music."
(interactive)
(let* ((khoj-buffer-name (get-buffer-create khoj--buffer-name)))
;; set khoj search type to last used or based on current buffer
(setq khoj--search-type (or khoj--search-type (khoj--buffer-name-to-search-type (buffer-name))))
;; setup rerank to improve results once user idle for KHOJ-RERANK-AFTER-IDLE-TIME seconds
(setq khoj--rerank-timer (run-with-idle-timer khoj-rerank-after-idle-time t 'khoj--incremental-search t))
;; switch to khoj results buffer
(switch-to-buffer khoj-buffer-name)
;; open and setup minibuffer for incremental search
@ -329,15 +344,13 @@ Use `which-key` if available, else display simple message in echo area"
;; set current (mini-)buffer entered as khoj minibuffer
;; used to query khoj API only when user in khoj minibuffer
(setq khoj--minibuffer-window (current-buffer))
;; rerank results on normal exit from minibuffer
(advice-add 'exit-minibuffer :before #'khoj--minibuffer-exit-advice)
(add-hook 'post-command-hook #'khoj--incremental-search) ; do khoj incremental search after every user action
(add-hook 'minibuffer-exit-hook #'khoj--teardown-incremental-search)) ; teardown khoj incremental search on minibuffer exit
(read-string khoj--query-prompt))))
;;;###autoload
(defun khoj-simple (query)
"Natural Search for QUERY in your personal notes, transactions, music and images using Khoj"
"Natural Search for QUERY on your personal notes, transactions, music and images."
(interactive "s🦅Khoj: ")
(let* ((rerank "true")
(default-type (khoj--buffer-name-to-search-type (buffer-name)))

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

View file

@ -8,8 +8,8 @@
<link rel="icon" type="image/png" sizes="144x144" href="/static/assets/icons/favicon-144x144.png">
<link rel="manifest" href="/static/khoj.webmanifest">
</head>
<script type="text/javascript" src="static/assets/org.js"></script>
<script type="text/javascript" src="static/assets/markdown-it.js"></script>
<script type="text/javascript" src="static/assets/org.min.js"></script>
<script type="text/javascript" src="static/assets/markdown-it.min.js"></script>
<script>
function render_image(item) {
@ -38,6 +38,12 @@
}).join("\n"));
}
function render_ledger(query, data) {
return `<div id="results-ledger">` + data.map(function (item) {
return `<p>${item.entry}</p>`
}).join("\n") + `</div>`;
}
function render_json(data, query, type) {
if (type === "markdown") {
return render_markdown(query, data);
@ -47,6 +53,8 @@
return render_org(query, data, "music-");
} else if (type === "image") {
return data.map(render_image).join('');
} else if (type === "ledger") {
return render_ledger(query, data);
} else {
return `<pre id="json">${JSON.stringify(data, null, 2)}</pre>`;
}
@ -223,7 +231,11 @@
#json {
white-space: pre-wrap;
}
#results-markdown {
#results-ledger {
white-space: pre-line;
text-align: left;
}
#results-markdown {
text-align: left;
}
#results-music,

View file

@ -1,344 +1,124 @@
# Standard Packages
import sys, json, yaml
import time
from typing import Optional
from pathlib import Path
from functools import lru_cache
import os
import signal
import sys
from platform import system
# External Packages
import uvicorn
import torch
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, FileResponse
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from PyQt6 import QtWidgets
from PyQt6.QtCore import QThread, QTimer
# Internal Packages
from src.search_type import image_search, text_search
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
from src.processor.panchayat.panchayat_to_jsonl import panchayat_to_jsonl
from src.utils.helpers import get_absolute_path, get_from_dict
from src.configure import configure_server
from src.router import router
from src.utils import constants, state
from src.utils.cli import cli
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils.rawconfig import FullConfig
from src.processor.conversation.gpt import converse, extract_search_type, message_to_log, message_to_prompt, understand, summarize
from src.search_filter.explicit_filter import ExplicitFilter
from src.search_filter.date_filter import DateFilter
from src.interface.desktop.main_window import MainWindow
from src.interface.desktop.system_tray import create_system_tray
# Application Global State
config = FullConfig()
model = SearchModels()
processor_config = ProcessorConfigModel()
config_file = ""
verbose = 0
# Initialize the Application Server
app = FastAPI()
this_directory = Path(__file__).parent
web_directory = this_directory / 'interface/web/'
app.mount("/static", StaticFiles(directory=web_directory), name="static")
templates = Jinja2Templates(directory=web_directory)
# Controllers
@app.get("/", response_class=FileResponse)
def index():
return FileResponse(web_directory / "index.html")
@app.get('/config', response_class=HTMLResponse)
def config(request: Request):
return templates.TemplateResponse("config.html", context={'request': request})
@app.get('/config/data', response_model=FullConfig)
def config_data():
return config
@app.post('/config/data')
async def config_data(updated_config: FullConfig):
global config
config = updated_config
with open(config_file, 'w') as outfile:
yaml.dump(yaml.safe_load(config.json(by_alias=True)), outfile)
outfile.close()
return config
@app.get('/search')
@lru_cache(maxsize=100)
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False):
if q is None or q == '':
print(f'No query param (q) passed in API call to initiate search')
return {}
# initialize variables
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
user_query = q
results_count = n
results = {}
query_start, query_end, collate_start, collate_end = None, None, None, None
if (t == SearchType.Org or t == None) and model.orgmode_search:
# query org-mode notes
query_start = time.time()
hits, entries = text_search.query(user_query, model.orgmode_search, rank_results=r, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Music or t == None) and model.music_search:
# query music library
query_start = time.time()
hits, entries = text_search.query(user_query, model.music_search, rank_results=r, device=device, filters=[DateFilter(), ExplicitFilter()], verbose=verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Markdown or t == None) and model.orgmode_search:
# query markdown files
query_start = time.time()
hits, entries = text_search.query(user_query, model.markdown_search, rank_results=r, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Panchayat or t == None) and model.panchayat_search:
# query Panchayat yaml files
query_start = time.time()
hits, entries = text_search.query(user_query, model.panchayat_search, rank_results=r, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Ledger or t == None) and model.ledger_search:
# query transactions
query_start = time.time()
hits, entries = text_search.query(user_query, model.ledger_search, rank_results=r, device=device, filters=[ExplicitFilter(), DateFilter()], verbose=verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Image or t == None) and model.image_search:
# query images
query_start = time.time()
hits = image_search.query(user_query, results_count, model.image_search)
output_directory = web_directory / 'images'
query_end = time.time()
# collate and return results
collate_start = time.time()
results = image_search.collate_results(
hits,
image_names=model.image_search.image_names,
output_directory=output_directory,
image_files_url='/static/images',
count=results_count)
collate_end = time.time()
if verbose > 1:
if query_start and query_end:
print(f"Query took {query_end - query_start:.3f} seconds")
if collate_start and collate_end:
print(f"Collating results took {collate_end - collate_start:.3f} seconds")
return results
@app.get('/reload')
def reload(t: Optional[SearchType] = None):
global model
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = initialize_search(config, regenerate=False, t=t, device=device)
return {'status': 'ok', 'message': 'reload completed'}
@app.get('/regenerate')
def regenerate(t: Optional[SearchType] = None):
global model
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = initialize_search(config, regenerate=True, t=t, device=device)
return {'status': 'ok', 'message': 'regeneration completed'}
@app.get('/beta/search')
def search_beta(q: str, n: Optional[int] = 1):
# Extract Search Type using GPT
metadata = extract_search_type(q, api_key=processor_config.conversation.openai_api_key, verbose=verbose)
search_type = get_from_dict(metadata, "search-type")
# Search
search_results = search(q, n=n, t=SearchType(search_type))
# Return response
return {'status': 'ok', 'result': search_results, 'type': search_type}
@app.get('/chat')
def chat(q: str):
# Load Conversation History
chat_session = processor_config.conversation.chat_session
meta_log = processor_config.conversation.meta_log
# Converse with OpenAI GPT
metadata = understand(q, api_key=processor_config.conversation.openai_api_key, verbose=verbose)
if verbose > 1:
print(f'Understood: {get_from_dict(metadata, "intent")}')
if get_from_dict(metadata, "intent", "memory-type") == "notes":
query = get_from_dict(metadata, "intent", "query")
result_list = search(query, n=1, t=SearchType.Org)
collated_result = "\n".join([item["entry"] for item in result_list])
if verbose > 1:
print(f'Semantically Similar Notes:\n{collated_result}')
gpt_response = summarize(collated_result, summary_type="notes", user_query=q, api_key=processor_config.conversation.openai_api_key)
else:
gpt_response = converse(q, chat_session, api_key=processor_config.conversation.openai_api_key)
# Update Conversation History
processor_config.conversation.chat_session = message_to_prompt(q, chat_session, gpt_message=gpt_response)
processor_config.conversation.meta_log['chat'] = message_to_log(q, metadata, gpt_response, meta_log.get('chat', []))
return {'status': 'ok', 'response': gpt_response}
def initialize_search(config: FullConfig, regenerate: bool, t: SearchType = None, device=torch.device("cpu")):
# Initialize Org Notes Search
if (t == SearchType.Org or t == None) and config.content_type.org:
# Extract Entries, Generate Notes Embeddings
model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
# Initialize Org Music Search
if (t == SearchType.Music or t == None) and config.content_type.music:
# Extract Entries, Generate Music Embeddings
model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
# Initialize Markdown Search
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
# Extract Entries, Generate Markdown Embeddings
model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
# Initialize Panchayat Search
if (t == SearchType.Panchayat or t == None) and config.content_type.panchayat:
# Extract Entries, Generate Yaml Embeddings
model.panchayat_search = text_search.setup(panchayat_to_jsonl, config.content_type.panchayat, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)
# Initialize Ledger Search
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, verbose=verbose)
# Initialize Image Search
if (t == SearchType.Image or t == None) and config.content_type.image:
# Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate, verbose=verbose)
return model
def initialize_processor(config: FullConfig):
if not config.processor:
return
processor_config = ProcessorConfigModel()
# Initialize Conversation Processor
processor_config.conversation = ConversationProcessorConfigModel(config.processor.conversation, verbose)
conversation_logfile = processor_config.conversation.conversation_logfile
if processor_config.conversation.verbose:
print('INFO:\tLoading conversation logs from disk...')
if conversation_logfile.expanduser().absolute().is_file():
# Load Metadata Logs from Conversation Logfile
with open(get_absolute_path(conversation_logfile), 'r') as f:
processor_config.conversation.meta_log = json.load(f)
print('INFO:\tConversation logs loaded from disk.')
else:
# Initialize Conversation Logs
processor_config.conversation.meta_log = {}
processor_config.conversation.chat_session = ""
return processor_config
@app.on_event('shutdown')
def shutdown_event():
# No need to create empty log file
if not (processor_config and processor_config.conversation and processor_config.conversation.meta_log):
return
elif processor_config.conversation.verbose:
print('INFO:\tSaving conversation logs to disk...')
# Summarize Conversation Logs for this Session
chat_session = processor_config.conversation.chat_session
openai_api_key = processor_config.conversation.openai_api_key
conversation_log = processor_config.conversation.meta_log
session = {
"summary": summarize(chat_session, summary_type="chat", api_key=openai_api_key),
"session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"],
"session-end": len(conversation_log["chat"])
}
if 'session' in conversation_log:
conversation_log['session'].append(session)
else:
conversation_log['session'] = [session]
# Save Conversation Metadata Logs to Disk
conversation_logfile = get_absolute_path(processor_config.conversation.conversation_logfile)
with open(conversation_logfile, "w+", encoding='utf-8') as logfile:
json.dump(conversation_log, logfile)
print('INFO:\tConversation logs saved to disk.')
app.mount("/static", StaticFiles(directory=constants.web_directory), name="static")
app.include_router(router)
def run():
# Turn Tokenizers Parallelism Off. App does not support it.
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
# Load config from CLI
args = cli(sys.argv[1:])
state.cli_args = sys.argv[1:]
args = cli(state.cli_args)
set_state(args)
# Stores the file path to the config file.
global config_file
config_file = args.config_file
# Store the raw config data.
global config
config = args.config
# Store the verbose flag
global verbose
verbose = args.verbose
# Set device to GPU if available
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# Initialize the search model from Config
global model
model = initialize_search(args.config, args.regenerate, device=device)
# Initialize Processor from Config
global processor_config
processor_config = initialize_processor(args.config)
# Start Application Server
if args.socket:
uvicorn.run(app, proxy_headers=True, uds=args.socket)
if args.no_gui:
# Start Server
configure_server(args, required=True)
start_server(app, host=args.host, port=args.port, socket=args.socket)
else:
uvicorn.run(app, host=args.host, port=args.port)
# Setup GUI
gui = QtWidgets.QApplication([])
main_window = MainWindow(args.config_file)
# System tray is only available on Windows, MacOS.
# On Linux (Gnome) the System tray is not supported.
# Since only the Main Window is available
# Quitting it should quit the application
if system() in ['Windows', 'Darwin']:
gui.setQuitOnLastWindowClosed(False)
tray = create_system_tray(gui, main_window)
tray.show()
# Setup Server
configure_server(args, required=False)
server = ServerThread(app, args.host, args.port, args.socket)
# Show Main Window on First Run Experience or if on Linux
if args.config is None or system() not in ['Windows', 'Darwin']:
main_window.show()
# Setup Signal Handlers
signal.signal(signal.SIGINT, sigint_handler)
# Invoke python Interpreter every 500ms to handle signals
timer = QTimer()
timer.start(500)
timer.timeout.connect(lambda: None)
# Start Application
server.start()
gui.aboutToQuit.connect(server.terminate)
# Close Splash Screen if still open
if system() != 'Darwin':
try:
import pyi_splash
# Update the text on the splash screen
pyi_splash.update_text("Khoj setup complete")
# Close Splash Screen
pyi_splash.close()
except:
pass
gui.exec()
def sigint_handler(*args):
print("\nShutting down Khoj...")
QtWidgets.QApplication.quit()
def set_state(args):
state.config_file = args.config_file
state.config = args.config
state.verbose = args.verbose
state.host = args.host
state.port = args.port
def start_server(app, host=None, port=None, socket=None):
if socket:
uvicorn.run(app, proxy_headers=True, uds=socket)
else:
uvicorn.run(app, host=host, port=port)
class ServerThread(QThread):
def __init__(self, app, host=None, port=None, socket=None):
super(ServerThread, self).__init__()
self.app = app
self.host = host
self.port = port
self.socket = socket
def __del__(self):
self.wait()
def run(self):
start_server(self.app, self.host, self.port, self.socket)
if __name__ == '__main__':
run()
run()

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
# Standard Packages
import re
import json
import argparse
import pathlib
@ -71,14 +72,14 @@ def extract_org_entries(org_files):
return entries
def convert_org_entries_to_jsonl(entries, verbose=0):
def convert_org_entries_to_jsonl(entries, verbose=0) -> str:
"Convert each Org-Mode entries to JSON and collate as JSONL"
jsonl = ''
for entry in entries:
entry_dict = dict()
# Ignore title notes i.e notes with just headings and empty body
if not entry.Body() or entry.Body().strip(empty_escape_sequences) == "":
if not entry.Body() or re.sub(r'\n|\t|\r| ', '', entry.Body()) == "":
continue
entry_dict["compiled"] = f'{entry.Heading()}.'

View file

@ -38,6 +38,8 @@ import datetime
from pathlib import Path
from os.path import relpath
indent_regex = re.compile(r'^\s*')
def normalize_filename(filename):
file_relative_to_home = f'~/{relpath(filename, start=Path.home())}'
escaped_filename = f'{file_relative_to_home}'.replace("[","\[").replace("]","\]")
@ -370,7 +372,9 @@ class Orgnode(object):
n = ''
for _ in range(0, self.level):
n = n + '*'
n = n + ' ' + self.todo + ' '
n = n + ' '
if self.todo:
n = n + self.todo + ' '
if self.prty:
n = n + '[#' + self.prty + '] '
n = n + self.headline
@ -382,7 +386,12 @@ class Orgnode(object):
n = n + closecolon
n = n + "\n"
# Get body indentation from first line of body
indent = indent_regex.match(self.body).group()
# Output Closed Date, Scheduled Date, Deadline Date
if self.closed or self.scheduled or self.deadline:
n = n + indent
if self.closed:
n = n + f'CLOSED: [{self.closed.strftime("%Y-%m-%d %a")}] '
if self.scheduled:
@ -393,10 +402,10 @@ class Orgnode(object):
n = n + '\n'
# Ouput Property Drawer
n = n + ":PROPERTIES:\n"
n = n + indent + ":PROPERTIES:\n"
for key, value in self.properties.items():
n = n + f":{key}: {value}\n"
n = n + ":END:\n"
n = n + indent + f":{key}: {value}\n"
n = n + indent + ":END:\n"
n = n + self.body

223
src/router.py Normal file
View file

@ -0,0 +1,223 @@
# Standard Packages
import yaml
import json
import time
from typing import Optional
from functools import lru_cache
# External Packages
from fastapi import APIRouter
from fastapi import Request
from fastapi.responses import HTMLResponse, FileResponse
from fastapi.templating import Jinja2Templates
# Internal Packages
from src.configure import configure_search
from src.search_type import image_search, text_search
from src.processor.conversation.gpt import converse, extract_search_type, message_to_log, message_to_prompt, understand, summarize
from src.search_filter.explicit_filter import ExplicitFilter
from src.search_filter.date_filter import DateFilter
from src.utils.rawconfig import FullConfig
from src.utils.config import SearchType
from src.utils.helpers import get_absolute_path, get_from_dict
from src.utils import state, constants
router = APIRouter()
templates = Jinja2Templates(directory=constants.web_directory)
@router.get("/", response_class=FileResponse)
def index():
return FileResponse(constants.web_directory / "index.html")
@router.get('/config', response_class=HTMLResponse)
def config_page(request: Request):
return templates.TemplateResponse("config.html", context={'request': request})
@router.get('/config/data', response_model=FullConfig)
def config_data():
return state.config
@router.post('/config/data')
async def config_data(updated_config: FullConfig):
state.config = updated_config
with open(state.config_file, 'w') as outfile:
yaml.dump(yaml.safe_load(state.config.json(by_alias=True)), outfile)
outfile.close()
return state.config
@router.get('/search')
@lru_cache(maxsize=100)
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False):
if q is None or q == '':
print(f'No query param (q) passed in API call to initiate search')
return {}
# initialize variables
user_query = q
results_count = n
results = {}
query_start, query_end, collate_start, collate_end = None, None, None, None
if (t == SearchType.Org or t == None) and state.model.orgmode_search:
# query org-mode notes
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.orgmode_search, rank_results=r, filters=[DateFilter(), ExplicitFilter()], verbose=state.verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Music or t == None) and state.model.music_search:
# query music library
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.music_search, rank_results=r, filters=[DateFilter(), ExplicitFilter()], verbose=state.verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Markdown or t == None) and state.model.markdown_search:
# query markdown files
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.markdown_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Ledger or t == None) and state.model.ledger_search:
# query transactions
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.ledger_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Panchayat or t == None) and state.model.panchayat_search:
# query Panchayat yaml files
query_start = time.time()
hits, entries = text_search.query(user_query, state.model.panchayat_search, rank_results=r, filters=[ExplicitFilter(), DateFilter()], verbose=state.verbose)
query_end = time.time()
# collate and return results
collate_start = time.time()
results = text_search.collate_results(hits, entries, results_count)
collate_end = time.time()
if (t == SearchType.Image or t == None) and state.model.image_search:
# query images
query_start = time.time()
hits = image_search.query(user_query, results_count, state.model.image_search)
output_directory = constants.web_directory / 'images'
query_end = time.time()
# collate and return results
collate_start = time.time()
results = image_search.collate_results(
hits,
image_names=state.model.image_search.image_names,
output_directory=output_directory,
image_files_url='/static/images',
count=results_count)
collate_end = time.time()
if state.verbose > 1:
if query_start and query_end:
print(f"Query took {query_end - query_start:.3f} seconds")
if collate_start and collate_end:
print(f"Collating results took {collate_end - collate_start:.3f} seconds")
return results
@router.get('/reload')
def reload(t: Optional[SearchType] = None):
state.model = configure_search(state.model, state.config, regenerate=False, t=t)
return {'status': 'ok', 'message': 'reload completed'}
@router.get('/regenerate')
def regenerate(t: Optional[SearchType] = None):
state.model = configure_search(state.model, state.config, regenerate=True, t=t)
return {'status': 'ok', 'message': 'regeneration completed'}
@router.get('/beta/search')
def search_beta(q: str, n: Optional[int] = 1):
# Extract Search Type using GPT
metadata = extract_search_type(q, api_key=state.processor_config.conversation.openai_api_key, verbose=state.verbose)
search_type = get_from_dict(metadata, "search-type")
# Search
search_results = search(q, n=n, t=SearchType(search_type))
# Return response
return {'status': 'ok', 'result': search_results, 'type': search_type}
@router.get('/beta/chat')
def chat(q: str):
# Load Conversation History
chat_session = state.processor_config.conversation.chat_session
meta_log = state.processor_config.conversation.meta_log
# Converse with OpenAI GPT
metadata = understand(q, api_key=state.processor_config.conversation.openai_api_key, verbose=state.verbose)
if state.verbose > 1:
print(f'Understood: {get_from_dict(metadata, "intent")}')
if get_from_dict(metadata, "intent", "memory-type") == "notes":
query = get_from_dict(metadata, "intent", "query")
result_list = search(query, n=1, t=SearchType.Org)
collated_result = "\n".join([item["entry"] for item in result_list])
if state.verbose > 1:
print(f'Semantically Similar Notes:\n{collated_result}')
gpt_response = summarize(collated_result, summary_type="notes", user_query=q, api_key=state.processor_config.conversation.openai_api_key)
else:
gpt_response = converse(q, chat_session, api_key=state.processor_config.conversation.openai_api_key)
# Update Conversation History
state.processor_config.conversation.chat_session = message_to_prompt(q, chat_session, gpt_message=gpt_response)
state.processor_config.conversation.meta_log['chat'] = message_to_log(q, metadata, gpt_response, meta_log.get('chat', []))
return {'status': 'ok', 'response': gpt_response}
@router.on_event('shutdown')
def shutdown_event():
# No need to create empty log file
if not (state.processor_config and state.processor_config.conversation and state.processor_config.conversation.meta_log):
return
elif state.processor_config.conversation.verbose:
print('INFO:\tSaving conversation logs to disk...')
# Summarize Conversation Logs for this Session
chat_session = state.processor_config.conversation.chat_session
openai_api_key = state.processor_config.conversation.openai_api_key
conversation_log = state.processor_config.conversation.meta_log
session = {
"summary": summarize(chat_session, summary_type="chat", api_key=openai_api_key),
"session-start": conversation_log.get("session", [{"session-end": 0}])[-1]["session-end"],
"session-end": len(conversation_log["chat"])
}
if 'session' in conversation_log:
conversation_log['session'].append(session)
else:
conversation_log['session'] = [session]
# Save Conversation Metadata Logs to Disk
conversation_logfile = get_absolute_path(state.processor_config.conversation.conversation_logfile)
with open(conversation_logfile, "w+", encoding='utf-8') as logfile:
json.dump(conversation_log, logfile)
print('INFO:\tConversation logs saved to disk.')

View file

@ -211,9 +211,9 @@ def collate_results(hits, image_names, output_directory, image_files_url, count=
# Add the image metadata to the results
results += [{
"entry": f'{image_files_url}/{target_image_name}',
"score": f"{hit['score']:.3f}",
"image_score": f"{hit['image_score']:.3f}",
"metadata_score": f"{hit['metadata_score']:.3f}",
"score": f"{hit['score']:.9f}",
"image_score": f"{hit['image_score']:.9f}",
"metadata_score": f"{hit['metadata_score']:.9f}",
}]
return results

View file

@ -9,6 +9,7 @@ import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages
from src.utils import state
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
from src.utils.config import TextSearchModel
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
@ -32,13 +33,15 @@ def initialize_model(search_config: TextSearchConfig):
bi_encoder = load_model(
model_dir = search_config.model_directory,
model_name = search_config.encoder,
model_type = SentenceTransformer)
model_type = SentenceTransformer,
device=f'{state.device}')
# The cross-encoder re-ranks the results to improve quality
cross_encoder = load_model(
model_dir = search_config.model_directory,
model_name = search_config.cross_encoder,
model_type = CrossEncoder)
model_type = CrossEncoder,
device=f'{state.device}')
return bi_encoder, cross_encoder, top_k
@ -50,17 +53,16 @@ def extract_entries(jsonl_file, verbose=0):
in load_jsonl(jsonl_file, verbose=verbose)]
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0):
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
# Load pre-computed embeddings from file if exists
if embeddings_file.exists() and not regenerate:
corpus_embeddings = torch.load(get_absolute_path(embeddings_file))
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
if verbose > 0:
print(f"Loaded embeddings from {embeddings_file}")
else: # Else compute the corpus_embeddings from scratch, which can take a while
corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings.to(device)
corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, device=state.device, show_progress_bar=True)
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
torch.save(corpus_embeddings, embeddings_file)
if verbose > 0:
@ -69,7 +71,7 @@ def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, d
return corpus_embeddings
def query(raw_query: str, model: TextSearchModel, rank_results=False, device='cpu', filters: list = [], verbose=0):
def query(raw_query: str, model: TextSearchModel, rank_results=False, filters: list = [], verbose=0):
"Search for entries that answer the query"
query = raw_query
@ -99,19 +101,18 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False, device='cp
# Encode the query using the bi-encoder
start = time.time()
question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True)
question_embedding.to(device)
question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device)
question_embedding = util.normalize_embeddings(question_embedding)
end = time.time()
if verbose > 1:
print(f"Query Encode Time: {end - start:.3f} seconds")
print(f"Query Encode Time: {end - start:.3f} seconds on device: {state.device}")
# Find relevant entries for the query
start = time.time()
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=model.top_k, score_function=util.dot_score)[0]
end = time.time()
if verbose > 1:
print(f"Search Time: {end - start:.3f} seconds")
print(f"Search Time: {end - start:.3f} seconds on device: {state.device}")
# Score all retrieved entries using the cross-encoder
if rank_results:
@ -120,7 +121,7 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False, device='cp
cross_scores = model.cross_encoder.predict(cross_inp)
end = time.time()
if verbose > 1:
print(f"Cross-Encoder Predict Time: {end - start:.3f} seconds")
print(f"Cross-Encoder Predict Time: {end - start:.3f} seconds on device: {state.device}")
# Store cross-encoder scores in results dictionary for ranking
for idx in range(len(cross_scores)):
@ -133,7 +134,7 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False, device='cp
hits.sort(key=lambda x: x['cross-score'], reverse=True) # sort by cross-encoder score
end = time.time()
if verbose > 1:
print(f"Rank Time: {end - start:.3f} seconds")
print(f"Rank Time: {end - start:.3f} seconds on device: {state.device}")
return hits, entries
@ -166,7 +167,7 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, device='cpu', verbose: bool=False) -> TextSearchModel:
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
@ -181,7 +182,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
# Compute or Load Embeddings
config.embeddings_file = resolve_absolute_path(config.embeddings_file)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, device=device, verbose=verbose)
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate, verbose=verbose)
return TextSearchModel(entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose=verbose)

View file

@ -2,17 +2,15 @@
import argparse
import pathlib
# External Packages
import yaml
# Internal Packages
from src.utils.helpers import is_none_or_empty, get_absolute_path, resolve_absolute_path
from src.utils.rawconfig import FullConfig
from src.utils.helpers import resolve_absolute_path
from src.utils.yaml import parse_config_from_file
def cli(args=None):
# Setup Argument Parser for the Commandline Interface
parser = argparse.ArgumentParser(description="Start Khoj; A Natural Language Search Engine for your personal Notes, Transactions and Photos")
parser.add_argument('config_file', type=pathlib.Path, help="YAML file to configure Khoj")
parser.add_argument('--config-file', '-c', default='~/.khoj/khoj.yml', type=pathlib.Path, help="YAML file to configure Khoj")
parser.add_argument('--no-gui', action='store_true', default=False, help="Do not show native desktop GUI. Default: false")
parser.add_argument('--regenerate', action='store_true', default=False, help="Regenerate model embeddings from source files. Default: false")
parser.add_argument('--verbose', '-v', action='count', default=0, help="Show verbose conversion logs. Default: 0")
parser.add_argument('--host', type=str, default='127.0.0.1', help="Host address of the server. Default: 127.0.0.1")
@ -21,15 +19,12 @@ def cli(args=None):
args = parser.parse_args(args)
if not resolve_absolute_path(args.config_file).exists():
raise ValueError(f"Config file {args.config_file} does not exist")
# Normalize config_file path to absolute path
args.config_file = resolve_absolute_path(args.config_file)
# Read Config from YML file
config_from_file = None
with open(get_absolute_path(args.config_file), 'r', encoding='utf-8') as config_file:
config_from_file = yaml.safe_load(config_file)
if not args.config_file.exists():
args.config = None
else:
args.config = parse_config_from_file(args.config_file)
# Parse, Validate Config in YML file
args.config = FullConfig.parse_obj(config_from_file)
return args
return args

View file

@ -16,6 +16,10 @@ class SearchType(str, Enum):
Image = "image"
class ProcessorType(str, Enum):
Conversation = "conversation"
class TextSearchModel():
def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose):
self.entries = entries

View file

@ -1 +1,64 @@
empty_escape_sequences = r'\n|\r\t '
from pathlib import Path
app_root_directory = Path(__file__).parent.parent.parent
web_directory = app_root_directory / 'src/interface/web/'
empty_escape_sequences = r'\n|\r\t '
# default app config to use
default_config = {
'content-type': {
'org': {
'input-files': None,
'input-filter': None,
'compressed-jsonl': '~/.khoj/content/org/org.jsonl.gz',
'embeddings-file': '~/.khoj/content/org/org_embeddings.pt'
},
'markdown': {
'input-files': None,
'input-filter': None,
'compressed-jsonl': '~/.khoj/content/markdown/markdown.jsonl.gz',
'embeddings-file': '~/.khoj/content/markdown/markdown_embeddings.pt'
},
'ledger': {
'input-files': None,
'input-filter': None,
'compressed-jsonl': '~/.khoj/content/ledger/ledger.jsonl.gz',
'embeddings-file': '~/.khoj/content/ledger/ledger_embeddings.pt'
},
'image': {
'input-directories': None,
'input-filter': None,
'embeddings-file': '~/.khoj/content/image/image_embeddings.pt',
'batch-size': 50,
'use-xmp-metadata': False
},
'music': {
'input-files': None,
'input-filter': None,
'compressed-jsonl': '~/.khoj/content/music/music.jsonl.gz',
'embeddings-file': '~/.khoj/content/music/music_embeddings.pt'
}
},
'search-type': {
'symmetric': {
'encoder': 'sentence-transformers/all-MiniLM-L6-v2',
'cross-encoder': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
'model_directory': '~/.khoj/search/symmetric/'
},
'asymmetric': {
'encoder': 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
'cross-encoder': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
'model_directory': '~/.khoj/search/asymmetric/'
},
'image': {
'encoder': 'sentence-transformers/clip-ViT-B-32',
'model_directory': '~/.khoj/search/image/'
}
},
'processor': {
'conversation': {
'openai-api-key': None,
'conversation-logfile': '~/.khoj/processor/conversation/conversation_logs.json'
}
}
}

View file

@ -1,10 +1,11 @@
# Standard Packages
import pathlib
import sys
from os.path import join
def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
return item == None or (hasattr(item, '__iter__') and len(item) == 0) or item == ''
def to_snake_case_from_dash(item: str):
@ -40,18 +41,23 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
return merged_dict
def load_model(model_name, model_dir, model_type):
def load_model(model_name, model_dir, model_type, device:str=None):
"Load model from disk or huggingface"
# Construct model path
model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None
# Load model from model_path if it exists there
if model_path is not None and resolve_absolute_path(model_path).exists():
model = model_type(get_absolute_path(model_path))
model = model_type(get_absolute_path(model_path), device=device)
# Else load the model from the model_name
else:
model = model_type(model_name)
model = model_type(model_name, device=device)
if model_path is not None:
model.save(model_path)
return model
return model
def is_pyinstaller_app():
"Returns true if the app is running from Native GUI created by PyInstaller"
return getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS')

28
src/utils/state.py Normal file
View file

@ -0,0 +1,28 @@
# Standard Packages
from packaging import version
# External Packages
import torch
from pathlib import Path
# Internal Packages
from src.utils.config import SearchModels, ProcessorConfigModel
from src.utils.rawconfig import FullConfig
# Application Global State
config = FullConfig()
model = SearchModels()
processor_config = ProcessorConfigModel()
config_file: Path = ""
verbose: int = 0
host: str = None
port: int = None
cli_args = None
if torch.cuda.is_available():
# Use CUDA GPU
device = torch.device("cuda:0")
elif version.parse(torch.__version__) >= version.parse("1.13.0.dev") and torch.backends.mps.is_available():
# Use Apple M1 Metal Acceleration
device = torch.device("mps")
else:
device = torch.device("cpu")

38
src/utils/yaml.py Normal file
View file

@ -0,0 +1,38 @@
# Standard Packages
from pathlib import Path
# External Packages
import yaml
# Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path
from src.utils.rawconfig import FullConfig
# Do not emit tags when dumping to YAML
yaml.emitter.Emitter.process_tag = lambda self, *args, **kwargs: None
def save_config_to_file(yaml_config: dict, yaml_config_file: Path):
"Write config to YML file"
# Create output directory, if it doesn't exist
yaml_config_file.parent.mkdir(parents=True, exist_ok=True)
with open(yaml_config_file, 'w', encoding='utf-8') as config_file:
yaml.safe_dump(yaml_config, config_file, allow_unicode=True)
def load_config_from_file(yaml_config_file: Path) -> dict:
"Read config from YML file"
config_from_file = None
with open(yaml_config_file, 'r', encoding='utf-8') as config_file:
config_from_file = yaml.safe_load(config_file)
return config_from_file
def parse_config_from_string(yaml_config: dict) -> FullConfig:
"Parse and validate config in YML string"
return FullConfig.parse_obj(yaml_config)
def parse_config_from_file(yaml_config_file):
"Parse and validate config in YML file"
return parse_config_from_string(load_config_from_file(yaml_config_file))

View file

@ -1,11 +1,11 @@
# Standard Packages
import pytest
import torch
# Internal Packages
from src.search_type import image_search, text_search
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils import state
@pytest.fixture(scope='session')
@ -37,7 +37,6 @@ def search_config(tmp_path_factory):
@pytest.fixture(scope='session')
def model_dir(search_config):
model_dir = search_config.asymmetric.model_directory
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# Generate Image Embeddings from Test Images
content_config = ContentConfig()
@ -56,7 +55,7 @@ def model_dir(search_config):
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, device=device, verbose=True)
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, verbose=True)
return model_dir
@ -73,7 +72,7 @@ def content_config(model_dir):
content_config.image = ImageContentConfig(
input_directories = ['tests/data/images'],
embeddings_file = model_dir.joinpath('image_embeddings.pt'),
batch_size = 10,
batch_size = 1,
use_xmp_metadata = False)
return content_config

View file

@ -7,34 +7,44 @@ import pytest
# Internal Packages
from src.utils.cli import cli
from src.utils.helpers import resolve_absolute_path
# Test
# ----------------------------------------------------------------------------------------------------
def test_cli_minimal_default():
# Act
actual_args = cli(['tests/data/config.yml'])
actual_args = cli([])
# Assert
assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.config_file == resolve_absolute_path(Path('~/.khoj/khoj.yml'))
assert actual_args.regenerate == False
assert actual_args.no_gui == False
assert actual_args.verbose == 0
# ----------------------------------------------------------------------------------------------------
def test_cli_invalid_config_file_path():
# Arrange
non_existent_config_file = f"non-existent-khoj-{random()}.yml"
# Act
with pytest.raises(ValueError):
cli([f"non-existent-khoj-{random()}.yml"])
actual_args = cli([f'-c={non_existent_config_file}'])
# Assert
assert actual_args.config_file == resolve_absolute_path(non_existent_config_file)
assert actual_args.config == None
# ----------------------------------------------------------------------------------------------------
def test_cli_config_from_file():
# Act
actual_args = cli(['tests/data/config.yml',
actual_args = cli(['-c=tests/data/config.yml',
'--regenerate',
'--no-gui',
'-vvv'])
# Assert
assert actual_args.config_file == Path('tests/data/config.yml')
assert actual_args.config_file == resolve_absolute_path(Path('tests/data/config.yml'))
assert actual_args.no_gui == True
assert actual_args.regenerate == True
assert actual_args.config is not None
assert actual_args.config.content_type.org.input_files == [Path('~/first_from_config.org'), Path('~/second_from_config.org')]

View file

@ -7,7 +7,8 @@ from fastapi.testclient import TestClient
import pytest
# Internal Packages
from src.main import app, model, config
from src.main import app
from src.utils.state import model, config
from src.search_type import text_search, image_search
from src.utils.rawconfig import ContentConfig, SearchConfig
from src.processor.org_mode import org_to_jsonl
@ -37,7 +38,7 @@ def test_search_with_valid_content_type(content_config: ContentConfig, search_co
config.search_type = search_config
# config.content_type.image = search_config.image
for content_type in ["org", "markdown", "ledger", "music", "image"]:
for content_type in ["org", "markdown", "ledger", "music"]:
# Act
response = client.get(f"/search?q=random&t={content_type}")
# Assert
@ -59,7 +60,7 @@ def test_reload_with_valid_content_type(content_config: ContentConfig, search_co
config.content_type = content_config
config.search_type = search_config
for content_type in ["org", "markdown", "ledger", "music", "image"]:
for content_type in ["org", "markdown", "ledger", "music"]:
# Act
response = client.get(f"/reload?t={content_type}")
# Assert
@ -89,7 +90,6 @@ def test_regenerate_with_valid_content_type(content_config: ContentConfig, searc
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="Flaky test. Search doesn't always return expected image path.")
def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
config.content_type = content_config

View file

@ -6,7 +6,8 @@ from PIL import Image
import pytest
# Internal Packages
from src.main import model, web_directory
from src.utils.state import model
from src.utils.constants import web_directory
from src.search_type import image_search
from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentConfig, SearchConfig
@ -25,7 +26,6 @@ def test_image_search_setup(content_config: ContentConfig, search_config: Search
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="results inconsistent currently")
def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
output_directory = resolve_absolute_path(web_directory)

View file

@ -0,0 +1,62 @@
# Standard Packages
import json
from posixpath import split
# Internal Packages
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries
from src.utils.helpers import is_none_or_empty
def test_entry_with_empty_body_line_to_jsonl(tmp_path):
'''Ensure entries with empty body are ignored.
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
# Arrange
entry = f'''*** Heading
:PROPERTIES:
:ID: 42-42-42
:END:
\t\r\n
'''
orgfile = create_file(tmp_path, entry)
# Act
# Extract Entries from specified Org files
entries = extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_data = convert_org_entries_to_jsonl(entries)
# Assert
assert is_none_or_empty(jsonl_data)
def test_entry_with_body_to_jsonl(tmp_path):
"Ensure entries with valid body text are loaded."
# Arrange
entry = f'''*** Heading
:PROPERTIES:
:ID: 42-42-42
:END:
\t\r\nBody Line 1\n
'''
orgfile = create_file(tmp_path, entry)
# Act
# Extract Entries from specified Org files
entries = extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
assert len(jsonl_data) == 1
# Helper Functions
def create_file(tmp_path, entry, filename="test.org"):
org_file = tmp_path / f"notes/{filename}"
org_file.parent.mkdir()
org_file.touch()
org_file.write_text(entry)
return org_file

View file

@ -37,7 +37,7 @@ def test_parse_complete_entry(tmp_path):
"Test parsing of entry with all important fields"
# Arrange
entry = f'''
*** [#A] Heading :Tag1:TAG2:tag3:
*** DONE [#A] Heading :Tag1:TAG2:tag3:
CLOSED: [1984-04-01 Sun 12:00] SCHEDULED: <1984-04-01 Sun 09:00> DEADLINE: <1984-04-01 Sun>
:PROPERTIES:
:ID: 123-456-789-4234-1231
@ -56,6 +56,7 @@ Body Line 2'''
# Assert
assert len(entries) == 1
assert entries[0].Heading() == "Heading"
assert entries[0].Todo() == "DONE"
assert entries[0].Tags() == {"Tag1", "TAG2", "tag3"}
assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2"
assert entries[0].Priority() == "A"
@ -124,7 +125,7 @@ def test_parse_multiple_entries(tmp_path):
"Test parsing of multiple entries"
# Arrange
content = f'''
*** [#A] Heading1 :tag1:
*** FAILED [#A] Heading1 :tag1:
CLOSED: [1984-04-01 Sun 12:00] SCHEDULED: <1984-04-01 Sun 09:00> DEADLINE: <1984-04-01 Sun>
:PROPERTIES:
:ID: 123-456-789-4234-0001
@ -135,7 +136,7 @@ CLOCK: [1984-04-01 Sun 09:00]--[1984-04-01 Sun 12:00] => 3:00
:END:
Body 1
*** [#A] Heading2 :tag2:
*** CANCELLED [#A] Heading2 :tag2:
CLOSED: [1984-04-02 Sun 12:00] SCHEDULED: <1984-04-02 Sun 09:00> DEADLINE: <1984-04-02 Sun>
:PROPERTIES:
:ID: 123-456-789-4234-0002
@ -156,6 +157,7 @@ Body 2
assert len(entries) == 2
for index, entry in enumerate(entries):
assert entry.Heading() == f"Heading{index+1}"
assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED"
assert entry.Tags() == {f"tag{index+1}"}
assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
assert entry.Priority() == "A"

View file

@ -2,7 +2,7 @@
from pathlib import Path
# Internal Packages
from src.main import model
from src.utils.state import model
from src.search_type import text_search
from src.utils.rawconfig import ContentConfig, SearchConfig
from src.processor.org_mode.org_to_jsonl import org_to_jsonl