mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Merge branch 'master' into support-incremental-updates-of-embeddings
This commit is contained in:
commit
ebd5039bd1
20 changed files with 225 additions and 70 deletions
6
.github/workflows/release.yml
vendored
6
.github/workflows/release.yml
vendored
|
@ -1,15 +1,15 @@
|
||||||
name: release
|
name: release
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- v*
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
inputs:
|
inputs:
|
||||||
version:
|
version:
|
||||||
description: 'Version Number'
|
description: 'Version Number'
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- v*
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
publish:
|
publish:
|
||||||
|
|
|
@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj
|
||||||
|
|
||||||
# Install System Dependencies
|
# Install System Dependencies
|
||||||
RUN apt-get update -y && \
|
RUN apt-get update -y && \
|
||||||
apt-get -y install libimage-exiftool-perl
|
apt-get -y install libimage-exiftool-perl python3-pyqt5
|
||||||
|
|
||||||
# Copy Application to Container
|
# Copy Application to Container
|
||||||
COPY . /app
|
COPY . /app
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
[![build](https://github.com/debanjum/khoj/actions/workflows/build.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/build.yml)
|
[![build](https://github.com/debanjum/khoj/actions/workflows/build.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/build.yml)
|
||||||
[![test](https://github.com/debanjum/khoj/actions/workflows/test.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/test.yml)
|
[![test](https://github.com/debanjum/khoj/actions/workflows/test.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/test.yml)
|
||||||
[![publish](https://github.com/debanjum/khoj/actions/workflows/publish.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/publish.yml)
|
[![publish](https://github.com/debanjum/khoj/actions/workflows/publish.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/publish.yml)
|
||||||
[![release](https://github.com/debanjum/khoj/actions/workflows/release.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/release.yml)
|
|
||||||
|
|
||||||
*A natural language search engine for your personal notes, transactions and images*
|
*A natural language search engine for your personal notes, transactions and images*
|
||||||
|
|
||||||
|
@ -107,7 +106,7 @@ pip install --upgrade khoj-assistant
|
||||||
## Troubleshoot
|
## Troubleshoot
|
||||||
|
|
||||||
- Symptom: Errors out complaining about Tensors mismatch, null etc
|
- Symptom: Errors out complaining about Tensors mismatch, null etc
|
||||||
- Mitigation: Disable `image` search on the desktop GUI
|
- Mitigation: Disable `image` search using the desktop GUI
|
||||||
- Symptom: Errors out with \"Killed\" in error message in Docker
|
- Symptom: Errors out with \"Killed\" in error message in Docker
|
||||||
- Fix: Increase RAM available to Docker Containers in Docker Settings
|
- Fix: Increase RAM available to Docker Containers in Docker Settings
|
||||||
- Refer: [StackOverflow Solution](https://stackoverflow.com/a/50770267), [Configure Resources on Docker for Mac](https://docs.docker.com/desktop/mac/#resources)
|
- Refer: [StackOverflow Solution](https://stackoverflow.com/a/50770267), [Configure Resources on Docker for Mac](https://docs.docker.com/desktop/mac/#resources)
|
||||||
|
@ -125,11 +124,12 @@ pip install --upgrade khoj-assistant
|
||||||
|
|
||||||
- Semantic search using the bi-encoder is fairly fast at \<50 ms
|
- Semantic search using the bi-encoder is fairly fast at \<50 ms
|
||||||
- Reranking using the cross-encoder is slower at \<2s on 15 results. Tweak `top_k` to tradeoff speed for accuracy of results
|
- Reranking using the cross-encoder is slower at \<2s on 15 results. Tweak `top_k` to tradeoff speed for accuracy of results
|
||||||
|
- Filters in query (e.g by file, word or date) usually add \<20ms to query latency
|
||||||
|
|
||||||
### Indexing performance
|
### Indexing performance
|
||||||
|
|
||||||
- Indexing is more strongly impacted by the size of the source data
|
- Indexing is more strongly impacted by the size of the source data
|
||||||
- Indexing 100K+ line corpus of notes takes 6 minutes
|
- Indexing 100K+ line corpus of notes takes about 10 minutes
|
||||||
- Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM
|
- Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM
|
||||||
- Note: *It should only take this long on the first run* as the index is incrementally updated
|
- Note: *It should only take this long on the first run* as the index is incrementally updated
|
||||||
|
|
||||||
|
|
|
@ -20,11 +20,11 @@ content-type:
|
||||||
compressed-jsonl: /data/embeddings/transactions.jsonl.gz
|
compressed-jsonl: /data/embeddings/transactions.jsonl.gz
|
||||||
embeddings-file: /data/embeddings/transaction_embeddings.pt
|
embeddings-file: /data/embeddings/transaction_embeddings.pt
|
||||||
|
|
||||||
# image:
|
image:
|
||||||
# input-directories: ["/data/images/"]
|
input-directories: ["/data/images/"]
|
||||||
# embeddings-file: "/data/embeddings/image_embeddings.pt"
|
embeddings-file: "/data/embeddings/image_embeddings.pt"
|
||||||
# batch-size: 50
|
batch-size: 50
|
||||||
# use-xmp-metadata: true
|
use-xmp-metadata: false
|
||||||
|
|
||||||
music:
|
music:
|
||||||
input-files: ["/data/music/music.org"]
|
input-files: ["/data/music/music.org"]
|
||||||
|
|
|
@ -26,4 +26,4 @@ services:
|
||||||
- ./tests/data/embeddings/:/data/embeddings/
|
- ./tests/data/embeddings/:/data/embeddings/
|
||||||
- ./tests/data/models/:/data/models/
|
- ./tests/data/models/:/data/models/
|
||||||
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
# Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/
|
||||||
command: --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
|
command: --no-gui --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -7,7 +7,7 @@ this_directory = Path(__file__).parent
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='khoj-assistant',
|
name='khoj-assistant',
|
||||||
version='0.1.6',
|
version='0.1.9',
|
||||||
description="A natural language search engine for your personal notes, transactions and images",
|
description="A natural language search engine for your personal notes, transactions and images",
|
||||||
long_description=(this_directory / "Readme.md").read_text(encoding="utf-8"),
|
long_description=(this_directory / "Readme.md").read_text(encoding="utf-8"),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|
|
@ -92,7 +92,7 @@ class MainWindow(QtWidgets.QMainWindow):
|
||||||
search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
|
search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
|
||||||
enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
|
enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
|
||||||
# Add file browser to set input files for given search type
|
# Add file browser to set input files for given search type
|
||||||
input_files = FileBrowser(file_input_text, search_type, current_content_files)
|
input_files = FileBrowser(file_input_text, search_type, current_content_files or [])
|
||||||
|
|
||||||
# Set enabled/disabled based on checkbox state
|
# Set enabled/disabled based on checkbox state
|
||||||
enable_search_type.setChecked(current_content_files is not None and len(current_content_files) > 0)
|
enable_search_type.setChecked(current_content_files is not None and len(current_content_files) > 0)
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
;; Author: Debanjum Singh Solanky <debanjum@gmail.com>
|
||||||
;; Description: Natural, Incremental Search for your Second Brain
|
;; Description: Natural, Incremental Search for your Second Brain
|
||||||
;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image
|
;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image
|
||||||
;; Version: 0.1.6
|
;; Version: 0.1.9
|
||||||
;; Package-Requires: ((emacs "27.1"))
|
;; Package-Requires: ((emacs "27.1"))
|
||||||
;; URL: http://github.com/debanjum/khoj/interface/emacs
|
;; URL: http://github.com/debanjum/khoj/interface/emacs
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,12 @@ import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
|
import warnings
|
||||||
from platform import system
|
from platform import system
|
||||||
|
|
||||||
|
# Ignore non-actionable warnings
|
||||||
|
warnings.filterwarnings("ignore", message=r'snapshot_download.py has been made private', category=FutureWarning)
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
@ -63,6 +67,9 @@ def run():
|
||||||
args = cli(state.cli_args)
|
args = cli(state.cli_args)
|
||||||
set_state(args)
|
set_state(args)
|
||||||
|
|
||||||
|
# Create app directory, if it doesn't exist
|
||||||
|
state.config_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Setup Logger
|
# Setup Logger
|
||||||
if args.verbose == 0:
|
if args.verbose == 0:
|
||||||
logger.setLevel(logging.WARN)
|
logger.setLevel(logging.WARN)
|
||||||
|
|
|
@ -41,8 +41,13 @@ from os.path import relpath
|
||||||
indent_regex = re.compile(r'^\s*')
|
indent_regex = re.compile(r'^\s*')
|
||||||
|
|
||||||
def normalize_filename(filename):
|
def normalize_filename(filename):
|
||||||
file_relative_to_home = f'~/{relpath(filename, start=Path.home())}'
|
"Normalize and escape filename for rendering"
|
||||||
escaped_filename = f'{file_relative_to_home}'.replace("[","\[").replace("]","\]")
|
if not Path(filename).is_absolute():
|
||||||
|
# Normalize relative filename to be relative to current directory
|
||||||
|
normalized_filename = f'~/{relpath(filename, start=Path.home())}'
|
||||||
|
else:
|
||||||
|
normalized_filename = filename
|
||||||
|
escaped_filename = f'{normalized_filename}'.replace("[","\[").replace("]","\]")
|
||||||
return escaped_filename
|
return escaped_filename
|
||||||
|
|
||||||
def makelist(filename):
|
def makelist(filename):
|
||||||
|
@ -61,7 +66,7 @@ def makelist(filename):
|
||||||
|
|
||||||
todos = { "TODO": "", "WAITING": "", "ACTIVE": "",
|
todos = { "TODO": "", "WAITING": "", "ACTIVE": "",
|
||||||
"DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
|
"DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line
|
||||||
level = 0
|
level = ""
|
||||||
heading = ""
|
heading = ""
|
||||||
bodytext = ""
|
bodytext = ""
|
||||||
tags = list() # set of all tags in headline
|
tags = list() # set of all tags in headline
|
||||||
|
@ -73,6 +78,7 @@ def makelist(filename):
|
||||||
propdict = dict()
|
propdict = dict()
|
||||||
in_properties_drawer = False
|
in_properties_drawer = False
|
||||||
in_logbook_drawer = False
|
in_logbook_drawer = False
|
||||||
|
file_title = f'{filename}'
|
||||||
|
|
||||||
for line in f:
|
for line in f:
|
||||||
ctr += 1
|
ctr += 1
|
||||||
|
@ -111,6 +117,16 @@ def makelist(filename):
|
||||||
kwlist = re.findall(r'([A-Z]+)\(', line)
|
kwlist = re.findall(r'([A-Z]+)\(', line)
|
||||||
for kw in kwlist: todos[kw] = ""
|
for kw in kwlist: todos[kw] = ""
|
||||||
|
|
||||||
|
# Set file title to TITLE property, if it exists
|
||||||
|
title_search = re.search(r'^#\+TITLE:\s*(.*)$', line)
|
||||||
|
if title_search and title_search.group(1).strip() != '':
|
||||||
|
title_text = title_search.group(1).strip()
|
||||||
|
if file_title == f'{filename}':
|
||||||
|
file_title = title_text
|
||||||
|
else:
|
||||||
|
file_title += f' {title_text}'
|
||||||
|
continue
|
||||||
|
|
||||||
# Ignore Properties Drawers Completely
|
# Ignore Properties Drawers Completely
|
||||||
if re.search(':PROPERTIES:', line):
|
if re.search(':PROPERTIES:', line):
|
||||||
in_properties_drawer=True
|
in_properties_drawer=True
|
||||||
|
@ -167,7 +183,7 @@ def makelist(filename):
|
||||||
bodytext = bodytext + line
|
bodytext = bodytext + line
|
||||||
|
|
||||||
# write out last node
|
# write out last node
|
||||||
thisNode = Orgnode(level, heading, bodytext, tags)
|
thisNode = Orgnode(level, heading or file_title, bodytext, tags)
|
||||||
thisNode.setProperties(propdict)
|
thisNode.setProperties(propdict)
|
||||||
if sched_date:
|
if sched_date:
|
||||||
thisNode.setScheduled(sched_date)
|
thisNode.setScheduled(sched_date)
|
||||||
|
@ -196,6 +212,10 @@ def makelist(filename):
|
||||||
n.setHeading(prtysrch.group(2))
|
n.setHeading(prtysrch.group(2))
|
||||||
|
|
||||||
# Set SOURCE property to a file+heading based org-mode link to the entry
|
# Set SOURCE property to a file+heading based org-mode link to the entry
|
||||||
|
if n.Level() == 0:
|
||||||
|
n.properties['LINE'] = f'file:{normalize_filename(filename)}::0'
|
||||||
|
n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]'
|
||||||
|
else:
|
||||||
escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
|
escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]")
|
||||||
n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'
|
n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]'
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ class WordFilter(BaseFilter):
|
||||||
|
|
||||||
def load(self, entries, regenerate=False):
|
def load(self, entries, regenerate=False):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
self.cache = {} # Clear cache on reload of filter
|
self.cache = {} # Clear cache on filter (re-)load
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||||
# Create map of words to entries they exist in
|
# Create map of words to entries they exist in
|
||||||
for entry_index, entry in enumerate(entries):
|
for entry_index, entry in enumerate(entries):
|
||||||
|
|
|
@ -11,7 +11,7 @@ from src.search_filter.base_filter import BaseFilter
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils import state
|
from src.utils import state
|
||||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
|
from src.utils.rawconfig import TextSearchConfig, TextContentConfig
|
||||||
from src.utils.jsonl import load_jsonl
|
from src.utils.jsonl import load_jsonl
|
||||||
|
@ -187,6 +187,8 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
|
||||||
|
|
||||||
# Extract Updated Entries
|
# Extract Updated Entries
|
||||||
entries = extract_entries(config.compressed_jsonl)
|
entries = extract_entries(config.compressed_jsonl)
|
||||||
|
if is_none_or_empty(entries):
|
||||||
|
raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
|
||||||
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
|
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
|
||||||
|
|
||||||
# Compute or Load Embeddings
|
# Compute or Load Embeddings
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import argparse
|
import argparse
|
||||||
import pathlib
|
import pathlib
|
||||||
|
from importlib.metadata import version
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import resolve_absolute_path
|
from src.utils.helpers import resolve_absolute_path
|
||||||
|
@ -17,9 +18,15 @@ def cli(args=None):
|
||||||
parser.add_argument('--host', type=str, default='127.0.0.1', help="Host address of the server. Default: 127.0.0.1")
|
parser.add_argument('--host', type=str, default='127.0.0.1', help="Host address of the server. Default: 127.0.0.1")
|
||||||
parser.add_argument('--port', '-p', type=int, default=8000, help="Port of the server. Default: 8000")
|
parser.add_argument('--port', '-p', type=int, default=8000, help="Port of the server. Default: 8000")
|
||||||
parser.add_argument('--socket', type=pathlib.Path, help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock")
|
parser.add_argument('--socket', type=pathlib.Path, help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock")
|
||||||
|
parser.add_argument('--version', '-V', action='store_true', help="Print the installed Khoj version and exit")
|
||||||
|
|
||||||
args = parser.parse_args(args)
|
args = parser.parse_args(args)
|
||||||
|
|
||||||
|
if args.version:
|
||||||
|
# Show version of khoj installed and exit
|
||||||
|
print(version('khoj-assistant'))
|
||||||
|
exit(0)
|
||||||
|
|
||||||
# Normalize config_file path to absolute path
|
# Normalize config_file path to absolute path
|
||||||
args.config_file = resolve_absolute_path(args.config_file)
|
args.config_file = resolve_absolute_path(args.config_file)
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path):
|
||||||
with open(output_path, 'w', encoding='utf-8') as f:
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
f.write(jsonl_data)
|
f.write(jsonl_data)
|
||||||
|
|
||||||
logger.info(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
|
logger.info(f'Wrote jsonl data to {output_path}')
|
||||||
|
|
||||||
|
|
||||||
def compress_jsonl_data(jsonl_data, output_path):
|
def compress_jsonl_data(jsonl_data, output_path):
|
||||||
|
|
|
@ -4,6 +4,7 @@ import pytest
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.search_type import image_search, text_search
|
from src.search_type import image_search, text_search
|
||||||
from src.utils.config import SearchType
|
from src.utils.config import SearchType
|
||||||
|
from src.utils.helpers import resolve_absolute_path
|
||||||
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||||
from src.search_filter.date_filter import DateFilter
|
from src.search_filter.date_filter import DateFilter
|
||||||
|
@ -12,41 +13,41 @@ from src.search_filter.file_filter import FileFilter
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
def search_config(tmp_path_factory) -> SearchConfig:
|
def search_config() -> SearchConfig:
|
||||||
model_dir = tmp_path_factory.mktemp('data')
|
model_dir = resolve_absolute_path('~/.khoj/search')
|
||||||
|
model_dir.mkdir(parents=True, exist_ok=True)
|
||||||
search_config = SearchConfig()
|
search_config = SearchConfig()
|
||||||
|
|
||||||
search_config.symmetric = TextSearchConfig(
|
search_config.symmetric = TextSearchConfig(
|
||||||
encoder = "sentence-transformers/all-MiniLM-L6-v2",
|
encoder = "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
model_directory = model_dir
|
model_directory = model_dir / 'symmetric/'
|
||||||
)
|
)
|
||||||
|
|
||||||
search_config.asymmetric = TextSearchConfig(
|
search_config.asymmetric = TextSearchConfig(
|
||||||
encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1",
|
||||||
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
||||||
model_directory = model_dir
|
model_directory = model_dir / 'asymmetric/'
|
||||||
)
|
)
|
||||||
|
|
||||||
search_config.image = ImageSearchConfig(
|
search_config.image = ImageSearchConfig(
|
||||||
encoder = "sentence-transformers/clip-ViT-B-32",
|
encoder = "sentence-transformers/clip-ViT-B-32",
|
||||||
model_directory = model_dir
|
model_directory = model_dir / 'image/'
|
||||||
)
|
)
|
||||||
|
|
||||||
return search_config
|
return search_config
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
def model_dir(search_config: SearchConfig):
|
def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||||
model_dir = search_config.asymmetric.model_directory
|
content_dir = tmp_path_factory.mktemp('content')
|
||||||
|
|
||||||
# Generate Image Embeddings from Test Images
|
# Generate Image Embeddings from Test Images
|
||||||
content_config = ContentConfig()
|
content_config = ContentConfig()
|
||||||
content_config.image = ImageContentConfig(
|
content_config.image = ImageContentConfig(
|
||||||
input_directories = ['tests/data/images'],
|
input_directories = ['tests/data/images'],
|
||||||
embeddings_file = model_dir.joinpath('image_embeddings.pt'),
|
embeddings_file = content_dir.joinpath('image_embeddings.pt'),
|
||||||
batch_size = 10,
|
batch_size = 1,
|
||||||
use_xmp_metadata = False)
|
use_xmp_metadata = False)
|
||||||
|
|
||||||
image_search.setup(content_config.image, search_config.image, regenerate=False)
|
image_search.setup(content_config.image, search_config.image, regenerate=False)
|
||||||
|
@ -55,28 +56,10 @@ def model_dir(search_config: SearchConfig):
|
||||||
content_config.org = TextContentConfig(
|
content_config.org = TextContentConfig(
|
||||||
input_files = None,
|
input_files = None,
|
||||||
input_filter = 'tests/data/org/*.org',
|
input_filter = 'tests/data/org/*.org',
|
||||||
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
|
compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
|
||||||
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
|
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
||||||
|
|
||||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
|
|
||||||
return model_dir
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
def content_config(model_dir) -> ContentConfig:
|
|
||||||
content_config = ContentConfig()
|
|
||||||
content_config.org = TextContentConfig(
|
|
||||||
input_files = None,
|
|
||||||
input_filter = 'tests/data/org/*.org',
|
|
||||||
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
|
|
||||||
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
|
|
||||||
|
|
||||||
content_config.image = ImageContentConfig(
|
|
||||||
input_directories = ['tests/data/images'],
|
|
||||||
embeddings_file = model_dir.joinpath('image_embeddings.pt'),
|
|
||||||
batch_size = 1,
|
|
||||||
use_xmp_metadata = False)
|
|
||||||
|
|
||||||
return content_config
|
return content_config
|
|
@ -2,9 +2,6 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import random
|
from random import random
|
||||||
|
|
||||||
# External Modules
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.cli import cli
|
from src.utils.cli import cli
|
||||||
from src.utils.helpers import resolve_absolute_path
|
from src.utils.helpers import resolve_absolute_path
|
||||||
|
|
|
@ -48,8 +48,13 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
|
||||||
image_files_url='/static/images',
|
image_files_url='/static/images',
|
||||||
count=1)
|
count=1)
|
||||||
|
|
||||||
actual_image = Image.open(output_directory.joinpath(Path(results[0]["entry"]).name))
|
actual_image_path = output_directory.joinpath(Path(results[0]["entry"]).name)
|
||||||
|
actual_image = Image.open(actual_image_path)
|
||||||
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
|
expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert expected_image == actual_image
|
assert expected_image == actual_image
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
# Delete the image files copied to results directory
|
||||||
|
actual_image_path.unlink()
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
from posixpath import split
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries
|
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries
|
||||||
|
@ -15,7 +14,7 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
||||||
:PROPERTIES:
|
:PROPERTIES:
|
||||||
:ID: 42-42-42
|
:ID: 42-42-42
|
||||||
:END:
|
:END:
|
||||||
\t\r\n
|
\t\r
|
||||||
'''
|
'''
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
@ -38,7 +37,29 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
:PROPERTIES:
|
:PROPERTIES:
|
||||||
:ID: 42-42-42
|
:ID: 42-42-42
|
||||||
:END:
|
:END:
|
||||||
\t\r\nBody Line 1\n
|
\t\r
|
||||||
|
Body Line 1
|
||||||
|
'''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Org files
|
||||||
|
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
"Ensure files with no heading, only body text are loaded."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
- Bullet point 1
|
||||||
|
- Bullet point 2
|
||||||
'''
|
'''
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,28 @@ from src.processor.org_mode import orgnode
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_entry_with_no_headings(tmp_path):
|
||||||
|
"Test parsing of entry with minimal fields"
|
||||||
|
# Arrange
|
||||||
|
entry = f'''Body Line 1'''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 1
|
||||||
|
assert entries[0].Heading() == f'{orgfile}'
|
||||||
|
assert entries[0].Tags() == list()
|
||||||
|
assert entries[0].Body() == "Body Line 1"
|
||||||
|
assert entries[0].Priority() == ""
|
||||||
|
assert entries[0].Property("ID") == ""
|
||||||
|
assert entries[0].Closed() == ""
|
||||||
|
assert entries[0].Scheduled() == ""
|
||||||
|
assert entries[0].Deadline() == ""
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_parse_minimal_entry(tmp_path):
|
def test_parse_minimal_entry(tmp_path):
|
||||||
"Test parsing of entry with minimal fields"
|
"Test parsing of entry with minimal fields"
|
||||||
|
@ -81,18 +103,17 @@ Body Line 1
|
||||||
Body Line 2
|
Body Line 2
|
||||||
'''
|
'''
|
||||||
orgfile = create_file(tmp_path, entry)
|
orgfile = create_file(tmp_path, entry)
|
||||||
normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'
|
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
entries = orgnode.makelist(orgfile)
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# SOURCE link rendered with Heading
|
# SOURCE link rendered with Heading
|
||||||
assert f':SOURCE: [[file:{normalized_orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
|
assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}'
|
||||||
# ID link rendered with ID
|
# ID link rendered with ID
|
||||||
assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
|
assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}'
|
||||||
# LINE link rendered with line number
|
# LINE link rendered with line number
|
||||||
assert f':LINE: file:{normalized_orgfile}::2' in f'{entries[0]}'
|
assert f':LINE: file:{orgfile}::2' in f'{entries[0]}'
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
@ -115,8 +136,7 @@ Body Line 1'''
|
||||||
# parsed heading from entry
|
# parsed heading from entry
|
||||||
assert entries[0].Heading() == "Heading[1]"
|
assert entries[0].Heading() == "Heading[1]"
|
||||||
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
|
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
|
||||||
normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}'
|
escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]")
|
||||||
escaped_orgfile = f'{normalized_orgfile}'.replace("[1]", "\\[1\\]")
|
|
||||||
assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'
|
assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}'
|
||||||
|
|
||||||
|
|
||||||
|
@ -168,6 +188,76 @@ Body 2
|
||||||
assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
|
assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))]
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_entry_with_empty_title(tmp_path):
|
||||||
|
"Test parsing of entry with minimal fields"
|
||||||
|
# Arrange
|
||||||
|
entry = f'''#+TITLE:
|
||||||
|
Body Line 1'''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 1
|
||||||
|
assert entries[0].Heading() == f'{orgfile}'
|
||||||
|
assert entries[0].Tags() == list()
|
||||||
|
assert entries[0].Body() == "Body Line 1"
|
||||||
|
assert entries[0].Priority() == ""
|
||||||
|
assert entries[0].Property("ID") == ""
|
||||||
|
assert entries[0].Closed() == ""
|
||||||
|
assert entries[0].Scheduled() == ""
|
||||||
|
assert entries[0].Deadline() == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_entry_with_title_and_no_headings(tmp_path):
|
||||||
|
"Test parsing of entry with minimal fields"
|
||||||
|
# Arrange
|
||||||
|
entry = f'''#+TITLE: test
|
||||||
|
Body Line 1'''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 1
|
||||||
|
assert entries[0].Heading() == 'test'
|
||||||
|
assert entries[0].Tags() == list()
|
||||||
|
assert entries[0].Body() == "Body Line 1"
|
||||||
|
assert entries[0].Priority() == ""
|
||||||
|
assert entries[0].Property("ID") == ""
|
||||||
|
assert entries[0].Closed() == ""
|
||||||
|
assert entries[0].Scheduled() == ""
|
||||||
|
assert entries[0].Deadline() == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path):
|
||||||
|
"Test parsing of entry with minimal fields"
|
||||||
|
# Arrange
|
||||||
|
entry = f'''#+TITLE: title1
|
||||||
|
Body Line 1
|
||||||
|
#+TITLE: title2 '''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
entries = orgnode.makelist(orgfile)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 1
|
||||||
|
assert entries[0].Heading() == 'title1 title2'
|
||||||
|
assert entries[0].Tags() == list()
|
||||||
|
assert entries[0].Body() == "Body Line 1\n"
|
||||||
|
assert entries[0].Priority() == ""
|
||||||
|
assert entries[0].Property("ID") == ""
|
||||||
|
assert entries[0].Closed() == ""
|
||||||
|
assert entries[0].Scheduled() == ""
|
||||||
|
assert entries[0].Deadline() == ""
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry, filename="test.org"):
|
def create_file(tmp_path, entry, filename="test.org"):
|
||||||
org_file = tmp_path / f"notes/{filename}"
|
org_file = tmp_path / f"notes/{filename}"
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
# System Packages
|
# System Packages
|
||||||
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
# External Packages
|
||||||
|
import pytest
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.state import model
|
from src.utils.state import model
|
||||||
from src.search_type import text_search
|
from src.search_type import text_search
|
||||||
|
@ -9,6 +13,25 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
|
# Arrange
|
||||||
|
file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org"
|
||||||
|
file_to_index.touch()
|
||||||
|
new_org_content_config = deepcopy(content_config.org)
|
||||||
|
new_org_content_config.input_files = [f'{file_to_index}']
|
||||||
|
new_org_content_config.input_filter = None
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Generate notes embeddings during asymmetric setup
|
||||||
|
with pytest.raises(ValueError, match=r'^No valid entries found*'):
|
||||||
|
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
# delete created test file
|
||||||
|
file_to_index.unlink()
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Act
|
# Act
|
||||||
|
@ -23,7 +46,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
query = "How to git install application?"
|
query = "How to git install application?"
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
|
Loading…
Reference in a new issue