diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a664f12e..9d642f2b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,15 +1,15 @@ name: release on: + push: + tags: + - v* workflow_dispatch: inputs: version: description: 'Version Number' required: true type: string - push: - tags: - - v* jobs: publish: diff --git a/Dockerfile b/Dockerfile index d38a39af..e1cd9321 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL org.opencontainers.image.source https://github.com/debanjum/khoj # Install System Dependencies RUN apt-get update -y && \ - apt-get -y install libimage-exiftool-perl + apt-get -y install libimage-exiftool-perl python3-pyqt5 # Copy Application to Container COPY . /app diff --git a/Readme.md b/Readme.md index 18158585..c47e07a5 100644 --- a/Readme.md +++ b/Readme.md @@ -2,7 +2,6 @@ [![build](https://github.com/debanjum/khoj/actions/workflows/build.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/build.yml) [![test](https://github.com/debanjum/khoj/actions/workflows/test.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/test.yml) [![publish](https://github.com/debanjum/khoj/actions/workflows/publish.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/publish.yml) -[![release](https://github.com/debanjum/khoj/actions/workflows/release.yml/badge.svg)](https://github.com/debanjum/khoj/actions/workflows/release.yml) *A natural language search engine for your personal notes, transactions and images* @@ -107,7 +106,7 @@ pip install --upgrade khoj-assistant ## Troubleshoot - Symptom: Errors out complaining about Tensors mismatch, null etc - - Mitigation: Disable `image` search on the desktop GUI + - Mitigation: Disable `image` search using the desktop GUI - Symptom: Errors out with \"Killed\" in error message in Docker - Fix: Increase RAM available to Docker Containers in Docker Settings - Refer: [StackOverflow Solution](https://stackoverflow.com/a/50770267), [Configure Resources on Docker for Mac](https://docs.docker.com/desktop/mac/#resources) @@ -125,11 +124,12 @@ pip install --upgrade khoj-assistant - Semantic search using the bi-encoder is fairly fast at \<50 ms - Reranking using the cross-encoder is slower at \<2s on 15 results. Tweak `top_k` to tradeoff speed for accuracy of results +- Filters in query (e.g by file, word or date) usually add \<20ms to query latency ### Indexing performance - Indexing is more strongly impacted by the size of the source data -- Indexing 100K+ line corpus of notes takes 6 minutes +- Indexing 100K+ line corpus of notes takes about 10 minutes - Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM - Note: *It should only take this long on the first run* as the index is incrementally updated diff --git a/config/khoj_docker.yml b/config/khoj_docker.yml index b1ef7d5f..7e3fe8b4 100644 --- a/config/khoj_docker.yml +++ b/config/khoj_docker.yml @@ -20,11 +20,11 @@ content-type: compressed-jsonl: /data/embeddings/transactions.jsonl.gz embeddings-file: /data/embeddings/transaction_embeddings.pt -# image: -# input-directories: ["/data/images/"] -# embeddings-file: "/data/embeddings/image_embeddings.pt" -# batch-size: 50 -# use-xmp-metadata: true + image: + input-directories: ["/data/images/"] + embeddings-file: "/data/embeddings/image_embeddings.pt" + batch-size: 50 + use-xmp-metadata: false music: input-files: ["/data/music/music.org"] diff --git a/docker-compose.yml b/docker-compose.yml index 022463f2..42f2e617 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,4 +26,4 @@ services: - ./tests/data/embeddings/:/data/embeddings/ - ./tests/data/models/:/data/models/ # Use 0.0.0.0 to explicitly set the host ip for the service on the container. https://pythonspeed.com/articles/docker-connection-refused/ - command: --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv + command: --no-gui --host="0.0.0.0" --port=8000 -c=config/khoj_docker.yml -vv diff --git a/setup.py b/setup.py index bf9f3ce9..dc0ba0f3 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ this_directory = Path(__file__).parent setup( name='khoj-assistant', - version='0.1.6', + version='0.1.9', description="A natural language search engine for your personal notes, transactions and images", long_description=(this_directory / "Readme.md").read_text(encoding="utf-8"), long_description_content_type="text/markdown", diff --git a/src/interface/desktop/main_window.py b/src/interface/desktop/main_window.py index 97bbb54a..25287d27 100644 --- a/src/interface/desktop/main_window.py +++ b/src/interface/desktop/main_window.py @@ -92,7 +92,7 @@ class MainWindow(QtWidgets.QMainWindow): search_type_layout = QtWidgets.QVBoxLayout(search_type_settings) enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type) # Add file browser to set input files for given search type - input_files = FileBrowser(file_input_text, search_type, current_content_files) + input_files = FileBrowser(file_input_text, search_type, current_content_files or []) # Set enabled/disabled based on checkbox state enable_search_type.setChecked(current_content_files is not None and len(current_content_files) > 0) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index 5ab00d0f..c34fb88e 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -5,7 +5,7 @@ ;; Author: Debanjum Singh Solanky ;; Description: Natural, Incremental Search for your Second Brain ;; Keywords: search, org-mode, outlines, markdown, beancount, ledger, image -;; Version: 0.1.6 +;; Version: 0.1.9 ;; Package-Requires: ((emacs "27.1")) ;; URL: http://github.com/debanjum/khoj/interface/emacs diff --git a/src/main.py b/src/main.py index 72a05884..e7dbba5d 100644 --- a/src/main.py +++ b/src/main.py @@ -3,8 +3,12 @@ import os import signal import sys import logging +import warnings from platform import system +# Ignore non-actionable warnings +warnings.filterwarnings("ignore", message=r'snapshot_download.py has been made private', category=FutureWarning) + # External Packages import uvicorn from fastapi import FastAPI @@ -63,6 +67,9 @@ def run(): args = cli(state.cli_args) set_state(args) + # Create app directory, if it doesn't exist + state.config_file.parent.mkdir(parents=True, exist_ok=True) + # Setup Logger if args.verbose == 0: logger.setLevel(logging.WARN) diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 433a367f..31cedbb9 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -41,8 +41,13 @@ from os.path import relpath indent_regex = re.compile(r'^\s*') def normalize_filename(filename): - file_relative_to_home = f'~/{relpath(filename, start=Path.home())}' - escaped_filename = f'{file_relative_to_home}'.replace("[","\[").replace("]","\]") + "Normalize and escape filename for rendering" + if not Path(filename).is_absolute(): + # Normalize relative filename to be relative to current directory + normalized_filename = f'~/{relpath(filename, start=Path.home())}' + else: + normalized_filename = filename + escaped_filename = f'{normalized_filename}'.replace("[","\[").replace("]","\]") return escaped_filename def makelist(filename): @@ -61,7 +66,7 @@ def makelist(filename): todos = { "TODO": "", "WAITING": "", "ACTIVE": "", "DONE": "", "CANCELLED": "", "FAILED": ""} # populated from #+SEQ_TODO line - level = 0 + level = "" heading = "" bodytext = "" tags = list() # set of all tags in headline @@ -73,6 +78,7 @@ def makelist(filename): propdict = dict() in_properties_drawer = False in_logbook_drawer = False + file_title = f'{filename}' for line in f: ctr += 1 @@ -111,6 +117,16 @@ def makelist(filename): kwlist = re.findall(r'([A-Z]+)\(', line) for kw in kwlist: todos[kw] = "" + # Set file title to TITLE property, if it exists + title_search = re.search(r'^#\+TITLE:\s*(.*)$', line) + if title_search and title_search.group(1).strip() != '': + title_text = title_search.group(1).strip() + if file_title == f'{filename}': + file_title = title_text + else: + file_title += f' {title_text}' + continue + # Ignore Properties Drawers Completely if re.search(':PROPERTIES:', line): in_properties_drawer=True @@ -167,7 +183,7 @@ def makelist(filename): bodytext = bodytext + line # write out last node - thisNode = Orgnode(level, heading, bodytext, tags) + thisNode = Orgnode(level, heading or file_title, bodytext, tags) thisNode.setProperties(propdict) if sched_date: thisNode.setScheduled(sched_date) @@ -196,8 +212,12 @@ def makelist(filename): n.setHeading(prtysrch.group(2)) # Set SOURCE property to a file+heading based org-mode link to the entry - escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]") - n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]' + if n.Level() == 0: + n.properties['LINE'] = f'file:{normalize_filename(filename)}::0' + n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}]]' + else: + escaped_heading = n.Heading().replace("[","\\[").replace("]","\\]") + n.properties['SOURCE'] = f'[[file:{normalize_filename(filename)}::*{escaped_heading}]]' return nodelist diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py index 6fe0b31e..dd376cbb 100644 --- a/src/search_filter/word_filter.py +++ b/src/search_filter/word_filter.py @@ -27,7 +27,7 @@ class WordFilter(BaseFilter): def load(self, entries, regenerate=False): start = time.time() - self.cache = {} # Clear cache on reload of filter + self.cache = {} # Clear cache on filter (re-)load entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\'' # Create map of words to entries they exist in for entry_index, entry in enumerate(entries): diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index b5e647e2..238c4736 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -11,7 +11,7 @@ from src.search_filter.base_filter import BaseFilter # Internal Packages from src.utils import state -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model +from src.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model from src.utils.config import TextSearchModel from src.utils.rawconfig import TextSearchConfig, TextContentConfig from src.utils.jsonl import load_jsonl @@ -187,6 +187,8 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) + if is_none_or_empty(entries): + raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}") top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus # Compute or Load Embeddings diff --git a/src/utils/cli.py b/src/utils/cli.py index a140bb46..1f66a757 100644 --- a/src/utils/cli.py +++ b/src/utils/cli.py @@ -1,6 +1,7 @@ # Standard Packages import argparse import pathlib +from importlib.metadata import version # Internal Packages from src.utils.helpers import resolve_absolute_path @@ -17,9 +18,15 @@ def cli(args=None): parser.add_argument('--host', type=str, default='127.0.0.1', help="Host address of the server. Default: 127.0.0.1") parser.add_argument('--port', '-p', type=int, default=8000, help="Port of the server. Default: 8000") parser.add_argument('--socket', type=pathlib.Path, help="Path to UNIX socket for server. Use to run server behind reverse proxy. Default: /tmp/uvicorn.sock") + parser.add_argument('--version', '-V', action='store_true', help="Print the installed Khoj version and exit") args = parser.parse_args(args) + if args.version: + # Show version of khoj installed and exit + print(version('khoj-assistant')) + exit(0) + # Normalize config_file path to absolute path args.config_file = resolve_absolute_path(args.config_file) diff --git a/src/utils/jsonl.py b/src/utils/jsonl.py index 91eac555..77b5af11 100644 --- a/src/utils/jsonl.py +++ b/src/utils/jsonl.py @@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path): with open(output_path, 'w', encoding='utf-8') as f: f.write(jsonl_data) - logger.info(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}') + logger.info(f'Wrote jsonl data to {output_path}') def compress_jsonl_data(jsonl_data, output_path): diff --git a/tests/conftest.py b/tests/conftest.py index ab2703da..2a725919 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest # Internal Packages from src.search_type import image_search, text_search from src.utils.config import SearchType +from src.utils.helpers import resolve_absolute_path from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.search_filter.date_filter import DateFilter @@ -12,41 +13,41 @@ from src.search_filter.file_filter import FileFilter @pytest.fixture(scope='session') -def search_config(tmp_path_factory) -> SearchConfig: - model_dir = tmp_path_factory.mktemp('data') - +def search_config() -> SearchConfig: + model_dir = resolve_absolute_path('~/.khoj/search') + model_dir.mkdir(parents=True, exist_ok=True) search_config = SearchConfig() search_config.symmetric = TextSearchConfig( encoder = "sentence-transformers/all-MiniLM-L6-v2", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory = model_dir + model_directory = model_dir / 'symmetric/' ) search_config.asymmetric = TextSearchConfig( encoder = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", - model_directory = model_dir + model_directory = model_dir / 'asymmetric/' ) search_config.image = ImageSearchConfig( encoder = "sentence-transformers/clip-ViT-B-32", - model_directory = model_dir + model_directory = model_dir / 'image/' ) return search_config @pytest.fixture(scope='session') -def model_dir(search_config: SearchConfig): - model_dir = search_config.asymmetric.model_directory +def content_config(tmp_path_factory, search_config: SearchConfig): + content_dir = tmp_path_factory.mktemp('content') # Generate Image Embeddings from Test Images content_config = ContentConfig() content_config.image = ImageContentConfig( input_directories = ['tests/data/images'], - embeddings_file = model_dir.joinpath('image_embeddings.pt'), - batch_size = 10, + embeddings_file = content_dir.joinpath('image_embeddings.pt'), + batch_size = 1, use_xmp_metadata = False) image_search.setup(content_config.image, search_config.image, regenerate=False) @@ -55,28 +56,10 @@ def model_dir(search_config: SearchConfig): content_config.org = TextContentConfig( input_files = None, input_filter = 'tests/data/org/*.org', - compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), - embeddings_file = model_dir.joinpath('note_embeddings.pt')) + compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'), + embeddings_file = content_dir.joinpath('note_embeddings.pt')) filters = [DateFilter(), WordFilter(), FileFilter()] text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) - return model_dir - - -@pytest.fixture(scope='session') -def content_config(model_dir) -> ContentConfig: - content_config = ContentConfig() - content_config.org = TextContentConfig( - input_files = None, - input_filter = 'tests/data/org/*.org', - compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), - embeddings_file = model_dir.joinpath('note_embeddings.pt')) - - content_config.image = ImageContentConfig( - input_directories = ['tests/data/images'], - embeddings_file = model_dir.joinpath('image_embeddings.pt'), - batch_size = 1, - use_xmp_metadata = False) - - return content_config \ No newline at end of file + return content_config diff --git a/tests/test_cli.py b/tests/test_cli.py index 4cbf1209..3c99f424 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,9 +2,6 @@ from pathlib import Path from random import random -# External Modules -import pytest - # Internal Packages from src.utils.cli import cli from src.utils.helpers import resolve_absolute_path diff --git a/tests/test_image_search.py b/tests/test_image_search.py index 80c4fdf6..ad374da1 100644 --- a/tests/test_image_search.py +++ b/tests/test_image_search.py @@ -48,8 +48,13 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig image_files_url='/static/images', count=1) - actual_image = Image.open(output_directory.joinpath(Path(results[0]["entry"]).name)) + actual_image_path = output_directory.joinpath(Path(results[0]["entry"]).name) + actual_image = Image.open(actual_image_path) expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name)) # Assert assert expected_image == actual_image + + # Cleanup + # Delete the image files copied to results directory + actual_image_path.unlink() diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 594c954f..eaac5ef8 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -1,6 +1,5 @@ # Standard Packages import json -from posixpath import split # Internal Packages from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries @@ -15,7 +14,7 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path): :PROPERTIES: :ID: 42-42-42 :END: - \t\r\n + \t\r ''' orgfile = create_file(tmp_path, entry) @@ -38,7 +37,29 @@ def test_entry_with_body_to_jsonl(tmp_path): :PROPERTIES: :ID: 42-42-42 :END: - \t\r\nBody Line 1\n + \t\r + Body Line 1 + ''' + orgfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Org files + entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_file_with_no_headings_to_jsonl(tmp_path): + "Ensure files with no heading, only body text are loaded." + # Arrange + entry = f''' + - Bullet point 1 + - Bullet point 2 ''' orgfile = create_file(tmp_path, entry) diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index a81f1cc3..c1e0aaa9 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -8,6 +8,28 @@ from src.processor.org_mode import orgnode # Test +# ---------------------------------------------------------------------------------------------------- +def test_parse_entry_with_no_headings(tmp_path): + "Test parsing of entry with minimal fields" + # Arrange + entry = f'''Body Line 1''' + orgfile = create_file(tmp_path, entry) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 1 + assert entries[0].Heading() == f'{orgfile}' + assert entries[0].Tags() == list() + assert entries[0].Body() == "Body Line 1" + assert entries[0].Priority() == "" + assert entries[0].Property("ID") == "" + assert entries[0].Closed() == "" + assert entries[0].Scheduled() == "" + assert entries[0].Deadline() == "" + + # ---------------------------------------------------------------------------------------------------- def test_parse_minimal_entry(tmp_path): "Test parsing of entry with minimal fields" @@ -81,18 +103,17 @@ Body Line 1 Body Line 2 ''' orgfile = create_file(tmp_path, entry) - normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}' # Act entries = orgnode.makelist(orgfile) # Assert # SOURCE link rendered with Heading - assert f':SOURCE: [[file:{normalized_orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}' + assert f':SOURCE: [[file:{orgfile}::*{entries[0].Heading()}]]' in f'{entries[0]}' # ID link rendered with ID assert f':ID: id:123-456-789-4234-1231' in f'{entries[0]}' # LINE link rendered with line number - assert f':LINE: file:{normalized_orgfile}::2' in f'{entries[0]}' + assert f':LINE: file:{orgfile}::2' in f'{entries[0]}' # ---------------------------------------------------------------------------------------------------- @@ -115,8 +136,7 @@ Body Line 1''' # parsed heading from entry assert entries[0].Heading() == "Heading[1]" # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries - normalized_orgfile = f'~/{relpath(orgfile, start=Path.home())}' - escaped_orgfile = f'{normalized_orgfile}'.replace("[1]", "\\[1\\]") + escaped_orgfile = f'{orgfile}'.replace("[1]", "\\[1\\]") assert f':SOURCE: [[file:{escaped_orgfile}::*Heading\[1\]' in f'{entries[0]}' @@ -168,10 +188,80 @@ Body 2 assert entry.Logbook() == [(datetime.datetime(1984,4,index+1,9,0,0), datetime.datetime(1984,4,index+1,12,0,0))] +# ---------------------------------------------------------------------------------------------------- +def test_parse_entry_with_empty_title(tmp_path): + "Test parsing of entry with minimal fields" + # Arrange + entry = f'''#+TITLE: +Body Line 1''' + orgfile = create_file(tmp_path, entry) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 1 + assert entries[0].Heading() == f'{orgfile}' + assert entries[0].Tags() == list() + assert entries[0].Body() == "Body Line 1" + assert entries[0].Priority() == "" + assert entries[0].Property("ID") == "" + assert entries[0].Closed() == "" + assert entries[0].Scheduled() == "" + assert entries[0].Deadline() == "" + + +# ---------------------------------------------------------------------------------------------------- +def test_parse_entry_with_title_and_no_headings(tmp_path): + "Test parsing of entry with minimal fields" + # Arrange + entry = f'''#+TITLE: test +Body Line 1''' + orgfile = create_file(tmp_path, entry) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 1 + assert entries[0].Heading() == 'test' + assert entries[0].Tags() == list() + assert entries[0].Body() == "Body Line 1" + assert entries[0].Priority() == "" + assert entries[0].Property("ID") == "" + assert entries[0].Closed() == "" + assert entries[0].Scheduled() == "" + assert entries[0].Deadline() == "" + + +# ---------------------------------------------------------------------------------------------------- +def test_parse_entry_with_multiple_titles_and_no_headings(tmp_path): + "Test parsing of entry with minimal fields" + # Arrange + entry = f'''#+TITLE: title1 +Body Line 1 +#+TITLE: title2 ''' + orgfile = create_file(tmp_path, entry) + + # Act + entries = orgnode.makelist(orgfile) + + # Assert + assert len(entries) == 1 + assert entries[0].Heading() == 'title1 title2' + assert entries[0].Tags() == list() + assert entries[0].Body() == "Body Line 1\n" + assert entries[0].Priority() == "" + assert entries[0].Property("ID") == "" + assert entries[0].Closed() == "" + assert entries[0].Scheduled() == "" + assert entries[0].Deadline() == "" + + # Helper Functions def create_file(tmp_path, entry, filename="test.org"): org_file = tmp_path / f"notes/{filename}" org_file.parent.mkdir() org_file.touch() org_file.write_text(entry) - return org_file \ No newline at end of file + return org_file diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 5c2bd1c9..dce1070a 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -1,6 +1,10 @@ # System Packages +from copy import deepcopy from pathlib import Path +# External Packages +import pytest + # Internal Packages from src.utils.state import model from src.search_type import text_search @@ -9,6 +13,25 @@ from src.processor.org_mode.org_to_jsonl import org_to_jsonl # Test +# ---------------------------------------------------------------------------------------------------- +def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + file_to_index = Path(content_config.org.input_filter).parent / "new_file_to_index.org" + file_to_index.touch() + new_org_content_config = deepcopy(content_config.org) + new_org_content_config.input_files = [f'{file_to_index}'] + new_org_content_config.input_filter = None + + # Act + # Generate notes embeddings during asymmetric setup + with pytest.raises(ValueError, match=r'^No valid entries found*'): + text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True) + + # Cleanup + # delete created test file + file_to_index.unlink() + + # ---------------------------------------------------------------------------------------------------- def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig): # Act @@ -23,7 +46,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo # ---------------------------------------------------------------------------------------------------- def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig): # Arrange - model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) query = "How to git install application?" # Act