From c17a0fd05bd893d379d3489e5cfdd032226ce9ae Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 02:43:58 +0300 Subject: [PATCH] Do not store word filters index to file. Not necessary for now - It's more of a hassle to not let word filter go stale on entry updates - Generating index on 120K lines of notes takes 1s. Loading from file takes 0.2s. For less content load time difference will be even smaller - Let go of startup time improvement for simplicity for now --- src/configure.py | 18 +++----------- src/search_filter/word_filter.py | 41 +++++++++++--------------------- tests/conftest.py | 2 +- tests/test_client.py | 4 ++-- tests/test_word_filter.py | 30 ++++++++++------------- 5 files changed, 33 insertions(+), 62 deletions(-) diff --git a/src/configure.py b/src/configure.py index f6476951..ed46af37 100644 --- a/src/configure.py +++ b/src/configure.py @@ -48,11 +48,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Org Music Search if (t == SearchType.Music or t == None) and config.content_type.music: @@ -71,11 +67,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: @@ -85,11 +77,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Image Search if (t == SearchType.Image or t == None) and config.content_type.image: diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py index c7c5d059..6fe0b31e 100644 --- a/src/search_filter/word_filter.py +++ b/src/search_filter/word_filter.py @@ -3,6 +3,7 @@ import re import time import pickle import logging +from collections import defaultdict # Internal Packages from src.search_filter.base_filter import BaseFilter @@ -18,38 +19,24 @@ class WordFilter(BaseFilter): required_regex = r'\+"(\w+)" ?' blocked_regex = r'\-"(\w+)" ?' - def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'): - self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl") + def __init__(self, entry_key='raw'): self.entry_key = entry_key - self.search_type = search_type - self.word_to_entry_index = dict() + self.word_to_entry_index = defaultdict(set) self.cache = LRU() def load(self, entries, regenerate=False): - if self.filter_file.exists() and not regenerate: - start = time.time() - with self.filter_file.open('rb') as f: - self.word_to_entry_index = pickle.load(f) - end = time.time() - logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds") - else: - start = time.time() - self.cache = {} # Clear cache on (re-)generating entries_by_word_set - entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' - # Create map of words to entries they exist in - for entry_index, entry in enumerate(entries): - for word in re.split(entry_splitter, entry[self.entry_key].lower()): - if word == '': - continue - if word not in self.word_to_entry_index: - self.word_to_entry_index[word] = set() - self.word_to_entry_index[word].add(entry_index) - - with self.filter_file.open('wb') as f: - pickle.dump(self.word_to_entry_index, f) - end = time.time() - logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds") + start = time.time() + self.cache = {} # Clear cache on reload of filter + entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\'' + # Create map of words to entries they exist in + for entry_index, entry in enumerate(entries): + for word in re.split(entry_splitter, entry[self.entry_key].lower()): + if word == '': + continue + self.word_to_entry_index[word].add(entry_index) + end = time.time() + logger.debug(f"Created word filter index: {end - start} seconds") return self.word_to_entry_index diff --git a/tests/conftest.py b/tests/conftest.py index 7545527f..ab2703da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,7 +58,7 @@ def model_dir(search_config: SearchConfig): compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), embeddings_file = model_dir.joinpath('note_embeddings.pt')) - filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()] + filters = [DateFilter(), WordFilter(), FileFilter()] text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) return model_dir diff --git a/tests/test_client.py b/tests/test_client.py index 578c789c..b167bce0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? +"Emacs"' @@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_ # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? -"clone"' diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py index 3efe8ed9..db23c2c6 100644 --- a/tests/test_word_filter.py +++ b/tests/test_word_filter.py @@ -1,15 +1,12 @@ -# External Packages -import torch - # Application Packages from src.search_filter.word_filter import WordFilter from src.utils.config import SearchType -def test_no_word_filter(tmp_path): +def test_no_word_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_no_filter = 'head tail' # Act @@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path): assert entry_indices == {0, 1, 2, 3} -def test_word_exclude_filter(tmp_path): +def test_word_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_exclude_filter = 'head -"exclude_word" tail' # Act @@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path): assert entry_indices == {0, 2} -def test_word_include_filter(tmp_path): +def test_word_include_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_filter = 'head +"include_word" tail' # Act @@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path): assert entry_indices == {2, 3} -def test_word_include_and_exclude_filter(tmp_path): +def test_word_include_and_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail' # Act @@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path): def arrange_content(): - embeddings = torch.randn(4, 10) entries = [ {'compiled': '', 'raw': 'Minimal Entry'}, {'compiled': '', 'raw': 'Entry with exclude_word'}, {'compiled': '', 'raw': 'Entry with include_word'}, {'compiled': '', 'raw': 'Entry with include_word and exclude_word'}] - return embeddings, entries + return entries