From f93032435042d4fd3129cca6ae1f37da83a422be Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 4 Sep 2022 17:18:47 +0300 Subject: [PATCH] Rename explicit filter to word filter to be more specific --- Readme.md | 1 - src/router.py | 2 -- .../{explicit_filter.py => word_filter.py} | 15 ++++---- src/search_type/text_search.py | 4 +-- tests/test_client.py | 4 +-- ...explicit_filter.py => test_word_filter.py} | 34 +++++++++---------- 6 files changed, 28 insertions(+), 32 deletions(-) rename src/search_filter/{explicit_filter.py => word_filter.py} (88%) rename tests/{test_explicit_filter.py => test_word_filter.py} (56%) diff --git a/Readme.md b/Readme.md index 76182560..628ce458 100644 --- a/Readme.md +++ b/Readme.md @@ -125,7 +125,6 @@ pip install --upgrade khoj-assistant - Semantic search using the bi-encoder is fairly fast at \<50 ms - Reranking using the cross-encoder is slower at \<2s on 15 results. Tweak `top_k` to tradeoff speed for accuracy of results -- Applying explicit filters is very slow currently at \~6s. This is because the filters are rudimentary. Considerable speed-ups can be achieved using indexes etc ### Indexing performance diff --git a/src/router.py b/src/router.py index 127623c6..a4bd2f84 100644 --- a/src/router.py +++ b/src/router.py @@ -16,8 +16,6 @@ from fastapi.templating import Jinja2Templates from src.configure import configure_search from src.search_type import image_search, text_search from src.processor.conversation.gpt import converse, extract_search_type, message_to_log, message_to_prompt, understand, summarize -from src.search_filter.explicit_filter import ExplicitFilter -from src.search_filter.date_filter import DateFilter from src.utils.rawconfig import FullConfig from src.utils.config import SearchType from src.utils.helpers import get_absolute_path, get_from_dict diff --git a/src/search_filter/explicit_filter.py b/src/search_filter/word_filter.py similarity index 88% rename from src/search_filter/explicit_filter.py rename to src/search_filter/word_filter.py index 7a26f830..f47ae6b7 100644 --- a/src/search_filter/explicit_filter.py +++ b/src/search_filter/word_filter.py @@ -15,13 +15,13 @@ from src.utils.config import SearchType logger = logging.getLogger(__name__) -class ExplicitFilter: +class WordFilter: # Filter Regex required_regex = r'\+"(\w+)" ?' blocked_regex = r'\-"(\w+)" ?' def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'): - self.filter_file = resolve_absolute_path(filter_directory / f"{search_type.name.lower()}_explicit_filter_entry_word_sets.pkl") + self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl") self.entry_key = entry_key self.search_type = search_type self.word_to_entry_index = dict() @@ -34,7 +34,7 @@ class ExplicitFilter: with self.filter_file.open('rb') as f: self.word_to_entry_index = pickle.load(f) end = time.time() - logger.debug(f"Load {self.search_type} entries by word set from file: {end - start} seconds") + logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds") else: start = time.time() self.cache = {} # Clear cache on (re-)generating entries_by_word_set @@ -51,14 +51,13 @@ class ExplicitFilter: with self.filter_file.open('wb') as f: pickle.dump(self.word_to_entry_index, f) end = time.time() - logger.debug(f"Convert all {self.search_type} entries to word sets: {end - start} seconds") + logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds") return self.word_to_entry_index def can_filter(self, raw_query): - "Check if query contains explicit filters" - # Extract explicit query portion with required, blocked words to filter from natural query + "Check if query contains word filters" required_words = re.findall(self.required_regex, raw_query) blocked_words = re.findall(self.blocked_regex, raw_query) @@ -67,7 +66,7 @@ class ExplicitFilter: def apply(self, raw_query, raw_entries, raw_embeddings): "Find entries containing required and not blocked words specified in query" - # Separate natural query from explicit required, blocked words filters + # Separate natural query from required, blocked words filters start = time.time() required_words = set([word.lower() for word in re.findall(self.required_regex, raw_query)]) @@ -83,7 +82,7 @@ class ExplicitFilter: # Return item from cache if exists cache_key = tuple(sorted(required_words)), tuple(sorted(blocked_words)) if cache_key in self.cache: - logger.info(f"Explicit filter results from cache") + logger.info(f"Return word filter results from cache") entries, embeddings = self.cache[cache_key] return query, entries, embeddings diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 742ff5ed..a674d712 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -8,7 +8,7 @@ import time import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util from src.search_filter.date_filter import DateFilter -from src.search_filter.explicit_filter import ExplicitFilter +from src.search_filter.word_filter import WordFilter # Internal Packages from src.utils import state @@ -171,7 +171,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate) filter_directory = resolve_absolute_path(config.compressed_jsonl.parent) - filters = [DateFilter(), ExplicitFilter(filter_directory, search_type=search_type)] + filters = [DateFilter(), WordFilter(filter_directory, search_type=search_type)] for filter in filters: filter.load(entries, regenerate=regenerate) diff --git a/tests/test_client.py b/tests/test_client.py index e9b632be..e7ddac33 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -140,7 +140,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_ # Assert assert response.status_code == 200 - # assert actual_data contains explicitly included word "Emacs" + # assert actual_data contains word "Emacs" search_result = response.json()[0]["entry"] assert "Emacs" in search_result @@ -156,6 +156,6 @@ def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_ # Assert assert response.status_code == 200 - # assert actual_data does not contains explicitly excluded word "Emacs" + # assert actual_data does not contains word "Emacs" search_result = response.json()[0]["entry"] assert "clone" not in search_result diff --git a/tests/test_explicit_filter.py b/tests/test_word_filter.py similarity index 56% rename from tests/test_explicit_filter.py rename to tests/test_word_filter.py index 5f34b0ac..3d584077 100644 --- a/tests/test_explicit_filter.py +++ b/tests/test_word_filter.py @@ -2,19 +2,19 @@ import torch # Application Packages -from src.search_filter.explicit_filter import ExplicitFilter +from src.search_filter.word_filter import WordFilter from src.utils.config import SearchType -def test_no_explicit_filter(tmp_path): +def test_no_word_filter(tmp_path): # Arrange - explicit_filter = ExplicitFilter(tmp_path, SearchType.Org) + word_filter = WordFilter(tmp_path, SearchType.Org) embeddings, entries = arrange_content() q_with_no_filter = 'head tail' # Act - can_filter = explicit_filter.can_filter(q_with_no_filter) - ret_query, ret_entries, ret_emb = explicit_filter.apply(q_with_no_filter, entries.copy(), embeddings) + can_filter = word_filter.can_filter(q_with_no_filter) + ret_query, ret_entries, ret_emb = word_filter.apply(q_with_no_filter, entries.copy(), embeddings) # Assert assert can_filter == False @@ -23,15 +23,15 @@ def test_no_explicit_filter(tmp_path): assert ret_entries == entries -def test_explicit_exclude_filter(tmp_path): +def test_word_exclude_filter(tmp_path): # Arrange - explicit_filter = ExplicitFilter(tmp_path, SearchType.Org) + word_filter = WordFilter(tmp_path, SearchType.Org) embeddings, entries = arrange_content() q_with_exclude_filter = 'head -"exclude_word" tail' # Act - can_filter = explicit_filter.can_filter(q_with_exclude_filter) - ret_query, ret_entries, ret_emb = explicit_filter.apply(q_with_exclude_filter, entries.copy(), embeddings) + can_filter = word_filter.can_filter(q_with_exclude_filter) + ret_query, ret_entries, ret_emb = word_filter.apply(q_with_exclude_filter, entries.copy(), embeddings) # Assert assert can_filter == True @@ -40,15 +40,15 @@ def test_explicit_exclude_filter(tmp_path): assert ret_entries == [entries[0], entries[2]] -def test_explicit_include_filter(tmp_path): +def test_word_include_filter(tmp_path): # Arrange - explicit_filter = ExplicitFilter(tmp_path, SearchType.Org) + word_filter = WordFilter(tmp_path, SearchType.Org) embeddings, entries = arrange_content() query_with_include_filter = 'head +"include_word" tail' # Act - can_filter = explicit_filter.can_filter(query_with_include_filter) - ret_query, ret_entries, ret_emb = explicit_filter.apply(query_with_include_filter, entries.copy(), embeddings) + can_filter = word_filter.can_filter(query_with_include_filter) + ret_query, ret_entries, ret_emb = word_filter.apply(query_with_include_filter, entries.copy(), embeddings) # Assert assert can_filter == True @@ -57,15 +57,15 @@ def test_explicit_include_filter(tmp_path): assert ret_entries == [entries[2], entries[3]] -def test_explicit_include_and_exclude_filter(tmp_path): +def test_word_include_and_exclude_filter(tmp_path): # Arrange - explicit_filter = ExplicitFilter(tmp_path, SearchType.Org) + word_filter = WordFilter(tmp_path, SearchType.Org) embeddings, entries = arrange_content() query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail' # Act - can_filter = explicit_filter.can_filter(query_with_include_and_exclude_filter) - ret_query, ret_entries, ret_emb = explicit_filter.apply(query_with_include_and_exclude_filter, entries.copy(), embeddings) + can_filter = word_filter.can_filter(query_with_include_and_exclude_filter) + ret_query, ret_entries, ret_emb = word_filter.apply(query_with_include_and_exclude_filter, entries.copy(), embeddings) # Assert assert can_filter == True