mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Do not store word filters index to file. Not necessary for now
- It's more of a hassle to not let word filter go stale on entry updates - Generating index on 120K lines of notes takes 1s. Loading from file takes 0.2s. For less content load time difference will be even smaller - Let go of startup time improvement for simplicity for now
This commit is contained in:
parent
91d11ccb49
commit
c17a0fd05b
5 changed files with 33 additions and 62 deletions
|
@ -48,11 +48,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.org,
|
config.content_type.org,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Org Music Search
|
# Initialize Org Music Search
|
||||||
if (t == SearchType.Music or t == None) and config.content_type.music:
|
if (t == SearchType.Music or t == None) and config.content_type.music:
|
||||||
|
@ -71,11 +67,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.markdown,
|
config.content_type.markdown,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Ledger Search
|
# Initialize Ledger Search
|
||||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||||
|
@ -85,11 +77,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.ledger,
|
config.content_type.ledger,
|
||||||
search_config=config.search_type.symmetric,
|
search_config=config.search_type.symmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Image Search
|
# Initialize Image Search
|
||||||
if (t == SearchType.Image or t == None) and config.content_type.image:
|
if (t == SearchType.Image or t == None) and config.content_type.image:
|
||||||
|
|
|
@ -3,6 +3,7 @@ import re
|
||||||
import time
|
import time
|
||||||
import pickle
|
import pickle
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.search_filter.base_filter import BaseFilter
|
from src.search_filter.base_filter import BaseFilter
|
||||||
|
@ -18,38 +19,24 @@ class WordFilter(BaseFilter):
|
||||||
required_regex = r'\+"(\w+)" ?'
|
required_regex = r'\+"(\w+)" ?'
|
||||||
blocked_regex = r'\-"(\w+)" ?'
|
blocked_regex = r'\-"(\w+)" ?'
|
||||||
|
|
||||||
def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'):
|
def __init__(self, entry_key='raw'):
|
||||||
self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl")
|
|
||||||
self.entry_key = entry_key
|
self.entry_key = entry_key
|
||||||
self.search_type = search_type
|
self.word_to_entry_index = defaultdict(set)
|
||||||
self.word_to_entry_index = dict()
|
|
||||||
self.cache = LRU()
|
self.cache = LRU()
|
||||||
|
|
||||||
|
|
||||||
def load(self, entries, regenerate=False):
|
def load(self, entries, regenerate=False):
|
||||||
if self.filter_file.exists() and not regenerate:
|
start = time.time()
|
||||||
start = time.time()
|
self.cache = {} # Clear cache on reload of filter
|
||||||
with self.filter_file.open('rb') as f:
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||||
self.word_to_entry_index = pickle.load(f)
|
# Create map of words to entries they exist in
|
||||||
end = time.time()
|
for entry_index, entry in enumerate(entries):
|
||||||
logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds")
|
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||||
else:
|
if word == '':
|
||||||
start = time.time()
|
continue
|
||||||
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
self.word_to_entry_index[word].add(entry_index)
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
end = time.time()
|
||||||
# Create map of words to entries they exist in
|
logger.debug(f"Created word filter index: {end - start} seconds")
|
||||||
for entry_index, entry in enumerate(entries):
|
|
||||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
|
||||||
if word == '':
|
|
||||||
continue
|
|
||||||
if word not in self.word_to_entry_index:
|
|
||||||
self.word_to_entry_index[word] = set()
|
|
||||||
self.word_to_entry_index[word].add(entry_index)
|
|
||||||
|
|
||||||
with self.filter_file.open('wb') as f:
|
|
||||||
pickle.dump(self.word_to_entry_index, f)
|
|
||||||
end = time.time()
|
|
||||||
logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds")
|
|
||||||
|
|
||||||
return self.word_to_entry_index
|
return self.word_to_entry_index
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,7 @@ def model_dir(search_config: SearchConfig):
|
||||||
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
|
compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'),
|
||||||
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
|
embeddings_file = model_dir.joinpath('note_embeddings.pt'))
|
||||||
|
|
||||||
filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()]
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
|
|
||||||
return model_dir
|
return model_dir
|
||||||
|
|
|
@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = 'How to git install application? +"Emacs"'
|
user_query = 'How to git install application? +"Emacs"'
|
||||||
|
|
||||||
|
@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = 'How to git install application? -"clone"'
|
user_query = 'How to git install application? -"clone"'
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,12 @@
|
||||||
# External Packages
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# Application Packages
|
# Application Packages
|
||||||
from src.search_filter.word_filter import WordFilter
|
from src.search_filter.word_filter import WordFilter
|
||||||
from src.utils.config import SearchType
|
from src.utils.config import SearchType
|
||||||
|
|
||||||
|
|
||||||
def test_no_word_filter(tmp_path):
|
def test_no_word_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
q_with_no_filter = 'head tail'
|
q_with_no_filter = 'head tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path):
|
||||||
assert entry_indices == {0, 1, 2, 3}
|
assert entry_indices == {0, 1, 2, 3}
|
||||||
|
|
||||||
|
|
||||||
def test_word_exclude_filter(tmp_path):
|
def test_word_exclude_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
q_with_exclude_filter = 'head -"exclude_word" tail'
|
q_with_exclude_filter = 'head -"exclude_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path):
|
||||||
assert entry_indices == {0, 2}
|
assert entry_indices == {0, 2}
|
||||||
|
|
||||||
|
|
||||||
def test_word_include_filter(tmp_path):
|
def test_word_include_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
query_with_include_filter = 'head +"include_word" tail'
|
query_with_include_filter = 'head +"include_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path):
|
||||||
assert entry_indices == {2, 3}
|
assert entry_indices == {2, 3}
|
||||||
|
|
||||||
|
|
||||||
def test_word_include_and_exclude_filter(tmp_path):
|
def test_word_include_and_exclude_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail'
|
query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path):
|
||||||
|
|
||||||
|
|
||||||
def arrange_content():
|
def arrange_content():
|
||||||
embeddings = torch.randn(4, 10)
|
|
||||||
entries = [
|
entries = [
|
||||||
{'compiled': '', 'raw': 'Minimal Entry'},
|
{'compiled': '', 'raw': 'Minimal Entry'},
|
||||||
{'compiled': '', 'raw': 'Entry with exclude_word'},
|
{'compiled': '', 'raw': 'Entry with exclude_word'},
|
||||||
{'compiled': '', 'raw': 'Entry with include_word'},
|
{'compiled': '', 'raw': 'Entry with include_word'},
|
||||||
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
|
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
|
||||||
|
|
||||||
return embeddings, entries
|
return entries
|
||||||
|
|
Loading…
Reference in a new issue