mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Init word filter dictionary with default value as set to simplify code
This commit is contained in:
parent
4d776d9c7a
commit
e00bb53336
1 changed files with 3 additions and 3 deletions
|
@ -3,6 +3,7 @@ import re
|
||||||
import time
|
import time
|
||||||
import pickle
|
import pickle
|
||||||
import logging
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.search_filter.base_filter import BaseFilter
|
from src.search_filter.base_filter import BaseFilter
|
||||||
|
@ -37,19 +38,18 @@ class WordFilter(BaseFilter):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
||||||
|
self.word_to_entry_index = defaultdict(set)
|
||||||
# Create map of words to entries they exist in
|
# Create map of words to entries they exist in
|
||||||
for entry_index, entry in enumerate(entries):
|
for entry_index, entry in enumerate(entries):
|
||||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||||
if word == '':
|
if word == '':
|
||||||
continue
|
continue
|
||||||
if word not in self.word_to_entry_index:
|
|
||||||
self.word_to_entry_index[word] = set()
|
|
||||||
self.word_to_entry_index[word].add(entry_index)
|
self.word_to_entry_index[word].add(entry_index)
|
||||||
|
|
||||||
with self.filter_file.open('wb') as f:
|
with self.filter_file.open('wb') as f:
|
||||||
pickle.dump(self.word_to_entry_index, f)
|
pickle.dump(self.word_to_entry_index, f)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds")
|
logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds")
|
||||||
|
|
||||||
return self.word_to_entry_index
|
return self.word_to_entry_index
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue