mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Init word filter dictionary with default value as set to simplify code
This commit is contained in:
parent
4d776d9c7a
commit
e00bb53336
1 changed files with 3 additions and 3 deletions
|
@ -3,6 +3,7 @@ import re
|
|||
import time
|
||||
import pickle
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
|
||||
# Internal Packages
|
||||
from src.search_filter.base_filter import BaseFilter
|
||||
|
@ -37,19 +38,18 @@ class WordFilter(BaseFilter):
|
|||
start = time.time()
|
||||
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
||||
self.word_to_entry_index = defaultdict(set)
|
||||
# Create map of words to entries they exist in
|
||||
for entry_index, entry in enumerate(entries):
|
||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||
if word == '':
|
||||
continue
|
||||
if word not in self.word_to_entry_index:
|
||||
self.word_to_entry_index[word] = set()
|
||||
self.word_to_entry_index[word].add(entry_index)
|
||||
|
||||
with self.filter_file.open('wb') as f:
|
||||
pickle.dump(self.word_to_entry_index, f)
|
||||
end = time.time()
|
||||
logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds")
|
||||
logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds")
|
||||
|
||||
return self.word_to_entry_index
|
||||
|
||||
|
|
Loading…
Reference in a new issue