mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-18 02:27:10 +00:00
Use Python Logging Framework to Time Performance of Explicit Filter
This commit is contained in:
parent
30c3eb372a
commit
ffb8e3988e
1 changed files with 9 additions and 5 deletions
|
@ -2,6 +2,7 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import pickle
|
import pickle
|
||||||
|
import logging
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import torch
|
import torch
|
||||||
|
@ -11,6 +12,9 @@ from src.utils.helpers import resolve_absolute_path
|
||||||
from src.utils.config import SearchType
|
from src.utils.config import SearchType
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ExplicitFilter:
|
class ExplicitFilter:
|
||||||
def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'):
|
def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'):
|
||||||
self.filter_file = resolve_absolute_path(filter_directory / f"{search_type.name.lower()}_explicit_filter_entry_word_sets.pkl")
|
self.filter_file = resolve_absolute_path(filter_directory / f"{search_type.name.lower()}_explicit_filter_entry_word_sets.pkl")
|
||||||
|
@ -24,7 +28,7 @@ class ExplicitFilter:
|
||||||
with self.filter_file.open('rb') as f:
|
with self.filter_file.open('rb') as f:
|
||||||
entries_by_word_set = pickle.load(f)
|
entries_by_word_set = pickle.load(f)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Load {self.search_type} entries by word set from file: {end - start} seconds")
|
logger.debug(f"Load {self.search_type} entries by word set from file: {end - start} seconds")
|
||||||
else:
|
else:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
||||||
|
@ -36,7 +40,7 @@ class ExplicitFilter:
|
||||||
with self.filter_file.open('wb') as f:
|
with self.filter_file.open('wb') as f:
|
||||||
pickle.dump(entries_by_word_set, f)
|
pickle.dump(entries_by_word_set, f)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Convert all {self.search_type} entries to word sets: {end - start} seconds")
|
logger.debug(f"Convert all {self.search_type} entries to word sets: {end - start} seconds")
|
||||||
|
|
||||||
return entries_by_word_set
|
return entries_by_word_set
|
||||||
|
|
||||||
|
@ -58,7 +62,7 @@ class ExplicitFilter:
|
||||||
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
|
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
|
||||||
blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
|
blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Time to extract required, blocked words: {end - start} seconds")
|
logger.debug(f"Time to extract required, blocked words: {end - start} seconds")
|
||||||
|
|
||||||
if len(required_words) == 0 and len(blocked_words) == 0:
|
if len(required_words) == 0 and len(blocked_words) == 0:
|
||||||
return query, entries, embeddings
|
return query, entries, embeddings
|
||||||
|
@ -82,7 +86,7 @@ class ExplicitFilter:
|
||||||
if words_in_entry.intersection(blocked_words):
|
if words_in_entry.intersection(blocked_words):
|
||||||
entries_to_exclude.add(id)
|
entries_to_exclude.add(id)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Mark entries to filter: {end - start} seconds")
|
logger.debug(f"Mark entries to filter: {end - start} seconds")
|
||||||
|
|
||||||
# delete entries (and their embeddings) marked for exclusion
|
# delete entries (and their embeddings) marked for exclusion
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -90,6 +94,6 @@ class ExplicitFilter:
|
||||||
del entries[id]
|
del entries[id]
|
||||||
embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
|
embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
|
||||||
end = time.time()
|
end = time.time()
|
||||||
print(f"Remove entries to filter from embeddings: {end - start} seconds")
|
logger.debug(f"Remove entries to filter from embeddings: {end - start} seconds")
|
||||||
|
|
||||||
return query, entries, embeddings
|
return query, entries, embeddings
|
||||||
|
|
Loading…
Reference in a new issue