Use Python Logging Framework to Time Performance of Explicit Filter

This commit is contained in:
Debanjum Singh Solanky 2022-09-03 22:14:37 +03:00
parent 30c3eb372a
commit ffb8e3988e

View file

@ -2,6 +2,7 @@
import re import re
import time import time
import pickle import pickle
import logging
# External Packages # External Packages
import torch import torch
@ -11,6 +12,9 @@ from src.utils.helpers import resolve_absolute_path
from src.utils.config import SearchType from src.utils.config import SearchType
logger = logging.getLogger(__name__)
class ExplicitFilter: class ExplicitFilter:
def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'): def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'):
self.filter_file = resolve_absolute_path(filter_directory / f"{search_type.name.lower()}_explicit_filter_entry_word_sets.pkl") self.filter_file = resolve_absolute_path(filter_directory / f"{search_type.name.lower()}_explicit_filter_entry_word_sets.pkl")
@ -24,7 +28,7 @@ class ExplicitFilter:
with self.filter_file.open('rb') as f: with self.filter_file.open('rb') as f:
entries_by_word_set = pickle.load(f) entries_by_word_set = pickle.load(f)
end = time.time() end = time.time()
print(f"Load {self.search_type} entries by word set from file: {end - start} seconds") logger.debug(f"Load {self.search_type} entries by word set from file: {end - start} seconds")
else: else:
start = time.time() start = time.time()
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
@ -36,7 +40,7 @@ class ExplicitFilter:
with self.filter_file.open('wb') as f: with self.filter_file.open('wb') as f:
pickle.dump(entries_by_word_set, f) pickle.dump(entries_by_word_set, f)
end = time.time() end = time.time()
print(f"Convert all {self.search_type} entries to word sets: {end - start} seconds") logger.debug(f"Convert all {self.search_type} entries to word sets: {end - start} seconds")
return entries_by_word_set return entries_by_word_set
@ -58,7 +62,7 @@ class ExplicitFilter:
required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")]) required_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("+")])
blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")]) blocked_words = set([word[1:].lower() for word in raw_query.split() if word.startswith("-")])
end = time.time() end = time.time()
print(f"Time to extract required, blocked words: {end - start} seconds") logger.debug(f"Time to extract required, blocked words: {end - start} seconds")
if len(required_words) == 0 and len(blocked_words) == 0: if len(required_words) == 0 and len(blocked_words) == 0:
return query, entries, embeddings return query, entries, embeddings
@ -82,7 +86,7 @@ class ExplicitFilter:
if words_in_entry.intersection(blocked_words): if words_in_entry.intersection(blocked_words):
entries_to_exclude.add(id) entries_to_exclude.add(id)
end = time.time() end = time.time()
print(f"Mark entries to filter: {end - start} seconds") logger.debug(f"Mark entries to filter: {end - start} seconds")
# delete entries (and their embeddings) marked for exclusion # delete entries (and their embeddings) marked for exclusion
start = time.time() start = time.time()
@ -90,6 +94,6 @@ class ExplicitFilter:
del entries[id] del entries[id]
embeddings = torch.cat((embeddings[:id], embeddings[id+1:])) embeddings = torch.cat((embeddings[:id], embeddings[id+1:]))
end = time.time() end = time.time()
print(f"Remove entries to filter from embeddings: {end - start} seconds") logger.debug(f"Remove entries to filter from embeddings: {end - start} seconds")
return query, entries, embeddings return query, entries, embeddings