mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Support exclusion file filters (#826)
### Overview Support exclude file filter in user search queries ### Details - All of the exclude file filter terms need to be satisfied - Any one of the include file filter terms should be satisfied ### Example - **Search Query**: *what happened yesterday? -file:"tasks.org" -file:"work.md" file:"diary.org" file:"journal.org* - **Behavior**: Query will try find relevant notes in any of `journal.org` or `diary.org` and not in `tasks.org` and not in `work.md` ### Details * Add support for exclusion file filters * Translate file filter to valid Django DB entry filter regex * Exclude all files when multiple exclude file filter in query Previously we were applying an "Or" filter, which would exclude any file mentioned in a query with multiple exclude file filter. This is not what we naturally mean when we ask excluding a file in a query * Rename, rearrange, deduplicate and add file filter tests Closes #728 --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
This commit is contained in:
parent
7815e02dd4
commit
05c0aa3882
3 changed files with 77 additions and 29 deletions
|
@ -1048,7 +1048,7 @@ class FileObjectAdapters:
|
||||||
|
|
||||||
|
|
||||||
class EntryAdapters:
|
class EntryAdapters:
|
||||||
word_filer = WordFilter()
|
word_filter = WordFilter()
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
date_filter = DateFilter()
|
date_filter = DateFilter()
|
||||||
|
|
||||||
|
@ -1150,14 +1150,14 @@ class EntryAdapters:
|
||||||
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
|
def apply_filters(user: KhojUser, query: str, file_type_filter: str = None):
|
||||||
q_filter_terms = Q()
|
q_filter_terms = Q()
|
||||||
|
|
||||||
explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query)
|
word_filters = EntryAdapters.word_filter.get_filter_terms(query)
|
||||||
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
|
file_filters = EntryAdapters.file_filter.get_filter_terms(query)
|
||||||
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
|
date_filters = EntryAdapters.date_filter.get_query_date_range(query)
|
||||||
|
|
||||||
if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
|
if len(word_filters) == 0 and len(file_filters) == 0 and len(date_filters) == 0:
|
||||||
return Entry.objects.filter(user=user)
|
return Entry.objects.filter(user=user)
|
||||||
|
|
||||||
for term in explicit_word_terms:
|
for term in word_filters:
|
||||||
if term.startswith("+"):
|
if term.startswith("+"):
|
||||||
q_filter_terms &= Q(raw__icontains=term[1:])
|
q_filter_terms &= Q(raw__icontains=term[1:])
|
||||||
elif term.startswith("-"):
|
elif term.startswith("-"):
|
||||||
|
@ -1167,7 +1167,16 @@ class EntryAdapters:
|
||||||
|
|
||||||
if len(file_filters) > 0:
|
if len(file_filters) > 0:
|
||||||
for term in file_filters:
|
for term in file_filters:
|
||||||
q_file_filter_terms |= Q(file_path__regex=term)
|
if term.startswith("-"):
|
||||||
|
# Convert the glob term to a regex pattern
|
||||||
|
regex_term = re.escape(term[1:]).replace(r"\*", ".*").replace(r"\?", ".")
|
||||||
|
# Exclude all files that match the regex term
|
||||||
|
q_file_filter_terms &= ~Q(file_path__regex=regex_term)
|
||||||
|
else:
|
||||||
|
# Convert the glob term to a regex pattern
|
||||||
|
regex_term = re.escape(term).replace(r"\*", ".*").replace(r"\?", ".")
|
||||||
|
# Include any files that match the regex term
|
||||||
|
q_file_filter_terms |= Q(file_path__regex=regex_term)
|
||||||
|
|
||||||
q_filter_terms &= q_file_filter_terms
|
q_filter_terms &= q_file_filter_terms
|
||||||
|
|
||||||
|
@ -1182,9 +1191,7 @@ class EntryAdapters:
|
||||||
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
|
formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d")
|
||||||
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
|
q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date)
|
||||||
|
|
||||||
relevant_entries = Entry.objects.filter(user=user).filter(
|
relevant_entries = Entry.objects.filter(user=user).filter(q_filter_terms)
|
||||||
q_filter_terms,
|
|
||||||
)
|
|
||||||
if file_type_filter:
|
if file_type_filter:
|
||||||
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
relevant_entries = relevant_entries.filter(file_type=file_type_filter)
|
||||||
return relevant_entries
|
return relevant_entries
|
||||||
|
|
|
@ -11,7 +11,8 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class FileFilter(BaseFilter):
|
class FileFilter(BaseFilter):
|
||||||
file_filter_regex = r'file:"(.+?)" ?'
|
file_filter_regex = r'(?<!-)file:"(.+?)" ?'
|
||||||
|
excluded_file_filter_regex = r'-file:"(.+?)" ?'
|
||||||
|
|
||||||
def __init__(self, entry_key="file"):
|
def __init__(self, entry_key="file"):
|
||||||
self.entry_key = entry_key
|
self.entry_key = entry_key
|
||||||
|
@ -20,7 +21,9 @@ class FileFilter(BaseFilter):
|
||||||
|
|
||||||
def get_filter_terms(self, query: str) -> List[str]:
|
def get_filter_terms(self, query: str) -> List[str]:
|
||||||
"Get all filter terms in query"
|
"Get all filter terms in query"
|
||||||
return [f"{self.convert_to_regex(term)}" for term in re.findall(self.file_filter_regex, query)]
|
required_files = [f"{required_file}" for required_file in re.findall(self.file_filter_regex, query)]
|
||||||
|
excluded_files = [f"-{excluded_file}" for excluded_file in re.findall(self.excluded_file_filter_regex, query)]
|
||||||
|
return required_files + excluded_files
|
||||||
|
|
||||||
def convert_to_regex(self, file_filter: str) -> str:
|
def convert_to_regex(self, file_filter: str) -> str:
|
||||||
"Convert file filter to regex"
|
"Convert file filter to regex"
|
||||||
|
|
|
@ -3,7 +3,7 @@ from khoj.search_filter.file_filter import FileFilter
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
|
||||||
|
|
||||||
def test_no_file_filter():
|
def test_can_filter_no_file_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = "head tail"
|
q_with_no_filter = "head tail"
|
||||||
|
@ -15,76 +15,114 @@ def test_no_file_filter():
|
||||||
assert can_filter == False
|
assert can_filter == False
|
||||||
|
|
||||||
|
|
||||||
def test_file_filter_with_non_existent_file():
|
def test_can_filter_non_existent_file():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = 'head file:"nonexistent.org" tail'
|
q_with_filter = 'head file:"nonexistent.org" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
can_filter = file_filter.can_filter(q_with_no_filter)
|
can_filter = file_filter.can_filter(q_with_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert can_filter == True
|
assert can_filter == True
|
||||||
|
|
||||||
|
|
||||||
def test_single_file_filter():
|
def test_can_filter_single_file_include():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = 'head file:"file 1.org" tail'
|
q_with_filter = 'head file:"file 1.org" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
can_filter = file_filter.can_filter(q_with_no_filter)
|
can_filter = file_filter.can_filter(q_with_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert can_filter == True
|
assert can_filter == True
|
||||||
|
|
||||||
|
|
||||||
def test_file_filter_with_partial_match():
|
def test_can_filter_single_file_exclude():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = 'head file:"1.org" tail'
|
q_with_filter = 'head -file:"1.org" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
can_filter = file_filter.can_filter(q_with_no_filter)
|
can_filter = file_filter.can_filter(q_with_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert can_filter == True
|
assert can_filter == True
|
||||||
|
|
||||||
|
|
||||||
def test_file_filter_with_regex_match():
|
def test_can_filter_file_with_regex_match():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = 'head file:"*.org" tail'
|
q_with_filter = 'head file:"*.org" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
can_filter = file_filter.can_filter(q_with_no_filter)
|
can_filter = file_filter.can_filter(q_with_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert can_filter == True
|
assert can_filter == True
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_file_filter():
|
def test_can_filter_multiple_file_includes():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_no_filter = 'head tail file:"file 1.org" file:"file2.org"'
|
q_with_filter = 'head tail file:"file 1.org" file:"file2.org"'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
can_filter = file_filter.can_filter(q_with_no_filter)
|
can_filter = file_filter.can_filter(q_with_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert can_filter == True
|
assert can_filter == True
|
||||||
|
|
||||||
|
|
||||||
def test_get_file_filter_terms():
|
def test_get_single_include_file_filter_terms():
|
||||||
# Arrange
|
# Arrange
|
||||||
file_filter = FileFilter()
|
file_filter = FileFilter()
|
||||||
q_with_filter_terms = 'head tail file:"file 1.org" file:"/path/to/dir/*.org"'
|
q_with_filter_terms = 'head tail file:"/path/to/dir/*.org"'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
|
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert filter_terms == ["file 1\\.org", "/path/to/dir/.*\\.org"]
|
assert filter_terms == ["/path/to/dir/*.org"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_single_exclude_file_filter_terms():
|
||||||
|
# Arrange
|
||||||
|
file_filter = FileFilter()
|
||||||
|
q_with_filter_terms = 'head tail -file:"file 1.org"'
|
||||||
|
|
||||||
|
# Act
|
||||||
|
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert filter_terms == ["-file 1.org"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_single_include_exclude_file_filter_terms():
|
||||||
|
# Arrange
|
||||||
|
file_filter = FileFilter()
|
||||||
|
q_with_filter_terms = 'head tail -file:"file 1.org" file:"/path/to/dir/*.org"'
|
||||||
|
|
||||||
|
# Act
|
||||||
|
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert filter_terms == ["/path/to/dir/*.org", "-file 1.org"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_multiple_include_exclude_file_filter_terms():
|
||||||
|
# Arrange
|
||||||
|
file_filter = FileFilter()
|
||||||
|
q_with_filter_terms = (
|
||||||
|
'head -file:"file 1.org" file:"file 1.org" file:"/path/to/dir/.*.org" -file:"/path/to/dir/*.org" tail'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
filter_terms = file_filter.get_filter_terms(q_with_filter_terms)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert filter_terms == ["file 1.org", "/path/to/dir/.*.org", "-file 1.org", "-/path/to/dir/*.org"]
|
||||||
|
|
||||||
|
|
||||||
def arrange_content():
|
def arrange_content():
|
||||||
|
|
Loading…
Reference in a new issue