diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index b62c2360..dba42094 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1048,7 +1048,7 @@ class FileObjectAdapters: class EntryAdapters: - word_filer = WordFilter() + word_filter = WordFilter() file_filter = FileFilter() date_filter = DateFilter() @@ -1150,14 +1150,14 @@ class EntryAdapters: def apply_filters(user: KhojUser, query: str, file_type_filter: str = None): q_filter_terms = Q() - explicit_word_terms = EntryAdapters.word_filer.get_filter_terms(query) + word_filters = EntryAdapters.word_filter.get_filter_terms(query) file_filters = EntryAdapters.file_filter.get_filter_terms(query) date_filters = EntryAdapters.date_filter.get_query_date_range(query) - if len(explicit_word_terms) == 0 and len(file_filters) == 0 and len(date_filters) == 0: + if len(word_filters) == 0 and len(file_filters) == 0 and len(date_filters) == 0: return Entry.objects.filter(user=user) - for term in explicit_word_terms: + for term in word_filters: if term.startswith("+"): q_filter_terms &= Q(raw__icontains=term[1:]) elif term.startswith("-"): @@ -1167,7 +1167,16 @@ class EntryAdapters: if len(file_filters) > 0: for term in file_filters: - q_file_filter_terms |= Q(file_path__regex=term) + if term.startswith("-"): + # Convert the glob term to a regex pattern + regex_term = re.escape(term[1:]).replace(r"\*", ".*").replace(r"\?", ".") + # Exclude all files that match the regex term + q_file_filter_terms &= ~Q(file_path__regex=regex_term) + else: + # Convert the glob term to a regex pattern + regex_term = re.escape(term).replace(r"\*", ".*").replace(r"\?", ".") + # Include any files that match the regex term + q_file_filter_terms |= Q(file_path__regex=regex_term) q_filter_terms &= q_file_filter_terms @@ -1182,9 +1191,7 @@ class EntryAdapters: formatted_max_date = date.fromtimestamp(max_date).strftime("%Y-%m-%d") q_filter_terms &= Q(embeddings_dates__date__lte=formatted_max_date) - relevant_entries = Entry.objects.filter(user=user).filter( - q_filter_terms, - ) + relevant_entries = Entry.objects.filter(user=user).filter(q_filter_terms) if file_type_filter: relevant_entries = relevant_entries.filter(file_type=file_type_filter) return relevant_entries diff --git a/src/khoj/search_filter/file_filter.py b/src/khoj/search_filter/file_filter.py index b3c3e0b8..9883ea70 100644 --- a/src/khoj/search_filter/file_filter.py +++ b/src/khoj/search_filter/file_filter.py @@ -11,7 +11,8 @@ logger = logging.getLogger(__name__) class FileFilter(BaseFilter): - file_filter_regex = r'file:"(.+?)" ?' + file_filter_regex = r'(? List[str]: "Get all filter terms in query" - return [f"{self.convert_to_regex(term)}" for term in re.findall(self.file_filter_regex, query)] + required_files = [f"{required_file}" for required_file in re.findall(self.file_filter_regex, query)] + excluded_files = [f"-{excluded_file}" for excluded_file in re.findall(self.excluded_file_filter_regex, query)] + return required_files + excluded_files def convert_to_regex(self, file_filter: str) -> str: "Convert file filter to regex" diff --git a/tests/test_file_filter.py b/tests/test_file_filter.py index f5a903f8..9a36bd57 100644 --- a/tests/test_file_filter.py +++ b/tests/test_file_filter.py @@ -3,7 +3,7 @@ from khoj.search_filter.file_filter import FileFilter from khoj.utils.rawconfig import Entry -def test_no_file_filter(): +def test_can_filter_no_file_filter(): # Arrange file_filter = FileFilter() q_with_no_filter = "head tail" @@ -15,76 +15,114 @@ def test_no_file_filter(): assert can_filter == False -def test_file_filter_with_non_existent_file(): +def test_can_filter_non_existent_file(): # Arrange file_filter = FileFilter() - q_with_no_filter = 'head file:"nonexistent.org" tail' + q_with_filter = 'head file:"nonexistent.org" tail' # Act - can_filter = file_filter.can_filter(q_with_no_filter) + can_filter = file_filter.can_filter(q_with_filter) # Assert assert can_filter == True -def test_single_file_filter(): +def test_can_filter_single_file_include(): # Arrange file_filter = FileFilter() - q_with_no_filter = 'head file:"file 1.org" tail' + q_with_filter = 'head file:"file 1.org" tail' # Act - can_filter = file_filter.can_filter(q_with_no_filter) + can_filter = file_filter.can_filter(q_with_filter) # Assert assert can_filter == True -def test_file_filter_with_partial_match(): +def test_can_filter_single_file_exclude(): # Arrange file_filter = FileFilter() - q_with_no_filter = 'head file:"1.org" tail' + q_with_filter = 'head -file:"1.org" tail' # Act - can_filter = file_filter.can_filter(q_with_no_filter) + can_filter = file_filter.can_filter(q_with_filter) # Assert assert can_filter == True -def test_file_filter_with_regex_match(): +def test_can_filter_file_with_regex_match(): # Arrange file_filter = FileFilter() - q_with_no_filter = 'head file:"*.org" tail' + q_with_filter = 'head file:"*.org" tail' # Act - can_filter = file_filter.can_filter(q_with_no_filter) + can_filter = file_filter.can_filter(q_with_filter) # Assert assert can_filter == True -def test_multiple_file_filter(): +def test_can_filter_multiple_file_includes(): # Arrange file_filter = FileFilter() - q_with_no_filter = 'head tail file:"file 1.org" file:"file2.org"' + q_with_filter = 'head tail file:"file 1.org" file:"file2.org"' # Act - can_filter = file_filter.can_filter(q_with_no_filter) + can_filter = file_filter.can_filter(q_with_filter) # Assert assert can_filter == True -def test_get_file_filter_terms(): +def test_get_single_include_file_filter_terms(): # Arrange file_filter = FileFilter() - q_with_filter_terms = 'head tail file:"file 1.org" file:"/path/to/dir/*.org"' + q_with_filter_terms = 'head tail file:"/path/to/dir/*.org"' # Act filter_terms = file_filter.get_filter_terms(q_with_filter_terms) # Assert - assert filter_terms == ["file 1\\.org", "/path/to/dir/.*\\.org"] + assert filter_terms == ["/path/to/dir/*.org"] + + +def test_get_single_exclude_file_filter_terms(): + # Arrange + file_filter = FileFilter() + q_with_filter_terms = 'head tail -file:"file 1.org"' + + # Act + filter_terms = file_filter.get_filter_terms(q_with_filter_terms) + + # Assert + assert filter_terms == ["-file 1.org"] + + +def test_get_single_include_exclude_file_filter_terms(): + # Arrange + file_filter = FileFilter() + q_with_filter_terms = 'head tail -file:"file 1.org" file:"/path/to/dir/*.org"' + + # Act + filter_terms = file_filter.get_filter_terms(q_with_filter_terms) + + # Assert + assert filter_terms == ["/path/to/dir/*.org", "-file 1.org"] + + +def test_get_multiple_include_exclude_file_filter_terms(): + # Arrange + file_filter = FileFilter() + q_with_filter_terms = ( + 'head -file:"file 1.org" file:"file 1.org" file:"/path/to/dir/.*.org" -file:"/path/to/dir/*.org" tail' + ) + + # Act + filter_terms = file_filter.get_filter_terms(q_with_filter_terms) + + # Assert + assert filter_terms == ["file 1.org", "/path/to/dir/.*.org", "-file 1.org", "-/path/to/dir/*.org"] def arrange_content():