Merge branch 'master' of github.com:khoj-ai/khoj into features/internet-enabled-search

This commit is contained in:
sabaimran 2023-11-19 16:20:08 -08:00
commit b8e6883a81
14 changed files with 276 additions and 43 deletions

View file

@ -5,6 +5,15 @@
<title>Document</title> <title>Document</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /> <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
<meta name="description" content="Description"> <meta name="description" content="Description">
<!-- Open Graph metadata -->
<meta property="og:title" content="Khoj Documentation">
<meta property="og:type" content="website">
<meta property="og:site_name" content="Khoj Documentation">
<meta property="og:description" content="Quickly get started with using or self-hosting Khoj">
<meta property="og:image" content="https://khoj-web-bucket.s3.amazonaws.com/link_preview_docs.png">
<meta property="og:url" content="https://docs.khoj.dev">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0">
<link rel="stylesheet" href="//cdn.jsdelivr.net/npm/docsify/lib/themes/buble.css" /> <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/docsify/lib/themes/buble.css" />
<link rel="icon" href="./assets/favicon-128x128.ico"> <link rel="icon" href="./assets/favicon-128x128.ico">

View file

@ -0,0 +1,27 @@
# Generated by Django 4.2.7 on 2023-11-19 22:20
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("database", "0018_searchmodelconfig_delete_searchmodel"),
]
operations = [
migrations.AlterField(
model_name="googleuser",
name="family_name",
field=models.CharField(blank=True, default=None, max_length=200, null=True),
),
migrations.AlterField(
model_name="googleuser",
name="given_name",
field=models.CharField(blank=True, default=None, max_length=200, null=True),
),
migrations.AlterField(
model_name="googleuser",
name="name",
field=models.CharField(blank=True, default=None, max_length=200, null=True),
),
]

View file

@ -27,9 +27,9 @@ class GoogleUser(models.Model):
sub = models.CharField(max_length=200) sub = models.CharField(max_length=200)
azp = models.CharField(max_length=200) azp = models.CharField(max_length=200)
email = models.CharField(max_length=200) email = models.CharField(max_length=200)
name = models.CharField(max_length=200) name = models.CharField(max_length=200, null=True, default=None, blank=True)
given_name = models.CharField(max_length=200) given_name = models.CharField(max_length=200, null=True, default=None, blank=True)
family_name = models.CharField(max_length=200) family_name = models.CharField(max_length=200, null=True, default=None, blank=True)
picture = models.CharField(max_length=200, null=True, default=None) picture = models.CharField(max_length=200, null=True, default=None)
locale = models.CharField(max_length=200) locale = models.CharField(max_length=200)

View file

@ -100,16 +100,17 @@ class OrgToEntries(TextToEntries):
continue continue
todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else "" todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
# Prepend filename as top heading to entry
filename = Path(entry_to_file_map[parsed_entry]).stem # Prepend ancestor headings, filename as top heading to entry for context
ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry])
if parsed_entry.heading: if parsed_entry.heading:
heading = f"* {filename}\n** {todo_str}{parsed_entry.heading}." heading = f"* Path: {ancestors_trail}\n** {todo_str}{parsed_entry.heading}."
else: else:
heading = f"* {filename}." heading = f"* Path: {ancestors_trail}."
compiled = heading compiled = heading
if state.verbose > 2: if state.verbose > 2:
logger.debug(f"Title: {parsed_entry.heading}") logger.debug(f"Title: {heading}")
if parsed_entry.tags: if parsed_entry.tags:
tags_str = " ".join(parsed_entry.tags) tags_str = " ".join(parsed_entry.tags)

View file

@ -80,6 +80,7 @@ def makelist(file, filename):
} # populated from #+SEQ_TODO line } # populated from #+SEQ_TODO line
level = "" level = ""
heading = "" heading = ""
ancestor_headings = []
bodytext = "" bodytext = ""
introtext = "" introtext = ""
tags = list() # set of all tags in headline tags = list() # set of all tags in headline
@ -98,7 +99,7 @@ def makelist(file, filename):
heading_search = re.search(r"^(\*+)\s(.*?)\s*$", line) heading_search = re.search(r"^(\*+)\s(.*?)\s*$", line)
if heading_search: # we are processing a heading line if heading_search: # we are processing a heading line
if heading: # if we have are on second heading, append first heading to headings list if heading: # if we have are on second heading, append first heading to headings list
thisNode = Orgnode(level, heading, bodytext, tags) thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
if closed_date: if closed_date:
thisNode.closed = closed_date thisNode.closed = closed_date
closed_date = "" closed_date = ""
@ -114,6 +115,8 @@ def makelist(file, filename):
thisNode.properties = property_map thisNode.properties = property_map
nodelist.append(thisNode) nodelist.append(thisNode)
property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"} property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
previous_level = level
previous_heading = heading
level = heading_search.group(1) level = heading_search.group(1)
heading = heading_search.group(2) heading = heading_search.group(2)
bodytext = "" bodytext = ""
@ -126,6 +129,17 @@ def makelist(file, filename):
for parsedtag in parsedtags.split(":"): for parsedtag in parsedtags.split(":"):
if parsedtag != "": if parsedtag != "":
tags.append(parsedtag) tags.append(parsedtag)
# Add previous heading to ancestors if current heading is deeper than previous level
if len(level) > len(previous_level) and previous_heading:
ancestor_headings.append(previous_heading)
# Remove last ancestor(s) if current heading is shallower than previous level
elif len(level) < len(previous_level):
for _ in range(len(level), len(previous_level)):
if not ancestor_headings or len(ancestor_headings) == 0:
break
ancestor_headings.pop()
else: # we are processing a non-heading line else: # we are processing a non-heading line
if line[:10] == "#+SEQ_TODO": if line[:10] == "#+SEQ_TODO":
kwlist = re.findall(r"([A-Z]+)\(", line) kwlist = re.findall(r"([A-Z]+)\(", line)
@ -216,7 +230,7 @@ def makelist(file, filename):
nodelist = [thisNode] + nodelist nodelist = [thisNode] + nodelist
# write out last heading node # write out last heading node
if heading: if heading:
thisNode = Orgnode(level, heading, bodytext, tags) thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
thisNode.properties = property_map thisNode.properties = property_map
if sched_date: if sched_date:
thisNode.scheduled = sched_date thisNode.scheduled = sched_date
@ -243,6 +257,9 @@ def makelist(file, filename):
n.priority = priority_search.group(1) n.priority = priority_search.group(1)
n.heading = priority_search.group(2) n.heading = priority_search.group(2)
# Prefix filepath/title to ancestors
n.ancestors = [file_title] + n.ancestors
# Set SOURCE property to a file+heading based org-mode link to the entry # Set SOURCE property to a file+heading based org-mode link to the entry
if n.level == 0: if n.level == 0:
n.properties["LINE"] = f"file:{normalize_filename(filename)}::0" n.properties["LINE"] = f"file:{normalize_filename(filename)}::0"
@ -261,7 +278,7 @@ class Orgnode(object):
with the headline. with the headline.
""" """
def __init__(self, level, headline, body, tags): def __init__(self, level, headline, body, tags, ancestor_headings=[]):
""" """
Create an Orgnode object given the parameters of level (as the Create an Orgnode object given the parameters of level (as the
raw asterisks), headline text (including the TODO tag), and raw asterisks), headline text (including the TODO tag), and
@ -279,8 +296,21 @@ class Orgnode(object):
self._closed = "" # Closed date self._closed = "" # Closed date
self._properties = dict() self._properties = dict()
self._logbook = list() # List of clock-in, clock-out tuples representing logbook entries self._logbook = list() # List of clock-in, clock-out tuples representing logbook entries
self._ancestor_headings = ancestor_headings.copy()
# Look for priority in headline and transfer to prty field @property
def ancestors(self) -> List[str]:
"""
Return the ancestor headings of the node
"""
return self._ancestor_headings
@ancestors.setter
def ancestors(self, new_ancestors):
"""
Update the ancestor headings of the node
"""
self._ancestor_headings = new_ancestors
@property @property
def heading(self): def heading(self):

View file

@ -384,6 +384,45 @@ def sample_org_data():
def get_sample_data(type): def get_sample_data(type):
sample_data = { sample_data = {
"org": { "org": {
"elisp.org": """
* Emacs Khoj
/An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation
*** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp
;; Khoj Package
(use-package khoj
:load-path "~/.emacs.d/lisp/khoj.el"
:bind ("C-c s" . 'khoj))
#+end_src
*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp
;; Khoj Package
(use-package khoj
:quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
:bind ("C-c s" . 'khoj))
#+end_src
** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language
e.g "What is the meaning of life?" "What are my life goals?"
3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query
e.g "What is the meaning of life? -god +none"
""",
"readme.org": """ "readme.org": """
* Khoj * Khoj
/Allow natural language search on user content like notes, images using transformer based models/ /Allow natural language search on user content like notes, images using transformer based models/
@ -399,7 +438,7 @@ def get_sample_data(type):
git clone https://github.com/khoj-ai/khoj && cd khoj git clone https://github.com/khoj-ai/khoj && cd khoj
conda env create -f environment.yml conda env create -f environment.yml
conda activate khoj conda activate khoj
#+end_src""" #+end_src""",
}, },
"markdown": { "markdown": {
"readme.markdown": """ "readme.markdown": """

View file

@ -4,10 +4,9 @@
** Requirements ** Requirements
- Install and Run [[https://github.com/khoj-ai/khoj][khoj]] - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
** Installation ** Install
- Direct Install *** Direct
- Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp - Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
- Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
#+begin_src elisp #+begin_src elisp
;; Khoj Package ;; Khoj Package
@ -16,7 +15,7 @@
:bind ("C-c s" . 'khoj)) :bind ("C-c s" . 'khoj))
#+end_src #+end_src
- Use [[https://github.com/quelpa/quelpa#installation][Quelpa]] *** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
- Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
- Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it. - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
#+begin_src elisp #+begin_src elisp
@ -28,17 +27,10 @@
** Usage ** Usage
1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~ 1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
2. Enter Query in Natural Language 2. Enter Query in Natural Language
e.g "What is the meaning of life?" "What are my life goals?" e.g "What is the meaning of life?" "What are my life goals?"
3. Wait for results 3. Wait for results
*Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files* *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
4. (Optional) Narrow down results further 4. (Optional) Narrow down results further
Include/Exclude specific words from results by adding to query Include/Exclude specific words from results by adding to query
e.g "What is the meaning of life? -god +none" e.g "What is the meaning of life? -god +none"

View file

@ -22,16 +22,16 @@
#+end_src #+end_src
** Use ** Use
- *Khoj via Emacs* *** *Khoj via Emacs*
- [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]] - [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
- Run ~M-x khoj <user-query>~ or Call ~C-c C-s~ - Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
- *Khoj via API* *** *Khoj via API*
- Query: ~GET~ [[http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:42110/api/search?q="What is the meaning of life"]] - Query: ~GET~ [[http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:42110/api/search?q="What is the meaning of life"]]
- Update Index: ~GET~ [[http://localhost:42110/api/update][http://localhost:42110/api/update]] - Update Index: ~GET~ [[http://localhost:42110/api/update][http://localhost:42110/api/update]]
- [[http://localhost:42110/docs][Khoj API Docs]] - [[http://localhost:42110/docs][Khoj API Docs]]
- *Call Khoj via Python Script Directly* *** *Call Khoj via Python Script Directly*
#+begin_src shell #+begin_src shell
python3 search_types/asymmetric.py \ python3 search_types/asymmetric.py \
--compressed-jsonl .notes.jsonl.gz \ --compressed-jsonl .notes.jsonl.gz \

View file

@ -321,7 +321,7 @@ def test_notes_search_with_include_filter(client, sample_org_data, default_user:
assert response.status_code == 200 assert response.status_code == 200
# assert actual_data contains word "Emacs" # assert actual_data contains word "Emacs"
search_result = response.json()[0]["entry"] search_result = response.json()[0]["entry"]
assert "Emacs" in search_result assert "emacs" in search_result
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -347,6 +347,27 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
assert "clone" not in search_result assert "clone" not in search_result
# ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
def test_notes_search_requires_parent_context(
client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
):
# Arrange
headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("Install Khoj on Emacs")
# Act
response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
# Assert
assert response.status_code == 200
assert len(response.json()) == 1, "Expected only 1 result"
search_result = response.json()[0]["entry"]
assert "Emacs load path" in search_result, "Expected 'Emacs load path' in search result"
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True) @pytest.mark.django_db(transaction=True)
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser): def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):

View file

@ -69,7 +69,7 @@ def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUs
# Assert # Assert
assert update_response.status_code == 200 assert update_response.status_code == 200
assert len(results) == 4 assert len(results) == 5
for result in results: for result in results:
assert result["additional"]["file"] not in source_file_symbol assert result["additional"]["file"] not in source_file_symbol

View file

@ -45,9 +45,10 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
assert is_none_or_empty(jsonl_data) assert is_none_or_empty(jsonl_data)
def test_entry_split_when_exceeds_max_words(tmp_path): def test_entry_split_when_exceeds_max_words():
"Ensure entries with compiled words exceeding max_words are split." "Ensure entries with compiled words exceeding max_words are split."
# Arrange # Arrange
tmp_path = "/tmp/test.org"
entry = f"""*** Heading entry = f"""*** Heading
\t\r \t\r
Body Line Body Line
@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
data = { data = {
f"{tmp_path}": entry, f"{tmp_path}": entry,
} }
expected_heading = f"* {tmp_path.stem}\n** Heading" expected_heading = f"* Path: {tmp_path}\n** Heading"
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files

View file

@ -161,6 +161,8 @@ Body Line 1"""
assert len(entries) == 1 assert len(entries) == 1
# parsed heading from entry # parsed heading from entry
assert entries[0].heading == "Heading[1]" assert entries[0].heading == "Heading[1]"
# track ancestors of entry
assert entries[0].ancestors == [f"{orgfile}"]
# ensure SOURCE link has square brackets in filename, heading escaped in rendered entries # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]") escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}" assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
@ -260,6 +262,7 @@ Body Line 1"""
assert entries[0].closed == "" assert entries[0].closed == ""
assert entries[0].scheduled == "" assert entries[0].scheduled == ""
assert entries[0].deadline == "" assert entries[0].deadline == ""
assert entries[0].ancestors == ["test"]
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -284,6 +287,7 @@ Body Line 1
assert entries[0].closed == "" assert entries[0].closed == ""
assert entries[0].scheduled == "" assert entries[0].scheduled == ""
assert entries[0].deadline == "" assert entries[0].deadline == ""
assert entries[0].ancestors == ["title1 title2"]
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -304,8 +308,10 @@ entry body
assert len(entries) == 2 assert len(entries) == 2
assert entries[0].heading == "Title" assert entries[0].heading == "Title"
assert entries[0].body == "intro body\n" assert entries[0].body == "intro body\n"
assert entries[0].ancestors == ["Title"]
assert entries[1].heading == "Entry Heading" assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n\n" assert entries[1].body == "entry body\n\n"
assert entries[1].ancestors == ["Title"]
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -326,8 +332,93 @@ entry body
assert len(entries) == 2 assert len(entries) == 2
assert entries[0].heading == "Title1 Title2" assert entries[0].heading == "Title1 Title2"
assert entries[0].body == "intro body\n" assert entries[0].body == "intro body\n"
assert entries[0].ancestors == ["Title1 Title2"]
assert entries[1].heading == "Entry Heading" assert entries[1].heading == "Entry Heading"
assert entries[1].body == "entry body\n\n" assert entries[1].body == "entry body\n\n"
assert entries[0].ancestors == ["Title1 Title2"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_single_ancestor_heading(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 2
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_multiple_ancestor_headings(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
*** Sub Sub Heading 1
sub sub body 1
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 3
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[2].heading == "Sub Sub Heading 1"
assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
# ----------------------------------------------------------------------------------------------------
def test_parse_org_with_multiple_ancestor_headings_of_siblings(tmp_path):
"Parse org entries with parent headings context"
# Arrange
body = f"""
* Heading 1
body 1
** Sub Heading 1
*** Sub Sub Heading 1
sub sub body 1
*** Sub Sub Heading 2
** Sub Heading 2
*** Sub Sub Heading 3
"""
orgfile = create_file(tmp_path, body)
# Act
entries = orgnode.makelist_with_filepath(orgfile)
# Assert
assert len(entries) == 6
assert entries[0].heading == "Heading 1"
assert entries[0].ancestors == [f"{orgfile}"]
assert entries[1].heading == "Sub Heading 1"
assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[2].heading == "Sub Sub Heading 1"
assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
assert entries[3].heading == "Sub Sub Heading 2"
assert entries[3].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
assert entries[4].heading == "Sub Heading 2"
assert entries[4].ancestors == [f"{orgfile}", "Heading 1"]
assert entries[5].heading == "Sub Sub Heading 3"
assert entries[5].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 2"]
# Helper Functions # Helper Functions

View file

@ -70,7 +70,7 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert # Assert
assert "Deleted 3 entries. Created 0 new entries for user " in caplog.records[-1].message assert "Deleted 8 entries. Created 0 new entries for user " in caplog.records[-1].message
verify_embeddings(0, default_user) verify_embeddings(0, default_user)
@ -90,7 +90,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
# Assert # Assert
assert "Deleting all entries for file type org" in caplog.text assert "Deleting all entries for file type org" in caplog.text
assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -106,7 +106,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert # Assert
assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -161,7 +161,7 @@ async def test_text_search(search_config: SearchConfig):
default_user, default_user,
) )
query = "How to git install application?" query = "Load Khoj on Emacs?"
# Act # Act
hits = await text_search.query(default_user, query) hits = await text_search.query(default_user, query)
@ -170,7 +170,7 @@ async def test_text_search(search_config: SearchConfig):
# Assert # Assert
search_result = results[0].entry search_result = results[0].entry
assert "git clone" in search_result, 'search result did not contain "git clone" entry' assert "Emacs load path" in search_result, 'Expected "Emacs load path" in entry'
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -284,9 +284,9 @@ def test_regenerate_index_with_new_entry(
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
assert "Deleted 10 entries. Created 11 new entries for user " in final_logs assert "Deleted 13 entries. Created 14 new entries for user " in final_logs
verify_embeddings(11, default_user) verify_embeddings(14, default_user)
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@ -320,7 +320,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# Assert # Assert
# verify only 1 entry added even if there are multiple duplicate entries # verify only 1 entry added even if there are multiple duplicate entries
assert "Deleted 3 entries. Created 1 new entries for user " in initial_logs assert "Deleted 8 entries. Created 1 new entries for user " in initial_logs
assert "Deleted 0 entries. Created 0 new entries for user " in final_logs assert "Deleted 0 entries. Created 0 new entries for user " in final_logs
verify_embeddings(1, default_user) verify_embeddings(1, default_user)
@ -357,7 +357,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# Assert # Assert
# verify only 1 entry added even if there are multiple duplicate entries # verify only 1 entry added even if there are multiple duplicate entries
assert "Deleted 3 entries. Created 2 new entries for user " in initial_logs assert "Deleted 8 entries. Created 2 new entries for user " in initial_logs
assert "Deleted 1 entries. Created 0 new entries for user " in final_logs assert "Deleted 1 entries. Created 0 new entries for user " in final_logs
verify_embeddings(1, default_user) verify_embeddings(1, default_user)
@ -388,9 +388,9 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
assert "Deleted 0 entries. Created 1 new entries for user " in final_logs assert "Deleted 0 entries. Created 1 new entries for user " in final_logs
verify_embeddings(11, default_user) verify_embeddings(14, default_user)
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------

View file

@ -3,6 +3,25 @@ from khoj.search_filter.word_filter import WordFilter
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
# Test
# ----------------------------------------------------------------------------------------------------
def test_no_word_filter():
# Arrange
word_filter = WordFilter()
q_with_no_filter = "head tail"
# Act
can_filter = word_filter.can_filter(q_with_no_filter)
filter_terms = word_filter.get_filter_terms(q_with_no_filter)
# Assert
assert can_filter == False
assert filter_terms == []
# ----------------------------------------------------------------------------------------------------
def test_word_exclude_filter(): def test_word_exclude_filter():
# Arrange # Arrange
word_filter = WordFilter() word_filter = WordFilter()
@ -15,6 +34,7 @@ def test_word_exclude_filter():
assert can_filter == True assert can_filter == True
# ----------------------------------------------------------------------------------------------------
def test_word_include_filter(): def test_word_include_filter():
# Arrange # Arrange
word_filter = WordFilter() word_filter = WordFilter()
@ -27,6 +47,7 @@ def test_word_include_filter():
assert can_filter == True assert can_filter == True
# ----------------------------------------------------------------------------------------------------
def test_word_include_and_exclude_filter(): def test_word_include_and_exclude_filter():
# Arrange # Arrange
word_filter = WordFilter() word_filter = WordFilter()
@ -39,6 +60,7 @@ def test_word_include_and_exclude_filter():
assert can_filter == True assert can_filter == True
# ----------------------------------------------------------------------------------------------------
def test_get_word_filter_terms(): def test_get_word_filter_terms():
# Arrange # Arrange
word_filter = WordFilter() word_filter = WordFilter()