Index Parent Headings of Org-Mode Entries to Improve Search Context (#548)

### Overview The parent hierarchy of org-mode entries can store important context. This change updates OrgNode to track parent headings for each org entry and adds the parent outline for each entry to the index ### Details - Test search uses ancestor headings as context for improved results - Add ancestor headings of each org-mode entry to their compiled form - Track ancestor headings for each org-mode entry in org-node parser Resolves #85
2024-11-27 17:35:07 +01:00 · 2023-11-19 13:18:19 -08:00 · 2023-11-19 13:18:19 -08:00 · 71799add0b
commit 71799add0b
parent cfd76b8472 33ad9b8e64
10 changed files with 215 additions and 40 deletions
--- a/src/khoj/processor/org_mode/org_to_entries.py
+++ b/src/khoj/processor/org_mode/org_to_entries.py
@ -100,16 +100,17 @@ class OrgToEntries(TextToEntries):
                continue
            todo_str = f"{parsed_entry.todo} " if parsed_entry.todo else ""
-            # Prepend filename as top heading to entry
+
-            filename = Path(entry_to_file_map[parsed_entry]).stem
+            # Prepend ancestor headings, filename as top heading to entry for context
            ancestors_trail = " / ".join(parsed_entry.ancestors) or Path(entry_to_file_map[parsed_entry])
            if parsed_entry.heading:
-                heading = f"* {filename}\n** {todo_str}{parsed_entry.heading}."
+                heading = f"* Path: {ancestors_trail}\n** {todo_str}{parsed_entry.heading}."
            else:
-                heading = f"* {filename}."
+                heading = f"* Path: {ancestors_trail}."
            compiled = heading
            if state.verbose > 2:
-                logger.debug(f"Title: {parsed_entry.heading}")
+                logger.debug(f"Title: {heading}")
            if parsed_entry.tags:
                tags_str = " ".join(parsed_entry.tags)
--- a/src/khoj/processor/org_mode/orgnode.py
+++ b/src/khoj/processor/org_mode/orgnode.py
@ -80,6 +80,7 @@ def makelist(file, filename):
    }  # populated from #+SEQ_TODO line
    level = ""
    heading = ""
    ancestor_headings = []
    bodytext = ""
    introtext = ""
    tags = list()  # set of all tags in headline
@ -98,7 +99,7 @@ def makelist(file, filename):
        heading_search = re.search(r"^(\*+)\s(.*?)\s*$", line)
        if heading_search:  # we are processing a heading line
            if heading:  # if we have are on second heading, append first heading to headings list
-                thisNode = Orgnode(level, heading, bodytext, tags)
+                thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
                if closed_date:
                    thisNode.closed = closed_date
                    closed_date = ""
@ -114,6 +115,8 @@ def makelist(file, filename):
                thisNode.properties = property_map
                nodelist.append(thisNode)
            property_map = {"LINE": f"file:{normalize_filename(filename)}::{ctr}"}
            previous_level = level
            previous_heading = heading
            level = heading_search.group(1)
            heading = heading_search.group(2)
            bodytext = ""
@ -126,6 +129,17 @@ def makelist(file, filename):
                    for parsedtag in parsedtags.split(":"):
                        if parsedtag != "":
                            tags.append(parsedtag)
            # Add previous heading to ancestors if current heading is deeper than previous level
            if len(level) > len(previous_level) and previous_heading:
                ancestor_headings.append(previous_heading)
            # Remove last ancestor(s) if current heading is shallower than previous level
            elif len(level) < len(previous_level):
                for _ in range(len(level), len(previous_level)):
                    if not ancestor_headings or len(ancestor_headings) == 0:
                        break
                    ancestor_headings.pop()
        else:  # we are processing a non-heading line
            if line[:10] == "#+SEQ_TODO":
                kwlist = re.findall(r"([A-Z]+)\(", line)
@ -216,7 +230,7 @@ def makelist(file, filename):
        nodelist = [thisNode] + nodelist
    # write out last heading node
    if heading:
-        thisNode = Orgnode(level, heading, bodytext, tags)
+        thisNode = Orgnode(level, heading, bodytext, tags, ancestor_headings)
        thisNode.properties = property_map
        if sched_date:
            thisNode.scheduled = sched_date
@ -243,6 +257,9 @@ def makelist(file, filename):
            n.priority = priority_search.group(1)
            n.heading = priority_search.group(2)
        # Prefix filepath/title to ancestors
        n.ancestors = [file_title] + n.ancestors
        # Set SOURCE property to a file+heading based org-mode link to the entry
        if n.level == 0:
            n.properties["LINE"] = f"file:{normalize_filename(filename)}::0"
@ -261,7 +278,7 @@ class Orgnode(object):
    with the headline.
    """
-    def __init__(self, level, headline, body, tags):
+    def __init__(self, level, headline, body, tags, ancestor_headings=[]):
        """
        Create an Orgnode object given the parameters of level (as the
        raw asterisks), headline text (including the TODO tag), and
@ -279,8 +296,21 @@ class Orgnode(object):
        self._closed = ""  # Closed date
        self._properties = dict()
        self._logbook = list()  # List of clock-in, clock-out tuples representing logbook entries
        self._ancestor_headings = ancestor_headings.copy()
-        # Look for priority in headline and transfer to prty field
+    @property
    def ancestors(self) -> List[str]:
        """
        Return the ancestor headings of the node
        """
        return self._ancestor_headings
    @ancestors.setter
    def ancestors(self, new_ancestors):
        """
        Update the ancestor headings of the node
        """
        self._ancestor_headings = new_ancestors
    @property
    def heading(self):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -384,6 +384,45 @@ def sample_org_data():
 def get_sample_data(type):
    sample_data = {
        "org": {
            "elisp.org": """
 * Emacs Khoj
  /An Emacs interface for [[https://github.com/khoj-ai/khoj][khoj]]/
 ** Requirements
   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
 ** Installation
 *** Direct
     - Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :load-path "~/.emacs.d/lisp/khoj.el"
           :bind ("C-c s" . 'khoj))
       #+end_src
 *** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
       #+begin_src elisp
         ;; Khoj Package
         (use-package khoj
           :quelpa (khoj :fetcher url :url "https://raw.githubusercontent.com/khoj-ai/khoj/master/interface/emacs/khoj.el")
           :bind ("C-c s" . 'khoj))
       #+end_src
 ** Usage
   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
   2. Enter Query in Natural Language
      e.g "What is the meaning of life?" "What are my life goals?"
   3. Wait for results
      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
   4. (Optional) Narrow down results further
      Include/Exclude specific words from results by adding to query
      e.g "What is the meaning of life? -god +none"
 """,
            "readme.org": """
 * Khoj
  /Allow natural language search on user content like notes, images using transformer based models/
@ -399,7 +438,7 @@ def get_sample_data(type):
   git clone https://github.com/khoj-ai/khoj && cd khoj
   conda env create -f environment.yml
   conda activate khoj
-   #+end_src"""
+   #+end_src""",
        },
        "markdown": {
            "readme.markdown": """
--- a/tests/data/org/interface_emacs_readme.org
+++ b/tests/data/org/interface_emacs_readme.org
@ -4,10 +4,9 @@
 ** Requirements
   - Install and Run [[https://github.com/khoj-ai/khoj][khoj]]
-** Installation
+** Install
-   - Direct Install
+*** Direct
     - Put ~khoj.el~ in your Emacs load path. For e.g ~/.emacs.d/lisp
     - Load via ~use-package~ in your ~/.emacs.d/init.el or .emacs file by adding below snippet
       #+begin_src elisp
         ;; Khoj Package
@ -16,7 +15,7 @@
           :bind ("C-c s" . 'khoj))
       #+end_src
-   - Use [[https://github.com/quelpa/quelpa#installation][Quelpa]]
+*** Using [[https://github.com/quelpa/quelpa#installation][Quelpa]]
     - Ensure [[https://github.com/quelpa/quelpa#installation][Quelpa]], [[https://github.com/quelpa/quelpa-use-package#installation][quelpa-use-package]] are installed
     - Add below snippet to your ~/.emacs.d/init.el or .emacs config file and execute it.
       #+begin_src elisp
@ -28,17 +27,10 @@
 ** Usage
   1. Call ~khoj~ using keybinding ~C-c s~ or ~M-x khoj~
   2. Enter Query in Natural Language
      e.g "What is the meaning of life?" "What are my life goals?"
   3. Wait for results
      *Note: It takes about 15s on a Mac M1 and a ~100K lines corpus of org-mode files*
   4. (Optional) Narrow down results further
      Include/Exclude specific words from results by adding to query
      e.g "What is the meaning of life? -god +none"
--- a/tests/data/org/main_readme.org
+++ b/tests/data/org/main_readme.org
@ -22,16 +22,16 @@
   #+end_src
 ** Use
-   - *Khoj via Emacs*
+*** *Khoj via Emacs*
     - [[https://github.com/khoj-ai/khoj/tree/master/interface/emacs#installation][Install]] [[./interface/emacs/khoj.el][khoj.el]]
     - Run ~M-x khoj <user-query>~ or Call ~C-c C-s~
-   - *Khoj via API*
+*** *Khoj via API*
     - Query: ~GET~ [[http://localhost:42110/api/search?q=%22what%20is%20the%20meaning%20of%20life%22][http://localhost:42110/api/search?q="What is the meaning of life"]]
     - Update Index: ~GET~ [[http://localhost:42110/api/update][http://localhost:42110/api/update]]
     - [[http://localhost:42110/docs][Khoj API Docs]]
-   - *Call Khoj via Python Script Directly*
+*** *Call Khoj via Python Script Directly*
     #+begin_src shell
     python3 search_types/asymmetric.py \
     --compressed-jsonl .notes.jsonl.gz \
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -321,7 +321,7 @@ def test_notes_search_with_include_filter(client, sample_org_data, default_user:
    assert response.status_code == 200
    # assert actual_data contains word "Emacs"
    search_result = response.json()[0]["entry"]
-    assert "Emacs" in search_result
+    assert "emacs" in search_result
 # ----------------------------------------------------------------------------------------------------
@ -347,6 +347,27 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
    assert "clone" not in search_result
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
 def test_notes_search_requires_parent_context(
    client, search_config: SearchConfig, sample_org_data, default_user: KhojUser
 ):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
    user_query = quote("Install Khoj on Emacs")
    # Act
    response = client.get(f"/api/search?q={user_query}&n=1&t=org&r=true&max_distance=0.18", headers=headers)
    # Assert
    assert response.status_code == 200
    assert len(response.json()) == 1, "Expected only 1 result"
    search_result = response.json()[0]["entry"]
    assert "Emacs load path" in search_result, "Expected 'Emacs load path' in search result"
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
 def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
--- a/tests/test_multiple_users.py
+++ b/tests/test_multiple_users.py
@ -69,7 +69,7 @@ def test_index_update_with_user2_inaccessible_user1(client, api_user2: KhojApiUs
    # Assert
    assert update_response.status_code == 200
-    assert len(results) == 4
+    assert len(results) == 5
    for result in results:
        assert result["additional"]["file"] not in source_file_symbol
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@ -45,9 +45,10 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
            assert is_none_or_empty(jsonl_data)
-def test_entry_split_when_exceeds_max_words(tmp_path):
+def test_entry_split_when_exceeds_max_words():
    "Ensure entries with compiled words exceeding max_words are split."
    # Arrange
    tmp_path = "/tmp/test.org"
    entry = f"""*** Heading
    \t\r
    Body Line
@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
    data = {
        f"{tmp_path}": entry,
    }
-    expected_heading = f"* {tmp_path.stem}\n** Heading"
+    expected_heading = f"* Path: {tmp_path}\n** Heading"
    # Act
    # Extract Entries from specified Org files
--- a/tests/test_orgnode.py
+++ b/tests/test_orgnode.py
@ -161,6 +161,8 @@ Body Line 1"""
    assert len(entries) == 1
    # parsed heading from entry
    assert entries[0].heading == "Heading[1]"
    # track ancestors of entry
    assert entries[0].ancestors == [f"{orgfile}"]
    # ensure SOURCE link has square brackets in filename, heading escaped in rendered entries
    escaped_orgfile = f"{orgfile}".replace("[1]", "\\[1\\]")
    assert f":SOURCE: [[file:{escaped_orgfile}::*Heading\\[1\\]" in f"{entries[0]}"
@ -260,6 +262,7 @@ Body Line 1"""
    assert entries[0].closed == ""
    assert entries[0].scheduled == ""
    assert entries[0].deadline == ""
    assert entries[0].ancestors == ["test"]
 # ----------------------------------------------------------------------------------------------------
@ -284,6 +287,7 @@ Body Line 1
    assert entries[0].closed == ""
    assert entries[0].scheduled == ""
    assert entries[0].deadline == ""
    assert entries[0].ancestors == ["title1 title2"]
 # ----------------------------------------------------------------------------------------------------
@ -304,8 +308,10 @@ entry body
    assert len(entries) == 2
    assert entries[0].heading == "Title"
    assert entries[0].body == "intro body\n"
    assert entries[0].ancestors == ["Title"]
    assert entries[1].heading == "Entry Heading"
    assert entries[1].body == "entry body\n\n"
    assert entries[1].ancestors == ["Title"]
 # ----------------------------------------------------------------------------------------------------
@ -326,8 +332,93 @@ entry body
    assert len(entries) == 2
    assert entries[0].heading == "Title1 Title2"
    assert entries[0].body == "intro body\n"
    assert entries[0].ancestors == ["Title1 Title2"]
    assert entries[1].heading == "Entry Heading"
    assert entries[1].body == "entry body\n\n"
    assert entries[0].ancestors == ["Title1 Title2"]
 # ----------------------------------------------------------------------------------------------------
 def test_parse_org_with_single_ancestor_heading(tmp_path):
    "Parse org entries with parent headings context"
    # Arrange
    body = f"""
 * Heading 1
 body 1
 ** Sub Heading 1
 """
    orgfile = create_file(tmp_path, body)
    # Act
    entries = orgnode.makelist_with_filepath(orgfile)
    # Assert
    assert len(entries) == 2
    assert entries[0].heading == "Heading 1"
    assert entries[0].ancestors == [f"{orgfile}"]
    assert entries[1].heading == "Sub Heading 1"
    assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
 # ----------------------------------------------------------------------------------------------------
 def test_parse_org_with_multiple_ancestor_headings(tmp_path):
    "Parse org entries with parent headings context"
    # Arrange
    body = f"""
 * Heading 1
 body 1
 ** Sub Heading 1
 *** Sub Sub Heading 1
 sub sub body 1
 """
    orgfile = create_file(tmp_path, body)
    # Act
    entries = orgnode.makelist_with_filepath(orgfile)
    # Assert
    assert len(entries) == 3
    assert entries[0].heading == "Heading 1"
    assert entries[0].ancestors == [f"{orgfile}"]
    assert entries[1].heading == "Sub Heading 1"
    assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
    assert entries[2].heading == "Sub Sub Heading 1"
    assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
 # ----------------------------------------------------------------------------------------------------
 def test_parse_org_with_multiple_ancestor_headings_of_siblings(tmp_path):
    "Parse org entries with parent headings context"
    # Arrange
    body = f"""
 * Heading 1
 body 1
 ** Sub Heading 1
 *** Sub Sub Heading 1
 sub sub body 1
 *** Sub Sub Heading 2
 ** Sub Heading 2
 *** Sub Sub Heading 3
 """
    orgfile = create_file(tmp_path, body)
    # Act
    entries = orgnode.makelist_with_filepath(orgfile)
    # Assert
    assert len(entries) == 6
    assert entries[0].heading == "Heading 1"
    assert entries[0].ancestors == [f"{orgfile}"]
    assert entries[1].heading == "Sub Heading 1"
    assert entries[1].ancestors == [f"{orgfile}", "Heading 1"]
    assert entries[2].heading == "Sub Sub Heading 1"
    assert entries[2].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
    assert entries[3].heading == "Sub Sub Heading 2"
    assert entries[3].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 1"]
    assert entries[4].heading == "Sub Heading 2"
    assert entries[4].ancestors == [f"{orgfile}", "Heading 1"]
    assert entries[5].heading == "Sub Sub Heading 3"
    assert entries[5].ancestors == [f"{orgfile}", "Heading 1", "Sub Heading 2"]
 # Helper Functions
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@ -70,7 +70,7 @@ def test_text_search_setup_with_empty_file_creates_no_entries(
        text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
    # Assert
-    assert "Deleted 3 entries. Created 0 new entries for user " in caplog.records[-1].message
+    assert "Deleted 8 entries. Created 0 new entries for user " in caplog.records[-1].message
    verify_embeddings(0, default_user)
@ -90,7 +90,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
    # Assert
    assert "Deleting all entries for file type org" in caplog.text
-    assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message
+    assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
 # ----------------------------------------------------------------------------------------------------
@ -106,7 +106,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
        text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
    # Assert
-    assert "Deleted 3 entries. Created 10 new entries for user " in caplog.records[-1].message
+    assert "Deleted 8 entries. Created 13 new entries for user " in caplog.records[-1].message
 # ----------------------------------------------------------------------------------------------------
@ -161,7 +161,7 @@ async def test_text_search(search_config: SearchConfig):
        default_user,
    )
-    query = "How to git install application?"
+    query = "Load Khoj on Emacs?"
    # Act
    hits = await text_search.query(default_user, query)
@ -170,7 +170,7 @@ async def test_text_search(search_config: SearchConfig):
    # Assert
    search_result = results[0].entry
-    assert "git clone" in search_result, 'search result did not contain "git clone" entry'
+    assert "Emacs load path" in search_result, 'Expected "Emacs load path" in entry'
 # ----------------------------------------------------------------------------------------------------
@ -284,9 +284,9 @@ def test_regenerate_index_with_new_entry(
    final_logs = caplog.text
    # Assert
-    assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs
+    assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
-    assert "Deleted 10 entries. Created 11 new entries for user " in final_logs
+    assert "Deleted 13 entries. Created 14 new entries for user " in final_logs
-    verify_embeddings(11, default_user)
+    verify_embeddings(14, default_user)
 # ----------------------------------------------------------------------------------------------------
@ -320,7 +320,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
    # Assert
    # verify only 1 entry added even if there are multiple duplicate entries
-    assert "Deleted 3 entries. Created 1 new entries for user " in initial_logs
+    assert "Deleted 8 entries. Created 1 new entries for user " in initial_logs
    assert "Deleted 0 entries. Created 0 new entries for user " in final_logs
    verify_embeddings(1, default_user)
@ -357,7 +357,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
    # Assert
    # verify only 1 entry added even if there are multiple duplicate entries
-    assert "Deleted 3 entries. Created 2 new entries for user " in initial_logs
+    assert "Deleted 8 entries. Created 2 new entries for user " in initial_logs
    assert "Deleted 1 entries. Created 0 new entries for user " in final_logs
    verify_embeddings(1, default_user)
@ -388,9 +388,9 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
    final_logs = caplog.text
    # Assert
-    assert "Deleted 3 entries. Created 10 new entries for user " in initial_logs
+    assert "Deleted 8 entries. Created 13 new entries for user " in initial_logs
    assert "Deleted 0 entries. Created 1 new entries for user " in final_logs
-    verify_embeddings(11, default_user)
+    verify_embeddings(14, default_user)
 # ----------------------------------------------------------------------------------------------------