Split current section once by heading to resolve org-mode indexing bug

- Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections
2024-11-23 15:38:55 +01:00 · 2024-07-06 16:58:05 +05:30 · 2024-07-06 16:58:05 +05:30 · 010486fb36
commit 010486fb36
parent 6a135b1ed7
2 changed files with 39 additions and 3 deletions
--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries):
            return entries, entry_to_file_map
        # Split this entry tree into sections by the next heading level in it
-        # Increment heading level until able to split entry into sections
+        # Increment heading level until able to split entry into sections or reach max heading level
        # A successful split will result in at least 2 sections
        max_heading_level = 100
        next_heading_level = len(ancestry)
        sections: List[str] = []
-        while len(sections) < 2:
+        while len(sections) < 2 and next_heading_level < max_heading_level:
            next_heading_level += 1
            sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
        # If unable to split entry into sections, log error and skip indexing it
        if next_heading_level == max_heading_level:
            logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.")
            return entries, entry_to_file_map
        # Recurse down each non-empty section after parsing its body, heading and ancestry
        for section in sections:
            # Skip empty sections
@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries):
            # If first non-empty line is a heading with expected heading level
            if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
                # Extract the section body without the heading
-                current_section_body = "\n".join(section.split(first_non_empty_line)[1:])
+                current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
                # Parse the section heading into current section ancestry
                current_section_title = first_non_empty_line[next_heading_level:].strip()
                current_ancestry[next_heading_level] = current_section_title
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@ -1,5 +1,6 @@
 import os
 import re
 import time
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path):
            assert is_none_or_empty(entries[1])
 def test_extract_entries_when_child_headings_have_same_prefix():
    """Extract org entries from entries having child headings with same prefix.
    Prevents regressions like the one fixed in PR #840.
    """
    # Arrange
    tmp_path = "tests/data/org/same_prefix_headings.org"
    entry: str = """
 ** 1
 *** 1.1
 **** 1.1.2
 """.strip()
    data = {
        f"{tmp_path}": entry,
    }
    # Act
    # Extract Entries from specified Org files
    start = time.time()
    entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2)
    end = time.time()
    indexing_time = end - start
    # Assert
    explanation_msg = (
        "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop."
    )
    assert indexing_time < 6 * len(entries), explanation_msg
 def test_entry_split_when_exceeds_max_tokens():
    "Ensure entries with compiled words exceeding max_tokens are split."
    # Arrange