From 010486fb3617349a0f29eadaa0077c1b3d5ffbef Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 6 Jul 2024 16:58:05 +0530
Subject: [PATCH] Split current section once by heading to resolve org-mode
 indexing bug

- Split once by heading (=first_non_empty) to extract current section body
  Otherwise child headings with same prefix as current heading will
  cause the section split to go into infinite loop
- Also add check to prevent getting into recursive loop while trying
  to split entry into sub sections
---
 .../content/org_mode/org_to_entries.py        | 12 ++++++--
 tests/test_org_to_entries.py                  | 30 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py
index af85a6bd..c528244d 100644
--- a/src/khoj/processor/content/org_mode/org_to_entries.py
+++ b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries):
             return entries, entry_to_file_map
 
         # Split this entry tree into sections by the next heading level in it
-        # Increment heading level until able to split entry into sections
+        # Increment heading level until able to split entry into sections or reach max heading level
         # A successful split will result in at least 2 sections
+        max_heading_level = 100
         next_heading_level = len(ancestry)
         sections: List[str] = []
-        while len(sections) < 2:
+        while len(sections) < 2 and next_heading_level < max_heading_level:
             next_heading_level += 1
             sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
 
+        # If unable to split entry into sections, log error and skip indexing it
+        if next_heading_level == max_heading_level:
+            logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.")
+            return entries, entry_to_file_map
+
         # Recurse down each non-empty section after parsing its body, heading and ancestry
         for section in sections:
             # Skip empty sections
@@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries):
             # If first non-empty line is a heading with expected heading level
             if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
                 # Extract the section body without the heading
-                current_section_body = "\n".join(section.split(first_non_empty_line)[1:])
+                current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
                 # Parse the section heading into current section ancestry
                 current_section_title = first_non_empty_line[next_heading_level:].strip()
                 current_ancestry[next_heading_level] = current_section_title
diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py
index e8940269..a84fe6e8 100644
--- a/tests/test_org_to_entries.py
+++ b/tests/test_org_to_entries.py
@@ -1,5 +1,6 @@
 import os
 import re
+import time
 
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.text_to_entries import TextToEntries
@@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path):
             assert is_none_or_empty(entries[1])
 
 
+def test_extract_entries_when_child_headings_have_same_prefix():
+    """Extract org entries from entries having child headings with same prefix.
+    Prevents regressions like the one fixed in PR #840.
+    """
+    # Arrange
+    tmp_path = "tests/data/org/same_prefix_headings.org"
+    entry: str = """
+** 1
+*** 1.1
+**** 1.1.2
+""".strip()
+    data = {
+        f"{tmp_path}": entry,
+    }
+
+    # Act
+    # Extract Entries from specified Org files
+    start = time.time()
+    entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2)
+    end = time.time()
+    indexing_time = end - start
+
+    # Assert
+    explanation_msg = (
+        "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop."
+    )
+    assert indexing_time < 6 * len(entries), explanation_msg
+
+
 def test_entry_split_when_exceeds_max_tokens():
     "Ensure entries with compiled words exceeding max_tokens are split."
     # Arrange