From 010486fb3617349a0f29eadaa0077c1b3d5ffbef Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 16:58:05 +0530 Subject: [PATCH] Split current section once by heading to resolve org-mode indexing bug - Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections --- .../content/org_mode/org_to_entries.py | 12 ++++++-- tests/test_org_to_entries.py | 30 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index af85a6bd..c528244d 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries): return entries, entry_to_file_map # Split this entry tree into sections by the next heading level in it - # Increment heading level until able to split entry into sections + # Increment heading level until able to split entry into sections or reach max heading level # A successful split will result in at least 2 sections + max_heading_level = 100 next_heading_level = len(ancestry) sections: List[str] = [] - while len(sections) < 2: + while len(sections) < 2 and next_heading_level < max_heading_level: next_heading_level += 1 sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE) + # If unable to split entry into sections, log error and skip indexing it + if next_heading_level == max_heading_level: + logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.") + return entries, entry_to_file_map + # Recurse down each non-empty section after parsing its body, heading and ancestry for section in sections: # Skip empty sections @@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries): # If first non-empty line is a heading with expected heading level if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_non_empty_line)[1:]) + current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:]) # Parse the section heading into current section ancestry current_section_title = first_non_empty_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index e8940269..a84fe6e8 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -1,5 +1,6 @@ import os import re +import time from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.text_to_entries import TextToEntries @@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path): assert is_none_or_empty(entries[1]) +def test_extract_entries_when_child_headings_have_same_prefix(): + """Extract org entries from entries having child headings with same prefix. + Prevents regressions like the one fixed in PR #840. + """ + # Arrange + tmp_path = "tests/data/org/same_prefix_headings.org" + entry: str = """ +** 1 +*** 1.1 +**** 1.1.2 +""".strip() + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Org files + start = time.time() + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2) + end = time.time() + indexing_time = end - start + + # Assert + explanation_msg = ( + "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop." + ) + assert indexing_time < 6 * len(entries), explanation_msg + + def test_entry_split_when_exceeds_max_tokens(): "Ensure entries with compiled words exceeding max_tokens are split." # Arrange