mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Split current section once by heading to resolve org-mode indexing bug
- Split once by heading (=first_non_empty) to extract current section body Otherwise child headings with same prefix as current heading will cause the section split to go into infinite loop - Also add check to prevent getting into recursive loop while trying to split entry into sub sections
This commit is contained in:
parent
6a135b1ed7
commit
010486fb36
2 changed files with 39 additions and 3 deletions
|
@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries):
|
||||||
return entries, entry_to_file_map
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
# Split this entry tree into sections by the next heading level in it
|
# Split this entry tree into sections by the next heading level in it
|
||||||
# Increment heading level until able to split entry into sections
|
# Increment heading level until able to split entry into sections or reach max heading level
|
||||||
# A successful split will result in at least 2 sections
|
# A successful split will result in at least 2 sections
|
||||||
|
max_heading_level = 100
|
||||||
next_heading_level = len(ancestry)
|
next_heading_level = len(ancestry)
|
||||||
sections: List[str] = []
|
sections: List[str] = []
|
||||||
while len(sections) < 2:
|
while len(sections) < 2 and next_heading_level < max_heading_level:
|
||||||
next_heading_level += 1
|
next_heading_level += 1
|
||||||
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
|
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
|
||||||
|
|
||||||
|
# If unable to split entry into sections, log error and skip indexing it
|
||||||
|
if next_heading_level == max_heading_level:
|
||||||
|
logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.")
|
||||||
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
# Recurse down each non-empty section after parsing its body, heading and ancestry
|
# Recurse down each non-empty section after parsing its body, heading and ancestry
|
||||||
for section in sections:
|
for section in sections:
|
||||||
# Skip empty sections
|
# Skip empty sections
|
||||||
|
@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries):
|
||||||
# If first non-empty line is a heading with expected heading level
|
# If first non-empty line is a heading with expected heading level
|
||||||
if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
|
if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line):
|
||||||
# Extract the section body without the heading
|
# Extract the section body without the heading
|
||||||
current_section_body = "\n".join(section.split(first_non_empty_line)[1:])
|
current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:])
|
||||||
# Parse the section heading into current section ancestry
|
# Parse the section heading into current section ancestry
|
||||||
current_section_title = first_non_empty_line[next_heading_level:].strip()
|
current_section_title = first_non_empty_line[next_heading_level:].strip()
|
||||||
current_ancestry[next_heading_level] = current_section_title
|
current_ancestry[next_heading_level] = current_section_title
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.content.text_to_entries import TextToEntries
|
from khoj.processor.content.text_to_entries import TextToEntries
|
||||||
|
@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path):
|
||||||
assert is_none_or_empty(entries[1])
|
assert is_none_or_empty(entries[1])
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_entries_when_child_headings_have_same_prefix():
|
||||||
|
"""Extract org entries from entries having child headings with same prefix.
|
||||||
|
Prevents regressions like the one fixed in PR #840.
|
||||||
|
"""
|
||||||
|
# Arrange
|
||||||
|
tmp_path = "tests/data/org/same_prefix_headings.org"
|
||||||
|
entry: str = """
|
||||||
|
** 1
|
||||||
|
*** 1.1
|
||||||
|
**** 1.1.2
|
||||||
|
""".strip()
|
||||||
|
data = {
|
||||||
|
f"{tmp_path}": entry,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Org files
|
||||||
|
start = time.time()
|
||||||
|
entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2)
|
||||||
|
end = time.time()
|
||||||
|
indexing_time = end - start
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
explanation_msg = (
|
||||||
|
"It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop."
|
||||||
|
)
|
||||||
|
assert indexing_time < 6 * len(entries), explanation_msg
|
||||||
|
|
||||||
|
|
||||||
def test_entry_split_when_exceeds_max_tokens():
|
def test_entry_split_when_exceeds_max_tokens():
|
||||||
"Ensure entries with compiled words exceeding max_tokens are split."
|
"Ensure entries with compiled words exceeding max_tokens are split."
|
||||||
# Arrange
|
# Arrange
|
||||||
|
|
Loading…
Reference in a new issue