mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-30 10:53:02 +01:00
Fix extracting Markdown Entries with Top Level Headings
- Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression
This commit is contained in:
parent
1a296518c5
commit
7b4f78776c
3 changed files with 44 additions and 4 deletions
|
@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl):
|
||||||
for markdown_file in markdown_files:
|
for markdown_file in markdown_files:
|
||||||
with open(markdown_file) as f:
|
with open(markdown_file) as f:
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
markdown_entries_per_file = []
|
||||||
for entry
|
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
prefix = '#' if entry.startswith('#') else '# '
|
||||||
if entry.strip(empty_escape_sequences) != '']
|
if entry.strip(empty_escape_sequences) != '':
|
||||||
|
markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}')
|
||||||
|
|
||||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
||||||
entries.extend(markdown_entries_per_file)
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|
||||||
|
|
|
@ -103,6 +103,25 @@ def test_get_markdown_files(tmp_path):
|
||||||
assert extracted_org_files == expected_files
|
assert extracted_org_files == expected_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
|
"Extract markdown entries with different level headings."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
# Heading 1
|
||||||
|
## Heading 2
|
||||||
|
'''
|
||||||
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Markdown files
|
||||||
|
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 2
|
||||||
|
assert entries[0] == "# Heading 1"
|
||||||
|
assert entries[1] == "## Heading 2"
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry=None, filename="test.md"):
|
def create_file(tmp_path, entry=None, filename="test.md"):
|
||||||
markdown_file = tmp_path / filename
|
markdown_file = tmp_path / filename
|
||||||
|
|
|
@ -154,6 +154,25 @@ def test_get_org_files(tmp_path):
|
||||||
assert extracted_org_files == expected_files
|
assert extracted_org_files == expected_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
|
"Extract org entries with different level headings."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
* Heading 1
|
||||||
|
** Heading 2
|
||||||
|
'''
|
||||||
|
orgfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Org files
|
||||||
|
entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(entries) == 2
|
||||||
|
assert f'{entries[0]}'.startswith("* Heading 1")
|
||||||
|
assert f'{entries[1]}'.startswith("** Heading 2")
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry=None, filename="test.org"):
|
def create_file(tmp_path, entry=None, filename="test.org"):
|
||||||
org_file = tmp_path / filename
|
org_file = tmp_path / filename
|
||||||
|
|
Loading…
Reference in a new issue