From 7b4f78776c037d9fe4d8220720e9802378998713 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 17 Jan 2023 12:42:36 -0300 Subject: [PATCH] Fix extracting Markdown Entries with Top Level Headings - Previously top level headings would have get stripped of the space between heading text and the prefix # symbols. That is, `# Top Level Heading' would get converted to `#Top Level Heading' - This would mess up their rendering as a heading in search results - Add unit tests to text_to_jsonl processors to prevent regression --- src/processor/markdown/markdown_to_jsonl.py | 10 ++++++---- tests/test_markdown_to_jsonl.py | 19 +++++++++++++++++++ tests/test_org_to_jsonl.py | 19 +++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 189de84e..773d6dfc 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl): for markdown_file in markdown_files: with open(markdown_file) as f: markdown_content = f.read() - markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' - for entry - in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE) - if entry.strip(empty_escape_sequences) != ''] + markdown_entries_per_file = [] + for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE): + prefix = '#' if entry.startswith('#') else '# ' + if entry.strip(empty_escape_sequences) != '': + markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}') + entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index c4c72688..e7cae1ae 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -103,6 +103,25 @@ def test_get_markdown_files(tmp_path): assert extracted_org_files == expected_files +def test_extract_entries_with_different_level_headings(tmp_path): + "Extract markdown entries with different level headings." + # Arrange + entry = f''' +# Heading 1 +## Heading 2 +''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) + + # Assert + assert len(entries) == 2 + assert entries[0] == "# Heading 1" + assert entries[1] == "## Heading 2" + + # Helper Functions def create_file(tmp_path, entry=None, filename="test.md"): markdown_file = tmp_path / filename diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 3f30b7fc..c1e816f4 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -154,6 +154,25 @@ def test_get_org_files(tmp_path): assert extracted_org_files == expected_files +def test_extract_entries_with_different_level_headings(tmp_path): + "Extract org entries with different level headings." + # Arrange + entry = f''' +* Heading 1 +** Heading 2 +''' + orgfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Org files + entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile]) + + # Assert + assert len(entries) == 2 + assert f'{entries[0]}'.startswith("* Heading 1") + assert f'{entries[1]}'.startswith("** Heading 2") + + # Helper Functions def create_file(tmp_path, entry=None, filename="test.org"): org_file = tmp_path / filename