From 7b4f78776c037d9fe4d8220720e9802378998713 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Jan 2023 12:42:36 -0300
Subject: [PATCH] Fix extracting Markdown Entries with Top Level Headings

- Previously top level headings would have get stripped of the
  space between heading text and the prefix # symbols. That is,
  `# Top Level Heading' would get converted to `#Top Level Heading'
- This would mess up their rendering as a heading in search results

- Add unit tests to text_to_jsonl processors to prevent regression
---
 src/processor/markdown/markdown_to_jsonl.py | 10 ++++++----
 tests/test_markdown_to_jsonl.py             | 19 +++++++++++++++++++
 tests/test_org_to_jsonl.py                  | 19 +++++++++++++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py
index 189de84e..773d6dfc 100644
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -98,10 +98,12 @@ class MarkdownToJsonl(TextToJsonl):
         for markdown_file in markdown_files:
             with open(markdown_file) as f:
                 markdown_content = f.read()
-                markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
-                for entry
-                in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
-                if entry.strip(empty_escape_sequences) != '']
+                markdown_entries_per_file = []
+                for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
+                    prefix = '#' if entry.startswith('#') else '# '
+                    if entry.strip(empty_escape_sequences) != '':
+                        markdown_entries_per_file.append(f'{prefix}{entry.strip(empty_escape_sequences)}')
+
                 entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
                 entries.extend(markdown_entries_per_file)
 
diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py
index c4c72688..e7cae1ae 100644
--- a/tests/test_markdown_to_jsonl.py
+++ b/tests/test_markdown_to_jsonl.py
@@ -103,6 +103,25 @@ def test_get_markdown_files(tmp_path):
     assert extracted_org_files == expected_files
 
 
+def test_extract_entries_with_different_level_headings(tmp_path):
+    "Extract markdown entries with different level headings."
+    # Arrange
+    entry = f'''
+# Heading 1
+## Heading 2
+'''
+    markdownfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Markdown files
+    entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
+
+    # Assert
+    assert len(entries) == 2
+    assert entries[0] == "# Heading 1"
+    assert entries[1] == "## Heading 2"
+
+
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="test.md"):
     markdown_file = tmp_path / filename
diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py
index 3f30b7fc..c1e816f4 100644
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@@ -154,6 +154,25 @@ def test_get_org_files(tmp_path):
     assert extracted_org_files == expected_files
 
 
+def test_extract_entries_with_different_level_headings(tmp_path):
+    "Extract org entries with different level headings."
+    # Arrange
+    entry = f'''
+* Heading 1
+** Heading 2
+'''
+    orgfile = create_file(tmp_path, entry)
+
+    # Act
+    # Extract Entries from specified Org files
+    entries, _ = OrgToJsonl.extract_org_entries(org_files=[orgfile])
+
+    # Assert
+    assert len(entries) == 2
+    assert f'{entries[0]}'.startswith("* Heading 1")
+    assert f'{entries[1]}'.startswith("** Heading 2")
+
+
 # Helper Functions
 def create_file(tmp_path, entry=None, filename="test.org"):
     org_file = tmp_path / filename