Entries with no md headings should not get heading prefix prepended

Files with no headings would previously get their entry be prefixed
with a markdown heading prefix (#)
This commit is contained in:
Debanjum Singh Solanky 2023-05-03 18:18:48 +08:00
parent 45a991d75c
commit 0e3fb59e09
2 changed files with 8 additions and 4 deletions

View file

@ -2,7 +2,6 @@
import glob
import logging
import re
import time
from pathlib import Path
from typing import List
@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
with open(markdown_file, "r", encoding="utf8") as f:
markdown_content = f.read()
markdown_entries_per_file = []
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
prefix = "#" if entry.startswith("#") else "# "
if entry.strip(empty_escape_sequences) != "":
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
# Add heading level as the regex split removed it from entries with headings
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
stripped_entry = entry.strip(empty_escape_sequences)
if stripped_entry != "":
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
entries.extend(markdown_entries_per_file)

View file

@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Assert
assert len(jsonl_data) == 1
# Ensure entries with no headings do not get heading prefix prepended
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
def test_single_markdown_entry_to_jsonl(tmp_path):