mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Entries with no md headings should not get heading prefix prepended
Files with no headings would previously get their entry be prefixed with a markdown heading prefix (#)
This commit is contained in:
parent
45a991d75c
commit
0e3fb59e09
2 changed files with 8 additions and 4 deletions
|
@ -2,7 +2,6 @@
|
|||
import glob
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
|
@ -110,10 +109,13 @@ class MarkdownToJsonl(TextToJsonl):
|
|||
with open(markdown_file, "r", encoding="utf8") as f:
|
||||
markdown_content = f.read()
|
||||
markdown_entries_per_file = []
|
||||
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||
prefix = "#" if entry.startswith("#") else "# "
|
||||
if entry.strip(empty_escape_sequences) != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{entry.strip(empty_escape_sequences)}")
|
||||
# Add heading level as the regex split removed it from entries with headings
|
||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||
stripped_entry = entry.strip(empty_escape_sequences)
|
||||
if stripped_entry != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||
|
||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||
entries.extend(markdown_entries_per_file)
|
||||
|
|
|
@ -27,6 +27,8 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
|||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 1
|
||||
# Ensure entries with no headings do not get heading prefix prepended
|
||||
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
|
||||
|
||||
|
||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||
|
|
Loading…
Add table
Reference in a new issue