diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 9e08ae89..20cf9b2c 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl): for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) # Append base filename to compiled entry for context to model - compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" - entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) + # Increment heading level for heading entries and make filename as its top level heading + prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" + compiled_entry = f"{prefix}{parsed_entry}" logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index ca22f359..87a1a07e 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - Bullet point 2 """ markdownfile = create_file(tmp_path, entry) + expected_heading = "# " + markdownfile.stem # Act # Extract Entries from specified Markdown files @@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 - # Ensure entries with no headings do not get heading prefix prepended - assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") + # Ensure raw entry with no headings do not get heading prefix prepended + assert not jsonl_data[0]["raw"].startswith("#") + # Ensure compiled entry has filename prepended as top level heading + assert jsonl_data[0]["compiled"].startswith(expected_heading) def test_single_markdown_entry_to_jsonl(tmp_path): @@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Helper Functions -def create_file(tmp_path, entry=None, filename="test.md"): +def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename markdown_file.touch() if entry: