From 5de04621b5f038be59f869ddecd7dd9fce3fd89c Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 18:55:56 +0800 Subject: [PATCH] Set filename as top heading of md entries for better search context Previously filename was appended to the end of the compiled entry. This didn't provide appropriate structured context Test filename getting prepended as heading to compiled entry --- src/khoj/processor/markdown/markdown_to_jsonl.py | 5 +++-- tests/test_markdown_to_jsonl.py | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 9e08ae89..20cf9b2c 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl): for parsed_entry in parsed_entries: entry_filename = Path(entry_to_file_map[parsed_entry]) # Append base filename to compiled entry for context to model - compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" - entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) + # Increment heading level for heading entries and make filename as its top level heading + prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" + compiled_entry = f"{prefix}{parsed_entry}" logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index ca22f359..87a1a07e 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): - Bullet point 2 """ markdownfile = create_file(tmp_path, entry) + expected_heading = "# " + markdownfile.stem # Act # Extract Entries from specified Markdown files @@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Assert assert len(jsonl_data) == 1 - # Ensure entries with no headings do not get heading prefix prepended - assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") + # Ensure raw entry with no headings do not get heading prefix prepended + assert not jsonl_data[0]["raw"].startswith("#") + # Ensure compiled entry has filename prepended as top level heading + assert jsonl_data[0]["compiled"].startswith(expected_heading) def test_single_markdown_entry_to_jsonl(tmp_path): @@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Helper Functions -def create_file(tmp_path, entry=None, filename="test.md"): +def create_file(tmp_path: Path, entry=None, filename="test.md"): markdown_file = tmp_path / filename markdown_file.touch() if entry: