mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Set filename as top heading of md entries for better search context
Previously filename was appended to the end of the compiled entry. This didn't provide appropriate structured context Test filename getting prepended as heading to compiled entry
This commit is contained in:
parent
0e3fb59e09
commit
5de04621b5
2 changed files with 9 additions and 5 deletions
|
@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl):
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
# Append base filename to compiled entry for context to model
|
# Append base filename to compiled entry for context to model
|
||||||
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
|
# Increment heading level for heading entries and make filename as its top level heading
|
||||||
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
|
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
||||||
|
compiled_entry = f"{prefix}{parsed_entry}"
|
||||||
|
|
||||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
- Bullet point 2
|
- Bullet point 2
|
||||||
"""
|
"""
|
||||||
markdownfile = create_file(tmp_path, entry)
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
expected_heading = "# " + markdownfile.stem
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
|
@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 1
|
assert len(jsonl_data) == 1
|
||||||
# Ensure entries with no headings do not get heading prefix prepended
|
# Ensure raw entry with no headings do not get heading prefix prepended
|
||||||
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#")
|
assert not jsonl_data[0]["raw"].startswith("#")
|
||||||
|
# Ensure compiled entry has filename prepended as top level heading
|
||||||
|
assert jsonl_data[0]["compiled"].startswith(expected_heading)
|
||||||
|
|
||||||
|
|
||||||
def test_single_markdown_entry_to_jsonl(tmp_path):
|
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
|
@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
|
|
||||||
|
|
||||||
# Helper Functions
|
# Helper Functions
|
||||||
def create_file(tmp_path, entry=None, filename="test.md"):
|
def create_file(tmp_path: Path, entry=None, filename="test.md"):
|
||||||
markdown_file = tmp_path / filename
|
markdown_file = tmp_path / filename
|
||||||
markdown_file.touch()
|
markdown_file.touch()
|
||||||
if entry:
|
if entry:
|
||||||
|
|
Loading…
Reference in a new issue