Set filename as top heading of md entries for better search context

Previously filename was appended to the end of the compiled entry.
This didn't provide appropriate structured context

Test filename getting prepended as heading to compiled entry
This commit is contained in:
Debanjum Singh Solanky 2023-05-03 18:55:56 +08:00
parent 0e3fb59e09
commit 5de04621b5
2 changed files with 9 additions and 5 deletions

View file

@ -129,8 +129,9 @@ class MarkdownToJsonl(TextToJsonl):
for parsed_entry in parsed_entries: for parsed_entry in parsed_entries:
entry_filename = Path(entry_to_file_map[parsed_entry]) entry_filename = Path(entry_to_file_map[parsed_entry])
# Append base filename to compiled entry for context to model # Append base filename to compiled entry for context to model
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" # Increment heading level for heading entries and make filename as its top level heading
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
compiled_entry = f"{prefix}{parsed_entry}"
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")

View file

@ -14,6 +14,7 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
- Bullet point 2 - Bullet point 2
""" """
markdownfile = create_file(tmp_path, entry) markdownfile = create_file(tmp_path, entry)
expected_heading = "# " + markdownfile.stem
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
@ -27,8 +28,10 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Assert # Assert
assert len(jsonl_data) == 1 assert len(jsonl_data) == 1
# Ensure entries with no headings do not get heading prefix prepended # Ensure raw entry with no headings do not get heading prefix prepended
assert not jsonl_data[0]["compiled"].startswith("#") and not jsonl_data[0]["raw"].startswith("#") assert not jsonl_data[0]["raw"].startswith("#")
# Ensure compiled entry has filename prepended as top level heading
assert jsonl_data[0]["compiled"].startswith(expected_heading)
def test_single_markdown_entry_to_jsonl(tmp_path): def test_single_markdown_entry_to_jsonl(tmp_path):
@ -130,7 +133,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Helper Functions # Helper Functions
def create_file(tmp_path, entry=None, filename="test.md"): def create_file(tmp_path: Path, entry=None, filename="test.md"):
markdown_file = tmp_path / filename markdown_file = tmp_path / filename
markdown_file.touch() markdown_file.touch()
if entry: if entry: