Set filename as top heading of org entries for better search context

Previously filename was only being appended to markdown entries.

Test filename getting prepended to compiled entry as heading
This commit is contained in:
Debanjum Singh Solanky 2023-05-03 19:51:25 +08:00
parent 94825a70b9
commit 02aeee60aa
2 changed files with 10 additions and 4 deletions

View file

@ -1,6 +1,7 @@
# Standard Packages
import glob
import logging
from pathlib import Path
from typing import Iterable, List
# Internal Packages
@ -112,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
# Ignore title notes i.e notes with just headings and empty body
continue
compiled = f"{parsed_entry.heading}."
# Prepend filename as top heading to entry
filename = Path(entry_to_file_map[parsed_entry]).stem
heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
compiled = heading
if state.verbose > 2:
logger.debug(f"Title: {parsed_entry.heading}")
@ -142,7 +147,7 @@ class OrgToJsonl(TextToJsonl):
Entry(
compiled=compiled,
raw=f"{parsed_entry}",
heading=f"{parsed_entry.heading}",
heading=f"{heading}",
file=f"{entry_to_file_map[parsed_entry]}",
)
)

View file

@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
Body Line
"""
orgfile = create_file(tmp_path, entry)
expected_heading = f"* {orgfile.stem}\n** Heading"
# Act
# Extract Entries from specified Org files
@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
TextToJsonl.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -63,7 +64,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Assert
assert len(jsonl_data) == 2
# Ensure compiled entries split by max_words start with entry heading (for search context)
assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
def test_entry_split_drops_large_words():