From 02aeee60aaea3ce828be80e7a2732a2c0d28ecb8 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 3 May 2023 19:51:25 +0800 Subject: [PATCH] Set filename as top heading of org entries for better search context Previously filename was only being appended to markdown entries. Test filename getting prepended to compiled entry as heading --- src/khoj/processor/org_mode/org_to_jsonl.py | 9 +++++++-- tests/test_org_to_jsonl.py | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index ed3be1d0..e5ec7cc6 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -1,6 +1,7 @@ # Standard Packages import glob import logging +from pathlib import Path from typing import Iterable, List # Internal Packages @@ -112,7 +113,11 @@ class OrgToJsonl(TextToJsonl): # Ignore title notes i.e notes with just headings and empty body continue - compiled = f"{parsed_entry.heading}." + # Prepend filename as top heading to entry + filename = Path(entry_to_file_map[parsed_entry]).stem + heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}." + + compiled = heading if state.verbose > 2: logger.debug(f"Title: {parsed_entry.heading}") @@ -142,7 +147,7 @@ class OrgToJsonl(TextToJsonl): Entry( compiled=compiled, raw=f"{parsed_entry}", - heading=f"{parsed_entry.heading}", + heading=f"{heading}", file=f"{entry_to_file_map[parsed_entry]}", ) ) diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 15dd368a..171037c0 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): Body Line """ orgfile = create_file(tmp_path, entry) + expected_heading = f"* {orgfile.stem}\n** Heading" # Act # Extract Entries from specified Org files @@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Split each entry from specified Org files by max words jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( TextToJsonl.split_entries_by_max_tokens( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2 + OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -63,7 +64,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Assert assert len(jsonl_data) == 2 # Ensure compiled entries split by max_words start with entry heading (for search context) - assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data) + assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data]) def test_entry_split_drops_large_words():