mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Set filename as top heading of org entries for better search context
Previously filename was only being appended to markdown entries. Test filename getting prepended to compiled entry as heading
This commit is contained in:
parent
94825a70b9
commit
02aeee60aa
2 changed files with 10 additions and 4 deletions
|
@ -1,6 +1,7 @@
|
|||
# Standard Packages
|
||||
import glob
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List
|
||||
|
||||
# Internal Packages
|
||||
|
@ -112,7 +113,11 @@ class OrgToJsonl(TextToJsonl):
|
|||
# Ignore title notes i.e notes with just headings and empty body
|
||||
continue
|
||||
|
||||
compiled = f"{parsed_entry.heading}."
|
||||
# Prepend filename as top heading to entry
|
||||
filename = Path(entry_to_file_map[parsed_entry]).stem
|
||||
heading = f"* {filename}\n** {parsed_entry.heading}." if parsed_entry.heading else f"* {filename}."
|
||||
|
||||
compiled = heading
|
||||
if state.verbose > 2:
|
||||
logger.debug(f"Title: {parsed_entry.heading}")
|
||||
|
||||
|
@ -142,7 +147,7 @@ class OrgToJsonl(TextToJsonl):
|
|||
Entry(
|
||||
compiled=compiled,
|
||||
raw=f"{parsed_entry}",
|
||||
heading=f"{parsed_entry.heading}",
|
||||
heading=f"{heading}",
|
||||
file=f"{entry_to_file_map[parsed_entry]}",
|
||||
)
|
||||
)
|
||||
|
|
|
@ -47,6 +47,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||
Body Line
|
||||
"""
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
expected_heading = f"* {orgfile.stem}\n** Heading"
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
|
@ -55,7 +56,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||
# Split each entry from specified Org files by max words
|
||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
||||
TextToJsonl.split_entries_by_max_tokens(
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=2
|
||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||
)
|
||||
)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
@ -63,7 +64,7 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
|||
# Assert
|
||||
assert len(jsonl_data) == 2
|
||||
# Ensure compiled entries split by max_words start with entry heading (for search context)
|
||||
assert all(entry["compiled"].startswith("Heading") for entry in jsonl_data)
|
||||
assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data])
|
||||
|
||||
|
||||
def test_entry_split_drops_large_words():
|
||||
|
|
Loading…
Reference in a new issue