mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Set heading of md entries to improve search context for long entries
Otherwise if a markdown entry is longer than max_tokens, the split entries (apart from first one) do not get their heading context set
This commit is contained in:
parent
5de04621b5
commit
94825a70b9
1 changed files with 9 additions and 0 deletions
|
@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
|
|||
entries = []
|
||||
for parsed_entry in parsed_entries:
|
||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
|
||||
# Append base filename to compiled entry for context to model
|
||||
# Increment heading level for heading entries and make filename as its top level heading
|
||||
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
||||
compiled_entry = f"{prefix}{parsed_entry}"
|
||||
entries.append(
|
||||
Entry(
|
||||
compiled=compiled_entry,
|
||||
raw=parsed_entry,
|
||||
heading=f"{prefix}{heading}",
|
||||
file=f"{entry_filename}",
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||
|
||||
|
|
Loading…
Reference in a new issue