mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-30 19:03:01 +01:00
Set heading of md entries to improve search context for long entries
Otherwise if a markdown entry is longer than max_tokens, the split entries (apart from first one) do not get their heading context set
This commit is contained in:
parent
5de04621b5
commit
94825a70b9
1 changed files with 9 additions and 0 deletions
|
@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
|
||||||
entries = []
|
entries = []
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
entry_filename = Path(entry_to_file_map[parsed_entry])
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
|
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
|
||||||
# Append base filename to compiled entry for context to model
|
# Append base filename to compiled entry for context to model
|
||||||
# Increment heading level for heading entries and make filename as its top level heading
|
# Increment heading level for heading entries and make filename as its top level heading
|
||||||
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
|
||||||
compiled_entry = f"{prefix}{parsed_entry}"
|
compiled_entry = f"{prefix}{parsed_entry}"
|
||||||
|
entries.append(
|
||||||
|
Entry(
|
||||||
|
compiled=compiled_entry,
|
||||||
|
raw=parsed_entry,
|
||||||
|
heading=f"{prefix}{heading}",
|
||||||
|
file=f"{entry_filename}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue