Set heading of md entries to improve search context for long entries

Otherwise if a markdown entry is longer than max_tokens, the split
entries (apart from first one) do not get their heading context set
This commit is contained in:
Debanjum Singh Solanky 2023-05-03 18:58:37 +08:00
parent 5de04621b5
commit 94825a70b9

View file

@ -128,10 +128,19 @@ class MarkdownToJsonl(TextToJsonl):
entries = [] entries = []
for parsed_entry in parsed_entries: for parsed_entry in parsed_entries:
entry_filename = Path(entry_to_file_map[parsed_entry]) entry_filename = Path(entry_to_file_map[parsed_entry])
heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else ""
# Append base filename to compiled entry for context to model # Append base filename to compiled entry for context to model
# Increment heading level for heading entries and make filename as its top level heading # Increment heading level for heading entries and make filename as its top level heading
prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n" prefix = f"# {entry_filename.stem}\n#" if heading else f"# {entry_filename.stem}\n"
compiled_entry = f"{prefix}{parsed_entry}" compiled_entry = f"{prefix}{parsed_entry}"
entries.append(
Entry(
compiled=compiled_entry,
raw=parsed_entry,
heading=f"{prefix}{heading}",
file=f"{entry_filename}",
)
)
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")