Include filename of markdown entries for search indexing

Append originating filename to compiled string of each entry for
better search quality by providing more context to model

Update markdown_to_jsonl tests to ensure filename being added

Resolves #142
This commit is contained in:
Debanjum Singh Solanky 2023-03-30 12:30:25 +07:00
parent 67129964a7
commit a2ab68a7a2
2 changed files with 12 additions and 6 deletions

View file

@ -1,8 +1,9 @@
# Standard Packages # Standard Packages
import glob import glob
import re
import logging import logging
import re
import time import time
from pathlib import Path
from typing import List from typing import List
# Internal Packages # Internal Packages
@ -124,7 +125,10 @@ class MarkdownToJsonl(TextToJsonl):
"Convert each Markdown entries into a dictionary" "Convert each Markdown entries into a dictionary"
entries = [] entries = []
for parsed_entry in parsed_entries: for parsed_entry in parsed_entries:
entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}")) entry_filename = Path(entry_to_file_map[parsed_entry])
# Append base filename to compiled entry for context to model
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")

View file

@ -1,5 +1,6 @@
# Standard Packages # Standard Packages
import json import json
from pathlib import Path
# Internal Packages # Internal Packages
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
@ -66,16 +67,17 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
assert len(jsonl_data) == 2 assert len(jsonl_data) == 2
# Ensure entry compiled strings include the markdown files they originate from
assert all([markdownfile.stem in entry.compiled for entry in entries])
def test_get_markdown_files(tmp_path): def test_get_markdown_files(tmp_path):