From a2ab68a7a2f0d7e0397051a54b5fd67a4052d907 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 30 Mar 2023 12:30:25 +0700 Subject: [PATCH] Include filename of markdown entries for search indexing Append originating filename to compiled string of each entry for better search quality by providing more context to model Update markdown_to_jsonl tests to ensure filename being added Resolves #142 --- src/khoj/processor/markdown/markdown_to_jsonl.py | 8 ++++++-- tests/test_markdown_to_jsonl.py | 10 ++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index 045eea65..a1e4d0c1 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -1,8 +1,9 @@ # Standard Packages import glob -import re import logging +import re import time +from pathlib import Path from typing import List # Internal Packages @@ -124,7 +125,10 @@ class MarkdownToJsonl(TextToJsonl): "Convert each Markdown entries into a dictionary" entries = [] for parsed_entry in parsed_entries: - entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}")) + entry_filename = Path(entry_to_file_map[parsed_entry]) + # Append base filename to compiled entry for context to model + compiled_entry = f"{parsed_entry}\n{entry_filename.stem}" + entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}")) logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index 16f19ab1..dfb42fed 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -1,5 +1,6 @@ # Standard Packages import json +from pathlib import Path # Internal Packages from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl @@ -66,16 +67,17 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) + entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) + entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( - MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map) - ) + jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert assert len(jsonl_data) == 2 + # Ensure entry compiled strings include the markdown files they originate from + assert all([markdownfile.stem in entry.compiled for entry in entries]) def test_get_markdown_files(tmp_path):