mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Include filename of markdown entries for search indexing
Append originating filename to compiled string of each entry for better search quality by providing more context to model Update markdown_to_jsonl tests to ensure filename being added Resolves #142
This commit is contained in:
parent
67129964a7
commit
a2ab68a7a2
2 changed files with 12 additions and 6 deletions
|
@ -1,8 +1,9 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import glob
|
import glob
|
||||||
import re
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
|
@ -124,7 +125,10 @@ class MarkdownToJsonl(TextToJsonl):
|
||||||
"Convert each Markdown entries into a dictionary"
|
"Convert each Markdown entries into a dictionary"
|
||||||
entries = []
|
entries = []
|
||||||
for parsed_entry in parsed_entries:
|
for parsed_entry in parsed_entries:
|
||||||
entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}"))
|
entry_filename = Path(entry_to_file_map[parsed_entry])
|
||||||
|
# Append base filename to compiled entry for context to model
|
||||||
|
compiled_entry = f"{parsed_entry}\n{entry_filename.stem}"
|
||||||
|
entries.append(Entry(compiled=compiled_entry, raw=parsed_entry, file=f"{entry_filename}"))
|
||||||
|
|
||||||
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
|
@ -66,16 +67,17 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
|
||||||
)
|
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(jsonl_data) == 2
|
assert len(jsonl_data) == 2
|
||||||
|
# Ensure entry compiled strings include the markdown files they originate from
|
||||||
|
assert all([markdownfile.stem in entry.compiled for entry in entries])
|
||||||
|
|
||||||
|
|
||||||
def test_get_markdown_files(tmp_path):
|
def test_get_markdown_files(tmp_path):
|
||||||
|
|
Loading…
Reference in a new issue