mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-30 19:03:01 +01:00
Support incremental update of Markdown entries, embeddings
This commit is contained in:
parent
91aac83c6a
commit
030fab9bb2
1 changed files with 26 additions and 13 deletions
|
@ -9,7 +9,7 @@ import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None):
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||||
|
@ -28,10 +28,20 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
||||||
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files)
|
extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files)
|
||||||
|
|
||||||
|
# Convert Extracted Transactions to Dictionaries
|
||||||
|
current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map)
|
||||||
|
|
||||||
|
# Identify, mark and merge any new entries with previous entries
|
||||||
|
if not previous_entries:
|
||||||
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
else:
|
||||||
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map)
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
|
jsonl_data = convert_markdown_maps_to_jsonl(entries, entry_to_file_map)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
|
@ -39,7 +49,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
|
||||||
return list(enumerate(entries))
|
return entries_with_ids
|
||||||
|
|
||||||
|
|
||||||
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
|
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
|
||||||
|
@ -87,17 +97,20 @@ def extract_markdown_entries(markdown_files):
|
||||||
return entries, entry_to_file_map
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
|
|
||||||
def convert_markdown_entries_to_jsonl(entries, entry_to_file_map):
|
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
"Convert each Markdown entries into a dictionary"
|
||||||
jsonl = ''
|
entry_maps = []
|
||||||
for entry_id, entry in enumerate(entries):
|
for entry_id, entry in enumerate(entries):
|
||||||
entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}
|
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'})
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} to jsonl format")
|
logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
return jsonl
|
return entry_maps
|
||||||
|
|
||||||
|
|
||||||
|
def convert_markdown_maps_to_jsonl(entries):
|
||||||
|
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||||
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue