From 030fab9bb2c1d4076335dec4ecd53c6457e27d36 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 21:30:04 +0300 Subject: [PATCH] Support incremental update of Markdown entries, embeddings --- src/processor/markdown/markdown_to_jsonl.py | 39 ++++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index ce022358..d910ff49 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -9,7 +9,7 @@ import re import logging # Internal Packages -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) # Define Functions -def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): +def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): print("At least one of markdown-files or markdown-file-filter is required to be specified") @@ -28,10 +28,20 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): markdown_files = get_markdown_files(markdown_files, markdown_file_filter) # Extract Entries from specified Markdown files - entries, entry_to_file_map = extract_markdown_entries(markdown_files) + extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files) + + # Convert Extracted Transactions to Dictionaries + current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map) + + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) # Process Each Entry from All Notes Files - jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map) + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = convert_markdown_maps_to_jsonl(entries, entry_to_file_map) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -39,7 +49,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return list(enumerate(entries)) + return entries_with_ids def get_markdown_files(markdown_files=None, markdown_file_filter=None): @@ -87,17 +97,20 @@ def extract_markdown_entries(markdown_files): return entries, entry_to_file_map -def convert_markdown_entries_to_jsonl(entries, entry_to_file_map): - "Convert each Markdown entries to JSON and collate as JSONL" - jsonl = '' +def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: + "Convert each Markdown entries into a dictionary" + entry_maps = [] for entry_id, entry in enumerate(entries): - entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}) - logger.info(f"Converted {len(entries)} to jsonl format") + logger.info(f"Converted {len(entries)} markdown entries to dictionaries") - return jsonl + return entry_maps + + +def convert_markdown_maps_to_jsonl(entries): + "Convert each Markdown entries to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__':