From 030fab9bb2c1d4076335dec4ecd53c6457e27d36 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 10 Sep 2022 21:30:04 +0300
Subject: [PATCH] Support incremental update of Markdown entries, embeddings

---
 src/processor/markdown/markdown_to_jsonl.py | 39 ++++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py
index ce022358..d910ff49 100644
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -9,7 +9,7 @@ import re
 import logging
 
 # Internal Packages
-from src.utils.helpers import get_absolute_path, is_none_or_empty
+from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
 from src.utils.constants import empty_escape_sequences
 from src.utils.jsonl import dump_jsonl, compress_jsonl_data
 
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 
 
 # Define Functions
-def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
+def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None):
     # Input Validation
     if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
         print("At least one of markdown-files or markdown-file-filter is required to be specified")
@@ -28,10 +28,20 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
     markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
 
     # Extract Entries from specified Markdown files
-    entries, entry_to_file_map = extract_markdown_entries(markdown_files)
+    extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files)
+
+    # Convert Extracted Transactions to Dictionaries
+    current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map)
+
+    # Identify, mark and merge any new entries with previous entries
+    if not previous_entries:
+        entries_with_ids = list(enumerate(current_entries))
+    else:
+        entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
 
     # Process Each Entry from All Notes Files
-    jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map)
+    entries = list(map(lambda entry: entry[1], entries_with_ids))
+    jsonl_data = convert_markdown_maps_to_jsonl(entries, entry_to_file_map)
 
     # Compress JSONL formatted Data
     if output_file.suffix == ".gz":
@@ -39,7 +49,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
     elif output_file.suffix == ".jsonl":
         dump_jsonl(jsonl_data, output_file)
 
-    return list(enumerate(entries))
+    return entries_with_ids
 
 
 def get_markdown_files(markdown_files=None, markdown_file_filter=None):
@@ -87,17 +97,20 @@ def extract_markdown_entries(markdown_files):
     return entries, entry_to_file_map
 
 
-def convert_markdown_entries_to_jsonl(entries, entry_to_file_map):
-    "Convert each Markdown entries to JSON and collate as JSONL"
-    jsonl = ''
+def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
+    "Convert each Markdown entries into a dictionary"
+    entry_maps = []
     for entry_id, entry in enumerate(entries):
-        entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}
-        # Convert Dictionary to JSON and Append to JSONL string
-        jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
+        entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'})
 
-    logger.info(f"Converted {len(entries)} to jsonl format")
+    logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
 
-    return jsonl
+    return entry_maps
+
+
+def convert_markdown_maps_to_jsonl(entries):
+    "Convert each Markdown entries to JSON and collate as JSONL"
+    return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
 
 
 if __name__ == '__main__':