Only hash compiled entry to identify new/updated entries to update

- Comparing compiled entries is the appropriately narrow target to identify entries that need to encode their embedding vectors. Given we pass the compiled form of the entry to the model for encoding - Hashing the whole entry along with it's raw form was resulting in a bunch of entries being marked for updated as LINE: <entry_line_no> is a string added to each entries raw format. - This results in an update to a single entry resulting in all entries below it in the file being marked for update (as all their line numbers have changed) - Log performance metrics for steps to convert org entries to jsonl
2025-02-17 08:04:21 +00:00 · 2022-09-07 02:36:38 +03:00 · 2022-09-07 02:36:38 +03:00 · 91d11ccb49
commit 91d11ccb49
parent b9a6e80629
1 changed files with 21 additions and 2 deletions
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -8,6 +8,7 @@ import pathlib
 import glob
 import logging
 import hashlib
+import time

 # Internal Packages
 from src.processor.org_mode import orgnode
@ -27,20 +28,32 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None)
        exit(1)

    # Get Org Files to Process
+    start = time.time()
    org_files = get_org_files(org_files, org_file_filter)

    # Extract Entries from specified Org files
+    start = time.time()
    entry_nodes, file_to_entries = extract_org_entries(org_files)
+    end = time.time()
+    logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
+
+    start = time.time()
    current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
+    end = time.time()
+    logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")

    # Identify, mark and merge any new entries with previous entries
    if not previous_entries:
        entries_with_ids = list(enumerate(current_entries))
    else:
        # Hash all current and previous entries to identify new entries
-        current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), current_entries))
-        previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), previous_entries))
+        start = time.time()
+        current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), current_entries))
+        previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), previous_entries))
+        end = time.time()
+        logger.debug(f"Hash previous, current entries: {end - start} seconds")

+        start = time.time()
        hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
        hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))

@ -59,10 +72,14 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None)
            (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
            for entry_hash in existing_entry_hashes
        ]
+
        existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
        entries_with_ids = existing_entries_sorted + new_entries
+        end = time.time()
+        logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds")

    # Process Each Entry from All Notes Files
+    start = time.time()
    entries = map(lambda entry: entry[1], entries_with_ids)
    jsonl_data = convert_org_entries_to_jsonl(entries)

@ -71,6 +88,8 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None)
        compress_jsonl_data(jsonl_data, output_file)
    elif output_file.suffix == ".jsonl":
        dump_jsonl(jsonl_data, output_file)
+    end = time.time()
+    logger.debug(f"Write org entries to JSONL file: {end - start} seconds")

    return entries_with_ids