From 2f7a6af56a424bc90b841c8415cabdc94f305012 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 00:16:48 +0300 Subject: [PATCH 01/15] Support incremental update of org-mode entries and embeddings - What - Hash the entries and compare to find new/updated entries - Reuse embeddings encoded for existing entries - Only encode embeddings for updated or new entries - Merge the existing and new entries and embeddings to get the updated entries, embeddings - Why - Given most note text entries are expected to be unchanged across time. Reusing their earlier encoded embeddings should significantly speed up embeddings updates - Previously we were regenerating embeddings for all entries, even if they had existed in previous runs --- src/processor/ledger/beancount_to_jsonl.py | 4 +- src/processor/markdown/markdown_to_jsonl.py | 2 +- src/processor/org_mode/org_to_jsonl.py | 63 ++++++++++++++++----- src/search_type/text_search.py | 29 +++++++--- tests/test_org_to_jsonl.py | 12 ++-- 5 files changed, 80 insertions(+), 30 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index c0136bc6..a1792735 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) # Define Functions -def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file): +def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter): print("At least one of beancount-files or beancount-file-filter is required to be specified") @@ -39,7 +39,7 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file): elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return entries + return list(enumerate(entries)) def get_beancount_files(beancount_files=None, beancount_file_filter=None): diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index a0903fcb..ce022358 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -39,7 +39,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return entries + return list(enumerate(entries)) def get_markdown_files(markdown_files=None, markdown_file_filter=None): diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index f1531797..1c77acf8 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -7,6 +7,7 @@ import argparse import pathlib import glob import logging +import hashlib # Internal Packages from src.processor.org_mode import orgnode @@ -19,7 +20,7 @@ logger = logging.getLogger(__name__) # Define Functions -def org_to_jsonl(org_files, org_file_filter, output_file): +def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): print("At least one of org-files or org-file-filter is required to be specified") @@ -29,10 +30,41 @@ def org_to_jsonl(org_files, org_file_filter, output_file): org_files = get_org_files(org_files, org_file_filter) # Extract Entries from specified Org files - entries, file_to_entries = extract_org_entries(org_files) + entry_nodes, file_to_entries = extract_org_entries(org_files) + current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + # Hash all current and previous entries to identify new entries + current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), current_entries)) + previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), previous_entries)) + + hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) + hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) + + # All entries that did not exist in the previous set are to be added + new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) + # All entries that exist in both current and previous sets are kept + existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) + + # Mark new entries with no ids for later embeddings generation + new_entries = [ + (None, hash_to_current_entries[entry_hash]) + for entry_hash in new_entry_hashes + ] + # Set id of existing entries to their previous ids to reuse their existing encoded embeddings + existing_entries = [ + (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) + for entry_hash in existing_entry_hashes + ] + existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) + entries_with_ids = existing_entries_sorted + new_entries # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries) + entries = map(lambda entry: entry[1], entries_with_ids) + jsonl_data = convert_org_entries_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -40,7 +72,7 @@ def org_to_jsonl(org_files, org_file_filter, output_file): elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return entries + return entries_with_ids def get_org_files(org_files=None, org_file_filter=None): @@ -70,16 +102,16 @@ def extract_org_entries(org_files): entry_to_file_map = [] for org_file in org_files: org_file_entries = orgnode.makelist(str(org_file)) - entry_to_file_map += [org_file]*len(org_file_entries) + entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries)) entries.extend(org_file_entries) - return entries, entry_to_file_map + return entries, dict(entry_to_file_map) -def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str: - "Convert each Org-Mode entries to JSON and collate as JSONL" - jsonl = '' - for entry_id, entry in enumerate(entries): +def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]: + "Convert Org-Mode entries into list of dictionary" + entry_maps = [] + for entry in entries: entry_dict = dict() # Ignore title notes i.e notes with just headings and empty body @@ -113,14 +145,17 @@ def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str: if entry_dict: entry_dict["raw"] = f'{entry}' - entry_dict["file"] = f'{entry_to_file_map[entry_id]}' + entry_dict["file"] = f'{entry_to_file_map[entry]}' # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + entry_maps.append(entry_dict) - logger.info(f"Converted {len(entries)} to jsonl format") + return entry_maps - return jsonl + +def convert_org_entries_to_jsonl(entries) -> str: + "Convert each Org-Mode entry to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 8666056c..b5e647e2 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -55,15 +55,28 @@ def extract_entries(jsonl_file): return load_jsonl(jsonl_file) -def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False): +def compute_embeddings(entries_with_ids, bi_encoder, embeddings_file, regenerate=False): "Compute (and Save) Embeddings or Load Pre-Computed Embeddings" - # Load pre-computed embeddings from file if exists + new_entries = [] + # Load pre-computed embeddings from file if exists and update them if required if embeddings_file.exists() and not regenerate: corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device) logger.info(f"Loaded embeddings from {embeddings_file}") - else: # Else compute the corpus_embeddings from scratch, which can take a while - corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, device=state.device, show_progress_bar=True) + # Encode any new entries in the corpus and update corpus embeddings + new_entries = [entry['compiled'] for id, entry in entries_with_ids if id is None] + if new_entries: + new_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True) + existing_entry_ids = [id for id, _ in entries_with_ids if id is not None] + existing_embeddings = torch.index_select(corpus_embeddings, 0, torch.tensor(existing_entry_ids)) if existing_entry_ids else torch.Tensor() + corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0) + # Else compute the corpus embeddings from scratch + else: + new_entries = [entry['compiled'] for _, entry in entries_with_ids] + corpus_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True) + + # Save regenerated or updated embeddings to file + if new_entries: corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, embeddings_file) logger.info(f"Computed embeddings and saved them to {embeddings_file}") @@ -169,16 +182,16 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) - if not config.compressed_jsonl.exists() or regenerate: - text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl) + previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None + entries_with_indices = text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, previous_entries) - # Extract Entries + # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus # Compute or Load Embeddings config.embeddings_file = resolve_absolute_path(config.embeddings_file) - corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate) + corpus_embeddings = compute_embeddings(entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate) for filter in filters: filter.load(entries, regenerate=regenerate) diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 6a626299..594c954f 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -3,7 +3,7 @@ import json from posixpath import split # Internal Packages -from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries +from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries from src.utils.helpers import is_none_or_empty @@ -21,10 +21,11 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map) + entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_data = convert_org_entries_to_jsonl(entries) # Assert assert is_none_or_empty(jsonl_data) @@ -43,10 +44,11 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map) + entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert From b9a6e8062919ebb4e2efb82ce5b861dabf490cee Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 01:38:30 +0300 Subject: [PATCH 02/15] Make OrgNode tags stable sorted to find new entries for incremental updates - Having Tags as sets was returning them in a different order everytime - This resulted in spuriously identifying existing entries as new because their tags ordering changed - Converting tags to list fixes the issue and identifies updated new entries for incremental update correctly --- src/processor/org_mode/orgnode.py | 14 +++++++------- tests/test_orgnode.py | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/processor/org_mode/orgnode.py b/src/processor/org_mode/orgnode.py index 39c67731..433a367f 100644 --- a/src/processor/org_mode/orgnode.py +++ b/src/processor/org_mode/orgnode.py @@ -64,7 +64,7 @@ def makelist(filename): level = 0 heading = "" bodytext = "" - tags = set() # set of all tags in headline + tags = list() # set of all tags in headline closed_date = '' sched_date = '' deadline_date = '' @@ -98,14 +98,14 @@ def makelist(filename): level = hdng.group(1) heading = hdng.group(2) bodytext = "" - tags = set() # set of all tags in headline + tags = list() # set of all tags in headline tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading) if tagsrch: heading = tagsrch.group(1) parsedtags = tagsrch.group(2) if parsedtags: for parsedtag in parsedtags.split(':'): - if parsedtag != '': tags.add(parsedtag) + if parsedtag != '': tags.append(parsedtag) else: # we are processing a non-heading line if line[:10] == '#+SEQ_TODO': kwlist = re.findall(r'([A-Z]+)\(', line) @@ -217,7 +217,7 @@ class Orgnode(object): self.level = len(level) self.headline = headline self.body = body - self.tags = set(tags) # All tags in the headline + self.tags = tags # All tags in the headline self.todo = "" self.prty = "" # empty of A, B or C self.scheduled = "" # Scheduled date @@ -270,8 +270,8 @@ class Orgnode(object): def Tags(self): """ - Returns the set of all tags - For example, :HOME:COMPUTER: would return {'HOME', 'COMPUTER'} + Returns the list of all tags + For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER'] """ return self.tags @@ -287,7 +287,7 @@ class Orgnode(object): """ Store all the tags found in the headline. """ - self.tags = set(newtags) + self.tags = newtags def Todo(self): """ diff --git a/tests/test_orgnode.py b/tests/test_orgnode.py index 186eaaec..a81f1cc3 100644 --- a/tests/test_orgnode.py +++ b/tests/test_orgnode.py @@ -23,7 +23,7 @@ Body Line 1''' # Assert assert len(entries) == 1 assert entries[0].Heading() == "Heading" - assert entries[0].Tags() == set() + assert entries[0].Tags() == list() assert entries[0].Body() == "Body Line 1" assert entries[0].Priority() == "" assert entries[0].Property("ID") == "" @@ -57,7 +57,7 @@ Body Line 2''' assert len(entries) == 1 assert entries[0].Heading() == "Heading" assert entries[0].Todo() == "DONE" - assert entries[0].Tags() == {"Tag1", "TAG2", "tag3"} + assert entries[0].Tags() == ["Tag1", "TAG2", "tag3"] assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2" assert entries[0].Priority() == "A" assert entries[0].Property("ID") == "id:123-456-789-4234-1231" @@ -158,7 +158,7 @@ Body 2 for index, entry in enumerate(entries): assert entry.Heading() == f"Heading{index+1}" assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED" - assert entry.Tags() == {f"tag{index+1}"} + assert entry.Tags() == [f"tag{index+1}"] assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n" assert entry.Priority() == "A" assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}" From 91d11ccb491fdd553d36198322cb4611cf52e994 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 02:36:38 +0300 Subject: [PATCH 03/15] Only hash compiled entry to identify new/updated entries to update - Comparing compiled entries is the appropriately narrow target to identify entries that need to encode their embedding vectors. Given we pass the compiled form of the entry to the model for encoding - Hashing the whole entry along with it's raw form was resulting in a bunch of entries being marked for updated as LINE: is a string added to each entries raw format. - This results in an update to a single entry resulting in all entries below it in the file being marked for update (as all their line numbers have changed) - Log performance metrics for steps to convert org entries to jsonl --- src/processor/org_mode/org_to_jsonl.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 1c77acf8..88a4cd5e 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -8,6 +8,7 @@ import pathlib import glob import logging import hashlib +import time # Internal Packages from src.processor.org_mode import orgnode @@ -27,20 +28,32 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None) exit(1) # Get Org Files to Process + start = time.time() org_files = get_org_files(org_files, org_file_filter) # Extract Entries from specified Org files + start = time.time() entry_nodes, file_to_entries = extract_org_entries(org_files) + end = time.time() + logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") + + start = time.time() current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) + end = time.time() + logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") # Identify, mark and merge any new entries with previous entries if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: # Hash all current and previous entries to identify new entries - current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), current_entries)) - previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(json.dumps(e), encoding='utf-8')).hexdigest(), previous_entries)) + start = time.time() + current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), current_entries)) + previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), previous_entries)) + end = time.time() + logger.debug(f"Hash previous, current entries: {end - start} seconds") + start = time.time() hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) @@ -59,10 +72,14 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None) (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) for entry_hash in existing_entry_hashes ] + existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) entries_with_ids = existing_entries_sorted + new_entries + end = time.time() + logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds") # Process Each Entry from All Notes Files + start = time.time() entries = map(lambda entry: entry[1], entries_with_ids) jsonl_data = convert_org_entries_to_jsonl(entries) @@ -71,6 +88,8 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None) compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write org entries to JSONL file: {end - start} seconds") return entries_with_ids From c17a0fd05bd893d379d3489e5cfdd032226ce9ae Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 02:43:58 +0300 Subject: [PATCH 04/15] Do not store word filters index to file. Not necessary for now - It's more of a hassle to not let word filter go stale on entry updates - Generating index on 120K lines of notes takes 1s. Loading from file takes 0.2s. For less content load time difference will be even smaller - Let go of startup time improvement for simplicity for now --- src/configure.py | 18 +++----------- src/search_filter/word_filter.py | 41 +++++++++++--------------------- tests/conftest.py | 2 +- tests/test_client.py | 4 ++-- tests/test_word_filter.py | 30 ++++++++++------------- 5 files changed, 33 insertions(+), 62 deletions(-) diff --git a/src/configure.py b/src/configure.py index f6476951..ed46af37 100644 --- a/src/configure.py +++ b/src/configure.py @@ -48,11 +48,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Org Music Search if (t == SearchType.Music or t == None) and config.content_type.music: @@ -71,11 +67,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: @@ -85,11 +77,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, - filters=[ - DateFilter(), - WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger), - FileFilter(), - ]) + filters=[DateFilter(), WordFilter(), FileFilter()]) # Initialize Image Search if (t == SearchType.Image or t == None) and config.content_type.image: diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py index c7c5d059..6fe0b31e 100644 --- a/src/search_filter/word_filter.py +++ b/src/search_filter/word_filter.py @@ -3,6 +3,7 @@ import re import time import pickle import logging +from collections import defaultdict # Internal Packages from src.search_filter.base_filter import BaseFilter @@ -18,38 +19,24 @@ class WordFilter(BaseFilter): required_regex = r'\+"(\w+)" ?' blocked_regex = r'\-"(\w+)" ?' - def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'): - self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl") + def __init__(self, entry_key='raw'): self.entry_key = entry_key - self.search_type = search_type - self.word_to_entry_index = dict() + self.word_to_entry_index = defaultdict(set) self.cache = LRU() def load(self, entries, regenerate=False): - if self.filter_file.exists() and not regenerate: - start = time.time() - with self.filter_file.open('rb') as f: - self.word_to_entry_index = pickle.load(f) - end = time.time() - logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds") - else: - start = time.time() - self.cache = {} # Clear cache on (re-)generating entries_by_word_set - entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:' - # Create map of words to entries they exist in - for entry_index, entry in enumerate(entries): - for word in re.split(entry_splitter, entry[self.entry_key].lower()): - if word == '': - continue - if word not in self.word_to_entry_index: - self.word_to_entry_index[word] = set() - self.word_to_entry_index[word].add(entry_index) - - with self.filter_file.open('wb') as f: - pickle.dump(self.word_to_entry_index, f) - end = time.time() - logger.debug(f"Index {self.search_type} for word filter to {self.filter_file}: {end - start} seconds") + start = time.time() + self.cache = {} # Clear cache on reload of filter + entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\'' + # Create map of words to entries they exist in + for entry_index, entry in enumerate(entries): + for word in re.split(entry_splitter, entry[self.entry_key].lower()): + if word == '': + continue + self.word_to_entry_index[word].add(entry_index) + end = time.time() + logger.debug(f"Created word filter index: {end - start} seconds") return self.word_to_entry_index diff --git a/tests/conftest.py b/tests/conftest.py index 7545527f..ab2703da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,7 +58,7 @@ def model_dir(search_config: SearchConfig): compressed_jsonl = model_dir.joinpath('notes.jsonl.gz'), embeddings_file = model_dir.joinpath('note_embeddings.pt')) - filters = [DateFilter(), WordFilter(model_dir, search_type=SearchType.Org), FileFilter()] + filters = [DateFilter(), WordFilter(), FileFilter()] text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) return model_dir diff --git a/tests/test_client.py b/tests/test_client.py index 578c789c..b167bce0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? +"Emacs"' @@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_ # ---------------------------------------------------------------------------------------------------- def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange - filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)] + filters = [WordFilter()] model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = 'How to git install application? -"clone"' diff --git a/tests/test_word_filter.py b/tests/test_word_filter.py index 3efe8ed9..db23c2c6 100644 --- a/tests/test_word_filter.py +++ b/tests/test_word_filter.py @@ -1,15 +1,12 @@ -# External Packages -import torch - # Application Packages from src.search_filter.word_filter import WordFilter from src.utils.config import SearchType -def test_no_word_filter(tmp_path): +def test_no_word_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_no_filter = 'head tail' # Act @@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path): assert entry_indices == {0, 1, 2, 3} -def test_word_exclude_filter(tmp_path): +def test_word_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() q_with_exclude_filter = 'head -"exclude_word" tail' # Act @@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path): assert entry_indices == {0, 2} -def test_word_include_filter(tmp_path): +def test_word_include_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_filter = 'head +"include_word" tail' # Act @@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path): assert entry_indices == {2, 3} -def test_word_include_and_exclude_filter(tmp_path): +def test_word_include_and_exclude_filter(): # Arrange - word_filter = WordFilter(tmp_path, SearchType.Org) - embeddings, entries = arrange_content() + word_filter = WordFilter() + entries = arrange_content() query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail' # Act @@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path): def arrange_content(): - embeddings = torch.randn(4, 10) entries = [ {'compiled': '', 'raw': 'Minimal Entry'}, {'compiled': '', 'raw': 'Entry with exclude_word'}, {'compiled': '', 'raw': 'Entry with include_word'}, {'compiled': '', 'raw': 'Entry with include_word and exclude_word'}] - return embeddings, entries + return entries From 899bfc5c3e95f20918fa2483fdadde02ed1b1292 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 03:06:29 +0300 Subject: [PATCH 05/15] Test incremental update triggered on calling text_search.setup - Previously updates to index required explicitly setting `regenerate=True` - Now incremental update check made everytime on `text_search.setup` now - Test if index automatically updates when call `text_search.setup` with new content even with `regenerate=False` --- tests/test_text_search.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 39fed92e..5c2bd1c9 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -77,3 +77,32 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC # delete reload test file added content_config.org.input_files = [] file_to_add_on_reload.unlink() + + +# ---------------------------------------------------------------------------------------------------- +def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig): + # Arrange + initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + + assert len(initial_notes_model.entries) == 10 + assert len(initial_notes_model.corpus_embeddings) == 10 + + file_to_add_on_update = Path(content_config.org.input_filter).parent / "update.org" + content_config.org.input_files = [f'{file_to_add_on_update}'] + + # append Org-Mode Entry to first Org Input File in Config + with open(file_to_add_on_update, "w") as f: + f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") + + # Act + # update embeddings, entries with the newly added note + initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + + # verify new entry added in updated embeddings, entries + assert len(initial_notes_model.entries) == 11 + assert len(initial_notes_model.corpus_embeddings) == 11 + + # Cleanup + # delete file added for update testing + content_config.org.input_files = [] + file_to_add_on_update.unlink() From b01b4d7daa955e27aad200a0d98eca700b0dc587 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 03:31:48 +0300 Subject: [PATCH 06/15] Extract logic to mark entries for embeddings update into helper function - This could be re-used by other text_to_jsonl converters like markdown, beancount --- src/processor/org_mode/org_to_jsonl.py | 35 ++---------------------- src/utils/helpers.py | 38 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 88a4cd5e..47dc54fa 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -7,12 +7,11 @@ import argparse import pathlib import glob import logging -import hashlib import time # Internal Packages from src.processor.org_mode import orgnode -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils import state @@ -46,37 +45,7 @@ def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None) if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: - # Hash all current and previous entries to identify new entries - start = time.time() - current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), current_entries)) - previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e['compiled'], encoding='utf-8')).hexdigest(), previous_entries)) - end = time.time() - logger.debug(f"Hash previous, current entries: {end - start} seconds") - - start = time.time() - hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) - hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) - - # All entries that did not exist in the previous set are to be added - new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) - # All entries that exist in both current and previous sets are kept - existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) - - # Mark new entries with no ids for later embeddings generation - new_entries = [ - (None, hash_to_current_entries[entry_hash]) - for entry_hash in new_entry_hashes - ] - # Set id of existing entries to their previous ids to reuse their existing encoded embeddings - existing_entries = [ - (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) - for entry_hash in existing_entry_hashes - ] - - existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) - entries_with_ids = existing_entries_sorted + new_entries - end = time.time() - logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds") + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) # Process Each Entry from All Notes Files start = time.time() diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 7ea6580c..e8b2b8ca 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -1,6 +1,8 @@ # Standard Packages import pathlib import sys +import time +import hashlib from os.path import join from collections import OrderedDict @@ -79,3 +81,39 @@ class LRU(OrderedDict): if len(self) > self.capacity: oldest = next(iter(self)) del self[oldest] + + +def mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=None): + # Hash all current and previous entries to identify new entries + start = time.time() + current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), current_entries)) + previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), previous_entries)) + end = time.time() + logger.debug(f"Hash previous, current entries: {end - start} seconds") + + start = time.time() + hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) + hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries)) + + # All entries that did not exist in the previous set are to be added + new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes) + # All entries that exist in both current and previous sets are kept + existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes) + + # Mark new entries with no ids for later embeddings generation + new_entries = [ + (None, hash_to_current_entries[entry_hash]) + for entry_hash in new_entry_hashes + ] + # Set id of existing entries to their previous ids to reuse their existing encoded embeddings + existing_entries = [ + (previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash]) + for entry_hash in existing_entry_hashes + ] + + existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0]) + entries_with_ids = existing_entries_sorted + new_entries + end = time.time() + logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds") + + return entries_with_ids \ No newline at end of file From cfaf7aa6f47e645ec167350d44011c9332a41aac Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 7 Sep 2022 14:10:38 +0300 Subject: [PATCH 07/15] Update Indexing Performance Section in Readme --- Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Readme.md b/Readme.md index 628ce458..18158585 100644 --- a/Readme.md +++ b/Readme.md @@ -131,7 +131,7 @@ pip install --upgrade khoj-assistant - Indexing is more strongly impacted by the size of the source data - Indexing 100K+ line corpus of notes takes 6 minutes - Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM -- Once is implemented, it should only take this long on first run +- Note: *It should only take this long on the first run* as the index is incrementally updated ### Miscellaneous From 91aac83c6a7eaf9d4483b85484ecf10f195b320e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 20:55:32 +0300 Subject: [PATCH 08/15] Support incremental update of Beancount transactions, embeddings --- src/processor/ledger/beancount_to_jsonl.py | 37 +++++++++++++++------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index a1792735..fc40e544 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -9,7 +9,7 @@ import re import logging # Internal Packages -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -28,10 +28,20 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, prev beancount_files = get_beancount_files(beancount_files, beancount_file_filter) # Extract Entries from specified Beancount files - entries, transaction_to_file_map = extract_beancount_entries(beancount_files) + extracted_transactions, transaction_to_file_map = extract_beancount_entries(beancount_files) + + # Convert Extracted Transactions to Dictionaries + current_entries = convert_transactions_to_maps(extracted_transactions, transaction_to_file_map) + + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) # Process Each Entry from All Notes Files - jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map) + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = convert_transaction_maps_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -39,7 +49,7 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, prev elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return list(enumerate(entries)) + return entries_with_ids def get_beancount_files(beancount_files=None, beancount_file_filter=None): @@ -87,17 +97,20 @@ def extract_beancount_entries(beancount_files): return entries, transaction_to_file_map -def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map): - "Convert each Beancount transaction to JSON and collate as JSONL" - jsonl = '' +def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: + "Convert each Beancount transaction into a dictionary" + entry_maps = [] for entry_id, entry in enumerate(entries): - entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'}) - logger.info(f"Converted {len(entries)} to jsonl format") + logger.info(f"Converted {len(entries)} transactions to dictionaries") - return jsonl + return entry_maps + + +def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: + "Convert each Beancount transaction dictionary to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': From 030fab9bb2c1d4076335dec4ecd53c6457e27d36 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 21:30:04 +0300 Subject: [PATCH 09/15] Support incremental update of Markdown entries, embeddings --- src/processor/markdown/markdown_to_jsonl.py | 39 ++++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index ce022358..d910ff49 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -9,7 +9,7 @@ import re import logging # Internal Packages -from src.utils.helpers import get_absolute_path, is_none_or_empty +from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) # Define Functions -def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): +def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None): # Input Validation if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): print("At least one of markdown-files or markdown-file-filter is required to be specified") @@ -28,10 +28,20 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): markdown_files = get_markdown_files(markdown_files, markdown_file_filter) # Extract Entries from specified Markdown files - entries, entry_to_file_map = extract_markdown_entries(markdown_files) + extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files) + + # Convert Extracted Transactions to Dictionaries + current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map) + + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) # Process Each Entry from All Notes Files - jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map) + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = convert_markdown_maps_to_jsonl(entries, entry_to_file_map) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -39,7 +49,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) - return list(enumerate(entries)) + return entries_with_ids def get_markdown_files(markdown_files=None, markdown_file_filter=None): @@ -87,17 +97,20 @@ def extract_markdown_entries(markdown_files): return entries, entry_to_file_map -def convert_markdown_entries_to_jsonl(entries, entry_to_file_map): - "Convert each Markdown entries to JSON and collate as JSONL" - jsonl = '' +def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: + "Convert each Markdown entries into a dictionary" + entry_maps = [] for entry_id, entry in enumerate(entries): - entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'} - # Convert Dictionary to JSON and Append to JSONL string - jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}) - logger.info(f"Converted {len(entries)} to jsonl format") + logger.info(f"Converted {len(entries)} markdown entries to dictionaries") - return jsonl + return entry_maps + + +def convert_markdown_maps_to_jsonl(entries): + "Convert each Markdown entries to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) if __name__ == '__main__': From 4eb84c7f519a468df48fc7256093141e989edfd0 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 22:47:54 +0300 Subject: [PATCH 10/15] Log performance metrics for beancount, markdown to jsonl conversion --- src/processor/ledger/beancount_to_jsonl.py | 12 ++++++++++-- src/processor/markdown/markdown_to_jsonl.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index fc40e544..8a376663 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -7,6 +7,7 @@ import pathlib import glob import re import logging +import time # Internal Packages from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update @@ -28,18 +29,23 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, prev beancount_files = get_beancount_files(beancount_files, beancount_file_filter) # Extract Entries from specified Beancount files + start = time.time() extracted_transactions, transaction_to_file_map = extract_beancount_entries(beancount_files) - - # Convert Extracted Transactions to Dictionaries current_entries = convert_transactions_to_maps(extracted_transactions, transaction_to_file_map) + end = time.time() + logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") # Identify, mark and merge any new entries with previous entries + start = time.time() if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated transaction: {end - start} seconds") # Process Each Entry from All Notes Files + start = time.time() entries = list(map(lambda entry: entry[1], entries_with_ids)) jsonl_data = convert_transaction_maps_to_jsonl(entries) @@ -48,6 +54,8 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, prev compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write transactions to JSONL file: {end - start} seconds") return entries_with_ids diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index d910ff49..ebfc9472 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -7,6 +7,7 @@ import pathlib import glob import re import logging +import time # Internal Packages from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update @@ -28,26 +29,33 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previou markdown_files = get_markdown_files(markdown_files, markdown_file_filter) # Extract Entries from specified Markdown files + start = time.time() extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files) - - # Convert Extracted Transactions to Dictionaries current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map) + end = time.time() + logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") # Identify, mark and merge any new entries with previous entries + start = time.time() if not previous_entries: entries_with_ids = list(enumerate(current_entries)) else: entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated entries: {end - start} seconds") # Process Each Entry from All Notes Files + start = time.time() entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = convert_markdown_maps_to_jsonl(entries, entry_to_file_map) + jsonl_data = convert_markdown_maps_to_jsonl(entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": compress_jsonl_data(jsonl_data, output_file) elif output_file.suffix == ".jsonl": dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds") return entries_with_ids From 3e1323971b9c059620ad5a77c54ecc3433003234 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 22:56:06 +0300 Subject: [PATCH 11/15] Stack function calls in jsonl converters to avoid unneeded variables --- src/processor/ledger/beancount_to_jsonl.py | 5 ++--- src/processor/markdown/markdown_to_jsonl.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 8a376663..3baa2d54 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -30,8 +30,7 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, prev # Extract Entries from specified Beancount files start = time.time() - extracted_transactions, transaction_to_file_map = extract_beancount_entries(beancount_files) - current_entries = convert_transactions_to_maps(extracted_transactions, transaction_to_file_map) + current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files)) end = time.time() logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") @@ -84,7 +83,7 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None): return all_beancount_files -def extract_beancount_entries(beancount_files): +def extract_beancount_transactions(beancount_files): "Extract entries from specified Beancount files" # Initialize Regex for extracting Beancount Entries diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index ebfc9472..118e66dd 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -30,8 +30,7 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previou # Extract Entries from specified Markdown files start = time.time() - extracted_entries, entry_to_file_map = extract_markdown_entries(markdown_files) - current_entries = convert_markdown_entries_to_maps(extracted_entries, entry_to_file_map) + current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files)) end = time.time() logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") From a7cf6c845884a4f45c40f9575f0d6486203eccf3 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 23:08:30 +0300 Subject: [PATCH 12/15] Use dictionary instead of list to track entry to file maps --- src/processor/ledger/beancount_to_jsonl.py | 8 ++++---- src/processor/markdown/markdown_to_jsonl.py | 8 ++++---- src/processor/org_mode/org_to_jsonl.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 3baa2d54..b2f26b7b 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -99,16 +99,16 @@ def extract_beancount_transactions(beancount_files): for entry in re.split(empty_newline, ledger_content, flags=re.MULTILINE) if re.match(transaction_regex, entry)] - transaction_to_file_map += [beancount_file]*len(transactions_per_file) + transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file)) entries.extend(transactions_per_file) - return entries, transaction_to_file_map + return entries, dict(transaction_to_file_map) def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: "Convert each Beancount transaction into a dictionary" entry_maps = [] - for entry_id, entry in enumerate(entries): - entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'}) + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'}) logger.info(f"Converted {len(entries)} transactions to dictionaries") diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 118e66dd..7a91533b 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -98,17 +98,17 @@ def extract_markdown_entries(markdown_files): markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)] - entry_to_file_map += [markdown_file]*len(markdown_entries_per_file) + entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) - return entries, entry_to_file_map + return entries, dict(entry_to_file_map) def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: "Convert each Markdown entries into a dictionary" entry_maps = [] - for entry_id, entry in enumerate(entries): - entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}) + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'}) logger.info(f"Converted {len(entries)} markdown entries to dictionaries") diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 47dc54fa..f166810f 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -141,7 +141,7 @@ def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_m return entry_maps -def convert_org_entries_to_jsonl(entries) -> str: +def convert_org_entries_to_jsonl(entries: list[dict]) -> str: "Convert each Org-Mode entry to JSON and collate as JSONL" return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) From 2e1bbe0cac9381685142890572bbaac1284d146e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 23:55:09 +0300 Subject: [PATCH 13/15] Fix striping empty escape sequences from strings - Fix log message on jsonl write --- src/processor/ledger/beancount_to_jsonl.py | 2 +- src/processor/markdown/markdown_to_jsonl.py | 3 ++- src/utils/constants.py | 2 +- src/utils/jsonl.py | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index b2f26b7b..3af45c7a 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -88,7 +88,7 @@ def extract_beancount_transactions(beancount_files): # Initialize Regex for extracting Beancount Entries transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] ' - empty_newline = f'^[{empty_escape_sequences}]*$' + empty_newline = f'^[\n\r\t\ ]*$' entries = [] transaction_to_file_map = [] diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 7a91533b..e7fc2779 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -97,7 +97,8 @@ def extract_markdown_entries(markdown_files): markdown_content = f.read() markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' for entry - in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)] + in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE) + if entry.strip(empty_escape_sequences) != ''] entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) entries.extend(markdown_entries_per_file) diff --git a/src/utils/constants.py b/src/utils/constants.py index 84c3dfbb..8b443944 100644 --- a/src/utils/constants.py +++ b/src/utils/constants.py @@ -2,7 +2,7 @@ from pathlib import Path app_root_directory = Path(__file__).parent.parent.parent web_directory = app_root_directory / 'src/interface/web/' -empty_escape_sequences = r'\n|\r\t ' +empty_escape_sequences = '\n|\r|\t| ' # default app config to use default_config = { diff --git a/src/utils/jsonl.py b/src/utils/jsonl.py index 77b5af11..8a034acd 100644 --- a/src/utils/jsonl.py +++ b/src/utils/jsonl.py @@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path): with gzip.open(output_path, 'wt') as gzip_file: gzip_file.write(jsonl_data) - logger.info(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}') \ No newline at end of file + logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}') \ No newline at end of file From d3267554ae7a7190dba7ebc471e27a23df90bf9a Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 10 Sep 2022 23:57:17 +0300 Subject: [PATCH 14/15] Add basic tests for markdown to jsonl conversion --- tests/test_markdown_to_jsonl.py | 81 +++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 tests/test_markdown_to_jsonl.py diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py new file mode 100644 index 00000000..712053e8 --- /dev/null +++ b/tests/test_markdown_to_jsonl.py @@ -0,0 +1,81 @@ +# Standard Packages +import json + +# Internal Packages +from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps + + +def test_markdown_file_with_no_headings_to_jsonl(tmp_path): + "Convert files with no heading to jsonl." + # Arrange + entry = f''' + - Bullet point 1 + - Bullet point 2 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_single_markdown_entry_to_jsonl(tmp_path): + "Convert markdown entry from single file to jsonl." + # Arrange + entry = f'''### Heading + \t\r + Body Line 1 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multiple_markdown_entries_to_jsonl(tmp_path): + "Convert multiple markdown entries from single file to jsonl." + # Arrange + entry = f''' +### Heading 1 + \t\r + Heading 1 Body Line 1 +### Heading 2 + \t\r + Heading 2 Body Line 2 + ''' + markdownfile = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Markdown files + entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + + # Process Each Entry from All Notes Files + jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + +# Helper Functions +def create_file(tmp_path, entry, filename="test.md"): + markdown_file = tmp_path / f"notes/{filename}" + markdown_file.parent.mkdir() + markdown_file.touch() + markdown_file.write_text(entry) + return markdown_file From 9b2845de06d1c2464dc88f7b1e2df92146604837 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 11 Sep 2022 00:16:02 +0300 Subject: [PATCH 15/15] Add basic tests for beancount to jsonl conversion --- tests/test_beancount_to_jsonl.py | 84 ++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tests/test_beancount_to_jsonl.py diff --git a/tests/test_beancount_to_jsonl.py b/tests/test_beancount_to_jsonl.py new file mode 100644 index 00000000..e368e194 --- /dev/null +++ b/tests/test_beancount_to_jsonl.py @@ -0,0 +1,84 @@ +# Standard Packages +import json + +# Internal Packages +from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl + + +def test_no_transactions_in_file(tmp_path): + "Handle file with no transactions." + # Arrange + entry = f''' + - Bullet point 1 + - Bullet point 2 + ''' + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 0 + + +def test_single_beancount_transaction_to_jsonl(tmp_path): + "Convert transaction from single file to jsonl." + # Arrange + entry = f''' +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES + ''' + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + + +def test_multiple_transactions_to_jsonl(tmp_path): + "Convert multiple transactions from single file to jsonl." + # Arrange + entry = f''' +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES +\t\r +1984-04-01 * "Payee" "Narration" +Expenses:Test:Test 1.00 KES +Assets:Test:Test -1.00 KES +''' + + beancount_file = create_file(tmp_path, entry) + + # Act + # Extract Entries from specified Beancount files + entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + + # Process Each Entry from All Beancount Files + jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 2 + + +# Helper Functions +def create_file(tmp_path, entry, filename="ledger.beancount"): + beancount_file = tmp_path / f"notes/{filename}" + beancount_file.parent.mkdir() + beancount_file.touch() + beancount_file.write_text(entry) + return beancount_file