From 76cd63f4bd79f59f5e240c457b2cb5ba28475013 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 16:54:08 -0500 Subject: [PATCH 1/5] Fix count of processed jsonl entries shown to user by ledger processor Count lines not chars --- src/processor/ledger/beancount_to_jsonl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 0347d34b..1cac6b85 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -43,7 +43,8 @@ def dump_jsonl(jsonl_data, output_path, verbose=0): f.write(jsonl_data) if verbose > 0: - print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}') + jsonl_entries = len(jsonl_data.split('\n')) + print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}') def compress_jsonl_data(jsonl_data, output_path, verbose=0): @@ -51,7 +52,8 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0): gzip_file.write(jsonl_data) if verbose > 0: - print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}') + jsonl_entries = len(jsonl_data.split('\n')) + print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}') def load_jsonl(input_path, verbose=0): From 248aa632c0061f471b152dd68ef30797b9f976d4 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 16:56:13 -0500 Subject: [PATCH 2/5] Do not throw warning for beancount files with .beancount extension --- src/processor/ledger/beancount_to_jsonl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 1cac6b85..21dbcfb9 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -81,7 +81,10 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos all_beancount_files = absolute_beancount_files | filtered_beancount_files - files_with_non_beancount_extensions = {beancount_file for beancount_file in all_beancount_files if not beancount_file.endswith(".bean")} + files_with_non_beancount_extensions = {beancount_file + for beancount_file + in all_beancount_files + if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")} if any(files_with_non_beancount_extensions): print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}") From 502c68d4f8cc67fb52d11853125d69d56772e99b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 17:23:02 -0500 Subject: [PATCH 3/5] Remove trailling escape sequence in ledger search response entries - Fix loading entries from jsonl in extract_entries method - Only extract Title from jsonl of each entry This is the only thing written to the jsonl for symmetric ledger - This fixes the trailing escape seq in loaded entries - Remove the need for semantic-search.el response reader to do pointless complicated cleanup - Make symmetric_ledger:extract_entries use beancount_to_jsonl:load_jsonl Both methods were doing similar work - Make load_jsonl handle loading entries from both gzip and uncompressed jsonl --- src/processor/ledger/beancount_to_jsonl.py | 20 +++++++++++++++++--- src/search_type/symmetric_ledger.py | 17 ++++------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 21dbcfb9..99c9d5d5 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -58,11 +58,25 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0): def load_jsonl(input_path, verbose=0): "Read List of JSON objects from JSON line file" + # Initialize Variables data = [] - with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f: - for line in f: - data.append(json.loads(line.rstrip('\n|\r'))) + jsonl_file = None + escape_sequences = '\n|\r\t ' + # Open JSONL file + if input_path.suffix == ".gz": + jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') + elif input_path.suffix == ".jsonl": + jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') + + # Read JSONL file + for line in jsonl_file: + data.append(json.loads(line.strip(escape_sequences))) + + # Close JSONL file + jsonl_file.close() + + # Log JSONL entries loaded if verbose > 0: print(f'Loaded {len(data)} records from {input_path}') diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py index f63a1c98..5243c1aa 100644 --- a/src/search_type/symmetric_ledger.py +++ b/src/search_type/symmetric_ledger.py @@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model -from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl +from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig @@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig): def extract_entries(notesfile, verbose=0): "Load entries from compressed jsonl" - entries = [] - with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl: - for line in jsonl: - note = json.loads(line.strip()) - - note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}' - entries.extend([note_string]) - - if verbose > 0: - print(f"Loaded {len(entries)} entries from {notesfile}") - - return entries + return [f'{entry["Title"]}' + for entry + in load_jsonl(notesfile, verbose=verbose)] def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0): From b3ac2dd7300b6273fadbfeace53c7cfe5c01ddcb Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 17:33:10 -0500 Subject: [PATCH 4/5] Improve Results Rendered on Emacs from Semantic Search on Ledger - Add search query to top of buffer as Beancount comment - Remove trailing ) from response - Separate entries by empty line - Load beancount-mode in semantic search on ledger buffer --- src/interface/emacs/semantic-search.el | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/interface/emacs/semantic-search.el b/src/interface/emacs/semantic-search.el index 0b961262..753ff635 100644 --- a/src/interface/emacs/semantic-search.el +++ b/src/interface/emacs/semantic-search.el @@ -69,16 +69,20 @@ (lambda (args) (format "\n** \n [[%s]]" (cdr (assoc 'Entry args)))) json-response)))) -(defun semantic-search--extract-entries-as-ledger (json-response) +(defun semantic-search--extract-entries-as-ledger (json-response query) "Convert json response from API to ledger entries" ;; remove leading (, ) or SPC from extracted entries string (replace-regexp-in-string - "^[\(\) ]" "" - ;; extract entries from response as single string and convert to entries - (format "%s" - (mapcar - (lambda (args) (format "* %s" (cdr (assoc 'Entry args)))) - json-response)))) + "[\(\) ]$" "" + (replace-regexp-in-string + "^[\(\) ]" "" + ;; extract entries from response as single string and convert to entries + (format ";; %s\n\n%s\n" + query + (mapcar + (lambda (args) + (format "%s\n\n" (cdr (assoc 'Entry args)))) + json-response))))) (defun semantic-search--buffer-name-to-search-type (buffer-name) (let ((file-extension (file-name-extension buffer-name))) @@ -112,10 +116,11 @@ (erase-buffer) (insert (cond ((or (equal search-type "notes") (equal search-type "music")) (semantic-search--extract-entries-as-org json-response query)) - ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response)) + ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response query)) ((equal search-type "image") (semantic-search--extract-entries-as-images json-response query)) (t (format "%s" json-response))))) (cond ((equal search-type "notes") (org-mode)) + ((equal search-type "ledger") (beancount-mode)) ((equal search-type "music") (progn (org-mode) (org-music-mode))) ((equal search-type "image") (progn (org-mode) From b68558651bbe115b9452266760f7472e150f8542 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 26 Feb 2022 17:36:30 -0500 Subject: [PATCH 5/5] Improve Extraction of Beancount Entries - Only extract entries starting with YYYY-MM-DD from Beancount - Strip Trailing Escape Sequences from Entries --- src/processor/ledger/beancount_to_jsonl.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 99c9d5d5..aa09ffa3 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -6,6 +6,7 @@ import argparse import pathlib import glob import gzip +import re # Internal Packages from src.processor.org_mode import orgnode @@ -110,11 +111,19 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos def extract_beancount_entries(beancount_files): "Extract entries from specified Beancount files" + + # Initialize Regex for extracting Beancount Entries + date_regex = r'^\n?\d{4}-\d{2}-\d{2}' + empty_newline = r'^[\n\r\t ]*$' + entries = [] for beancount_file in beancount_files: with open(beancount_file) as f: - entries.extend( - f.read().split('\n\n')) + ledger_content = f.read() + entries.extend([entry.strip('\n|\r|\t| ') + for entry + in re.split(empty_newline, ledger_content, flags=re.MULTILINE) + if re.match(date_regex, entry)]) return entries