From 76cd63f4bd79f59f5e240c457b2cb5ba28475013 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 16:54:08 -0500
Subject: [PATCH 1/5] Fix count of processed jsonl entries shown to user by
 ledger processor

Count lines not chars
---
 src/processor/ledger/beancount_to_jsonl.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 0347d34b..1cac6b85 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -43,7 +43,8 @@ def dump_jsonl(jsonl_data, output_path, verbose=0):
         f.write(jsonl_data)
 
     if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
+        jsonl_entries = len(jsonl_data.split('\n'))
+        print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}')
 
 
 def compress_jsonl_data(jsonl_data, output_path, verbose=0):
@@ -51,7 +52,8 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
         gzip_file.write(jsonl_data)
 
     if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
+        jsonl_entries = len(jsonl_data.split('\n'))
+        print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
 
 
 def load_jsonl(input_path, verbose=0):

From 248aa632c0061f471b152dd68ef30797b9f976d4 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 16:56:13 -0500
Subject: [PATCH 2/5] Do not throw warning for beancount files with .beancount
 extension

---
 src/processor/ledger/beancount_to_jsonl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 1cac6b85..21dbcfb9 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -81,7 +81,10 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
 
     all_beancount_files = absolute_beancount_files | filtered_beancount_files
 
-    files_with_non_beancount_extensions = {beancount_file for beancount_file in all_beancount_files if not beancount_file.endswith(".bean")}
+    files_with_non_beancount_extensions = {beancount_file
+                                    for beancount_file
+                                    in all_beancount_files
+                                    if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
     if any(files_with_non_beancount_extensions):
         print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
 

From 502c68d4f8cc67fb52d11853125d69d56772e99b Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 17:23:02 -0500
Subject: [PATCH 3/5] Remove trailling escape sequence in ledger search
 response entries

- Fix loading entries from jsonl in extract_entries method
  - Only extract Title from jsonl of each entry
    This is the only thing written to the jsonl for symmetric ledger
  - This fixes the trailing escape seq in loaded entries
  - Remove the need for semantic-search.el response reader to do pointless complicated cleanup

- Make symmetric_ledger:extract_entries use beancount_to_jsonl:load_jsonl
  Both methods were doing similar work

- Make load_jsonl handle loading entries from both gzip and uncompressed jsonl
---
 src/processor/ledger/beancount_to_jsonl.py | 20 +++++++++++++++++---
 src/search_type/symmetric_ledger.py        | 17 ++++-------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 21dbcfb9..99c9d5d5 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -58,11 +58,25 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
 
 def load_jsonl(input_path, verbose=0):
     "Read List of JSON objects from JSON line file"
+    # Initialize Variables
     data = []
-    with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
-        for line in f:
-            data.append(json.loads(line.rstrip('\n|\r')))
+    jsonl_file = None
+    escape_sequences = '\n|\r\t '
 
+    # Open JSONL file
+    if input_path.suffix == ".gz":
+        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
+    elif input_path.suffix == ".jsonl":
+        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
+
+    # Read JSONL file
+    for line in jsonl_file:
+        data.append(json.loads(line.strip(escape_sequences)))
+
+    # Close JSONL file
+    jsonl_file.close()
+
+    # Log JSONL entries loaded
     if verbose > 0:
         print(f'Loaded {len(data)} records from {input_path}')
 
diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py
index f63a1c98..5243c1aa 100644
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
 
 # Internal Packages
 from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
+from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
 
@@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig):
 
 def extract_entries(notesfile, verbose=0):
     "Load entries from compressed jsonl"
-    entries = []
-    with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
-        for line in jsonl:
-            note = json.loads(line.strip())
-
-            note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}'
-            entries.extend([note_string])
-
-    if verbose > 0:
-        print(f"Loaded {len(entries)} entries from {notesfile}")
-
-    return entries
+    return [f'{entry["Title"]}'
+            for entry
+            in load_jsonl(notesfile, verbose=verbose)]
 
 
 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):

From b3ac2dd7300b6273fadbfeace53c7cfe5c01ddcb Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 17:33:10 -0500
Subject: [PATCH 4/5] Improve Results Rendered on Emacs from Semantic Search on
 Ledger

- Add search query to top of buffer as Beancount comment
- Remove trailing ) from response
- Separate entries by empty line
- Load beancount-mode in semantic search on ledger buffer
---
 src/interface/emacs/semantic-search.el | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/interface/emacs/semantic-search.el b/src/interface/emacs/semantic-search.el
index 0b961262..753ff635 100644
--- a/src/interface/emacs/semantic-search.el
+++ b/src/interface/emacs/semantic-search.el
@@ -69,16 +69,20 @@
             (lambda (args) (format "\n** \n  [[%s]]" (cdr (assoc 'Entry args))))
             json-response))))
 
-(defun semantic-search--extract-entries-as-ledger (json-response)
+(defun semantic-search--extract-entries-as-ledger (json-response query)
   "Convert json response from API to ledger entries"
   ;; remove leading (, ) or SPC from extracted entries string
   (replace-regexp-in-string
-   "^[\(\) ]" ""
-   ;; extract entries from response as single string and convert to entries
-   (format "%s"
-           (mapcar
-            (lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
-            json-response))))
+   "[\(\) ]$" ""
+   (replace-regexp-in-string
+    "^[\(\) ]" ""
+    ;; extract entries from response as single string and convert to entries
+    (format ";; %s\n\n%s\n"
+            query
+            (mapcar
+             (lambda (args)
+               (format "%s\n\n" (cdr (assoc 'Entry args))))
+             json-response)))))
 
 (defun semantic-search--buffer-name-to-search-type (buffer-name)
   (let ((file-extension (file-name-extension buffer-name)))
@@ -112,10 +116,11 @@
         (erase-buffer)
         (insert
          (cond ((or (equal search-type "notes") (equal search-type "music")) (semantic-search--extract-entries-as-org json-response query))
-               ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response))
+               ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response query))
                ((equal search-type "image") (semantic-search--extract-entries-as-images json-response query))
                (t (format "%s" json-response)))))
       (cond ((equal search-type "notes") (org-mode))
+            ((equal search-type "ledger") (beancount-mode))
             ((equal search-type "music") (progn (org-mode)
                                                 (org-music-mode)))
             ((equal search-type "image") (progn (org-mode)

From b68558651bbe115b9452266760f7472e150f8542 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Sat, 26 Feb 2022 17:36:30 -0500
Subject: [PATCH 5/5] Improve Extraction of Beancount Entries

- Only extract entries starting with YYYY-MM-DD from Beancount
- Strip Trailing Escape Sequences from Entries
---
 src/processor/ledger/beancount_to_jsonl.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 99c9d5d5..aa09ffa3 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -6,6 +6,7 @@ import argparse
 import pathlib
 import glob
 import gzip
+import re
 
 # Internal Packages
 from src.processor.org_mode import orgnode
@@ -110,11 +111,19 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
 
 def extract_beancount_entries(beancount_files):
     "Extract entries from specified Beancount files"
+
+    # Initialize Regex for extracting Beancount Entries
+    date_regex = r'^\n?\d{4}-\d{2}-\d{2}'
+    empty_newline = r'^[\n\r\t ]*$'
+
     entries = []
     for beancount_file in beancount_files:
         with open(beancount_file) as f:
-            entries.extend(
-                f.read().split('\n\n'))
+            ledger_content = f.read()
+            entries.extend([entry.strip('\n|\r|\t| ')
+               for entry
+               in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
+               if re.match(date_regex, entry)])
 
     return entries