Merge pull request #25 from debanjum/users/debanjum/improve-semantic-search-on-ledger

Improve Extraction and Rendering of Semantic Search on Ledger
2025-02-17 08:04:21 +00:00 · 2022-02-26 15:18:22 -08:00 · 2022-02-26 15:18:22 -08:00 · 6a84ca965a
commit 6a84ca965a
parent f08591c880 b68558651b
3 changed files with 53 additions and 29 deletions
--- a/src/interface/emacs/semantic-search.el
+++ b/src/interface/emacs/semantic-search.el
@ -69,16 +69,20 @@
            (lambda (args) (format "\n** \n  [[%s]]" (cdr (assoc 'Entry args))))
            json-response))))
-(defun semantic-search--extract-entries-as-ledger (json-response)
+(defun semantic-search--extract-entries-as-ledger (json-response query)
  "Convert json response from API to ledger entries"
  ;; remove leading (, ) or SPC from extracted entries string
  (replace-regexp-in-string
   "[\(\) ]$" ""
   (replace-regexp-in-string
    "^[\(\) ]" ""
    ;; extract entries from response as single string and convert to entries
-   (format "%s"
+    (format ";; %s\n\n%s\n"
            query
            (mapcar
-            (lambda (args) (format "* %s" (cdr (assoc 'Entry args))))
+             (lambda (args)
-            json-response))))
+               (format "%s\n\n" (cdr (assoc 'Entry args))))
             json-response)))))
 (defun semantic-search--buffer-name-to-search-type (buffer-name)
  (let ((file-extension (file-name-extension buffer-name)))
@ -112,10 +116,11 @@
        (erase-buffer)
        (insert
         (cond ((or (equal search-type "notes") (equal search-type "music")) (semantic-search--extract-entries-as-org json-response query))
-               ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response))
+               ((equal search-type "ledger") (semantic-search--extract-entries-as-ledger json-response query))
               ((equal search-type "image") (semantic-search--extract-entries-as-images json-response query))
               (t (format "%s" json-response)))))
      (cond ((equal search-type "notes") (org-mode))
            ((equal search-type "ledger") (beancount-mode))
            ((equal search-type "music") (progn (org-mode)
                                                (org-music-mode)))
            ((equal search-type "image") (progn (org-mode)
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@ -6,6 +6,7 @@ import argparse
 import pathlib
 import glob
 import gzip
 import re
 # Internal Packages
 from src.processor.org_mode import orgnode
@ -43,7 +44,8 @@ def dump_jsonl(jsonl_data, output_path, verbose=0):
        f.write(jsonl_data)
    if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
+        jsonl_entries = len(jsonl_data.split('\n'))
        print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}')
 def compress_jsonl_data(jsonl_data, output_path, verbose=0):
@ -51,16 +53,31 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
        gzip_file.write(jsonl_data)
    if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
+        jsonl_entries = len(jsonl_data.split('\n'))
        print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
 def load_jsonl(input_path, verbose=0):
    "Read List of JSON objects from JSON line file"
    # Initialize Variables
    data = []
-    with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
+    jsonl_file = None
-        for line in f:
+    escape_sequences = '\n|\r\t '
            data.append(json.loads(line.rstrip('\n|\r')))
    # Open JSONL file
    if input_path.suffix == ".gz":
        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
    elif input_path.suffix == ".jsonl":
        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
    # Read JSONL file
    for line in jsonl_file:
        data.append(json.loads(line.strip(escape_sequences)))
    # Close JSONL file
    jsonl_file.close()
    # Log JSONL entries loaded
    if verbose > 0:
        print(f'Loaded {len(data)} records from {input_path}')
@ -79,7 +96,10 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
    all_beancount_files = absolute_beancount_files | filtered_beancount_files
-    files_with_non_beancount_extensions = {beancount_file for beancount_file in all_beancount_files if not beancount_file.endswith(".bean")}
+    files_with_non_beancount_extensions = {beancount_file
                                    for beancount_file
                                    in all_beancount_files
                                    if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
    if any(files_with_non_beancount_extensions):
        print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
@ -91,11 +111,19 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbos
 def extract_beancount_entries(beancount_files):
    "Extract entries from specified Beancount files"
    # Initialize Regex for extracting Beancount Entries
    date_regex = r'^\n?\d{4}-\d{2}-\d{2}'
    empty_newline = r'^[\n\r\t ]*$'
    entries = []
    for beancount_file in beancount_files:
        with open(beancount_file) as f:
-            entries.extend(
+            ledger_content = f.read()
-                f.read().split('\n\n'))
+            entries.extend([entry.strip('\n|\r|\t| ')
               for entry
               in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
               if re.match(date_regex, entry)])
    return entries
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@ -11,7 +11,7 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
 # Internal Packages
 from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
+from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
@ -40,18 +40,9 @@ def initialize_model(search_config: SymmetricSearchConfig):
 def extract_entries(notesfile, verbose=0):
    "Load entries from compressed jsonl"
-    entries = []
+    return [f'{entry["Title"]}'
-    with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
+            for entry
-        for line in jsonl:
+            in load_jsonl(notesfile, verbose=verbose)]
            note = json.loads(line.strip())
            note_string = f'{note["Title"]} \t {note["Tags"] if "Tags" in note else ""} \n {note["Body"] if "Body" in note else ""}'
            entries.extend([note_string])
    if verbose > 0:
        print(f"Loaded {len(entries)} entries from {notesfile}")
    return entries
 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, verbose=0):