From 5aad297286bd95247123d05b0c66f5a58630a870 Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Thu, 21 Jul 2022 02:53:18 +0400
Subject: [PATCH] Reuse logic to extract entries across symmetric, asymmetric
 search

Now that the logic to compile entries is in the processor layer, the
extract_entries method is standard across (text) search_types

Extract the load_jsonl method as a utility helper method.
Use it in (a)symmetric search types
---
 src/processor/ledger/beancount_to_jsonl.py | 27 ------------------
 src/search_type/asymmetric.py              | 31 ++++----------------
 src/search_type/symmetric_ledger.py        |  4 +--
 src/utils/helpers.py                       | 33 +++++++++++++++++++++-
 4 files changed, 39 insertions(+), 56 deletions(-)

diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index aeb93653..ede42686 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -9,7 +9,6 @@ import gzip
 import re
 
 # Internal Packages
-from src.processor.org_mode import orgnode
 from src.utils.helpers import get_absolute_path, is_none_or_empty
 from src.utils.constants import empty_escape_sequences
 
@@ -58,32 +57,6 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
         print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
 
 
-def load_jsonl(input_path, verbose=0):
-    "Read List of JSON objects from JSON line file"
-    # Initialize Variables
-    data = []
-    jsonl_file = None
-
-    # Open JSONL file
-    if input_path.suffix == ".gz":
-        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
-    elif input_path.suffix == ".jsonl":
-        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
-
-    # Read JSONL file
-    for line in jsonl_file:
-        data.append(json.loads(line.strip(empty_escape_sequences)))
-
-    # Close JSONL file
-    jsonl_file.close()
-
-    # Log JSONL entries loaded
-    if verbose > 0:
-        print(f'Loaded {len(data)} records from {input_path}')
-
-    return data
-
-
 def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0):
     "Get Beancount files to process"
     absolute_beancount_files, filtered_beancount_files = set(), set()
diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py
index 8e6e9db4..da2f34dc 100644
--- a/src/search_type/asymmetric.py
+++ b/src/search_type/asymmetric.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
 
 # Standard Packages
-import json
-import gzip
 import argparse
 import pathlib
 from copy import deepcopy
@@ -12,11 +10,10 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 
 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
-from src.utils.constants import empty_escape_sequences
 
 
 def initialize_model(search_config: AsymmetricSearchConfig):
@@ -43,27 +40,9 @@ def initialize_model(search_config: AsymmetricSearchConfig):
 
 def extract_entries(notesfile, verbose=0):
     "Load entries from compressed jsonl"
-    entries = []
-    jsonl_file = None
-
-    # Open File
-    if notesfile.suffix == ".gz":
-        jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
-    elif notesfile.suffix == ".jsonl":
-        jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
-
-    # Read File
-    for line in jsonl_file:
-        note = json.loads(line.strip(empty_escape_sequences))
-        entries.append({'compiled': note['compiled'], 'raw': note["raw"]})
-
-    # Close File
-    jsonl_file.close()
-
-    if verbose > 0:
-        print(f"Loaded {len(entries)} entries from {notesfile}")
-
-    return entries
+    return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
+            for entry
+            in load_jsonl(notesfile, verbose=verbose)]
 
 
 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0):
@@ -194,4 +173,4 @@ if __name__ == '__main__':
         hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)
 
         # render results
-        render_results(hits, entries, count=args.results_count)
+        render_results(hits, entries, count=args.results_count)
\ No newline at end of file
diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py
index 1e0e4033..616a86e7 100644
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@@ -8,8 +8,8 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 
 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
-from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
+from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
 
diff --git a/src/utils/helpers.py b/src/utils/helpers.py
index 3c19b935..b19deb6f 100644
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@@ -1,7 +1,12 @@
 # Standard Packages
+import json
+import gzip
 import pathlib
 from os.path import join
 
+# Internal Packages
+from src.utils.constants import empty_escape_sequences
+
 
 def is_none_or_empty(item):
     return item == None or (hasattr(item, '__iter__') and len(item) == 0)
@@ -52,4 +57,30 @@ def load_model(model_name, model_dir, model_type):
         if model_path is not None:
             model.save(model_path)
 
-    return model
\ No newline at end of file
+    return model
+
+
+def load_jsonl(input_path, verbose=0):
+    "Read List of JSON objects from JSON line file"
+    # Initialize Variables
+    data = []
+    jsonl_file = None
+
+    # Open JSONL file
+    if input_path.suffix == ".gz":
+        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
+    elif input_path.suffix == ".jsonl":
+        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
+
+    # Read JSONL file
+    for line in jsonl_file:
+        data.append(json.loads(line.strip(empty_escape_sequences)))
+
+    # Close JSONL file
+    jsonl_file.close()
+
+    # Log JSONL entries loaded
+    if verbose > 0:
+        print(f'Loaded {len(data)} records from {input_path}')
+
+    return data
\ No newline at end of file