Consolidate jsonl helper methods in a single file under utils module

2024-11-28 01:45:07 +01:00 · 2022-07-21 03:28:37 +04:00 · 2022-07-21 03:28:37 +04:00 · 0917f1574d
commit 0917f1574d
parent de726c4b6c
6 changed files with 57 additions and 84 deletions
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@ -5,12 +5,12 @@ import json
 import argparse
 import pathlib
 import glob
-import gzip
 import re

 # Internal Packages
 from src.utils.helpers import get_absolute_path, is_none_or_empty
 from src.utils.constants import empty_escape_sequences
+from src.utils.jsonl import dump_jsonl, compress_jsonl_data


 # Define Functions
@ -38,25 +38,6 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, verb
    return entries


-def dump_jsonl(jsonl_data, output_path, verbose=0):
-    "Write List of JSON objects to JSON line file"
-    with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
-        f.write(jsonl_data)
-
-    if verbose > 0:
-        jsonl_entries = len(jsonl_data.split('\n'))
-        print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}')
-
-
-def compress_jsonl_data(jsonl_data, output_path, verbose=0):
-    with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
-        gzip_file.write(jsonl_data)
-
-    if verbose > 0:
-        jsonl_entries = len(jsonl_data.split('\n'))
-        print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
-
-
 def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0):
    "Get Beancount files to process"
    absolute_beancount_files, filtered_beancount_files = set(), set()
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -5,12 +5,12 @@ import json
 import argparse
 import pathlib
 import glob
-import gzip

 # Internal Packages
 from src.processor.org_mode import orgnode
 from src.utils.helpers import get_absolute_path, is_none_or_empty
 from src.utils.constants import empty_escape_sequences
+from src.utils.jsonl import dump_jsonl, compress_jsonl_data


 # Define Functions
@ -37,35 +37,6 @@ def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0):

    return entries

-def dump_jsonl(jsonl_data, output_path, verbose=0):
-    "Write List of JSON objects to JSON line file"
-    with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
-        f.write(jsonl_data)
-
-    if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
-
-
-def compress_jsonl_data(jsonl_data, output_path, verbose=0):
-    with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
-        gzip_file.write(jsonl_data)
-
-    if verbose > 0:
-        print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
-
-
-def load_jsonl(input_path, verbose=0):
-    "Read List of JSON objects from JSON line file"
-    data = []
-    with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
-        for line in f:
-            data.append(json.loads(line.rstrip('\n|\r')))
-
-    if verbose > 0:
-        print(f'Loaded {len(data)} records from {input_path}')
-
-    return data
-

 def get_org_files(org_files=None, org_file_filter=None, verbose=0):
    "Get Org files to process"
--- a/src/search_type/asymmetric.py
+++ b/src/search_type/asymmetric.py
@ -10,10 +10,11 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util

 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
 from src.processor.org_mode.org_to_jsonl import org_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
+from src.utils.jsonl import load_jsonl


 def initialize_model(search_config: AsymmetricSearchConfig):
--- a/src/search_type/symmetric_ledger.py
+++ b/src/search_type/symmetric_ledger.py
@ -8,10 +8,11 @@ import torch
 from sentence_transformers import SentenceTransformer, CrossEncoder, util

 # Internal Packages
-from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
+from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
 from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
 from src.utils.config import TextSearchModel
 from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
+from src.utils.jsonl import load_jsonl


 def initialize_model(search_config: SymmetricSearchConfig):
--- a/src/utils/helpers.py
+++ b/src/utils/helpers.py
@ -1,12 +1,7 @@
 # Standard Packages
-import json
-import gzip
 import pathlib
 from os.path import join

-# Internal Packages
-from src.utils.constants import empty_escape_sequences
-

 def is_none_or_empty(item):
    return item == None or (hasattr(item, '__iter__') and len(item) == 0)
@ -58,29 +53,3 @@ def load_model(model_name, model_dir, model_type):
            model.save(model_path)

    return model
-
-
-def load_jsonl(input_path, verbose=0):
-    "Read List of JSON objects from JSON line file"
-    # Initialize Variables
-    data = []
-    jsonl_file = None
-
-    # Open JSONL file
-    if input_path.suffix == ".gz":
-        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
-    elif input_path.suffix == ".jsonl":
-        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
-
-    # Read JSONL file
-    for line in jsonl_file:
-        data.append(json.loads(line.strip(empty_escape_sequences)))
-
-    # Close JSONL file
-    jsonl_file.close()
-
-    # Log JSONL entries loaded
-    if verbose > 0:
-        print(f'Loaded {len(data)} records from {input_path}')
-
-    return data
--- a/src/utils/jsonl.py
+++ b/src/utils/jsonl.py
@ -0,0 +1,50 @@
+# Standard Packages
+import json
+import gzip
+
+# Internal Packages
+from src.utils.constants import empty_escape_sequences
+from src.utils.helpers import get_absolute_path
+
+
+def load_jsonl(input_path, verbose=0):
+    "Read List of JSON objects from JSON line file"
+    # Initialize Variables
+    data = []
+    jsonl_file = None
+
+    # Open JSONL file
+    if input_path.suffix == ".gz":
+        jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
+    elif input_path.suffix == ".jsonl":
+        jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
+
+    # Read JSONL file
+    for line in jsonl_file:
+        data.append(json.loads(line.strip(empty_escape_sequences)))
+
+    # Close JSONL file
+    jsonl_file.close()
+
+    # Log JSONL entries loaded
+    if verbose > 0:
+        print(f'Loaded {len(data)} records from {input_path}')
+
+    return data
+
+
+def dump_jsonl(jsonl_data, output_path, verbose=0):
+    "Write List of JSON objects to JSON line file"
+    with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
+        f.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
+
+
+def compress_jsonl_data(jsonl_data, output_path, verbose=0):
+    with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
+        gzip_file.write(jsonl_data)
+
+    if verbose > 0:
+        print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')