From 5aad297286bd95247123d05b0c66f5a58630a870 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 21 Jul 2022 02:53:18 +0400 Subject: [PATCH] Reuse logic to extract entries across symmetric, asymmetric search Now that the logic to compile entries is in the processor layer, the extract_entries method is standard across (text) search_types Extract the load_jsonl method as a utility helper method. Use it in (a)symmetric search types --- src/processor/ledger/beancount_to_jsonl.py | 27 ------------------ src/search_type/asymmetric.py | 31 ++++---------------- src/search_type/symmetric_ledger.py | 4 +-- src/utils/helpers.py | 33 +++++++++++++++++++++- 4 files changed, 39 insertions(+), 56 deletions(-) diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index aeb93653..ede42686 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -9,7 +9,6 @@ import gzip import re # Internal Packages -from src.processor.org_mode import orgnode from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.constants import empty_escape_sequences @@ -58,32 +57,6 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0): print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}') -def load_jsonl(input_path, verbose=0): - "Read List of JSON objects from JSON line file" - # Initialize Variables - data = [] - jsonl_file = None - - # Open JSONL file - if input_path.suffix == ".gz": - jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') - elif input_path.suffix == ".jsonl": - jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') - - # Read JSONL file - for line in jsonl_file: - data.append(json.loads(line.strip(empty_escape_sequences))) - - # Close JSONL file - jsonl_file.close() - - # Log JSONL entries loaded - if verbose > 0: - print(f'Loaded {len(data)} records from {input_path}') - - return data - - def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0): "Get Beancount files to process" absolute_beancount_files, filtered_beancount_files = set(), set() diff --git a/src/search_type/asymmetric.py b/src/search_type/asymmetric.py index 8e6e9db4..da2f34dc 100644 --- a/src/search_type/asymmetric.py +++ b/src/search_type/asymmetric.py @@ -1,8 +1,6 @@ #!/usr/bin/env python # Standard Packages -import json -import gzip import argparse import pathlib from copy import deepcopy @@ -12,11 +10,10 @@ import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model +from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig -from src.utils.constants import empty_escape_sequences def initialize_model(search_config: AsymmetricSearchConfig): @@ -43,27 +40,9 @@ def initialize_model(search_config: AsymmetricSearchConfig): def extract_entries(notesfile, verbose=0): "Load entries from compressed jsonl" - entries = [] - jsonl_file = None - - # Open File - if notesfile.suffix == ".gz": - jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8') - elif notesfile.suffix == ".jsonl": - jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8') - - # Read File - for line in jsonl_file: - note = json.loads(line.strip(empty_escape_sequences)) - entries.append({'compiled': note['compiled'], 'raw': note["raw"]}) - - # Close File - jsonl_file.close() - - if verbose > 0: - print(f"Loaded {len(entries)} entries from {notesfile}") - - return entries + return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'} + for entry + in load_jsonl(notesfile, verbose=verbose)] def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0): @@ -194,4 +173,4 @@ if __name__ == '__main__': hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k) # render results - render_results(hits, entries, count=args.results_count) + render_results(hits, entries, count=args.results_count) \ No newline at end of file diff --git a/src/search_type/symmetric_ledger.py b/src/search_type/symmetric_ledger.py index 1e0e4033..616a86e7 100644 --- a/src/search_type/symmetric_ledger.py +++ b/src/search_type/symmetric_ledger.py @@ -8,8 +8,8 @@ import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util # Internal Packages -from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model -from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl +from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl +from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl from src.utils.config import TextSearchModel from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig diff --git a/src/utils/helpers.py b/src/utils/helpers.py index 3c19b935..b19deb6f 100644 --- a/src/utils/helpers.py +++ b/src/utils/helpers.py @@ -1,7 +1,12 @@ # Standard Packages +import json +import gzip import pathlib from os.path import join +# Internal Packages +from src.utils.constants import empty_escape_sequences + def is_none_or_empty(item): return item == None or (hasattr(item, '__iter__') and len(item) == 0) @@ -52,4 +57,30 @@ def load_model(model_name, model_dir, model_type): if model_path is not None: model.save(model_path) - return model \ No newline at end of file + return model + + +def load_jsonl(input_path, verbose=0): + "Read List of JSON objects from JSON line file" + # Initialize Variables + data = [] + jsonl_file = None + + # Open JSONL file + if input_path.suffix == ".gz": + jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8') + elif input_path.suffix == ".jsonl": + jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8') + + # Read JSONL file + for line in jsonl_file: + data.append(json.loads(line.strip(empty_escape_sequences))) + + # Close JSONL file + jsonl_file.close() + + # Log JSONL entries loaded + if verbose > 0: + print(f'Loaded {len(data)} records from {input_path}') + + return data \ No newline at end of file