Reuse logic to extract entries across symmetric, asymmetric search

Now that the logic to compile entries is in the processor layer, the
extract_entries method is standard across (text) search_types

Extract the load_jsonl method as a utility helper method.
Use it in (a)symmetric search types
This commit is contained in:
Debanjum Singh Solanky 2022-07-21 02:53:18 +04:00
parent e220ecc00b
commit 5aad297286
4 changed files with 39 additions and 56 deletions

View file

@ -9,7 +9,6 @@ import gzip
import re import re
# Internal Packages # Internal Packages
from src.processor.org_mode import orgnode
from src.utils.helpers import get_absolute_path, is_none_or_empty from src.utils.helpers import get_absolute_path, is_none_or_empty
from src.utils.constants import empty_escape_sequences from src.utils.constants import empty_escape_sequences
@ -58,32 +57,6 @@ def compress_jsonl_data(jsonl_data, output_path, verbose=0):
print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}') print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
def load_jsonl(input_path, verbose=0):
"Read List of JSON objects from JSON line file"
# Initialize Variables
data = []
jsonl_file = None
# Open JSONL file
if input_path.suffix == ".gz":
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
elif input_path.suffix == ".jsonl":
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
# Read JSONL file
for line in jsonl_file:
data.append(json.loads(line.strip(empty_escape_sequences)))
# Close JSONL file
jsonl_file.close()
# Log JSONL entries loaded
if verbose > 0:
print(f'Loaded {len(data)} records from {input_path}')
return data
def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0): def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0):
"Get Beancount files to process" "Get Beancount files to process"
absolute_beancount_files, filtered_beancount_files = set(), set() absolute_beancount_files, filtered_beancount_files = set(), set()

View file

@ -1,8 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# Standard Packages # Standard Packages
import json
import gzip
import argparse import argparse
import pathlib import pathlib
from copy import deepcopy from copy import deepcopy
@ -12,11 +10,10 @@ import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages # Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel from src.utils.config import TextSearchModel
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
from src.utils.constants import empty_escape_sequences
def initialize_model(search_config: AsymmetricSearchConfig): def initialize_model(search_config: AsymmetricSearchConfig):
@ -43,27 +40,9 @@ def initialize_model(search_config: AsymmetricSearchConfig):
def extract_entries(notesfile, verbose=0): def extract_entries(notesfile, verbose=0):
"Load entries from compressed jsonl" "Load entries from compressed jsonl"
entries = [] return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
jsonl_file = None for entry
in load_jsonl(notesfile, verbose=verbose)]
# Open File
if notesfile.suffix == ".gz":
jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
elif notesfile.suffix == ".jsonl":
jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
# Read File
for line in jsonl_file:
note = json.loads(line.strip(empty_escape_sequences))
entries.append({'compiled': note['compiled'], 'raw': note["raw"]})
# Close File
jsonl_file.close()
if verbose > 0:
print(f"Loaded {len(entries)} entries from {notesfile}")
return entries
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0): def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False, device='cpu', verbose=0):
@ -194,4 +173,4 @@ if __name__ == '__main__':
hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k) hits = query(user_query, corpus_embeddings, entries, bi_encoder, cross_encoder, top_k)
# render results # render results
render_results(hits, entries, count=args.results_count) render_results(hits, entries, count=args.results_count)

View file

@ -8,8 +8,8 @@ import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util from sentence_transformers import SentenceTransformer, CrossEncoder, util
# Internal Packages # Internal Packages
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl, load_jsonl from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.utils.config import TextSearchModel from src.utils.config import TextSearchModel
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig

View file

@ -1,7 +1,12 @@
# Standard Packages # Standard Packages
import json
import gzip
import pathlib import pathlib
from os.path import join from os.path import join
# Internal Packages
from src.utils.constants import empty_escape_sequences
def is_none_or_empty(item): def is_none_or_empty(item):
return item == None or (hasattr(item, '__iter__') and len(item) == 0) return item == None or (hasattr(item, '__iter__') and len(item) == 0)
@ -52,4 +57,30 @@ def load_model(model_name, model_dir, model_type):
if model_path is not None: if model_path is not None:
model.save(model_path) model.save(model_path)
return model return model
def load_jsonl(input_path, verbose=0):
"Read List of JSON objects from JSON line file"
# Initialize Variables
data = []
jsonl_file = None
# Open JSONL file
if input_path.suffix == ".gz":
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
elif input_path.suffix == ".jsonl":
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
# Read JSONL file
for line in jsonl_file:
data.append(json.loads(line.strip(empty_escape_sequences)))
# Close JSONL file
jsonl_file.close()
# Log JSONL entries loaded
if verbose > 0:
print(f'Loaded {len(data)} records from {input_path}')
return data