Update asymmetric extract_entries method to handle uncompressed jsonl

This is similar to what was done for the symmetric extract_entries
method earlier
This commit is contained in:
Debanjum Singh Solanky 2022-02-27 19:03:31 -05:00
parent 3d8a07f252
commit 1c3a1420f8

View file

@ -16,6 +16,7 @@ from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_mod
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
from src.utils.constants import empty_escape_sequences
def initialize_model(search_config: AsymmetricSearchConfig):
@ -43,17 +44,28 @@ def initialize_model(search_config: AsymmetricSearchConfig):
def extract_entries(notesfile, verbose=0):
"Load entries from compressed jsonl"
entries = []
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
for line in jsonl:
note = json.loads(line.strip())
jsonl_file = None
# Open File
if notesfile.suffix == ".gz":
jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
elif notesfile.suffix == ".jsonl":
jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
# Read File
for line in jsonl_file:
note = json.loads(line.strip(empty_escape_sequences))
# Ignore title notes i.e notes with just headings and empty body
if not "Body" in note or note["Body"].strip() == "":
if not "Body" in note or note["Body"].strip(empty_escape_sequences) == "":
continue
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
entries.append([note_string, note["Raw"]])
# Close File
jsonl_file.close()
if verbose > 0:
print(f"Loaded {len(entries)} entries from {notesfile}")