mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Update asymmetric extract_entries method to handle uncompressed jsonl
This is similar to what was done for the symmetric extract_entries method earlier
This commit is contained in:
parent
3d8a07f252
commit
1c3a1420f8
1 changed files with 20 additions and 8 deletions
|
@ -16,6 +16,7 @@ from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_mod
|
|||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||
from src.utils.config import TextSearchModel
|
||||
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
||||
from src.utils.constants import empty_escape_sequences
|
||||
|
||||
|
||||
def initialize_model(search_config: AsymmetricSearchConfig):
|
||||
|
@ -43,17 +44,28 @@ def initialize_model(search_config: AsymmetricSearchConfig):
|
|||
def extract_entries(notesfile, verbose=0):
|
||||
"Load entries from compressed jsonl"
|
||||
entries = []
|
||||
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
||||
for line in jsonl:
|
||||
note = json.loads(line.strip())
|
||||
jsonl_file = None
|
||||
|
||||
# Open File
|
||||
if notesfile.suffix == ".gz":
|
||||
jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
|
||||
elif notesfile.suffix == ".jsonl":
|
||||
jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
|
||||
|
||||
# Read File
|
||||
for line in jsonl_file:
|
||||
note = json.loads(line.strip(empty_escape_sequences))
|
||||
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
if not "Body" in note or note["Body"].strip() == "":
|
||||
if not "Body" in note or note["Body"].strip(empty_escape_sequences) == "":
|
||||
continue
|
||||
|
||||
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
||||
entries.append([note_string, note["Raw"]])
|
||||
|
||||
# Close File
|
||||
jsonl_file.close()
|
||||
|
||||
if verbose > 0:
|
||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
||||
|
||||
|
|
Loading…
Reference in a new issue