mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Update asymmetric extract_entries method to handle uncompressed jsonl
This is similar to what was done for the symmetric extract_entries method earlier
This commit is contained in:
parent
3d8a07f252
commit
1c3a1420f8
1 changed files with 20 additions and 8 deletions
|
@ -16,6 +16,7 @@ from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_mod
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
||||||
|
from src.utils.constants import empty_escape_sequences
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(search_config: AsymmetricSearchConfig):
|
def initialize_model(search_config: AsymmetricSearchConfig):
|
||||||
|
@ -43,16 +44,27 @@ def initialize_model(search_config: AsymmetricSearchConfig):
|
||||||
def extract_entries(notesfile, verbose=0):
|
def extract_entries(notesfile, verbose=0):
|
||||||
"Load entries from compressed jsonl"
|
"Load entries from compressed jsonl"
|
||||||
entries = []
|
entries = []
|
||||||
with gzip.open(get_absolute_path(notesfile), 'rt', encoding='utf8') as jsonl:
|
jsonl_file = None
|
||||||
for line in jsonl:
|
|
||||||
note = json.loads(line.strip())
|
|
||||||
|
|
||||||
# Ignore title notes i.e notes with just headings and empty body
|
# Open File
|
||||||
if not "Body" in note or note["Body"].strip() == "":
|
if notesfile.suffix == ".gz":
|
||||||
continue
|
jsonl_file = gzip.open(get_absolute_path(notesfile), "rt", encoding='utf8')
|
||||||
|
elif notesfile.suffix == ".jsonl":
|
||||||
|
jsonl_file = open(get_absolute_path(notesfile), "r", encoding='utf8')
|
||||||
|
|
||||||
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
# Read File
|
||||||
entries.append([note_string, note["Raw"]])
|
for line in jsonl_file:
|
||||||
|
note = json.loads(line.strip(empty_escape_sequences))
|
||||||
|
|
||||||
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
|
if not "Body" in note or note["Body"].strip(empty_escape_sequences) == "":
|
||||||
|
continue
|
||||||
|
|
||||||
|
note_string = f'{note["Title"]}\t{note["Tags"] if "Tags" in note else ""}\n{note["Body"] if "Body" in note else ""}'
|
||||||
|
entries.append([note_string, note["Raw"]])
|
||||||
|
|
||||||
|
# Close File
|
||||||
|
jsonl_file.close()
|
||||||
|
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print(f"Loaded {len(entries)} entries from {notesfile}")
|
print(f"Loaded {len(entries)} entries from {notesfile}")
|
||||||
|
|
Loading…
Add table
Reference in a new issue