mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 16:14:21 +00:00
Consolidate jsonl helper methods in a single file under utils module
This commit is contained in:
parent
de726c4b6c
commit
0917f1574d
6 changed files with 57 additions and 84 deletions
|
@ -5,12 +5,12 @@ import json
|
||||||
import argparse
|
import argparse
|
||||||
import pathlib
|
import pathlib
|
||||||
import glob
|
import glob
|
||||||
import gzip
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
|
@ -38,25 +38,6 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, verb
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
def dump_jsonl(jsonl_data, output_path, verbose=0):
|
|
||||||
"Write List of JSON objects to JSON line file"
|
|
||||||
with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
|
|
||||||
f.write(jsonl_data)
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
jsonl_entries = len(jsonl_data.split('\n'))
|
|
||||||
print(f'Wrote {jsonl_entries} lines to jsonl at {output_path}')
|
|
||||||
|
|
||||||
|
|
||||||
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
|
||||||
with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
|
|
||||||
gzip_file.write(jsonl_data)
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
jsonl_entries = len(jsonl_data.split('\n'))
|
|
||||||
print(f'Wrote {jsonl_entries} lines to gzip compressed jsonl at {output_path}')
|
|
||||||
|
|
||||||
|
|
||||||
def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0):
|
def get_beancount_files(beancount_files=None, beancount_file_filter=None, verbose=0):
|
||||||
"Get Beancount files to process"
|
"Get Beancount files to process"
|
||||||
absolute_beancount_files, filtered_beancount_files = set(), set()
|
absolute_beancount_files, filtered_beancount_files = set(), set()
|
||||||
|
|
|
@ -5,12 +5,12 @@ import json
|
||||||
import argparse
|
import argparse
|
||||||
import pathlib
|
import pathlib
|
||||||
import glob
|
import glob
|
||||||
import gzip
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode import orgnode
|
from src.processor.org_mode import orgnode
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
|
@ -37,35 +37,6 @@ def org_to_jsonl(org_files, org_file_filter, output_file, verbose=0):
|
||||||
|
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
def dump_jsonl(jsonl_data, output_path, verbose=0):
|
|
||||||
"Write List of JSON objects to JSON line file"
|
|
||||||
with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
|
|
||||||
f.write(jsonl_data)
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
|
|
||||||
|
|
||||||
|
|
||||||
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
|
||||||
with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
|
|
||||||
gzip_file.write(jsonl_data)
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
|
|
||||||
|
|
||||||
|
|
||||||
def load_jsonl(input_path, verbose=0):
|
|
||||||
"Read List of JSON objects from JSON line file"
|
|
||||||
data = []
|
|
||||||
with open(get_absolute_path(input_path), 'r', encoding='utf-8') as f:
|
|
||||||
for line in f:
|
|
||||||
data.append(json.loads(line.rstrip('\n|\r')))
|
|
||||||
|
|
||||||
if verbose > 0:
|
|
||||||
print(f'Loaded {len(data)} records from {input_path}')
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def get_org_files(org_files=None, org_file_filter=None, verbose=0):
|
def get_org_files(org_files=None, org_file_filter=None, verbose=0):
|
||||||
"Get Org files to process"
|
"Get Org files to process"
|
||||||
|
|
|
@ -10,10 +10,11 @@ import torch
|
||||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
|
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
|
||||||
|
from src.utils.jsonl import load_jsonl
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(search_config: AsymmetricSearchConfig):
|
def initialize_model(search_config: AsymmetricSearchConfig):
|
||||||
|
|
|
@ -8,10 +8,11 @@ import torch
|
||||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model, load_jsonl
|
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
|
||||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
||||||
from src.utils.config import TextSearchModel
|
from src.utils.config import TextSearchModel
|
||||||
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
|
||||||
|
from src.utils.jsonl import load_jsonl
|
||||||
|
|
||||||
|
|
||||||
def initialize_model(search_config: SymmetricSearchConfig):
|
def initialize_model(search_config: SymmetricSearchConfig):
|
||||||
|
|
|
@ -1,12 +1,7 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
|
||||||
import gzip
|
|
||||||
import pathlib
|
import pathlib
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
# Internal Packages
|
|
||||||
from src.utils.constants import empty_escape_sequences
|
|
||||||
|
|
||||||
|
|
||||||
def is_none_or_empty(item):
|
def is_none_or_empty(item):
|
||||||
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
|
return item == None or (hasattr(item, '__iter__') and len(item) == 0)
|
||||||
|
@ -58,29 +53,3 @@ def load_model(model_name, model_dir, model_type):
|
||||||
model.save(model_path)
|
model.save(model_path)
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def load_jsonl(input_path, verbose=0):
|
|
||||||
"Read List of JSON objects from JSON line file"
|
|
||||||
# Initialize Variables
|
|
||||||
data = []
|
|
||||||
jsonl_file = None
|
|
||||||
|
|
||||||
# Open JSONL file
|
|
||||||
if input_path.suffix == ".gz":
|
|
||||||
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
|
|
||||||
elif input_path.suffix == ".jsonl":
|
|
||||||
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
|
|
||||||
|
|
||||||
# Read JSONL file
|
|
||||||
for line in jsonl_file:
|
|
||||||
data.append(json.loads(line.strip(empty_escape_sequences)))
|
|
||||||
|
|
||||||
# Close JSONL file
|
|
||||||
jsonl_file.close()
|
|
||||||
|
|
||||||
# Log JSONL entries loaded
|
|
||||||
if verbose > 0:
|
|
||||||
print(f'Loaded {len(data)} records from {input_path}')
|
|
||||||
|
|
||||||
return data
|
|
50
src/utils/jsonl.py
Normal file
50
src/utils/jsonl.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# Standard Packages
|
||||||
|
import json
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from src.utils.constants import empty_escape_sequences
|
||||||
|
from src.utils.helpers import get_absolute_path
|
||||||
|
|
||||||
|
|
||||||
|
def load_jsonl(input_path, verbose=0):
|
||||||
|
"Read List of JSON objects from JSON line file"
|
||||||
|
# Initialize Variables
|
||||||
|
data = []
|
||||||
|
jsonl_file = None
|
||||||
|
|
||||||
|
# Open JSONL file
|
||||||
|
if input_path.suffix == ".gz":
|
||||||
|
jsonl_file = gzip.open(get_absolute_path(input_path), 'rt', encoding='utf-8')
|
||||||
|
elif input_path.suffix == ".jsonl":
|
||||||
|
jsonl_file = open(get_absolute_path(input_path), 'r', encoding='utf-8')
|
||||||
|
|
||||||
|
# Read JSONL file
|
||||||
|
for line in jsonl_file:
|
||||||
|
data.append(json.loads(line.strip(empty_escape_sequences)))
|
||||||
|
|
||||||
|
# Close JSONL file
|
||||||
|
jsonl_file.close()
|
||||||
|
|
||||||
|
# Log JSONL entries loaded
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Loaded {len(data)} records from {input_path}')
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def dump_jsonl(jsonl_data, output_path, verbose=0):
|
||||||
|
"Write List of JSON objects to JSON line file"
|
||||||
|
with open(get_absolute_path(output_path), 'w', encoding='utf-8') as f:
|
||||||
|
f.write(jsonl_data)
|
||||||
|
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Wrote {len(jsonl_data)} lines to jsonl at {output_path}')
|
||||||
|
|
||||||
|
|
||||||
|
def compress_jsonl_data(jsonl_data, output_path, verbose=0):
|
||||||
|
with gzip.open(get_absolute_path(output_path), 'wt') as gzip_file:
|
||||||
|
gzip_file.write(jsonl_data)
|
||||||
|
|
||||||
|
if verbose > 0:
|
||||||
|
print(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
|
Loading…
Add table
Reference in a new issue