mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-20 06:34:19 +00:00
Add file of each entry to entry dict in org_to_jsonl converter
- This will help filter query to org content type using file filter - Do not explicitly specify items being extracted from json of each entry in text_search as all text search content types do not have file being set in jsonl converters
This commit is contained in:
parent
7e083d3e96
commit
7606724dbc
3 changed files with 16 additions and 16 deletions
|
@ -28,10 +28,10 @@ def org_to_jsonl(org_files, org_file_filter, output_file):
|
||||||
org_files = get_org_files(org_files, org_file_filter)
|
org_files = get_org_files(org_files, org_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = extract_org_entries(org_files)
|
entries, file_to_entries = extract_org_entries(org_files)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
|
@ -66,18 +66,19 @@ def get_org_files(org_files=None, org_file_filter=None):
|
||||||
def extract_org_entries(org_files):
|
def extract_org_entries(org_files):
|
||||||
"Extract entries from specified Org files"
|
"Extract entries from specified Org files"
|
||||||
entries = []
|
entries = []
|
||||||
|
entry_to_file_map = []
|
||||||
for org_file in org_files:
|
for org_file in org_files:
|
||||||
entries.extend(
|
org_file_entries = orgnode.makelist(str(org_file))
|
||||||
orgnode.makelist(
|
entry_to_file_map += [org_file]*len(org_file_entries)
|
||||||
str(org_file)))
|
entries.extend(org_file_entries)
|
||||||
|
|
||||||
return entries
|
return entries, entry_to_file_map
|
||||||
|
|
||||||
|
|
||||||
def convert_org_entries_to_jsonl(entries) -> str:
|
def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str:
|
||||||
"Convert each Org-Mode entries to JSON and collate as JSONL"
|
"Convert each Org-Mode entries to JSON and collate as JSONL"
|
||||||
jsonl = ''
|
jsonl = ''
|
||||||
for entry in entries:
|
for entry_id, entry in enumerate(entries):
|
||||||
entry_dict = dict()
|
entry_dict = dict()
|
||||||
|
|
||||||
# Ignore title notes i.e notes with just headings and empty body
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
|
@ -106,6 +107,7 @@ def convert_org_entries_to_jsonl(entries) -> str:
|
||||||
|
|
||||||
if entry_dict:
|
if entry_dict:
|
||||||
entry_dict["raw"] = f'{entry}'
|
entry_dict["raw"] = f'{entry}'
|
||||||
|
entry_dict["file"] = f'{entry_to_file_map[entry_id]}'
|
||||||
|
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
||||||
|
|
|
@ -52,9 +52,7 @@ def initialize_model(search_config: TextSearchConfig):
|
||||||
|
|
||||||
def extract_entries(jsonl_file):
|
def extract_entries(jsonl_file):
|
||||||
"Load entries from compressed jsonl"
|
"Load entries from compressed jsonl"
|
||||||
return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
|
return load_jsonl(jsonl_file)
|
||||||
for entry
|
|
||||||
in load_jsonl(jsonl_file)]
|
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False):
|
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False):
|
||||||
|
@ -83,7 +81,7 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
|
||||||
for filter in filters_in_query:
|
for filter in filters_in_query:
|
||||||
query, entries, corpus_embeddings = filter.apply(query, entries, corpus_embeddings)
|
query, entries, corpus_embeddings = filter.apply(query, entries, corpus_embeddings)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Filter Time: {end - start:.3f} seconds")
|
logger.debug(f"Total Filter Time: {end - start:.3f} seconds")
|
||||||
|
|
||||||
if entries is None or len(entries) == 0:
|
if entries is None or len(entries) == 0:
|
||||||
return [], []
|
return [], []
|
||||||
|
|
|
@ -21,10 +21,10 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = extract_org_entries(org_files=[orgfile])
|
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert is_none_or_empty(jsonl_data)
|
assert is_none_or_empty(jsonl_data)
|
||||||
|
@ -43,10 +43,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries = extract_org_entries(org_files=[orgfile])
|
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_org_entries_to_jsonl(entries)
|
jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
|
Loading…
Add table
Reference in a new issue