diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index a41705e0..131ef919 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -28,10 +28,10 @@ def org_to_jsonl(org_files, org_file_filter, output_file): org_files = get_org_files(org_files, org_file_filter) # Extract Entries from specified Org files - entries = extract_org_entries(org_files) + entries, file_to_entries = extract_org_entries(org_files) # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries) + jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -66,18 +66,19 @@ def get_org_files(org_files=None, org_file_filter=None): def extract_org_entries(org_files): "Extract entries from specified Org files" entries = [] + entry_to_file_map = [] for org_file in org_files: - entries.extend( - orgnode.makelist( - str(org_file))) + org_file_entries = orgnode.makelist(str(org_file)) + entry_to_file_map += [org_file]*len(org_file_entries) + entries.extend(org_file_entries) - return entries + return entries, entry_to_file_map -def convert_org_entries_to_jsonl(entries) -> str: +def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str: "Convert each Org-Mode entries to JSON and collate as JSONL" jsonl = '' - for entry in entries: + for entry_id, entry in enumerate(entries): entry_dict = dict() # Ignore title notes i.e notes with just headings and empty body @@ -106,6 +107,7 @@ def convert_org_entries_to_jsonl(entries) -> str: if entry_dict: entry_dict["raw"] = f'{entry}' + entry_dict["file"] = f'{entry_to_file_map[entry_id]}' # Convert Dictionary to JSON and Append to JSONL string jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index 1dc30ef2..3b050bf0 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -52,9 +52,7 @@ def initialize_model(search_config: TextSearchConfig): def extract_entries(jsonl_file): "Load entries from compressed jsonl" - return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'} - for entry - in load_jsonl(jsonl_file)] + return load_jsonl(jsonl_file) def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False): @@ -83,7 +81,7 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False): for filter in filters_in_query: query, entries, corpus_embeddings = filter.apply(query, entries, corpus_embeddings) end = time.time() - logger.debug(f"Filter Time: {end - start:.3f} seconds") + logger.debug(f"Total Filter Time: {end - start:.3f} seconds") if entries is None or len(entries) == 0: return [], [] diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index cadd4a6a..6a626299 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -21,10 +21,10 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries = extract_org_entries(org_files=[orgfile]) + entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_data = convert_org_entries_to_jsonl(entries) + jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map) # Assert assert is_none_or_empty(jsonl_data) @@ -43,10 +43,10 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries = extract_org_entries(org_files=[orgfile]) + entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_org_entries_to_jsonl(entries) + jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert