Add file of each entry to entry dict in org_to_jsonl converter

- This will help filter query to org content type using file filter - Do not explicitly specify items being extracted from json of each entry in text_search as all text search content types do not have file being set in jsonl converters
2024-11-27 09:25:06 +01:00 · 2022-09-05 01:57:17 +03:00 · 2022-09-05 01:57:17 +03:00 · 7606724dbc
commit 7606724dbc
parent 7e083d3e96
3 changed files with 16 additions and 16 deletions
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -28,10 +28,10 @@ def org_to_jsonl(org_files, org_file_filter, output_file):
    org_files = get_org_files(org_files, org_file_filter)

    # Extract Entries from specified Org files
-    entries = extract_org_entries(org_files)
+    entries, file_to_entries = extract_org_entries(org_files)

    # Process Each Entry from All Notes Files
-    jsonl_data = convert_org_entries_to_jsonl(entries)
+    jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries)

    # Compress JSONL formatted Data
    if output_file.suffix == ".gz":
@ -66,18 +66,19 @@ def get_org_files(org_files=None, org_file_filter=None):
 def extract_org_entries(org_files):
    "Extract entries from specified Org files"
    entries = []
+    entry_to_file_map = []
    for org_file in org_files:
-        entries.extend(
-            orgnode.makelist(
-                str(org_file)))
+        org_file_entries = orgnode.makelist(str(org_file))
+        entry_to_file_map += [org_file]*len(org_file_entries)
+        entries.extend(org_file_entries)

-    return entries
+    return entries, entry_to_file_map


-def convert_org_entries_to_jsonl(entries) -> str:
+def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str:
    "Convert each Org-Mode entries to JSON and collate as JSONL"
    jsonl = ''
-    for entry in entries:
+    for entry_id, entry in enumerate(entries):
        entry_dict = dict()

        # Ignore title notes i.e notes with just headings and empty body
@ -106,6 +107,7 @@ def convert_org_entries_to_jsonl(entries) -> str:

        if entry_dict:
            entry_dict["raw"] = f'{entry}'
+            entry_dict["file"] = f'{entry_to_file_map[entry_id]}'

            # Convert Dictionary to JSON and Append to JSONL string
            jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
--- a/src/search_type/text_search.py
+++ b/src/search_type/text_search.py
@ -52,9 +52,7 @@ def initialize_model(search_config: TextSearchConfig):

 def extract_entries(jsonl_file):
    "Load entries from compressed jsonl"
-    return [{'compiled': f'{entry["compiled"]}', 'raw': f'{entry["raw"]}'}
-            for entry
-            in load_jsonl(jsonl_file)]
+    return load_jsonl(jsonl_file)


 def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False):
@ -83,7 +81,7 @@ def query(raw_query: str, model: TextSearchModel, rank_results=False):
    for filter in filters_in_query:
        query, entries, corpus_embeddings = filter.apply(query, entries, corpus_embeddings)
    end = time.time()
-    logger.debug(f"Filter Time: {end - start:.3f} seconds")
+    logger.debug(f"Total Filter Time: {end - start:.3f} seconds")

    if entries is None or len(entries) == 0:
        return [], []
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@ -21,10 +21,10 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Org files
-    entries = extract_org_entries(org_files=[orgfile])
+    entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])

    # Process Each Entry from All Notes Files
-    jsonl_data = convert_org_entries_to_jsonl(entries)
+    jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map)

    # Assert
    assert is_none_or_empty(jsonl_data)
@ -43,10 +43,10 @@ def test_entry_with_body_to_jsonl(tmp_path):

    # Act
    # Extract Entries from specified Org files
-    entries = extract_org_entries(org_files=[orgfile])
+    entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])

    # Process Each Entry from All Notes Files
-    jsonl_string = convert_org_entries_to_jsonl(entries)
+    jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] 

    # Assert