From 490157cafacd927496f727d60883078c9cbccdee Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 6 Sep 2022 15:27:31 +0300
Subject: [PATCH] Setup File Filter for Markdown and Ledger content types

- Pass file associated with entries in markdown, beancount to json converters
- Add File, Word, Date Filters to Ledger, Markdown Types
  - Word, Date Filters were accidently removed from the above types yesterday
  - File Filter is the only filter that newly got added
---
 src/configure.py                            | 46 +++++++++++++++++----
 src/processor/ledger/beancount_to_jsonl.py  | 20 +++++----
 src/processor/markdown/markdown_to_jsonl.py | 19 +++++----
 src/search_filter/date_filter.py            |  2 +-
 src/search_filter/word_filter.py            |  3 --
 5 files changed, 62 insertions(+), 28 deletions(-)

diff --git a/src/configure.py b/src/configure.py
index 3ee594b4..f6476951 100644
--- a/src/configure.py
+++ b/src/configure.py
@@ -42,30 +42,62 @@ def configure_server(args, required=False):
 def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None):
     # Initialize Org Notes Search
     if (t == SearchType.Org or t == None) and config.content_type.org:
-        filter_directory = resolve_absolute_path(config.content_type.org.compressed_jsonl.parent)
-        filters = [DateFilter(), WordFilter(filter_directory, search_type=SearchType.Org), FileFilter()]
         # Extract Entries, Generate Notes Embeddings
-        model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, filters=filters)
+        model.orgmode_search = text_search.setup(
+            org_to_jsonl,
+            config.content_type.org,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[
+                DateFilter(),
+                WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org),
+                FileFilter(),
+            ])
 
     # Initialize Org Music Search
     if (t == SearchType.Music or t == None) and config.content_type.music:
         # Extract Entries, Generate Music Embeddings
-        model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate)
+        model.music_search = text_search.setup(
+            org_to_jsonl,
+            config.content_type.music,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate)
 
     # Initialize Markdown Search
     if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
         # Extract Entries, Generate Markdown Embeddings
-        model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate)
+        model.markdown_search = text_search.setup(
+            markdown_to_jsonl,
+            config.content_type.markdown,
+            search_config=config.search_type.asymmetric,
+            regenerate=regenerate,
+            filters=[
+                DateFilter(),
+                WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown),
+                FileFilter(),
+            ])
 
     # Initialize Ledger Search
     if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
         # Extract Entries, Generate Ledger Embeddings
-        model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate)
+        model.ledger_search = text_search.setup(
+            beancount_to_jsonl,
+            config.content_type.ledger,
+            search_config=config.search_type.symmetric,
+            regenerate=regenerate,
+            filters=[
+                DateFilter(),
+                WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger),
+                FileFilter(),
+            ])
 
     # Initialize Image Search
     if (t == SearchType.Image or t == None) and config.content_type.image:
         # Extract Entries, Generate Image Embeddings
-        model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate)
+        model.image_search = image_search.setup(
+            config.content_type.image,
+            search_config=config.search_type.image,
+            regenerate=regenerate)
 
     return model
 
diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py
index 4c1034df..c0136bc6 100644
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@@ -28,10 +28,10 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
     beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
 
     # Extract Entries from specified Beancount files
-    entries = extract_beancount_entries(beancount_files)
+    entries, transaction_to_file_map = extract_beancount_entries(beancount_files)
 
     # Process Each Entry from All Notes Files
-    jsonl_data = convert_beancount_entries_to_jsonl(entries)
+    jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map)
 
     # Compress JSONL formatted Data
     if output_file.suffix == ".gz":
@@ -74,22 +74,24 @@ def extract_beancount_entries(beancount_files):
     empty_newline = f'^[{empty_escape_sequences}]*$'
 
     entries = []
+    transaction_to_file_map = []
     for beancount_file in beancount_files:
         with open(beancount_file) as f:
             ledger_content = f.read()
-            entries.extend([entry.strip(empty_escape_sequences)
+            transactions_per_file = [entry.strip(empty_escape_sequences)
                for entry
                in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
-               if re.match(transaction_regex, entry)])
-
-    return entries
+               if re.match(transaction_regex, entry)]
+            transaction_to_file_map += [beancount_file]*len(transactions_per_file)
+            entries.extend(transactions_per_file)
+    return entries, transaction_to_file_map
 
 
-def convert_beancount_entries_to_jsonl(entries):
+def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map):
     "Convert each Beancount transaction to JSON and collate as JSONL"
     jsonl = ''
-    for entry in entries:
-        entry_dict = {'compiled': entry, 'raw': entry}
+    for entry_id, entry in enumerate(entries):
+        entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'}
         # Convert Dictionary to JSON and Append to JSONL string
         jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
 
diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py
index 3b4bb371..a0903fcb 100644
--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@@ -28,10 +28,10 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
     markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
 
     # Extract Entries from specified Markdown files
-    entries = extract_markdown_entries(markdown_files)
+    entries, entry_to_file_map = extract_markdown_entries(markdown_files)
 
     # Process Each Entry from All Notes Files
-    jsonl_data = convert_markdown_entries_to_jsonl(entries)
+    jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map)
 
     # Compress JSONL formatted Data
     if output_file.suffix == ".gz":
@@ -74,21 +74,24 @@ def extract_markdown_entries(markdown_files):
     markdown_heading_regex = r'^#'
 
     entries = []
+    entry_to_file_map = []
     for markdown_file in markdown_files:
         with open(markdown_file) as f:
             markdown_content = f.read()
-            entries.extend([f'#{entry.strip(empty_escape_sequences)}'
+            markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
                for entry
-               in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)])
+               in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)]
+            entry_to_file_map += [markdown_file]*len(markdown_entries_per_file)
+            entries.extend(markdown_entries_per_file)
 
-    return entries
+    return entries, entry_to_file_map
 
 
-def convert_markdown_entries_to_jsonl(entries):
+def convert_markdown_entries_to_jsonl(entries, entry_to_file_map):
     "Convert each Markdown entries to JSON and collate as JSONL"
     jsonl = ''
-    for entry in entries:
-        entry_dict = {'compiled': entry, 'raw': entry}
+    for entry_id, entry in enumerate(entries):
+        entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}
         # Convert Dictionary to JSON and Append to JSONL string
         jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
 
diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py
index 53c7b266..22a66068 100644
--- a/src/search_filter/date_filter.py
+++ b/src/search_filter/date_filter.py
@@ -45,7 +45,7 @@ class DateFilter(BaseFilter):
                     continue
                 self.date_to_entry_ids[date_in_entry].add(id)
         end = time.time()
-        logger.debug(f"Created file filter index: {end - start} seconds")
+        logger.debug(f"Created date filter index: {end - start} seconds")
 
 
     def can_filter(self, raw_query):
diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py
index dcf9ca6b..c7c5d059 100644
--- a/src/search_filter/word_filter.py
+++ b/src/search_filter/word_filter.py
@@ -4,9 +4,6 @@ import time
 import pickle
 import logging
 
-# External Packages
-import torch
-
 # Internal Packages
 from src.search_filter.base_filter import BaseFilter
 from src.utils.helpers import LRU, resolve_absolute_path