From 490157cafacd927496f727d60883078c9cbccdee Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 6 Sep 2022 15:27:31 +0300 Subject: [PATCH] Setup File Filter for Markdown and Ledger content types - Pass file associated with entries in markdown, beancount to json converters - Add File, Word, Date Filters to Ledger, Markdown Types - Word, Date Filters were accidently removed from the above types yesterday - File Filter is the only filter that newly got added --- src/configure.py | 46 +++++++++++++++++---- src/processor/ledger/beancount_to_jsonl.py | 20 +++++---- src/processor/markdown/markdown_to_jsonl.py | 19 +++++---- src/search_filter/date_filter.py | 2 +- src/search_filter/word_filter.py | 3 -- 5 files changed, 62 insertions(+), 28 deletions(-) diff --git a/src/configure.py b/src/configure.py index 3ee594b4..f6476951 100644 --- a/src/configure.py +++ b/src/configure.py @@ -42,30 +42,62 @@ def configure_server(args, required=False): def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None): # Initialize Org Notes Search if (t == SearchType.Org or t == None) and config.content_type.org: - filter_directory = resolve_absolute_path(config.content_type.org.compressed_jsonl.parent) - filters = [DateFilter(), WordFilter(filter_directory, search_type=SearchType.Org), FileFilter()] # Extract Entries, Generate Notes Embeddings - model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, filters=filters) + model.orgmode_search = text_search.setup( + org_to_jsonl, + config.content_type.org, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[ + DateFilter(), + WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org), + FileFilter(), + ]) # Initialize Org Music Search if (t == SearchType.Music or t == None) and config.content_type.music: # Extract Entries, Generate Music Embeddings - model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate) + model.music_search = text_search.setup( + org_to_jsonl, + config.content_type.music, + search_config=config.search_type.asymmetric, + regenerate=regenerate) # Initialize Markdown Search if (t == SearchType.Markdown or t == None) and config.content_type.markdown: # Extract Entries, Generate Markdown Embeddings - model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate) + model.markdown_search = text_search.setup( + markdown_to_jsonl, + config.content_type.markdown, + search_config=config.search_type.asymmetric, + regenerate=regenerate, + filters=[ + DateFilter(), + WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown), + FileFilter(), + ]) # Initialize Ledger Search if (t == SearchType.Ledger or t == None) and config.content_type.ledger: # Extract Entries, Generate Ledger Embeddings - model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate) + model.ledger_search = text_search.setup( + beancount_to_jsonl, + config.content_type.ledger, + search_config=config.search_type.symmetric, + regenerate=regenerate, + filters=[ + DateFilter(), + WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger), + FileFilter(), + ]) # Initialize Image Search if (t == SearchType.Image or t == None) and config.content_type.image: # Extract Entries, Generate Image Embeddings - model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate) + model.image_search = image_search.setup( + config.content_type.image, + search_config=config.search_type.image, + regenerate=regenerate) return model diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 4c1034df..c0136bc6 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -28,10 +28,10 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file): beancount_files = get_beancount_files(beancount_files, beancount_file_filter) # Extract Entries from specified Beancount files - entries = extract_beancount_entries(beancount_files) + entries, transaction_to_file_map = extract_beancount_entries(beancount_files) # Process Each Entry from All Notes Files - jsonl_data = convert_beancount_entries_to_jsonl(entries) + jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -74,22 +74,24 @@ def extract_beancount_entries(beancount_files): empty_newline = f'^[{empty_escape_sequences}]*$' entries = [] + transaction_to_file_map = [] for beancount_file in beancount_files: with open(beancount_file) as f: ledger_content = f.read() - entries.extend([entry.strip(empty_escape_sequences) + transactions_per_file = [entry.strip(empty_escape_sequences) for entry in re.split(empty_newline, ledger_content, flags=re.MULTILINE) - if re.match(transaction_regex, entry)]) - - return entries + if re.match(transaction_regex, entry)] + transaction_to_file_map += [beancount_file]*len(transactions_per_file) + entries.extend(transactions_per_file) + return entries, transaction_to_file_map -def convert_beancount_entries_to_jsonl(entries): +def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map): "Convert each Beancount transaction to JSON and collate as JSONL" jsonl = '' - for entry in entries: - entry_dict = {'compiled': entry, 'raw': entry} + for entry_id, entry in enumerate(entries): + entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'} # Convert Dictionary to JSON and Append to JSONL string jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 3b4bb371..a0903fcb 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -28,10 +28,10 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file): markdown_files = get_markdown_files(markdown_files, markdown_file_filter) # Extract Entries from specified Markdown files - entries = extract_markdown_entries(markdown_files) + entries, entry_to_file_map = extract_markdown_entries(markdown_files) # Process Each Entry from All Notes Files - jsonl_data = convert_markdown_entries_to_jsonl(entries) + jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map) # Compress JSONL formatted Data if output_file.suffix == ".gz": @@ -74,21 +74,24 @@ def extract_markdown_entries(markdown_files): markdown_heading_regex = r'^#' entries = [] + entry_to_file_map = [] for markdown_file in markdown_files: with open(markdown_file) as f: markdown_content = f.read() - entries.extend([f'#{entry.strip(empty_escape_sequences)}' + markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' for entry - in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)]) + in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)] + entry_to_file_map += [markdown_file]*len(markdown_entries_per_file) + entries.extend(markdown_entries_per_file) - return entries + return entries, entry_to_file_map -def convert_markdown_entries_to_jsonl(entries): +def convert_markdown_entries_to_jsonl(entries, entry_to_file_map): "Convert each Markdown entries to JSON and collate as JSONL" jsonl = '' - for entry in entries: - entry_dict = {'compiled': entry, 'raw': entry} + for entry_id, entry in enumerate(entries): + entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'} # Convert Dictionary to JSON and Append to JSONL string jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n' diff --git a/src/search_filter/date_filter.py b/src/search_filter/date_filter.py index 53c7b266..22a66068 100644 --- a/src/search_filter/date_filter.py +++ b/src/search_filter/date_filter.py @@ -45,7 +45,7 @@ class DateFilter(BaseFilter): continue self.date_to_entry_ids[date_in_entry].add(id) end = time.time() - logger.debug(f"Created file filter index: {end - start} seconds") + logger.debug(f"Created date filter index: {end - start} seconds") def can_filter(self, raw_query): diff --git a/src/search_filter/word_filter.py b/src/search_filter/word_filter.py index dcf9ca6b..c7c5d059 100644 --- a/src/search_filter/word_filter.py +++ b/src/search_filter/word_filter.py @@ -4,9 +4,6 @@ import time import pickle import logging -# External Packages -import torch - # Internal Packages from src.search_filter.base_filter import BaseFilter from src.utils.helpers import LRU, resolve_absolute_path