diff --git a/src/configure.py b/src/configure.py index d920a614..495ac313 100644 --- a/src/configure.py +++ b/src/configure.py @@ -6,9 +6,9 @@ import logging import json # Internal Packages -from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl -from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl -from src.processor.org_mode.org_to_jsonl import org_to_jsonl +from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl +from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl +from src.processor.org_mode.org_to_jsonl import OrgToJsonl from src.search_type import image_search, text_search from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel from src.utils import state @@ -44,7 +44,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, if (t == SearchType.Org or t == None) and config.content_type.org: # Extract Entries, Generate Notes Embeddings model.orgmode_search = text_search.setup( - org_to_jsonl, + OrgToJsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, @@ -54,7 +54,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, if (t == SearchType.Music or t == None) and config.content_type.music: # Extract Entries, Generate Music Embeddings model.music_search = text_search.setup( - org_to_jsonl, + OrgToJsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, @@ -64,7 +64,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, if (t == SearchType.Markdown or t == None) and config.content_type.markdown: # Extract Entries, Generate Markdown Embeddings model.markdown_search = text_search.setup( - markdown_to_jsonl, + MarkdownToJsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, @@ -74,7 +74,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, if (t == SearchType.Ledger or t == None) and config.content_type.ledger: # Extract Entries, Generate Ledger Embeddings model.ledger_search = text_search.setup( - beancount_to_jsonl, + BeancountToJsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, diff --git a/src/processor/ledger/beancount_to_jsonl.py b/src/processor/ledger/beancount_to_jsonl.py index 7b8b9bba..d54b7e1b 100644 --- a/src/processor/ledger/beancount_to_jsonl.py +++ b/src/processor/ledger/beancount_to_jsonl.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Standard Packages import json import glob @@ -8,121 +6,122 @@ import logging import time # Internal Packages +from src.processor.text_to_jsonl import TextToJsonl from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data -from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) -# Define Functions -def beancount_to_jsonl(config: TextContentConfig, previous_entries=None): - # Extract required fields from config - beancount_files, beancount_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl +class BeancountToJsonl(TextToJsonl): + # Define Functions + def process(self, previous_entries=None): + # Extract required fields from config + beancount_files, beancount_file_filter, output_file = self.config.input_files, self.config.input_filter,self.config.compressed_jsonl - # Input Validation - if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter): - print("At least one of beancount-files or beancount-file-filter is required to be specified") - exit(1) + # Input Validation + if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter): + print("At least one of beancount-files or beancount-file-filter is required to be specified") + exit(1) - # Get Beancount Files to Process - beancount_files = get_beancount_files(beancount_files, beancount_file_filter) + # Get Beancount Files to Process + beancount_files = BeancountToJsonl.get_beancount_files(beancount_files, beancount_file_filter) - # Extract Entries from specified Beancount files - start = time.time() - current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files)) - end = time.time() - logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") + # Extract Entries from specified Beancount files + start = time.time() + current_entries = BeancountToJsonl.convert_transactions_to_maps(*BeancountToJsonl.extract_beancount_transactions(beancount_files)) + end = time.time() + logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds") - # Identify, mark and merge any new entries with previous entries - start = time.time() - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) - end = time.time() - logger.debug(f"Identify new or updated transaction: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries + start = time.time() + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated transaction: {end - start} seconds") - # Process Each Entry from All Notes Files - start = time.time() - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = convert_transaction_maps_to_jsonl(entries) + # Process Each Entry from All Notes Files + start = time.time() + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = BeancountToJsonl.convert_transaction_maps_to_jsonl(entries) - # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) - end = time.time() - logger.debug(f"Write transactions to JSONL file: {end - start} seconds") + # Compress JSONL formatted Data + if output_file.suffix == ".gz": + compress_jsonl_data(jsonl_data, output_file) + elif output_file.suffix == ".jsonl": + dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write transactions to JSONL file: {end - start} seconds") - return entries_with_ids + return entries_with_ids + @staticmethod + def get_beancount_files(beancount_files=None, beancount_file_filters=None): + "Get Beancount files to process" + absolute_beancount_files, filtered_beancount_files = set(), set() + if beancount_files: + absolute_beancount_files = {get_absolute_path(beancount_file) + for beancount_file + in beancount_files} + if beancount_file_filters: + filtered_beancount_files = { + filtered_file + for beancount_file_filter in beancount_file_filters + for filtered_file in glob.glob(get_absolute_path(beancount_file_filter)) + } -def get_beancount_files(beancount_files=None, beancount_file_filters=None): - "Get Beancount files to process" - absolute_beancount_files, filtered_beancount_files = set(), set() - if beancount_files: - absolute_beancount_files = {get_absolute_path(beancount_file) - for beancount_file - in beancount_files} - if beancount_file_filters: - filtered_beancount_files = { - filtered_file - for beancount_file_filter in beancount_file_filters - for filtered_file in glob.glob(get_absolute_path(beancount_file_filter)) + all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files) + + files_with_non_beancount_extensions = { + beancount_file + for beancount_file + in all_beancount_files + if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount") } + if any(files_with_non_beancount_extensions): + print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}") - all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files) + logger.info(f'Processing files: {all_beancount_files}') - files_with_non_beancount_extensions = { - beancount_file - for beancount_file - in all_beancount_files - if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount") - } - if any(files_with_non_beancount_extensions): - print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}") + return all_beancount_files - logger.info(f'Processing files: {all_beancount_files}') + @staticmethod + def extract_beancount_transactions(beancount_files): + "Extract entries from specified Beancount files" - return all_beancount_files + # Initialize Regex for extracting Beancount Entries + transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] ' + empty_newline = f'^[\n\r\t\ ]*$' + entries = [] + transaction_to_file_map = [] + for beancount_file in beancount_files: + with open(beancount_file) as f: + ledger_content = f.read() + transactions_per_file = [entry.strip(empty_escape_sequences) + for entry + in re.split(empty_newline, ledger_content, flags=re.MULTILINE) + if re.match(transaction_regex, entry)] + transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file)) + entries.extend(transactions_per_file) + return entries, dict(transaction_to_file_map) -def extract_beancount_transactions(beancount_files): - "Extract entries from specified Beancount files" + @staticmethod + def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: + "Convert each Beancount transaction into a dictionary" + entry_maps = [] + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'}) - # Initialize Regex for extracting Beancount Entries - transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] ' - empty_newline = f'^[\n\r\t\ ]*$' + logger.info(f"Converted {len(entries)} transactions to dictionaries") - entries = [] - transaction_to_file_map = [] - for beancount_file in beancount_files: - with open(beancount_file) as f: - ledger_content = f.read() - transactions_per_file = [entry.strip(empty_escape_sequences) - for entry - in re.split(empty_newline, ledger_content, flags=re.MULTILINE) - if re.match(transaction_regex, entry)] - transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file)) - entries.extend(transactions_per_file) - return entries, dict(transaction_to_file_map) + return entry_maps - -def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]: - "Convert each Beancount transaction into a dictionary" - entry_maps = [] - for entry in entries: - entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'}) - - logger.info(f"Converted {len(entries)} transactions to dictionaries") - - return entry_maps - - -def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: - "Convert each Beancount transaction dictionary to JSON and collate as JSONL" - return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) + @staticmethod + def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str: + "Convert each Beancount transaction dictionary to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) diff --git a/src/processor/markdown/markdown_to_jsonl.py b/src/processor/markdown/markdown_to_jsonl.py index 22f5ea17..48fbbdf9 100644 --- a/src/processor/markdown/markdown_to_jsonl.py +++ b/src/processor/markdown/markdown_to_jsonl.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Standard Packages import json import glob @@ -8,120 +6,121 @@ import logging import time # Internal Packages +from src.processor.text_to_jsonl import TextToJsonl from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.constants import empty_escape_sequences from src.utils.jsonl import dump_jsonl, compress_jsonl_data -from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) -# Define Functions -def markdown_to_jsonl(config: TextContentConfig, previous_entries=None): - # Extract required fields from config - markdown_files, markdown_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl +class MarkdownToJsonl(TextToJsonl): + # Define Functions + def process(self, previous_entries=None): + # Extract required fields from config + markdown_files, markdown_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl - # Input Validation - if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): - print("At least one of markdown-files or markdown-file-filter is required to be specified") - exit(1) + # Input Validation + if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter): + print("At least one of markdown-files or markdown-file-filter is required to be specified") + exit(1) - # Get Markdown Files to Process - markdown_files = get_markdown_files(markdown_files, markdown_file_filter) + # Get Markdown Files to Process + markdown_files = MarkdownToJsonl.get_markdown_files(markdown_files, markdown_file_filter) - # Extract Entries from specified Markdown files - start = time.time() - current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files)) - end = time.time() - logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") + # Extract Entries from specified Markdown files + start = time.time() + current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(*MarkdownToJsonl.extract_markdown_entries(markdown_files)) + end = time.time() + logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds") - # Identify, mark and merge any new entries with previous entries - start = time.time() - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) - end = time.time() - logger.debug(f"Identify new or updated entries: {end - start} seconds") + # Identify, mark and merge any new entries with previous entries + start = time.time() + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + end = time.time() + logger.debug(f"Identify new or updated entries: {end - start} seconds") - # Process Each Entry from All Notes Files - start = time.time() - entries = list(map(lambda entry: entry[1], entries_with_ids)) - jsonl_data = convert_markdown_maps_to_jsonl(entries) + # Process Each Entry from All Notes Files + start = time.time() + entries = list(map(lambda entry: entry[1], entries_with_ids)) + jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) - # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) - end = time.time() - logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds") + # Compress JSONL formatted Data + if output_file.suffix == ".gz": + compress_jsonl_data(jsonl_data, output_file) + elif output_file.suffix == ".jsonl": + dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds") - return entries_with_ids + return entries_with_ids + @staticmethod + def get_markdown_files(markdown_files=None, markdown_file_filters=None): + "Get Markdown files to process" + absolute_markdown_files, filtered_markdown_files = set(), set() + if markdown_files: + absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} + if markdown_file_filters: + filtered_markdown_files = { + filtered_file + for markdown_file_filter in markdown_file_filters + for filtered_file in glob.glob(get_absolute_path(markdown_file_filter)) + } -def get_markdown_files(markdown_files=None, markdown_file_filters=None): - "Get Markdown files to process" - absolute_markdown_files, filtered_markdown_files = set(), set() - if markdown_files: - absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files} - if markdown_file_filters: - filtered_markdown_files = { - filtered_file - for markdown_file_filter in markdown_file_filters - for filtered_file in glob.glob(get_absolute_path(markdown_file_filter)) + all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) + + files_with_non_markdown_extensions = { + md_file + for md_file + in all_markdown_files + if not md_file.endswith(".md") and not md_file.endswith('.markdown') } - all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files) + if any(files_with_non_markdown_extensions): + logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}") - files_with_non_markdown_extensions = { - md_file - for md_file - in all_markdown_files - if not md_file.endswith(".md") and not md_file.endswith('.markdown') - } + logger.info(f'Processing files: {all_markdown_files}') - if any(files_with_non_markdown_extensions): - logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}") + return all_markdown_files - logger.info(f'Processing files: {all_markdown_files}') + @staticmethod + def extract_markdown_entries(markdown_files): + "Extract entries by heading from specified Markdown files" - return all_markdown_files + # Regex to extract Markdown Entries by Heading + markdown_heading_regex = r'^#' + entries = [] + entry_to_file_map = [] + for markdown_file in markdown_files: + with open(markdown_file) as f: + markdown_content = f.read() + markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' + for entry + in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE) + if entry.strip(empty_escape_sequences) != ''] + entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) + entries.extend(markdown_entries_per_file) -def extract_markdown_entries(markdown_files): - "Extract entries by heading from specified Markdown files" + return entries, dict(entry_to_file_map) - # Regex to extract Markdown Entries by Heading - markdown_heading_regex = r'^#' + @staticmethod + def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: + "Convert each Markdown entries into a dictionary" + entry_maps = [] + for entry in entries: + entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'}) - entries = [] - entry_to_file_map = [] - for markdown_file in markdown_files: - with open(markdown_file) as f: - markdown_content = f.read() - markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}' - for entry - in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE) - if entry.strip(empty_escape_sequences) != ''] - entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file)) - entries.extend(markdown_entries_per_file) + logger.info(f"Converted {len(entries)} markdown entries to dictionaries") - return entries, dict(entry_to_file_map) + return entry_maps - -def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]: - "Convert each Markdown entries into a dictionary" - entry_maps = [] - for entry in entries: - entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'}) - - logger.info(f"Converted {len(entries)} markdown entries to dictionaries") - - return entry_maps - - -def convert_markdown_maps_to_jsonl(entries): - "Convert each Markdown entries to JSON and collate as JSONL" - return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) + @staticmethod + def convert_markdown_maps_to_jsonl(entries): + "Convert each Markdown entries to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) diff --git a/src/processor/org_mode/org_to_jsonl.py b/src/processor/org_mode/org_to_jsonl.py index 43f4acef..c4c18ce9 100644 --- a/src/processor/org_mode/org_to_jsonl.py +++ b/src/processor/org_mode/org_to_jsonl.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - # Standard Packages import json import glob @@ -9,147 +7,148 @@ from typing import Iterable # Internal Packages from src.processor.org_mode import orgnode +from src.processor.text_to_jsonl import TextToJsonl from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update from src.utils.jsonl import dump_jsonl, compress_jsonl_data from src.utils import state -from src.utils.rawconfig import TextContentConfig logger = logging.getLogger(__name__) -# Define Functions -def org_to_jsonl(config: TextContentConfig, previous_entries=None): - # Extract required fields from config - org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl - index_heading_entries = config.index_heading_entries +class OrgToJsonl(TextToJsonl): + # Define Functions + def process(self, previous_entries=None): + # Extract required fields from config + org_files, org_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl + index_heading_entries = self.config.index_heading_entries - # Input Validation - if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): - print("At least one of org-files or org-file-filter is required to be specified") - exit(1) + # Input Validation + if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter): + print("At least one of org-files or org-file-filter is required to be specified") + exit(1) - # Get Org Files to Process - start = time.time() - org_files = get_org_files(org_files, org_file_filter) + # Get Org Files to Process + start = time.time() + org_files = OrgToJsonl.get_org_files(org_files, org_file_filter) - # Extract Entries from specified Org files - start = time.time() - entry_nodes, file_to_entries = extract_org_entries(org_files) - end = time.time() - logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") + # Extract Entries from specified Org files + start = time.time() + entry_nodes, file_to_entries = self.extract_org_entries(org_files) + end = time.time() + logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds") - start = time.time() - current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) - end = time.time() - logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") + start = time.time() + current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries) + end = time.time() + logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds") - # Identify, mark and merge any new entries with previous entries - if not previous_entries: - entries_with_ids = list(enumerate(current_entries)) - else: - entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) + # Identify, mark and merge any new entries with previous entries + if not previous_entries: + entries_with_ids = list(enumerate(current_entries)) + else: + entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger) - # Process Each Entry from All Notes Files - start = time.time() - entries = map(lambda entry: entry[1], entries_with_ids) - jsonl_data = convert_org_entries_to_jsonl(entries) + # Process Each Entry from All Notes Files + start = time.time() + entries = map(lambda entry: entry[1], entries_with_ids) + jsonl_data = self.convert_org_entries_to_jsonl(entries) - # Compress JSONL formatted Data - if output_file.suffix == ".gz": - compress_jsonl_data(jsonl_data, output_file) - elif output_file.suffix == ".jsonl": - dump_jsonl(jsonl_data, output_file) - end = time.time() - logger.debug(f"Write org entries to JSONL file: {end - start} seconds") + # Compress JSONL formatted Data + if output_file.suffix == ".gz": + compress_jsonl_data(jsonl_data, output_file) + elif output_file.suffix == ".jsonl": + dump_jsonl(jsonl_data, output_file) + end = time.time() + logger.debug(f"Write org entries to JSONL file: {end - start} seconds") - return entries_with_ids + return entries_with_ids + @staticmethod + def get_org_files(org_files=None, org_file_filters=None): + "Get Org files to process" + absolute_org_files, filtered_org_files = set(), set() + if org_files: + absolute_org_files = { + get_absolute_path(org_file) + for org_file + in org_files + } + if org_file_filters: + filtered_org_files = { + filtered_file + for org_file_filter in org_file_filters + for filtered_file in glob.glob(get_absolute_path(org_file_filter)) + } -def get_org_files(org_files=None, org_file_filters=None): - "Get Org files to process" - absolute_org_files, filtered_org_files = set(), set() - if org_files: - absolute_org_files = { - get_absolute_path(org_file) - for org_file - in org_files - } - if org_file_filters: - filtered_org_files = { - filtered_file - for org_file_filter in org_file_filters - for filtered_file in glob.glob(get_absolute_path(org_file_filter)) - } + all_org_files = sorted(absolute_org_files | filtered_org_files) - all_org_files = sorted(absolute_org_files | filtered_org_files) + files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")} + if any(files_with_non_org_extensions): + logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}") - files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")} - if any(files_with_non_org_extensions): - logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}") + logger.info(f'Processing files: {all_org_files}') - logger.info(f'Processing files: {all_org_files}') + return all_org_files - return all_org_files + @staticmethod + def extract_org_entries(org_files): + "Extract entries from specified Org files" + entries = [] + entry_to_file_map = [] + for org_file in org_files: + org_file_entries = orgnode.makelist(str(org_file)) + entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries)) + entries.extend(org_file_entries) + return entries, dict(entry_to_file_map) -def extract_org_entries(org_files): - "Extract entries from specified Org files" - entries = [] - entry_to_file_map = [] - for org_file in org_files: - org_file_entries = orgnode.makelist(str(org_file)) - entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries)) - entries.extend(org_file_entries) + @staticmethod + def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: + "Convert Org-Mode entries into list of dictionary" + entry_maps = [] + for entry in entries: + entry_dict = dict() - return entries, dict(entry_to_file_map) + if not entry.hasBody and not index_heading_entries: + # Ignore title notes i.e notes with just headings and empty body + continue - -def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]: - "Convert Org-Mode entries into list of dictionary" - entry_maps = [] - for entry in entries: - entry_dict = dict() - - if not entry.hasBody and not index_heading_entries: - # Ignore title notes i.e notes with just headings and empty body - continue - - entry_dict["compiled"] = f'{entry.heading}.' - if state.verbose > 2: - logger.debug(f"Title: {entry.heading}") - - if entry.tags: - tags_str = " ".join(entry.tags) - entry_dict["compiled"] += f'\t {tags_str}.' + entry_dict["compiled"] = f'{entry.heading}.' if state.verbose > 2: - logger.debug(f"Tags: {tags_str}") + logger.debug(f"Title: {entry.heading}") - if entry.closed: - entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}') + if entry.tags: + tags_str = " ".join(entry.tags) + entry_dict["compiled"] += f'\t {tags_str}.' + if state.verbose > 2: + logger.debug(f"Tags: {tags_str}") - if entry.scheduled: - entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.' - if state.verbose > 2: - logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}') + if entry.closed: + entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.' + if state.verbose > 2: + logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}') - if entry.hasBody: - entry_dict["compiled"] += f'\n {entry.body}' - if state.verbose > 2: - logger.debug(f"Body: {entry.body}") + if entry.scheduled: + entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.' + if state.verbose > 2: + logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}') - if entry_dict: - entry_dict["raw"] = f'{entry}' - entry_dict["file"] = f'{entry_to_file_map[entry]}' + if entry.hasBody: + entry_dict["compiled"] += f'\n {entry.body}' + if state.verbose > 2: + logger.debug(f"Body: {entry.body}") - # Convert Dictionary to JSON and Append to JSONL string - entry_maps.append(entry_dict) + if entry_dict: + entry_dict["raw"] = f'{entry}' + entry_dict["file"] = f'{entry_to_file_map[entry]}' - return entry_maps + # Convert Dictionary to JSON and Append to JSONL string + entry_maps.append(entry_dict) + return entry_maps -def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str: - "Convert each Org-Mode entry to JSON and collate as JSONL" - return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) + @staticmethod + def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str: + "Convert each Org-Mode entry to JSON and collate as JSONL" + return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries]) diff --git a/src/processor/text_to_jsonl.py b/src/processor/text_to_jsonl.py new file mode 100644 index 00000000..e59c5fb1 --- /dev/null +++ b/src/processor/text_to_jsonl.py @@ -0,0 +1,14 @@ +# Standard Packages +from abc import ABC, abstractmethod +from typing import Iterable + +# Internal Packages +from src.utils.rawconfig import TextContentConfig + + +class TextToJsonl(ABC): + def __init__(self, config: TextContentConfig): + self.config = config + + @abstractmethod + def process(self, previous_entries: Iterable[tuple[int, dict]]=None) -> list[tuple[int, dict]]: ... diff --git a/src/search_type/text_search.py b/src/search_type/text_search.py index d4d8a9d4..ff7d9c43 100644 --- a/src/search_type/text_search.py +++ b/src/search_type/text_search.py @@ -1,10 +1,12 @@ # Standard Packages import logging import time +from typing import Type # External Packages import torch from sentence_transformers import SentenceTransformer, CrossEncoder, util +from src.processor.text_to_jsonl import TextToJsonl from src.search_filter.base_filter import BaseFilter # Internal Packages @@ -179,14 +181,14 @@ def collate_results(hits, entries, count=5): in hits[0:count]] -def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel: +def setup(text_to_jsonl: Type[TextToJsonl], config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel: # Initialize Model bi_encoder, cross_encoder, top_k = initialize_model(search_config) # Map notes in text files to (compressed) JSONL formatted file config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl) previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None - entries_with_indices = text_to_jsonl(config, previous_entries) + entries_with_indices = text_to_jsonl(config).process(previous_entries) # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) diff --git a/tests/conftest.py b/tests/conftest.py index f6c0a7ea..103a28e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ from src.search_type import image_search, text_search from src.utils.config import SearchType from src.utils.helpers import resolve_absolute_path from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig -from src.processor.org_mode.org_to_jsonl import org_to_jsonl +from src.processor.org_mode.org_to_jsonl import OrgToJsonl from src.search_filter.date_filter import DateFilter from src.search_filter.word_filter import WordFilter from src.search_filter.file_filter import FileFilter @@ -60,6 +60,6 @@ def content_config(tmp_path_factory, search_config: SearchConfig): embeddings_file = content_dir.joinpath('note_embeddings.pt')) filters = [DateFilter(), WordFilter(), FileFilter()] - text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) + text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) return content_config diff --git a/tests/test_beancount_to_jsonl.py b/tests/test_beancount_to_jsonl.py index 51a4dffd..2c1cb9e6 100644 --- a/tests/test_beancount_to_jsonl.py +++ b/tests/test_beancount_to_jsonl.py @@ -2,7 +2,7 @@ import json # Internal Packages -from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl, get_beancount_files +from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl def test_no_transactions_in_file(tmp_path): @@ -16,10 +16,11 @@ def test_no_transactions_in_file(tmp_path): # Act # Extract Entries from specified Beancount files - entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file]) + entry_nodes, file_to_entries = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file]) # Process Each Entry from All Beancount Files - jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries)) + jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl( + BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -38,10 +39,11 @@ Assets:Test:Test -1.00 KES # Act # Extract Entries from specified Beancount files - entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file]) # Process Each Entry from All Beancount Files - jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl( + BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -65,10 +67,11 @@ Assets:Test:Test -1.00 KES # Act # Extract Entries from specified Beancount files - entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file]) + entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file]) # Process Each Entry from All Beancount Files - jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map)) + jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl( + BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -96,7 +99,7 @@ def test_get_beancount_files(tmp_path): input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount'] # Act - extracted_org_files = get_beancount_files(input_files, input_filter) + extracted_org_files = BeancountToJsonl.get_beancount_files(input_files, input_filter) # Assert assert len(extracted_org_files) == 5 diff --git a/tests/test_client.py b/tests/test_client.py index d405a044..96fa2c01 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -12,7 +12,7 @@ from src.main import app from src.utils.state import model, config from src.search_type import text_search, image_search from src.utils.rawconfig import ContentConfig, SearchConfig -from src.processor.org_mode.org_to_jsonl import org_to_jsonl +from src.processor.org_mode.org_to_jsonl import OrgToJsonl from src.search_filter.word_filter import WordFilter from src.search_filter.file_filter import FileFilter @@ -118,7 +118,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig # ---------------------------------------------------------------------------------------------------- def test_notes_search(content_config: ContentConfig, search_config: SearchConfig): # Arrange - model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) user_query = quote("How to git install application?") # Act @@ -135,7 +135,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig): # Arrange filters = [WordFilter(), FileFilter()] - model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) + model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = quote('+"Emacs" file:"*.org"') # Act @@ -152,7 +152,7 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange filters = [WordFilter()] - model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) + model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = quote('How to git install application? +"Emacs"') # Act @@ -169,7 +169,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_ def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig): # Arrange filters = [WordFilter()] - model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) + model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters) user_query = quote('How to git install application? -"clone"') # Act diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index 89c471d8..c4c72688 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -2,7 +2,7 @@ import json # Internal Packages -from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps, get_markdown_files +from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl def test_markdown_file_with_no_headings_to_jsonl(tmp_path): @@ -16,10 +16,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile]) + entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries)) + jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( + MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -37,10 +38,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( + MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -62,10 +64,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile]) + entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map)) + jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( + MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -93,7 +96,7 @@ def test_get_markdown_files(tmp_path): input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown'] # Act - extracted_org_files = get_markdown_files(input_files, input_filter) + extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter) # Assert assert len(extracted_org_files) == 5 diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index 8a2f58ba..2dbedcd0 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -2,7 +2,7 @@ import json # Internal Packages -from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries, get_org_files +from src.processor.org_mode.org_to_jsonl import OrgToJsonl from src.utils.helpers import is_none_or_empty @@ -21,8 +21,8 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries( - *extract_org_entries(org_files=[orgfile]), + jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries( + *OrgToJsonl.extract_org_entries(org_files=[orgfile]), index_heading_entries=index_heading_entries)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -49,10 +49,10 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = extract_org_entries(org_files=[orgfile]) + entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map)) + jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -70,11 +70,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile]) + entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile]) # Process Each Entry from All Notes Files - entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_string = convert_org_entries_to_jsonl(entries) + entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -102,7 +102,7 @@ def test_get_org_files(tmp_path): input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org'] # Act - extracted_org_files = get_org_files(input_files, input_filter) + extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter) # Assert assert len(extracted_org_files) == 5 diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 6744566d..584c07b9 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -9,7 +9,7 @@ import pytest from src.utils.state import model from src.search_type import text_search from src.utils.rawconfig import ContentConfig, SearchConfig -from src.processor.org_mode.org_to_jsonl import org_to_jsonl +from src.processor.org_mode.org_to_jsonl import OrgToJsonl # Test @@ -24,7 +24,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(content_config: Content # Act # Generate notes embeddings during asymmetric setup with pytest.raises(FileNotFoundError): - text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True) + text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True) # ---------------------------------------------------------------------------------------------------- @@ -39,7 +39,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo # Act # Generate notes embeddings during asymmetric setup with pytest.raises(ValueError, match=r'^No valid entries found*'): - text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True) + text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True) # Cleanup # delete created test file @@ -50,7 +50,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig): # Act # Regenerate notes embeddings during asymmetric setup - notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) # Assert assert len(notes_model.entries) == 10 @@ -60,7 +60,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo # ---------------------------------------------------------------------------------------------------- def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig): # Arrange - model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + model.notes_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) query = "How to git install application?" # Act @@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC # ---------------------------------------------------------------------------------------------------- def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig): # Arrange - initial_notes_model= text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) assert len(initial_notes_model.entries) == 10 assert len(initial_notes_model.corpus_embeddings) == 10 @@ -96,11 +96,11 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n") # regenerate notes jsonl, model embeddings and model to include entry from new file - regenerated_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + regenerated_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) # Act # reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files - initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) # Assert assert len(regenerated_notes_model.entries) == 11 @@ -119,7 +119,7 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC # ---------------------------------------------------------------------------------------------------- def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig): # Arrange - initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True) + initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True) assert len(initial_notes_model.entries) == 10 assert len(initial_notes_model.corpus_embeddings) == 10 @@ -133,7 +133,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search # Act # update embeddings, entries with the newly added note - initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False) + initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False) # verify new entry added in updated embeddings, entries assert len(initial_notes_model.entries) == 11