mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 16:14:21 +00:00
Use Base TextToJsonl class to standardize <text>_to_jsonl processors
- Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app
This commit is contained in:
parent
c16ae9e344
commit
02d944030f
12 changed files with 364 additions and 345 deletions
|
@ -6,9 +6,9 @@ import logging
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
|
from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||||
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
|
from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from src.search_type import image_search, text_search
|
from src.search_type import image_search, text_search
|
||||||
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||||
from src.utils import state
|
from src.utils import state
|
||||||
|
@ -44,7 +44,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
if (t == SearchType.Org or t == None) and config.content_type.org:
|
if (t == SearchType.Org or t == None) and config.content_type.org:
|
||||||
# Extract Entries, Generate Notes Embeddings
|
# Extract Entries, Generate Notes Embeddings
|
||||||
model.orgmode_search = text_search.setup(
|
model.orgmode_search = text_search.setup(
|
||||||
org_to_jsonl,
|
OrgToJsonl,
|
||||||
config.content_type.org,
|
config.content_type.org,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
|
@ -54,7 +54,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
if (t == SearchType.Music or t == None) and config.content_type.music:
|
if (t == SearchType.Music or t == None) and config.content_type.music:
|
||||||
# Extract Entries, Generate Music Embeddings
|
# Extract Entries, Generate Music Embeddings
|
||||||
model.music_search = text_search.setup(
|
model.music_search = text_search.setup(
|
||||||
org_to_jsonl,
|
OrgToJsonl,
|
||||||
config.content_type.music,
|
config.content_type.music,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
|
@ -64,7 +64,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
|
||||||
# Extract Entries, Generate Markdown Embeddings
|
# Extract Entries, Generate Markdown Embeddings
|
||||||
model.markdown_search = text_search.setup(
|
model.markdown_search = text_search.setup(
|
||||||
markdown_to_jsonl,
|
MarkdownToJsonl,
|
||||||
config.content_type.markdown,
|
config.content_type.markdown,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
|
@ -74,7 +74,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||||
# Extract Entries, Generate Ledger Embeddings
|
# Extract Entries, Generate Ledger Embeddings
|
||||||
model.ledger_search = text_search.setup(
|
model.ledger_search = text_search.setup(
|
||||||
beancount_to_jsonl,
|
BeancountToJsonl,
|
||||||
config.content_type.ledger,
|
config.content_type.ledger,
|
||||||
search_config=config.search_type.symmetric,
|
search_config=config.search_type.symmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import glob
|
import glob
|
||||||
|
@ -8,121 +6,122 @@ import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
|
from src.processor.text_to_jsonl import TextToJsonl
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from src.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
class BeancountToJsonl(TextToJsonl):
|
||||||
def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
|
# Define Functions
|
||||||
# Extract required fields from config
|
def process(self, previous_entries=None):
|
||||||
beancount_files, beancount_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
# Extract required fields from config
|
||||||
|
beancount_files, beancount_file_filter, output_file = self.config.input_files, self.config.input_filter,self.config.compressed_jsonl
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
|
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
|
||||||
print("At least one of beancount-files or beancount-file-filter is required to be specified")
|
print("At least one of beancount-files or beancount-file-filter is required to be specified")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Get Beancount Files to Process
|
# Get Beancount Files to Process
|
||||||
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
|
beancount_files = BeancountToJsonl.get_beancount_files(beancount_files, beancount_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files))
|
current_entries = BeancountToJsonl.convert_transactions_to_maps(*BeancountToJsonl.extract_beancount_transactions(beancount_files))
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds")
|
logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
start = time.time()
|
start = time.time()
|
||||||
if not previous_entries:
|
if not previous_entries:
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
else:
|
else:
|
||||||
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Identify new or updated transaction: {end - start} seconds")
|
logger.debug(f"Identify new or updated transaction: {end - start} seconds")
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
jsonl_data = convert_transaction_maps_to_jsonl(entries)
|
jsonl_data = BeancountToJsonl.convert_transaction_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Write transactions to JSONL file: {end - start} seconds")
|
logger.debug(f"Write transactions to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
|
||||||
|
"Get Beancount files to process"
|
||||||
|
absolute_beancount_files, filtered_beancount_files = set(), set()
|
||||||
|
if beancount_files:
|
||||||
|
absolute_beancount_files = {get_absolute_path(beancount_file)
|
||||||
|
for beancount_file
|
||||||
|
in beancount_files}
|
||||||
|
if beancount_file_filters:
|
||||||
|
filtered_beancount_files = {
|
||||||
|
filtered_file
|
||||||
|
for beancount_file_filter in beancount_file_filters
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
||||||
|
}
|
||||||
|
|
||||||
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
|
all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
|
||||||
"Get Beancount files to process"
|
|
||||||
absolute_beancount_files, filtered_beancount_files = set(), set()
|
files_with_non_beancount_extensions = {
|
||||||
if beancount_files:
|
beancount_file
|
||||||
absolute_beancount_files = {get_absolute_path(beancount_file)
|
for beancount_file
|
||||||
for beancount_file
|
in all_beancount_files
|
||||||
in beancount_files}
|
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
|
||||||
if beancount_file_filters:
|
|
||||||
filtered_beancount_files = {
|
|
||||||
filtered_file
|
|
||||||
for beancount_file_filter in beancount_file_filters
|
|
||||||
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
|
||||||
}
|
}
|
||||||
|
if any(files_with_non_beancount_extensions):
|
||||||
|
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
||||||
|
|
||||||
all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
|
logger.info(f'Processing files: {all_beancount_files}')
|
||||||
|
|
||||||
files_with_non_beancount_extensions = {
|
return all_beancount_files
|
||||||
beancount_file
|
|
||||||
for beancount_file
|
|
||||||
in all_beancount_files
|
|
||||||
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
|
|
||||||
}
|
|
||||||
if any(files_with_non_beancount_extensions):
|
|
||||||
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
|
||||||
|
|
||||||
logger.info(f'Processing files: {all_beancount_files}')
|
@staticmethod
|
||||||
|
def extract_beancount_transactions(beancount_files):
|
||||||
|
"Extract entries from specified Beancount files"
|
||||||
|
|
||||||
return all_beancount_files
|
# Initialize Regex for extracting Beancount Entries
|
||||||
|
transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] '
|
||||||
|
empty_newline = f'^[\n\r\t\ ]*$'
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
transaction_to_file_map = []
|
||||||
|
for beancount_file in beancount_files:
|
||||||
|
with open(beancount_file) as f:
|
||||||
|
ledger_content = f.read()
|
||||||
|
transactions_per_file = [entry.strip(empty_escape_sequences)
|
||||||
|
for entry
|
||||||
|
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
||||||
|
if re.match(transaction_regex, entry)]
|
||||||
|
transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file))
|
||||||
|
entries.extend(transactions_per_file)
|
||||||
|
return entries, dict(transaction_to_file_map)
|
||||||
|
|
||||||
def extract_beancount_transactions(beancount_files):
|
@staticmethod
|
||||||
"Extract entries from specified Beancount files"
|
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
|
||||||
|
"Convert each Beancount transaction into a dictionary"
|
||||||
|
entry_maps = []
|
||||||
|
for entry in entries:
|
||||||
|
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'})
|
||||||
|
|
||||||
# Initialize Regex for extracting Beancount Entries
|
logger.info(f"Converted {len(entries)} transactions to dictionaries")
|
||||||
transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] '
|
|
||||||
empty_newline = f'^[\n\r\t\ ]*$'
|
|
||||||
|
|
||||||
entries = []
|
return entry_maps
|
||||||
transaction_to_file_map = []
|
|
||||||
for beancount_file in beancount_files:
|
|
||||||
with open(beancount_file) as f:
|
|
||||||
ledger_content = f.read()
|
|
||||||
transactions_per_file = [entry.strip(empty_escape_sequences)
|
|
||||||
for entry
|
|
||||||
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
|
||||||
if re.match(transaction_regex, entry)]
|
|
||||||
transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file))
|
|
||||||
entries.extend(transactions_per_file)
|
|
||||||
return entries, dict(transaction_to_file_map)
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
|
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
|
||||||
"Convert each Beancount transaction into a dictionary"
|
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
|
||||||
entry_maps = []
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
for entry in entries:
|
|
||||||
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'})
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} transactions to dictionaries")
|
|
||||||
|
|
||||||
return entry_maps
|
|
||||||
|
|
||||||
|
|
||||||
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
|
|
||||||
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
|
|
||||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import glob
|
import glob
|
||||||
|
@ -8,120 +6,121 @@ import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
|
from src.processor.text_to_jsonl import TextToJsonl
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from src.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
class MarkdownToJsonl(TextToJsonl):
|
||||||
def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
|
# Define Functions
|
||||||
# Extract required fields from config
|
def process(self, previous_entries=None):
|
||||||
markdown_files, markdown_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
# Extract required fields from config
|
||||||
|
markdown_files, markdown_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Get Markdown Files to Process
|
# Get Markdown Files to Process
|
||||||
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
markdown_files = MarkdownToJsonl.get_markdown_files(markdown_files, markdown_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files))
|
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(*MarkdownToJsonl.extract_markdown_entries(markdown_files))
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds")
|
logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
start = time.time()
|
start = time.time()
|
||||||
if not previous_entries:
|
if not previous_entries:
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
else:
|
else:
|
||||||
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Identify new or updated entries: {end - start} seconds")
|
logger.debug(f"Identify new or updated entries: {end - start} seconds")
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
jsonl_data = convert_markdown_maps_to_jsonl(entries)
|
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds")
|
logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
|
||||||
|
"Get Markdown files to process"
|
||||||
|
absolute_markdown_files, filtered_markdown_files = set(), set()
|
||||||
|
if markdown_files:
|
||||||
|
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
||||||
|
if markdown_file_filters:
|
||||||
|
filtered_markdown_files = {
|
||||||
|
filtered_file
|
||||||
|
for markdown_file_filter in markdown_file_filters
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
||||||
|
}
|
||||||
|
|
||||||
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
|
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||||
"Get Markdown files to process"
|
|
||||||
absolute_markdown_files, filtered_markdown_files = set(), set()
|
files_with_non_markdown_extensions = {
|
||||||
if markdown_files:
|
md_file
|
||||||
absolute_markdown_files = {get_absolute_path(markdown_file) for markdown_file in markdown_files}
|
for md_file
|
||||||
if markdown_file_filters:
|
in all_markdown_files
|
||||||
filtered_markdown_files = {
|
if not md_file.endswith(".md") and not md_file.endswith('.markdown')
|
||||||
filtered_file
|
|
||||||
for markdown_file_filter in markdown_file_filters
|
|
||||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
if any(files_with_non_markdown_extensions):
|
||||||
|
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}")
|
||||||
|
|
||||||
files_with_non_markdown_extensions = {
|
logger.info(f'Processing files: {all_markdown_files}')
|
||||||
md_file
|
|
||||||
for md_file
|
|
||||||
in all_markdown_files
|
|
||||||
if not md_file.endswith(".md") and not md_file.endswith('.markdown')
|
|
||||||
}
|
|
||||||
|
|
||||||
if any(files_with_non_markdown_extensions):
|
return all_markdown_files
|
||||||
logger.warn(f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}")
|
|
||||||
|
|
||||||
logger.info(f'Processing files: {all_markdown_files}')
|
@staticmethod
|
||||||
|
def extract_markdown_entries(markdown_files):
|
||||||
|
"Extract entries by heading from specified Markdown files"
|
||||||
|
|
||||||
return all_markdown_files
|
# Regex to extract Markdown Entries by Heading
|
||||||
|
markdown_heading_regex = r'^#'
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
entry_to_file_map = []
|
||||||
|
for markdown_file in markdown_files:
|
||||||
|
with open(markdown_file) as f:
|
||||||
|
markdown_content = f.read()
|
||||||
|
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
||||||
|
for entry
|
||||||
|
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||||
|
if entry.strip(empty_escape_sequences) != '']
|
||||||
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
||||||
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|
||||||
def extract_markdown_entries(markdown_files):
|
return entries, dict(entry_to_file_map)
|
||||||
"Extract entries by heading from specified Markdown files"
|
|
||||||
|
|
||||||
# Regex to extract Markdown Entries by Heading
|
@staticmethod
|
||||||
markdown_heading_regex = r'^#'
|
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||||
|
"Convert each Markdown entries into a dictionary"
|
||||||
|
entry_maps = []
|
||||||
|
for entry in entries:
|
||||||
|
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'})
|
||||||
|
|
||||||
entries = []
|
logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
|
||||||
entry_to_file_map = []
|
|
||||||
for markdown_file in markdown_files:
|
|
||||||
with open(markdown_file) as f:
|
|
||||||
markdown_content = f.read()
|
|
||||||
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
|
||||||
for entry
|
|
||||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
|
||||||
if entry.strip(empty_escape_sequences) != '']
|
|
||||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
|
||||||
entries.extend(markdown_entries_per_file)
|
|
||||||
|
|
||||||
return entries, dict(entry_to_file_map)
|
return entry_maps
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
def convert_markdown_maps_to_jsonl(entries):
|
||||||
"Convert each Markdown entries into a dictionary"
|
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||||
entry_maps = []
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
for entry in entries:
|
|
||||||
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'})
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
|
|
||||||
|
|
||||||
return entry_maps
|
|
||||||
|
|
||||||
|
|
||||||
def convert_markdown_maps_to_jsonl(entries):
|
|
||||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
|
||||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import json
|
import json
|
||||||
import glob
|
import glob
|
||||||
|
@ -9,147 +7,148 @@ from typing import Iterable
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode import orgnode
|
from src.processor.org_mode import orgnode
|
||||||
|
from src.processor.text_to_jsonl import TextToJsonl
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from src.utils import state
|
from src.utils import state
|
||||||
from src.utils.rawconfig import TextContentConfig
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
class OrgToJsonl(TextToJsonl):
|
||||||
def org_to_jsonl(config: TextContentConfig, previous_entries=None):
|
# Define Functions
|
||||||
# Extract required fields from config
|
def process(self, previous_entries=None):
|
||||||
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
|
# Extract required fields from config
|
||||||
index_heading_entries = config.index_heading_entries
|
org_files, org_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl
|
||||||
|
index_heading_entries = self.config.index_heading_entries
|
||||||
|
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
||||||
print("At least one of org-files or org-file-filter is required to be specified")
|
print("At least one of org-files or org-file-filter is required to be specified")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Get Org Files to Process
|
# Get Org Files to Process
|
||||||
start = time.time()
|
start = time.time()
|
||||||
org_files = get_org_files(org_files, org_file_filter)
|
org_files = OrgToJsonl.get_org_files(org_files, org_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
entry_nodes, file_to_entries = extract_org_entries(org_files)
|
entry_nodes, file_to_entries = self.extract_org_entries(org_files)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
|
current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
# Identify, mark and merge any new entries with previous entries
|
# Identify, mark and merge any new entries with previous entries
|
||||||
if not previous_entries:
|
if not previous_entries:
|
||||||
entries_with_ids = list(enumerate(current_entries))
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
else:
|
else:
|
||||||
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
start = time.time()
|
start = time.time()
|
||||||
entries = map(lambda entry: entry[1], entries_with_ids)
|
entries = map(lambda entry: entry[1], entries_with_ids)
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
jsonl_data = self.convert_org_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Write org entries to JSONL file: {end - start} seconds")
|
logger.debug(f"Write org entries to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries_with_ids
|
return entries_with_ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_org_files(org_files=None, org_file_filters=None):
|
||||||
|
"Get Org files to process"
|
||||||
|
absolute_org_files, filtered_org_files = set(), set()
|
||||||
|
if org_files:
|
||||||
|
absolute_org_files = {
|
||||||
|
get_absolute_path(org_file)
|
||||||
|
for org_file
|
||||||
|
in org_files
|
||||||
|
}
|
||||||
|
if org_file_filters:
|
||||||
|
filtered_org_files = {
|
||||||
|
filtered_file
|
||||||
|
for org_file_filter in org_file_filters
|
||||||
|
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
||||||
|
}
|
||||||
|
|
||||||
def get_org_files(org_files=None, org_file_filters=None):
|
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||||
"Get Org files to process"
|
|
||||||
absolute_org_files, filtered_org_files = set(), set()
|
|
||||||
if org_files:
|
|
||||||
absolute_org_files = {
|
|
||||||
get_absolute_path(org_file)
|
|
||||||
for org_file
|
|
||||||
in org_files
|
|
||||||
}
|
|
||||||
if org_file_filters:
|
|
||||||
filtered_org_files = {
|
|
||||||
filtered_file
|
|
||||||
for org_file_filter in org_file_filters
|
|
||||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
|
||||||
}
|
|
||||||
|
|
||||||
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
||||||
|
if any(files_with_non_org_extensions):
|
||||||
|
logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
||||||
|
|
||||||
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
logger.info(f'Processing files: {all_org_files}')
|
||||||
if any(files_with_non_org_extensions):
|
|
||||||
logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
|
|
||||||
|
|
||||||
logger.info(f'Processing files: {all_org_files}')
|
return all_org_files
|
||||||
|
|
||||||
return all_org_files
|
@staticmethod
|
||||||
|
def extract_org_entries(org_files):
|
||||||
|
"Extract entries from specified Org files"
|
||||||
|
entries = []
|
||||||
|
entry_to_file_map = []
|
||||||
|
for org_file in org_files:
|
||||||
|
org_file_entries = orgnode.makelist(str(org_file))
|
||||||
|
entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries))
|
||||||
|
entries.extend(org_file_entries)
|
||||||
|
|
||||||
|
return entries, dict(entry_to_file_map)
|
||||||
|
|
||||||
def extract_org_entries(org_files):
|
@staticmethod
|
||||||
"Extract entries from specified Org files"
|
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
||||||
entries = []
|
"Convert Org-Mode entries into list of dictionary"
|
||||||
entry_to_file_map = []
|
entry_maps = []
|
||||||
for org_file in org_files:
|
for entry in entries:
|
||||||
org_file_entries = orgnode.makelist(str(org_file))
|
entry_dict = dict()
|
||||||
entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries))
|
|
||||||
entries.extend(org_file_entries)
|
|
||||||
|
|
||||||
return entries, dict(entry_to_file_map)
|
if not entry.hasBody and not index_heading_entries:
|
||||||
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
|
continue
|
||||||
|
|
||||||
|
entry_dict["compiled"] = f'{entry.heading}.'
|
||||||
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
|
|
||||||
"Convert Org-Mode entries into list of dictionary"
|
|
||||||
entry_maps = []
|
|
||||||
for entry in entries:
|
|
||||||
entry_dict = dict()
|
|
||||||
|
|
||||||
if not entry.hasBody and not index_heading_entries:
|
|
||||||
# Ignore title notes i.e notes with just headings and empty body
|
|
||||||
continue
|
|
||||||
|
|
||||||
entry_dict["compiled"] = f'{entry.heading}.'
|
|
||||||
if state.verbose > 2:
|
|
||||||
logger.debug(f"Title: {entry.heading}")
|
|
||||||
|
|
||||||
if entry.tags:
|
|
||||||
tags_str = " ".join(entry.tags)
|
|
||||||
entry_dict["compiled"] += f'\t {tags_str}.'
|
|
||||||
if state.verbose > 2:
|
if state.verbose > 2:
|
||||||
logger.debug(f"Tags: {tags_str}")
|
logger.debug(f"Title: {entry.heading}")
|
||||||
|
|
||||||
if entry.closed:
|
if entry.tags:
|
||||||
entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.'
|
tags_str = " ".join(entry.tags)
|
||||||
if state.verbose > 2:
|
entry_dict["compiled"] += f'\t {tags_str}.'
|
||||||
logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}')
|
if state.verbose > 2:
|
||||||
|
logger.debug(f"Tags: {tags_str}")
|
||||||
|
|
||||||
if entry.scheduled:
|
if entry.closed:
|
||||||
entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.'
|
entry_dict["compiled"] += f'\n Closed on {entry.closed.strftime("%Y-%m-%d")}.'
|
||||||
if state.verbose > 2:
|
if state.verbose > 2:
|
||||||
logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}')
|
logger.debug(f'Closed: {entry.closed.strftime("%Y-%m-%d")}')
|
||||||
|
|
||||||
if entry.hasBody:
|
if entry.scheduled:
|
||||||
entry_dict["compiled"] += f'\n {entry.body}'
|
entry_dict["compiled"] += f'\n Scheduled for {entry.scheduled.strftime("%Y-%m-%d")}.'
|
||||||
if state.verbose > 2:
|
if state.verbose > 2:
|
||||||
logger.debug(f"Body: {entry.body}")
|
logger.debug(f'Scheduled: {entry.scheduled.strftime("%Y-%m-%d")}')
|
||||||
|
|
||||||
if entry_dict:
|
if entry.hasBody:
|
||||||
entry_dict["raw"] = f'{entry}'
|
entry_dict["compiled"] += f'\n {entry.body}'
|
||||||
entry_dict["file"] = f'{entry_to_file_map[entry]}'
|
if state.verbose > 2:
|
||||||
|
logger.debug(f"Body: {entry.body}")
|
||||||
|
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
if entry_dict:
|
||||||
entry_maps.append(entry_dict)
|
entry_dict["raw"] = f'{entry}'
|
||||||
|
entry_dict["file"] = f'{entry_to_file_map[entry]}'
|
||||||
|
|
||||||
return entry_maps
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
|
entry_maps.append(entry_dict)
|
||||||
|
|
||||||
|
return entry_maps
|
||||||
|
|
||||||
def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str:
|
@staticmethod
|
||||||
"Convert each Org-Mode entry to JSON and collate as JSONL"
|
def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str:
|
||||||
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
"Convert each Org-Mode entry to JSON and collate as JSONL"
|
||||||
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
|
|
14
src/processor/text_to_jsonl.py
Normal file
14
src/processor/text_to_jsonl.py
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Standard Packages
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from src.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
|
|
||||||
|
class TextToJsonl(ABC):
|
||||||
|
def __init__(self, config: TextContentConfig):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def process(self, previous_entries: Iterable[tuple[int, dict]]=None) -> list[tuple[int, dict]]: ...
|
|
@ -1,10 +1,12 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
# External Packages
|
# External Packages
|
||||||
import torch
|
import torch
|
||||||
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
||||||
|
from src.processor.text_to_jsonl import TextToJsonl
|
||||||
from src.search_filter.base_filter import BaseFilter
|
from src.search_filter.base_filter import BaseFilter
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
|
@ -179,14 +181,14 @@ def collate_results(hits, entries, count=5):
|
||||||
in hits[0:count]]
|
in hits[0:count]]
|
||||||
|
|
||||||
|
|
||||||
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
|
def setup(text_to_jsonl: Type[TextToJsonl], config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
|
||||||
# Initialize Model
|
# Initialize Model
|
||||||
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
|
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
|
||||||
|
|
||||||
# Map notes in text files to (compressed) JSONL formatted file
|
# Map notes in text files to (compressed) JSONL formatted file
|
||||||
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
||||||
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
|
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
|
||||||
entries_with_indices = text_to_jsonl(config, previous_entries)
|
entries_with_indices = text_to_jsonl(config).process(previous_entries)
|
||||||
|
|
||||||
# Extract Updated Entries
|
# Extract Updated Entries
|
||||||
entries = extract_entries(config.compressed_jsonl)
|
entries = extract_entries(config.compressed_jsonl)
|
||||||
|
|
|
@ -6,7 +6,7 @@ from src.search_type import image_search, text_search
|
||||||
from src.utils.config import SearchType
|
from src.utils.config import SearchType
|
||||||
from src.utils.helpers import resolve_absolute_path
|
from src.utils.helpers import resolve_absolute_path
|
||||||
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from src.search_filter.date_filter import DateFilter
|
from src.search_filter.date_filter import DateFilter
|
||||||
from src.search_filter.word_filter import WordFilter
|
from src.search_filter.word_filter import WordFilter
|
||||||
from src.search_filter.file_filter import FileFilter
|
from src.search_filter.file_filter import FileFilter
|
||||||
|
@ -60,6 +60,6 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||||
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
||||||
|
|
||||||
filters = [DateFilter(), WordFilter(), FileFilter()]
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
|
|
||||||
return content_config
|
return content_config
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl, get_beancount_files
|
from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl
|
||||||
|
|
||||||
|
|
||||||
def test_no_transactions_in_file(tmp_path):
|
def test_no_transactions_in_file(tmp_path):
|
||||||
|
@ -16,10 +16,11 @@ def test_no_transactions_in_file(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file])
|
entry_nodes, file_to_entries = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
# Process Each Entry from All Beancount Files
|
# Process Each Entry from All Beancount Files
|
||||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries))
|
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||||
|
BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -38,10 +39,11 @@ Assets:Test:Test -1.00 KES
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
# Process Each Entry from All Beancount Files
|
# Process Each Entry from All Beancount Files
|
||||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||||
|
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -65,10 +67,11 @@ Assets:Test:Test -1.00 KES
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
# Process Each Entry from All Beancount Files
|
# Process Each Entry from All Beancount Files
|
||||||
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
|
||||||
|
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -96,7 +99,7 @@ def test_get_beancount_files(tmp_path):
|
||||||
input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount']
|
input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount']
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
extracted_org_files = get_beancount_files(input_files, input_filter)
|
extracted_org_files = BeancountToJsonl.get_beancount_files(input_files, input_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(extracted_org_files) == 5
|
assert len(extracted_org_files) == 5
|
||||||
|
|
|
@ -12,7 +12,7 @@ from src.main import app
|
||||||
from src.utils.state import model, config
|
from src.utils.state import model, config
|
||||||
from src.search_type import text_search, image_search
|
from src.search_type import text_search, image_search
|
||||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from src.search_filter.word_filter import WordFilter
|
from src.search_filter.word_filter import WordFilter
|
||||||
from src.search_filter.file_filter import FileFilter
|
from src.search_filter.file_filter import FileFilter
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
user_query = quote("How to git install application?")
|
user_query = quote("How to git install application?")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -135,7 +135,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||||
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter(), FileFilter()]
|
filters = [WordFilter(), FileFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = quote('+"Emacs" file:"*.org"')
|
user_query = quote('+"Emacs" file:"*.org"')
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -152,7 +152,7 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co
|
||||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter()]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = quote('How to git install application? +"Emacs"')
|
user_query = quote('How to git install application? +"Emacs"')
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -169,7 +169,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter()]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = quote('How to git install application? -"clone"')
|
user_query = quote('How to git install application? -"clone"')
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps, get_markdown_files
|
from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||||
|
|
||||||
|
|
||||||
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
@ -16,10 +16,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile])
|
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
|
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||||
|
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -37,10 +38,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||||
|
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -62,10 +64,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
||||||
|
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -93,7 +96,7 @@ def test_get_markdown_files(tmp_path):
|
||||||
input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown']
|
input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown']
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
extracted_org_files = get_markdown_files(input_files, input_filter)
|
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(extracted_org_files) == 5
|
assert len(extracted_org_files) == 5
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries, get_org_files
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
from src.utils.helpers import is_none_or_empty
|
from src.utils.helpers import is_none_or_empty
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,8 +21,8 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||||
for index_heading_entries in [True, False]:
|
for index_heading_entries in [True, False]:
|
||||||
# Act
|
# Act
|
||||||
# Extract entries into jsonl from specified Org files
|
# Extract entries into jsonl from specified Org files
|
||||||
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(
|
||||||
*extract_org_entries(org_files=[orgfile]),
|
*OrgToJsonl.extract_org_entries(org_files=[orgfile]),
|
||||||
index_heading_entries=index_heading_entries))
|
index_heading_entries=index_heading_entries))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
@ -49,10 +49,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -70,11 +70,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
jsonl_string = convert_org_entries_to_jsonl(entries)
|
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -102,7 +102,7 @@ def test_get_org_files(tmp_path):
|
||||||
input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']
|
input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
extracted_org_files = get_org_files(input_files, input_filter)
|
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(extracted_org_files) == 5
|
assert len(extracted_org_files) == 5
|
||||||
|
|
|
@ -9,7 +9,7 @@ import pytest
|
||||||
from src.utils.state import model
|
from src.utils.state import model
|
||||||
from src.search_type import text_search
|
from src.search_type import text_search
|
||||||
from src.utils.rawconfig import ContentConfig, SearchConfig
|
from src.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
|
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||||
|
|
||||||
|
|
||||||
# Test
|
# Test
|
||||||
|
@ -24,7 +24,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(content_config: Content
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with pytest.raises(FileNotFoundError):
|
with pytest.raises(FileNotFoundError):
|
||||||
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
@ -39,7 +39,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with pytest.raises(ValueError, match=r'^No valid entries found*'):
|
with pytest.raises(ValueError, match=r'^No valid entries found*'):
|
||||||
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
# delete created test file
|
# delete created test file
|
||||||
|
@ -50,7 +50,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
|
||||||
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Act
|
# Act
|
||||||
# Regenerate notes embeddings during asymmetric setup
|
# Regenerate notes embeddings during asymmetric setup
|
||||||
notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(notes_model.entries) == 10
|
assert len(notes_model.entries) == 10
|
||||||
|
@ -60,7 +60,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
model.notes_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
query = "How to git install application?"
|
query = "How to git install application?"
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
initial_notes_model= text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
assert len(initial_notes_model.entries) == 10
|
assert len(initial_notes_model.entries) == 10
|
||||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||||
|
@ -96,11 +96,11 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||||
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||||
|
|
||||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||||
regenerated_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
regenerated_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
|
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
|
||||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(regenerated_notes_model.entries) == 11
|
assert len(regenerated_notes_model.entries) == 11
|
||||||
|
@ -119,7 +119,7 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig):
|
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
assert len(initial_notes_model.entries) == 10
|
assert len(initial_notes_model.entries) == 10
|
||||||
assert len(initial_notes_model.corpus_embeddings) == 10
|
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||||
|
@ -133,7 +133,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# update embeddings, entries with the newly added note
|
# update embeddings, entries with the newly added note
|
||||||
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
# verify new entry added in updated embeddings, entries
|
# verify new entry added in updated embeddings, entries
|
||||||
assert len(initial_notes_model.entries) == 11
|
assert len(initial_notes_model.entries) == 11
|
||||||
|
|
Loading…
Add table
Reference in a new issue