mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Support Incremental Update of Entries, Embeddings for OrgMode, Markdown, Beancount Content
### Major Changes -030fab9
Support incremental update of **Markdown** entries, embeddings -91aac83
Support incremental update of **Beancount** transactions, embeddings -2f7a6af
Support incremental update of **Org-Mode** entries, embeddings - Encode embeddings for updated or new entries - Reuse embeddings encoded for existing entries earlier - Merge the existing and new entries and embeddings to get the updated entries, embeddings -91d11cc
Only hash compiled entry to identify new/updated entries to update -b9a6e80
Make OrgNode tags stable sorted to find new entries for incremental updates ### Minor Changes -c17a0fd
Do not store word filters index to file. Not necessary for now -4eb84c7
Log performance metrics for jsonl conversion -2e1bbe0
Fix striping empty escape sequences from strings ### Why - Encoding embeddings is the slowest step to index content - Previously we regenerated embeddings for all entries, even if they existed in previous runs - Reusing previously generated embeddings should significantly speed up index updates, given most user generated content can be expected to be unchanged across time Resolves #36
This commit is contained in:
commit
c415af32d5
19 changed files with 426 additions and 144 deletions
|
@ -131,7 +131,7 @@ pip install --upgrade khoj-assistant
|
||||||
- Indexing is more strongly impacted by the size of the source data
|
- Indexing is more strongly impacted by the size of the source data
|
||||||
- Indexing 100K+ line corpus of notes takes about 10 minutes
|
- Indexing 100K+ line corpus of notes takes about 10 minutes
|
||||||
- Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM
|
- Indexing 4000+ images takes about 15 minutes and more than 8Gb of RAM
|
||||||
- Once <https://github.com/debanjum/khoj/issues/36> is implemented, it should only take this long on first run
|
- Note: *It should only take this long on the first run* as the index is incrementally updated
|
||||||
|
|
||||||
### Miscellaneous
|
### Miscellaneous
|
||||||
|
|
||||||
|
|
|
@ -48,11 +48,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.org,
|
config.content_type.org,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.org.compressed_jsonl.parent, SearchType.Org),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Org Music Search
|
# Initialize Org Music Search
|
||||||
if (t == SearchType.Music or t == None) and config.content_type.music:
|
if (t == SearchType.Music or t == None) and config.content_type.music:
|
||||||
|
@ -71,11 +67,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.markdown,
|
config.content_type.markdown,
|
||||||
search_config=config.search_type.asymmetric,
|
search_config=config.search_type.asymmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.markdown.compressed_jsonl.parent, SearchType.Markdown),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Ledger Search
|
# Initialize Ledger Search
|
||||||
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
|
||||||
|
@ -85,11 +77,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
||||||
config.content_type.ledger,
|
config.content_type.ledger,
|
||||||
search_config=config.search_type.symmetric,
|
search_config=config.search_type.symmetric,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
filters=[
|
filters=[DateFilter(), WordFilter(), FileFilter()])
|
||||||
DateFilter(),
|
|
||||||
WordFilter(config.content_type.ledger.compressed_jsonl.parent, SearchType.Ledger),
|
|
||||||
FileFilter(),
|
|
||||||
])
|
|
||||||
|
|
||||||
# Initialize Image Search
|
# Initialize Image Search
|
||||||
if (t == SearchType.Image or t == None) and config.content_type.image:
|
if (t == SearchType.Image or t == None) and config.content_type.image:
|
||||||
|
|
|
@ -7,9 +7,10 @@ import pathlib
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
|
def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file, previous_entries=None):
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
|
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
|
||||||
print("At least one of beancount-files or beancount-file-filter is required to be specified")
|
print("At least one of beancount-files or beancount-file-filter is required to be specified")
|
||||||
|
@ -28,18 +29,34 @@ def beancount_to_jsonl(beancount_files, beancount_file_filter, output_file):
|
||||||
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
|
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Beancount files
|
# Extract Entries from specified Beancount files
|
||||||
entries, transaction_to_file_map = extract_beancount_entries(beancount_files)
|
start = time.time()
|
||||||
|
current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files))
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
|
# Identify, mark and merge any new entries with previous entries
|
||||||
|
start = time.time()
|
||||||
|
if not previous_entries:
|
||||||
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
else:
|
||||||
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Identify new or updated transaction: {end - start} seconds")
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_beancount_entries_to_jsonl(entries, transaction_to_file_map)
|
start = time.time()
|
||||||
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
|
jsonl_data = convert_transaction_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Write transactions to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries
|
return entries_with_ids
|
||||||
|
|
||||||
|
|
||||||
def get_beancount_files(beancount_files=None, beancount_file_filter=None):
|
def get_beancount_files(beancount_files=None, beancount_file_filter=None):
|
||||||
|
@ -66,12 +83,12 @@ def get_beancount_files(beancount_files=None, beancount_file_filter=None):
|
||||||
return all_beancount_files
|
return all_beancount_files
|
||||||
|
|
||||||
|
|
||||||
def extract_beancount_entries(beancount_files):
|
def extract_beancount_transactions(beancount_files):
|
||||||
"Extract entries from specified Beancount files"
|
"Extract entries from specified Beancount files"
|
||||||
|
|
||||||
# Initialize Regex for extracting Beancount Entries
|
# Initialize Regex for extracting Beancount Entries
|
||||||
transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] '
|
transaction_regex = r'^\n?\d{4}-\d{2}-\d{2} [\*|\!] '
|
||||||
empty_newline = f'^[{empty_escape_sequences}]*$'
|
empty_newline = f'^[\n\r\t\ ]*$'
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
transaction_to_file_map = []
|
transaction_to_file_map = []
|
||||||
|
@ -82,22 +99,25 @@ def extract_beancount_entries(beancount_files):
|
||||||
for entry
|
for entry
|
||||||
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
in re.split(empty_newline, ledger_content, flags=re.MULTILINE)
|
||||||
if re.match(transaction_regex, entry)]
|
if re.match(transaction_regex, entry)]
|
||||||
transaction_to_file_map += [beancount_file]*len(transactions_per_file)
|
transaction_to_file_map += zip(transactions_per_file, [beancount_file]*len(transactions_per_file))
|
||||||
entries.extend(transactions_per_file)
|
entries.extend(transactions_per_file)
|
||||||
return entries, transaction_to_file_map
|
return entries, dict(transaction_to_file_map)
|
||||||
|
|
||||||
|
|
||||||
def convert_beancount_entries_to_jsonl(entries, transaction_to_file_map):
|
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
|
||||||
"Convert each Beancount transaction to JSON and collate as JSONL"
|
"Convert each Beancount transaction into a dictionary"
|
||||||
jsonl = ''
|
entry_maps = []
|
||||||
for entry_id, entry in enumerate(entries):
|
for entry in entries:
|
||||||
entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry_id]}'}
|
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{transaction_to_file_map[entry]}'})
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} to jsonl format")
|
logger.info(f"Converted {len(entries)} transactions to dictionaries")
|
||||||
|
|
||||||
return jsonl
|
return entry_maps
|
||||||
|
|
||||||
|
|
||||||
|
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
|
||||||
|
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
|
||||||
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -7,9 +7,10 @@ import pathlib
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.constants import empty_escape_sequences
|
from src.utils.constants import empty_escape_sequences
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
|
|
||||||
|
@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file, previous_entries=None):
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
|
||||||
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
print("At least one of markdown-files or markdown-file-filter is required to be specified")
|
||||||
|
@ -28,18 +29,34 @@ def markdown_to_jsonl(markdown_files, markdown_file_filter, output_file):
|
||||||
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = extract_markdown_entries(markdown_files)
|
start = time.time()
|
||||||
|
current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files))
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
|
# Identify, mark and merge any new entries with previous entries
|
||||||
|
start = time.time()
|
||||||
|
if not previous_entries:
|
||||||
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
else:
|
||||||
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Identify new or updated entries: {end - start} seconds")
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_markdown_entries_to_jsonl(entries, entry_to_file_map)
|
start = time.time()
|
||||||
|
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||||
|
jsonl_data = convert_markdown_maps_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Write markdown entries to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries
|
return entries_with_ids
|
||||||
|
|
||||||
|
|
||||||
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
|
def get_markdown_files(markdown_files=None, markdown_file_filter=None):
|
||||||
|
@ -80,24 +97,28 @@ def extract_markdown_entries(markdown_files):
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
markdown_entries_per_file = [f'#{entry.strip(empty_escape_sequences)}'
|
||||||
for entry
|
for entry
|
||||||
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)]
|
in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||||
entry_to_file_map += [markdown_file]*len(markdown_entries_per_file)
|
if entry.strip(empty_escape_sequences) != '']
|
||||||
|
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file]*len(markdown_entries_per_file))
|
||||||
entries.extend(markdown_entries_per_file)
|
entries.extend(markdown_entries_per_file)
|
||||||
|
|
||||||
return entries, entry_to_file_map
|
return entries, dict(entry_to_file_map)
|
||||||
|
|
||||||
|
|
||||||
def convert_markdown_entries_to_jsonl(entries, entry_to_file_map):
|
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
|
||||||
|
"Convert each Markdown entries into a dictionary"
|
||||||
|
entry_maps = []
|
||||||
|
for entry in entries:
|
||||||
|
entry_maps.append({'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry]}'})
|
||||||
|
|
||||||
|
logger.info(f"Converted {len(entries)} markdown entries to dictionaries")
|
||||||
|
|
||||||
|
return entry_maps
|
||||||
|
|
||||||
|
|
||||||
|
def convert_markdown_maps_to_jsonl(entries):
|
||||||
"Convert each Markdown entries to JSON and collate as JSONL"
|
"Convert each Markdown entries to JSON and collate as JSONL"
|
||||||
jsonl = ''
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
for entry_id, entry in enumerate(entries):
|
|
||||||
entry_dict = {'compiled': entry, 'raw': entry, 'file': f'{entry_to_file_map[entry_id]}'}
|
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} to jsonl format")
|
|
||||||
|
|
||||||
return jsonl
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -7,10 +7,11 @@ import argparse
|
||||||
import pathlib
|
import pathlib
|
||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode import orgnode
|
from src.processor.org_mode import orgnode
|
||||||
from src.utils.helpers import get_absolute_path, is_none_or_empty
|
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
|
||||||
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||||
from src.utils import state
|
from src.utils import state
|
||||||
|
|
||||||
|
@ -19,28 +20,47 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Define Functions
|
# Define Functions
|
||||||
def org_to_jsonl(org_files, org_file_filter, output_file):
|
def org_to_jsonl(org_files, org_file_filter, output_file, previous_entries=None):
|
||||||
# Input Validation
|
# Input Validation
|
||||||
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
|
||||||
print("At least one of org-files or org-file-filter is required to be specified")
|
print("At least one of org-files or org-file-filter is required to be specified")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
# Get Org Files to Process
|
# Get Org Files to Process
|
||||||
|
start = time.time()
|
||||||
org_files = get_org_files(org_files, org_file_filter)
|
org_files = get_org_files(org_files, org_file_filter)
|
||||||
|
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, file_to_entries = extract_org_entries(org_files)
|
start = time.time()
|
||||||
|
entry_nodes, file_to_entries = extract_org_entries(org_files)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
|
||||||
|
|
||||||
|
# Identify, mark and merge any new entries with previous entries
|
||||||
|
if not previous_entries:
|
||||||
|
entries_with_ids = list(enumerate(current_entries))
|
||||||
|
else:
|
||||||
|
entries_with_ids = mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=logger)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries, file_to_entries)
|
start = time.time()
|
||||||
|
entries = map(lambda entry: entry[1], entries_with_ids)
|
||||||
|
jsonl_data = convert_org_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Compress JSONL formatted Data
|
# Compress JSONL formatted Data
|
||||||
if output_file.suffix == ".gz":
|
if output_file.suffix == ".gz":
|
||||||
compress_jsonl_data(jsonl_data, output_file)
|
compress_jsonl_data(jsonl_data, output_file)
|
||||||
elif output_file.suffix == ".jsonl":
|
elif output_file.suffix == ".jsonl":
|
||||||
dump_jsonl(jsonl_data, output_file)
|
dump_jsonl(jsonl_data, output_file)
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Write org entries to JSONL file: {end - start} seconds")
|
||||||
|
|
||||||
return entries
|
return entries_with_ids
|
||||||
|
|
||||||
|
|
||||||
def get_org_files(org_files=None, org_file_filter=None):
|
def get_org_files(org_files=None, org_file_filter=None):
|
||||||
|
@ -70,16 +90,16 @@ def extract_org_entries(org_files):
|
||||||
entry_to_file_map = []
|
entry_to_file_map = []
|
||||||
for org_file in org_files:
|
for org_file in org_files:
|
||||||
org_file_entries = orgnode.makelist(str(org_file))
|
org_file_entries = orgnode.makelist(str(org_file))
|
||||||
entry_to_file_map += [org_file]*len(org_file_entries)
|
entry_to_file_map += zip(org_file_entries, [org_file]*len(org_file_entries))
|
||||||
entries.extend(org_file_entries)
|
entries.extend(org_file_entries)
|
||||||
|
|
||||||
return entries, entry_to_file_map
|
return entries, dict(entry_to_file_map)
|
||||||
|
|
||||||
|
|
||||||
def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str:
|
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map) -> list[dict]:
|
||||||
"Convert each Org-Mode entries to JSON and collate as JSONL"
|
"Convert Org-Mode entries into list of dictionary"
|
||||||
jsonl = ''
|
entry_maps = []
|
||||||
for entry_id, entry in enumerate(entries):
|
for entry in entries:
|
||||||
entry_dict = dict()
|
entry_dict = dict()
|
||||||
|
|
||||||
# Ignore title notes i.e notes with just headings and empty body
|
# Ignore title notes i.e notes with just headings and empty body
|
||||||
|
@ -113,14 +133,17 @@ def convert_org_entries_to_jsonl(entries, entry_to_file_map) -> str:
|
||||||
|
|
||||||
if entry_dict:
|
if entry_dict:
|
||||||
entry_dict["raw"] = f'{entry}'
|
entry_dict["raw"] = f'{entry}'
|
||||||
entry_dict["file"] = f'{entry_to_file_map[entry_id]}'
|
entry_dict["file"] = f'{entry_to_file_map[entry]}'
|
||||||
|
|
||||||
# Convert Dictionary to JSON and Append to JSONL string
|
# Convert Dictionary to JSON and Append to JSONL string
|
||||||
jsonl += f'{json.dumps(entry_dict, ensure_ascii=False)}\n'
|
entry_maps.append(entry_dict)
|
||||||
|
|
||||||
logger.info(f"Converted {len(entries)} to jsonl format")
|
return entry_maps
|
||||||
|
|
||||||
return jsonl
|
|
||||||
|
def convert_org_entries_to_jsonl(entries: list[dict]) -> str:
|
||||||
|
"Convert each Org-Mode entry to JSON and collate as JSONL"
|
||||||
|
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -69,7 +69,7 @@ def makelist(filename):
|
||||||
level = ""
|
level = ""
|
||||||
heading = ""
|
heading = ""
|
||||||
bodytext = ""
|
bodytext = ""
|
||||||
tags = set() # set of all tags in headline
|
tags = list() # set of all tags in headline
|
||||||
closed_date = ''
|
closed_date = ''
|
||||||
sched_date = ''
|
sched_date = ''
|
||||||
deadline_date = ''
|
deadline_date = ''
|
||||||
|
@ -104,14 +104,14 @@ def makelist(filename):
|
||||||
level = hdng.group(1)
|
level = hdng.group(1)
|
||||||
heading = hdng.group(2)
|
heading = hdng.group(2)
|
||||||
bodytext = ""
|
bodytext = ""
|
||||||
tags = set() # set of all tags in headline
|
tags = list() # set of all tags in headline
|
||||||
tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading)
|
tagsrch = re.search(r'(.*?)\s*:([a-zA-Z0-9].*?):$',heading)
|
||||||
if tagsrch:
|
if tagsrch:
|
||||||
heading = tagsrch.group(1)
|
heading = tagsrch.group(1)
|
||||||
parsedtags = tagsrch.group(2)
|
parsedtags = tagsrch.group(2)
|
||||||
if parsedtags:
|
if parsedtags:
|
||||||
for parsedtag in parsedtags.split(':'):
|
for parsedtag in parsedtags.split(':'):
|
||||||
if parsedtag != '': tags.add(parsedtag)
|
if parsedtag != '': tags.append(parsedtag)
|
||||||
else: # we are processing a non-heading line
|
else: # we are processing a non-heading line
|
||||||
if line[:10] == '#+SEQ_TODO':
|
if line[:10] == '#+SEQ_TODO':
|
||||||
kwlist = re.findall(r'([A-Z]+)\(', line)
|
kwlist = re.findall(r'([A-Z]+)\(', line)
|
||||||
|
@ -237,7 +237,7 @@ class Orgnode(object):
|
||||||
self.level = len(level)
|
self.level = len(level)
|
||||||
self.headline = headline
|
self.headline = headline
|
||||||
self.body = body
|
self.body = body
|
||||||
self.tags = set(tags) # All tags in the headline
|
self.tags = tags # All tags in the headline
|
||||||
self.todo = ""
|
self.todo = ""
|
||||||
self.prty = "" # empty of A, B or C
|
self.prty = "" # empty of A, B or C
|
||||||
self.scheduled = "" # Scheduled date
|
self.scheduled = "" # Scheduled date
|
||||||
|
@ -290,8 +290,8 @@ class Orgnode(object):
|
||||||
|
|
||||||
def Tags(self):
|
def Tags(self):
|
||||||
"""
|
"""
|
||||||
Returns the set of all tags
|
Returns the list of all tags
|
||||||
For example, :HOME:COMPUTER: would return {'HOME', 'COMPUTER'}
|
For example, :HOME:COMPUTER: would return ['HOME', 'COMPUTER']
|
||||||
"""
|
"""
|
||||||
return self.tags
|
return self.tags
|
||||||
|
|
||||||
|
@ -307,7 +307,7 @@ class Orgnode(object):
|
||||||
"""
|
"""
|
||||||
Store all the tags found in the headline.
|
Store all the tags found in the headline.
|
||||||
"""
|
"""
|
||||||
self.tags = set(newtags)
|
self.tags = newtags
|
||||||
|
|
||||||
def Todo(self):
|
def Todo(self):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -19,37 +19,24 @@ class WordFilter(BaseFilter):
|
||||||
required_regex = r'\+"(\w+)" ?'
|
required_regex = r'\+"(\w+)" ?'
|
||||||
blocked_regex = r'\-"(\w+)" ?'
|
blocked_regex = r'\-"(\w+)" ?'
|
||||||
|
|
||||||
def __init__(self, filter_directory, search_type: SearchType, entry_key='raw'):
|
def __init__(self, entry_key='raw'):
|
||||||
self.filter_file = resolve_absolute_path(filter_directory / f"word_filter_{search_type.name.lower()}_index.pkl")
|
|
||||||
self.entry_key = entry_key
|
self.entry_key = entry_key
|
||||||
self.search_type = search_type
|
self.word_to_entry_index = defaultdict(set)
|
||||||
self.word_to_entry_index = dict()
|
|
||||||
self.cache = LRU()
|
self.cache = LRU()
|
||||||
|
|
||||||
|
|
||||||
def load(self, entries, regenerate=False):
|
def load(self, entries, regenerate=False):
|
||||||
if self.filter_file.exists() and not regenerate:
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
with self.filter_file.open('rb') as f:
|
self.cache = {} # Clear cache on filter (re-)load
|
||||||
self.word_to_entry_index = pickle.load(f)
|
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\<|\>|\t|\n|\:|\;|\?|\!|\(|\)|\&|\^|\$|\@|\%|\+|\=|\/|\\|\||\~|\`|\"|\''
|
||||||
end = time.time()
|
|
||||||
logger.debug(f"Load word filter index for {self.search_type} from {self.filter_file}: {end - start} seconds")
|
|
||||||
else:
|
|
||||||
start = time.time()
|
|
||||||
self.cache = {} # Clear cache on (re-)generating entries_by_word_set
|
|
||||||
entry_splitter = r',|\.| |\]|\[\(|\)|\{|\}|\t|\n|\:'
|
|
||||||
self.word_to_entry_index = defaultdict(set)
|
|
||||||
# Create map of words to entries they exist in
|
# Create map of words to entries they exist in
|
||||||
for entry_index, entry in enumerate(entries):
|
for entry_index, entry in enumerate(entries):
|
||||||
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
for word in re.split(entry_splitter, entry[self.entry_key].lower()):
|
||||||
if word == '':
|
if word == '':
|
||||||
continue
|
continue
|
||||||
self.word_to_entry_index[word].add(entry_index)
|
self.word_to_entry_index[word].add(entry_index)
|
||||||
|
|
||||||
with self.filter_file.open('wb') as f:
|
|
||||||
pickle.dump(self.word_to_entry_index, f)
|
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.debug(f"Indexed {len(self.word_to_entry_index)} words of {self.search_type} type for word filter to {self.filter_file}: {end - start} seconds")
|
logger.debug(f"Created word filter index: {end - start} seconds")
|
||||||
|
|
||||||
return self.word_to_entry_index
|
return self.word_to_entry_index
|
||||||
|
|
||||||
|
|
|
@ -55,15 +55,28 @@ def extract_entries(jsonl_file):
|
||||||
return load_jsonl(jsonl_file)
|
return load_jsonl(jsonl_file)
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(entries, bi_encoder, embeddings_file, regenerate=False):
|
def compute_embeddings(entries_with_ids, bi_encoder, embeddings_file, regenerate=False):
|
||||||
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
|
"Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
|
||||||
# Load pre-computed embeddings from file if exists
|
new_entries = []
|
||||||
|
# Load pre-computed embeddings from file if exists and update them if required
|
||||||
if embeddings_file.exists() and not regenerate:
|
if embeddings_file.exists() and not regenerate:
|
||||||
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
|
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
|
||||||
logger.info(f"Loaded embeddings from {embeddings_file}")
|
logger.info(f"Loaded embeddings from {embeddings_file}")
|
||||||
|
|
||||||
else: # Else compute the corpus_embeddings from scratch, which can take a while
|
# Encode any new entries in the corpus and update corpus embeddings
|
||||||
corpus_embeddings = bi_encoder.encode([entry['compiled'] for entry in entries], convert_to_tensor=True, device=state.device, show_progress_bar=True)
|
new_entries = [entry['compiled'] for id, entry in entries_with_ids if id is None]
|
||||||
|
if new_entries:
|
||||||
|
new_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True)
|
||||||
|
existing_entry_ids = [id for id, _ in entries_with_ids if id is not None]
|
||||||
|
existing_embeddings = torch.index_select(corpus_embeddings, 0, torch.tensor(existing_entry_ids)) if existing_entry_ids else torch.Tensor()
|
||||||
|
corpus_embeddings = torch.cat([existing_embeddings, new_embeddings], dim=0)
|
||||||
|
# Else compute the corpus embeddings from scratch
|
||||||
|
else:
|
||||||
|
new_entries = [entry['compiled'] for _, entry in entries_with_ids]
|
||||||
|
corpus_embeddings = bi_encoder.encode(new_entries, convert_to_tensor=True, device=state.device, show_progress_bar=True)
|
||||||
|
|
||||||
|
# Save regenerated or updated embeddings to file
|
||||||
|
if new_entries:
|
||||||
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
|
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
|
||||||
torch.save(corpus_embeddings, embeddings_file)
|
torch.save(corpus_embeddings, embeddings_file)
|
||||||
logger.info(f"Computed embeddings and saved them to {embeddings_file}")
|
logger.info(f"Computed embeddings and saved them to {embeddings_file}")
|
||||||
|
@ -169,10 +182,10 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
|
||||||
|
|
||||||
# Map notes in text files to (compressed) JSONL formatted file
|
# Map notes in text files to (compressed) JSONL formatted file
|
||||||
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
|
||||||
if not config.compressed_jsonl.exists() or regenerate:
|
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() else None
|
||||||
text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl)
|
entries_with_indices = text_to_jsonl(config.input_files, config.input_filter, config.compressed_jsonl, previous_entries)
|
||||||
|
|
||||||
# Extract Entries
|
# Extract Updated Entries
|
||||||
entries = extract_entries(config.compressed_jsonl)
|
entries = extract_entries(config.compressed_jsonl)
|
||||||
if is_none_or_empty(entries):
|
if is_none_or_empty(entries):
|
||||||
raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
|
raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
|
||||||
|
@ -180,7 +193,7 @@ def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchCon
|
||||||
|
|
||||||
# Compute or Load Embeddings
|
# Compute or Load Embeddings
|
||||||
config.embeddings_file = resolve_absolute_path(config.embeddings_file)
|
config.embeddings_file = resolve_absolute_path(config.embeddings_file)
|
||||||
corpus_embeddings = compute_embeddings(entries, bi_encoder, config.embeddings_file, regenerate=regenerate)
|
corpus_embeddings = compute_embeddings(entries_with_indices, bi_encoder, config.embeddings_file, regenerate=regenerate)
|
||||||
|
|
||||||
for filter in filters:
|
for filter in filters:
|
||||||
filter.load(entries, regenerate=regenerate)
|
filter.load(entries, regenerate=regenerate)
|
||||||
|
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
||||||
|
|
||||||
app_root_directory = Path(__file__).parent.parent.parent
|
app_root_directory = Path(__file__).parent.parent.parent
|
||||||
web_directory = app_root_directory / 'src/interface/web/'
|
web_directory = app_root_directory / 'src/interface/web/'
|
||||||
empty_escape_sequences = r'\n|\r\t '
|
empty_escape_sequences = '\n|\r|\t| '
|
||||||
|
|
||||||
# default app config to use
|
# default app config to use
|
||||||
default_config = {
|
default_config = {
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# Standard Packages
|
# Standard Packages
|
||||||
import pathlib
|
import pathlib
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
@ -79,3 +81,39 @@ class LRU(OrderedDict):
|
||||||
if len(self) > self.capacity:
|
if len(self) > self.capacity:
|
||||||
oldest = next(iter(self))
|
oldest = next(iter(self))
|
||||||
del self[oldest]
|
del self[oldest]
|
||||||
|
|
||||||
|
|
||||||
|
def mark_entries_for_update(current_entries, previous_entries, key='compiled', logger=None):
|
||||||
|
# Hash all current and previous entries to identify new entries
|
||||||
|
start = time.time()
|
||||||
|
current_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), current_entries))
|
||||||
|
previous_entry_hashes = list(map(lambda e: hashlib.md5(bytes(e[key], encoding='utf-8')).hexdigest(), previous_entries))
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Hash previous, current entries: {end - start} seconds")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||||
|
hash_to_previous_entries = dict(zip(previous_entry_hashes, previous_entries))
|
||||||
|
|
||||||
|
# All entries that did not exist in the previous set are to be added
|
||||||
|
new_entry_hashes = set(current_entry_hashes) - set(previous_entry_hashes)
|
||||||
|
# All entries that exist in both current and previous sets are kept
|
||||||
|
existing_entry_hashes = set(current_entry_hashes) & set(previous_entry_hashes)
|
||||||
|
|
||||||
|
# Mark new entries with no ids for later embeddings generation
|
||||||
|
new_entries = [
|
||||||
|
(None, hash_to_current_entries[entry_hash])
|
||||||
|
for entry_hash in new_entry_hashes
|
||||||
|
]
|
||||||
|
# Set id of existing entries to their previous ids to reuse their existing encoded embeddings
|
||||||
|
existing_entries = [
|
||||||
|
(previous_entry_hashes.index(entry_hash), hash_to_previous_entries[entry_hash])
|
||||||
|
for entry_hash in existing_entry_hashes
|
||||||
|
]
|
||||||
|
|
||||||
|
existing_entries_sorted = sorted(existing_entries, key=lambda e: e[0])
|
||||||
|
entries_with_ids = existing_entries_sorted + new_entries
|
||||||
|
end = time.time()
|
||||||
|
logger.debug(f"Identify, Mark, Combine new, existing entries: {end - start} seconds")
|
||||||
|
|
||||||
|
return entries_with_ids
|
|
@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path):
|
||||||
with gzip.open(output_path, 'wt') as gzip_file:
|
with gzip.open(output_path, 'wt') as gzip_file:
|
||||||
gzip_file.write(jsonl_data)
|
gzip_file.write(jsonl_data)
|
||||||
|
|
||||||
logger.info(f'Wrote {len(jsonl_data)} lines to gzip compressed jsonl at {output_path}')
|
logger.info(f'Wrote jsonl data to gzip compressed jsonl at {output_path}')
|
|
@ -59,7 +59,7 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
|
||||||
compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
|
compressed_jsonl = content_dir.joinpath('notes.jsonl.gz'),
|
||||||
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
|
||||||
|
|
||||||
filters = [DateFilter(), WordFilter(content_dir, search_type=SearchType.Org), FileFilter()]
|
filters = [DateFilter(), WordFilter(), FileFilter()]
|
||||||
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
|
|
||||||
return content_config
|
return content_config
|
84
tests/test_beancount_to_jsonl.py
Normal file
84
tests/test_beancount_to_jsonl.py
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
# Standard Packages
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_transactions_in_file(tmp_path):
|
||||||
|
"Handle file with no transactions."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
- Bullet point 1
|
||||||
|
- Bullet point 2
|
||||||
|
'''
|
||||||
|
beancount_file = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Beancount files
|
||||||
|
entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
|
# Process Each Entry from All Beancount Files
|
||||||
|
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_beancount_transaction_to_jsonl(tmp_path):
|
||||||
|
"Convert transaction from single file to jsonl."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
1984-04-01 * "Payee" "Narration"
|
||||||
|
Expenses:Test:Test 1.00 KES
|
||||||
|
Assets:Test:Test -1.00 KES
|
||||||
|
'''
|
||||||
|
beancount_file = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Beancount files
|
||||||
|
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
|
# Process Each Entry from All Beancount Files
|
||||||
|
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_transactions_to_jsonl(tmp_path):
|
||||||
|
"Convert multiple transactions from single file to jsonl."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
1984-04-01 * "Payee" "Narration"
|
||||||
|
Expenses:Test:Test 1.00 KES
|
||||||
|
Assets:Test:Test -1.00 KES
|
||||||
|
\t\r
|
||||||
|
1984-04-01 * "Payee" "Narration"
|
||||||
|
Expenses:Test:Test 1.00 KES
|
||||||
|
Assets:Test:Test -1.00 KES
|
||||||
|
'''
|
||||||
|
|
||||||
|
beancount_file = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Beancount files
|
||||||
|
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
|
||||||
|
|
||||||
|
# Process Each Entry from All Beancount Files
|
||||||
|
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# Helper Functions
|
||||||
|
def create_file(tmp_path, entry, filename="ledger.beancount"):
|
||||||
|
beancount_file = tmp_path / f"notes/{filename}"
|
||||||
|
beancount_file.parent.mkdir()
|
||||||
|
beancount_file.touch()
|
||||||
|
beancount_file.write_text(entry)
|
||||||
|
return beancount_file
|
|
@ -132,7 +132,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = 'How to git install application? +"Emacs"'
|
user_query = 'How to git install application? +"Emacs"'
|
||||||
|
|
||||||
|
@ -149,7 +149,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
|
||||||
# ----------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------
|
||||||
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
# Arrange
|
# Arrange
|
||||||
filters = [WordFilter(content_config.org.compressed_jsonl.parent, search_type=SearchType.Org)]
|
filters = [WordFilter()]
|
||||||
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
|
||||||
user_query = 'How to git install application? -"clone"'
|
user_query = 'How to git install application? -"clone"'
|
||||||
|
|
||||||
|
|
81
tests/test_markdown_to_jsonl.py
Normal file
81
tests/test_markdown_to_jsonl.py
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
# Standard Packages
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Internal Packages
|
||||||
|
from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
"Convert files with no heading to jsonl."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
- Bullet point 1
|
||||||
|
- Bullet point 2
|
||||||
|
'''
|
||||||
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Markdown files
|
||||||
|
entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
|
"Convert markdown entry from single file to jsonl."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''### Heading
|
||||||
|
\t\r
|
||||||
|
Body Line 1
|
||||||
|
'''
|
||||||
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Markdown files
|
||||||
|
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||||
|
"Convert multiple markdown entries from single file to jsonl."
|
||||||
|
# Arrange
|
||||||
|
entry = f'''
|
||||||
|
### Heading 1
|
||||||
|
\t\r
|
||||||
|
Heading 1 Body Line 1
|
||||||
|
### Heading 2
|
||||||
|
\t\r
|
||||||
|
Heading 2 Body Line 2
|
||||||
|
'''
|
||||||
|
markdownfile = create_file(tmp_path, entry)
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# Extract Entries from specified Markdown files
|
||||||
|
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
|
||||||
|
|
||||||
|
# Process Each Entry from All Notes Files
|
||||||
|
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
|
||||||
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert len(jsonl_data) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# Helper Functions
|
||||||
|
def create_file(tmp_path, entry, filename="test.md"):
|
||||||
|
markdown_file = tmp_path / f"notes/{filename}"
|
||||||
|
markdown_file.parent.mkdir()
|
||||||
|
markdown_file.touch()
|
||||||
|
markdown_file.write_text(entry)
|
||||||
|
return markdown_file
|
|
@ -2,7 +2,7 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries
|
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries
|
||||||
from src.utils.helpers import is_none_or_empty
|
from src.utils.helpers import is_none_or_empty
|
||||||
|
|
||||||
|
|
||||||
|
@ -20,10 +20,11 @@ def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_data = convert_org_entries_to_jsonl(entries, entry_to_file_map)
|
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
|
jsonl_data = convert_org_entries_to_jsonl(entries)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert is_none_or_empty(jsonl_data)
|
assert is_none_or_empty(jsonl_data)
|
||||||
|
@ -46,7 +47,7 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
|
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -64,10 +65,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
|
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = convert_org_entries_to_jsonl(entries, entry_to_file_map)
|
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
|
jsonl_string = convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
|
|
@ -21,7 +21,7 @@ def test_parse_entry_with_no_headings(tmp_path):
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == f'{orgfile}'
|
assert entries[0].Heading() == f'{orgfile}'
|
||||||
assert entries[0].Tags() == set()
|
assert entries[0].Tags() == list()
|
||||||
assert entries[0].Body() == "Body Line 1"
|
assert entries[0].Body() == "Body Line 1"
|
||||||
assert entries[0].Priority() == ""
|
assert entries[0].Priority() == ""
|
||||||
assert entries[0].Property("ID") == ""
|
assert entries[0].Property("ID") == ""
|
||||||
|
@ -45,7 +45,7 @@ Body Line 1'''
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == "Heading"
|
assert entries[0].Heading() == "Heading"
|
||||||
assert entries[0].Tags() == set()
|
assert entries[0].Tags() == list()
|
||||||
assert entries[0].Body() == "Body Line 1"
|
assert entries[0].Body() == "Body Line 1"
|
||||||
assert entries[0].Priority() == ""
|
assert entries[0].Priority() == ""
|
||||||
assert entries[0].Property("ID") == ""
|
assert entries[0].Property("ID") == ""
|
||||||
|
@ -79,7 +79,7 @@ Body Line 2'''
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == "Heading"
|
assert entries[0].Heading() == "Heading"
|
||||||
assert entries[0].Todo() == "DONE"
|
assert entries[0].Todo() == "DONE"
|
||||||
assert entries[0].Tags() == {"Tag1", "TAG2", "tag3"}
|
assert entries[0].Tags() == ["Tag1", "TAG2", "tag3"]
|
||||||
assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2"
|
assert entries[0].Body() == "- Clocked Log 1\nBody Line 1\nBody Line 2"
|
||||||
assert entries[0].Priority() == "A"
|
assert entries[0].Priority() == "A"
|
||||||
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
|
assert entries[0].Property("ID") == "id:123-456-789-4234-1231"
|
||||||
|
@ -178,7 +178,7 @@ Body 2
|
||||||
for index, entry in enumerate(entries):
|
for index, entry in enumerate(entries):
|
||||||
assert entry.Heading() == f"Heading{index+1}"
|
assert entry.Heading() == f"Heading{index+1}"
|
||||||
assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED"
|
assert entry.Todo() == "FAILED" if index == 0 else "CANCELLED"
|
||||||
assert entry.Tags() == {f"tag{index+1}"}
|
assert entry.Tags() == [f"tag{index+1}"]
|
||||||
assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
|
assert entry.Body() == f"- Clocked Log {index+1}\nBody {index+1}\n\n"
|
||||||
assert entry.Priority() == "A"
|
assert entry.Priority() == "A"
|
||||||
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
|
assert entry.Property("ID") == f"id:123-456-789-4234-000{index+1}"
|
||||||
|
@ -202,7 +202,7 @@ Body Line 1'''
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == f'{orgfile}'
|
assert entries[0].Heading() == f'{orgfile}'
|
||||||
assert entries[0].Tags() == set()
|
assert entries[0].Tags() == list()
|
||||||
assert entries[0].Body() == "Body Line 1"
|
assert entries[0].Body() == "Body Line 1"
|
||||||
assert entries[0].Priority() == ""
|
assert entries[0].Priority() == ""
|
||||||
assert entries[0].Property("ID") == ""
|
assert entries[0].Property("ID") == ""
|
||||||
|
@ -225,7 +225,7 @@ Body Line 1'''
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == 'test'
|
assert entries[0].Heading() == 'test'
|
||||||
assert entries[0].Tags() == set()
|
assert entries[0].Tags() == list()
|
||||||
assert entries[0].Body() == "Body Line 1"
|
assert entries[0].Body() == "Body Line 1"
|
||||||
assert entries[0].Priority() == ""
|
assert entries[0].Priority() == ""
|
||||||
assert entries[0].Property("ID") == ""
|
assert entries[0].Property("ID") == ""
|
||||||
|
@ -249,7 +249,7 @@ Body Line 1
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 1
|
assert len(entries) == 1
|
||||||
assert entries[0].Heading() == 'title1 title2'
|
assert entries[0].Heading() == 'title1 title2'
|
||||||
assert entries[0].Tags() == set()
|
assert entries[0].Tags() == list()
|
||||||
assert entries[0].Body() == "Body Line 1\n"
|
assert entries[0].Body() == "Body Line 1\n"
|
||||||
assert entries[0].Priority() == ""
|
assert entries[0].Priority() == ""
|
||||||
assert entries[0].Property("ID") == ""
|
assert entries[0].Property("ID") == ""
|
||||||
|
|
|
@ -100,3 +100,32 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
|
||||||
# delete reload test file added
|
# delete reload test file added
|
||||||
content_config.org.input_files = []
|
content_config.org.input_files = []
|
||||||
file_to_add_on_reload.unlink()
|
file_to_add_on_reload.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------------------------------------
|
||||||
|
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig):
|
||||||
|
# Arrange
|
||||||
|
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
|
||||||
|
|
||||||
|
assert len(initial_notes_model.entries) == 10
|
||||||
|
assert len(initial_notes_model.corpus_embeddings) == 10
|
||||||
|
|
||||||
|
file_to_add_on_update = Path(content_config.org.input_filter).parent / "update.org"
|
||||||
|
content_config.org.input_files = [f'{file_to_add_on_update}']
|
||||||
|
|
||||||
|
# append Org-Mode Entry to first Org Input File in Config
|
||||||
|
with open(file_to_add_on_update, "w") as f:
|
||||||
|
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
|
||||||
|
|
||||||
|
# Act
|
||||||
|
# update embeddings, entries with the newly added note
|
||||||
|
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
|
||||||
|
|
||||||
|
# verify new entry added in updated embeddings, entries
|
||||||
|
assert len(initial_notes_model.entries) == 11
|
||||||
|
assert len(initial_notes_model.corpus_embeddings) == 11
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
# delete file added for update testing
|
||||||
|
content_config.org.input_files = []
|
||||||
|
file_to_add_on_update.unlink()
|
||||||
|
|
|
@ -1,15 +1,12 @@
|
||||||
# External Packages
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# Application Packages
|
# Application Packages
|
||||||
from src.search_filter.word_filter import WordFilter
|
from src.search_filter.word_filter import WordFilter
|
||||||
from src.utils.config import SearchType
|
from src.utils.config import SearchType
|
||||||
|
|
||||||
|
|
||||||
def test_no_word_filter(tmp_path):
|
def test_no_word_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
q_with_no_filter = 'head tail'
|
q_with_no_filter = 'head tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -22,10 +19,10 @@ def test_no_word_filter(tmp_path):
|
||||||
assert entry_indices == {0, 1, 2, 3}
|
assert entry_indices == {0, 1, 2, 3}
|
||||||
|
|
||||||
|
|
||||||
def test_word_exclude_filter(tmp_path):
|
def test_word_exclude_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
q_with_exclude_filter = 'head -"exclude_word" tail'
|
q_with_exclude_filter = 'head -"exclude_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -38,10 +35,10 @@ def test_word_exclude_filter(tmp_path):
|
||||||
assert entry_indices == {0, 2}
|
assert entry_indices == {0, 2}
|
||||||
|
|
||||||
|
|
||||||
def test_word_include_filter(tmp_path):
|
def test_word_include_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
query_with_include_filter = 'head +"include_word" tail'
|
query_with_include_filter = 'head +"include_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -54,10 +51,10 @@ def test_word_include_filter(tmp_path):
|
||||||
assert entry_indices == {2, 3}
|
assert entry_indices == {2, 3}
|
||||||
|
|
||||||
|
|
||||||
def test_word_include_and_exclude_filter(tmp_path):
|
def test_word_include_and_exclude_filter():
|
||||||
# Arrange
|
# Arrange
|
||||||
word_filter = WordFilter(tmp_path, SearchType.Org)
|
word_filter = WordFilter()
|
||||||
embeddings, entries = arrange_content()
|
entries = arrange_content()
|
||||||
query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail'
|
query_with_include_and_exclude_filter = 'head +"include_word" -"exclude_word" tail'
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -71,11 +68,10 @@ def test_word_include_and_exclude_filter(tmp_path):
|
||||||
|
|
||||||
|
|
||||||
def arrange_content():
|
def arrange_content():
|
||||||
embeddings = torch.randn(4, 10)
|
|
||||||
entries = [
|
entries = [
|
||||||
{'compiled': '', 'raw': 'Minimal Entry'},
|
{'compiled': '', 'raw': 'Minimal Entry'},
|
||||||
{'compiled': '', 'raw': 'Entry with exclude_word'},
|
{'compiled': '', 'raw': 'Entry with exclude_word'},
|
||||||
{'compiled': '', 'raw': 'Entry with include_word'},
|
{'compiled': '', 'raw': 'Entry with include_word'},
|
||||||
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
|
{'compiled': '', 'raw': 'Entry with include_word and exclude_word'}]
|
||||||
|
|
||||||
return embeddings, entries
|
return entries
|
||||||
|
|
Loading…
Reference in a new issue