Use Base TextToJsonl class to standardize <text>_to_jsonl processors

- Start standardizing implementation of the `text_to_jsonl' processors
  - `text_to_jsonl; scripts already had a shared structure
  - This change starts to codify that implicit structure

- Benefits
  - Ease adding more `text_to_jsonl; processors
  - Allow merging shared functionality
  - Help with type hinting

- Drawbacks
  - Lower agility to change. But this was already an implicit issue as
    the text_to_jsonl processors got more deeply wired into the app
This commit is contained in:
Debanjum Singh Solanky 2022-09-14 10:53:43 +03:00
parent c16ae9e344
commit 02d944030f
12 changed files with 364 additions and 345 deletions

View file

@ -6,9 +6,9 @@ import logging
import json
# Internal Packages
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl
from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.search_type import image_search, text_search
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils import state
@ -44,7 +44,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
if (t == SearchType.Org or t == None) and config.content_type.org:
# Extract Entries, Generate Notes Embeddings
model.orgmode_search = text_search.setup(
org_to_jsonl,
OrgToJsonl,
config.content_type.org,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
@ -54,7 +54,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
if (t == SearchType.Music or t == None) and config.content_type.music:
# Extract Entries, Generate Music Embeddings
model.music_search = text_search.setup(
org_to_jsonl,
OrgToJsonl,
config.content_type.music,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
@ -64,7 +64,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
# Extract Entries, Generate Markdown Embeddings
model.markdown_search = text_search.setup(
markdown_to_jsonl,
MarkdownToJsonl,
config.content_type.markdown,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
@ -74,7 +74,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = text_search.setup(
beancount_to_jsonl,
BeancountToJsonl,
config.content_type.ledger,
search_config=config.search_type.symmetric,
regenerate=regenerate,

View file

@ -1,5 +1,3 @@
#!/usr/bin/env python3
# Standard Packages
import json
import glob
@ -8,19 +6,20 @@ import logging
import time
# Internal Packages
from src.processor.text_to_jsonl import TextToJsonl
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
# Define Functions
def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
class BeancountToJsonl(TextToJsonl):
# Define Functions
def process(self, previous_entries=None):
# Extract required fields from config
beancount_files, beancount_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
beancount_files, beancount_file_filter, output_file = self.config.input_files, self.config.input_filter,self.config.compressed_jsonl
# Input Validation
if is_none_or_empty(beancount_files) and is_none_or_empty(beancount_file_filter):
@ -28,11 +27,11 @@ def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
exit(1)
# Get Beancount Files to Process
beancount_files = get_beancount_files(beancount_files, beancount_file_filter)
beancount_files = BeancountToJsonl.get_beancount_files(beancount_files, beancount_file_filter)
# Extract Entries from specified Beancount files
start = time.time()
current_entries = convert_transactions_to_maps(*extract_beancount_transactions(beancount_files))
current_entries = BeancountToJsonl.convert_transactions_to_maps(*BeancountToJsonl.extract_beancount_transactions(beancount_files))
end = time.time()
logger.debug(f"Parse transactions from Beancount files into dictionaries: {end - start} seconds")
@ -48,7 +47,7 @@ def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
# Process Each Entry from All Notes Files
start = time.time()
entries = list(map(lambda entry: entry[1], entries_with_ids))
jsonl_data = convert_transaction_maps_to_jsonl(entries)
jsonl_data = BeancountToJsonl.convert_transaction_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
@ -60,8 +59,8 @@ def beancount_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
@staticmethod
def get_beancount_files(beancount_files=None, beancount_file_filters=None):
"Get Beancount files to process"
absolute_beancount_files, filtered_beancount_files = set(), set()
if beancount_files:
@ -90,8 +89,8 @@ def get_beancount_files(beancount_files=None, beancount_file_filters=None):
return all_beancount_files
def extract_beancount_transactions(beancount_files):
@staticmethod
def extract_beancount_transactions(beancount_files):
"Extract entries from specified Beancount files"
# Initialize Regex for extracting Beancount Entries
@ -111,8 +110,8 @@ def extract_beancount_transactions(beancount_files):
entries.extend(transactions_per_file)
return entries, dict(transaction_to_file_map)
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
@staticmethod
def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) -> list[dict]:
"Convert each Beancount transaction into a dictionary"
entry_maps = []
for entry in entries:
@ -122,7 +121,7 @@ def convert_transactions_to_maps(entries: list[str], transaction_to_file_map) ->
return entry_maps
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
@staticmethod
def convert_transaction_maps_to_jsonl(entries: list[dict]) -> str:
"Convert each Beancount transaction dictionary to JSON and collate as JSONL"
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View file

@ -1,5 +1,3 @@
#!/usr/bin/env python3
# Standard Packages
import json
import glob
@ -8,19 +6,20 @@ import logging
import time
# Internal Packages
from src.processor.text_to_jsonl import TextToJsonl
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
from src.utils.constants import empty_escape_sequences
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
# Define Functions
def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
class MarkdownToJsonl(TextToJsonl):
# Define Functions
def process(self, previous_entries=None):
# Extract required fields from config
markdown_files, markdown_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
markdown_files, markdown_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl
# Input Validation
if is_none_or_empty(markdown_files) and is_none_or_empty(markdown_file_filter):
@ -28,11 +27,11 @@ def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
exit(1)
# Get Markdown Files to Process
markdown_files = get_markdown_files(markdown_files, markdown_file_filter)
markdown_files = MarkdownToJsonl.get_markdown_files(markdown_files, markdown_file_filter)
# Extract Entries from specified Markdown files
start = time.time()
current_entries = convert_markdown_entries_to_maps(*extract_markdown_entries(markdown_files))
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(*MarkdownToJsonl.extract_markdown_entries(markdown_files))
end = time.time()
logger.debug(f"Parse entries from Markdown files into dictionaries: {end - start} seconds")
@ -48,7 +47,7 @@ def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
# Process Each Entry from All Notes Files
start = time.time()
entries = list(map(lambda entry: entry[1], entries_with_ids))
jsonl_data = convert_markdown_maps_to_jsonl(entries)
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
@ -60,8 +59,8 @@ def markdown_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
@staticmethod
def get_markdown_files(markdown_files=None, markdown_file_filters=None):
"Get Markdown files to process"
absolute_markdown_files, filtered_markdown_files = set(), set()
if markdown_files:
@ -89,8 +88,8 @@ def get_markdown_files(markdown_files=None, markdown_file_filters=None):
return all_markdown_files
def extract_markdown_entries(markdown_files):
@staticmethod
def extract_markdown_entries(markdown_files):
"Extract entries by heading from specified Markdown files"
# Regex to extract Markdown Entries by Heading
@ -110,8 +109,8 @@ def extract_markdown_entries(markdown_files):
return entries, dict(entry_to_file_map)
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
@staticmethod
def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> list[dict]:
"Convert each Markdown entries into a dictionary"
entry_maps = []
for entry in entries:
@ -121,7 +120,7 @@ def convert_markdown_entries_to_maps(entries: list[str], entry_to_file_map) -> l
return entry_maps
def convert_markdown_maps_to_jsonl(entries):
@staticmethod
def convert_markdown_maps_to_jsonl(entries):
"Convert each Markdown entries to JSON and collate as JSONL"
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View file

@ -1,5 +1,3 @@
#!/usr/bin/env python3
# Standard Packages
import json
import glob
@ -9,20 +7,21 @@ from typing import Iterable
# Internal Packages
from src.processor.org_mode import orgnode
from src.processor.text_to_jsonl import TextToJsonl
from src.utils.helpers import get_absolute_path, is_none_or_empty, mark_entries_for_update
from src.utils.jsonl import dump_jsonl, compress_jsonl_data
from src.utils import state
from src.utils.rawconfig import TextContentConfig
logger = logging.getLogger(__name__)
# Define Functions
def org_to_jsonl(config: TextContentConfig, previous_entries=None):
class OrgToJsonl(TextToJsonl):
# Define Functions
def process(self, previous_entries=None):
# Extract required fields from config
org_files, org_file_filter, output_file = config.input_files, config.input_filter, config.compressed_jsonl
index_heading_entries = config.index_heading_entries
org_files, org_file_filter, output_file = self.config.input_files, self.config.input_filter, self.config.compressed_jsonl
index_heading_entries = self.config.index_heading_entries
# Input Validation
if is_none_or_empty(org_files) and is_none_or_empty(org_file_filter):
@ -31,16 +30,16 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
# Get Org Files to Process
start = time.time()
org_files = get_org_files(org_files, org_file_filter)
org_files = OrgToJsonl.get_org_files(org_files, org_file_filter)
# Extract Entries from specified Org files
start = time.time()
entry_nodes, file_to_entries = extract_org_entries(org_files)
entry_nodes, file_to_entries = self.extract_org_entries(org_files)
end = time.time()
logger.debug(f"Parse entries from org files into OrgNode objects: {end - start} seconds")
start = time.time()
current_entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
current_entries = self.convert_org_nodes_to_entries(entry_nodes, file_to_entries, index_heading_entries)
end = time.time()
logger.debug(f"Convert OrgNodes into entry dictionaries: {end - start} seconds")
@ -53,7 +52,7 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
# Process Each Entry from All Notes Files
start = time.time()
entries = map(lambda entry: entry[1], entries_with_ids)
jsonl_data = convert_org_entries_to_jsonl(entries)
jsonl_data = self.convert_org_entries_to_jsonl(entries)
# Compress JSONL formatted Data
if output_file.suffix == ".gz":
@ -65,8 +64,8 @@ def org_to_jsonl(config: TextContentConfig, previous_entries=None):
return entries_with_ids
def get_org_files(org_files=None, org_file_filters=None):
@staticmethod
def get_org_files(org_files=None, org_file_filters=None):
"Get Org files to process"
absolute_org_files, filtered_org_files = set(), set()
if org_files:
@ -92,8 +91,8 @@ def get_org_files(org_files=None, org_file_filters=None):
return all_org_files
def extract_org_entries(org_files):
@staticmethod
def extract_org_entries(org_files):
"Extract entries from specified Org files"
entries = []
entry_to_file_map = []
@ -104,8 +103,8 @@ def extract_org_entries(org_files):
return entries, dict(entry_to_file_map)
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
@staticmethod
def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False) -> list[dict]:
"Convert Org-Mode entries into list of dictionary"
entry_maps = []
for entry in entries:
@ -149,7 +148,7 @@ def convert_org_nodes_to_entries(entries: list[orgnode.Orgnode], entry_to_file_m
return entry_maps
def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str:
@staticmethod
def convert_org_entries_to_jsonl(entries: Iterable[dict]) -> str:
"Convert each Org-Mode entry to JSON and collate as JSONL"
return ''.join([f'{json.dumps(entry_dict, ensure_ascii=False)}\n' for entry_dict in entries])

View file

@ -0,0 +1,14 @@
# Standard Packages
from abc import ABC, abstractmethod
from typing import Iterable
# Internal Packages
from src.utils.rawconfig import TextContentConfig
class TextToJsonl(ABC):
def __init__(self, config: TextContentConfig):
self.config = config
@abstractmethod
def process(self, previous_entries: Iterable[tuple[int, dict]]=None) -> list[tuple[int, dict]]: ...

View file

@ -1,10 +1,12 @@
# Standard Packages
import logging
import time
from typing import Type
# External Packages
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from src.processor.text_to_jsonl import TextToJsonl
from src.search_filter.base_filter import BaseFilter
# Internal Packages
@ -179,14 +181,14 @@ def collate_results(hits, entries, count=5):
in hits[0:count]]
def setup(text_to_jsonl, config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
def setup(text_to_jsonl: Type[TextToJsonl], config: TextContentConfig, search_config: TextSearchConfig, regenerate: bool, filters: list[BaseFilter] = []) -> TextSearchModel:
# Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config)
# Map notes in text files to (compressed) JSONL formatted file
config.compressed_jsonl = resolve_absolute_path(config.compressed_jsonl)
previous_entries = extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None
entries_with_indices = text_to_jsonl(config, previous_entries)
entries_with_indices = text_to_jsonl(config).process(previous_entries)
# Extract Updated Entries
entries = extract_entries(config.compressed_jsonl)

View file

@ -6,7 +6,7 @@ from src.search_type import image_search, text_search
from src.utils.config import SearchType
from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, ImageSearchConfig
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.search_filter.date_filter import DateFilter
from src.search_filter.word_filter import WordFilter
from src.search_filter.file_filter import FileFilter
@ -60,6 +60,6 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
embeddings_file = content_dir.joinpath('note_embeddings.pt'))
filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
return content_config

View file

@ -2,7 +2,7 @@
import json
# Internal Packages
from src.processor.ledger.beancount_to_jsonl import extract_beancount_transactions, convert_transactions_to_maps, convert_transaction_maps_to_jsonl, get_beancount_files
from src.processor.ledger.beancount_to_jsonl import BeancountToJsonl
def test_no_transactions_in_file(tmp_path):
@ -16,10 +16,11 @@ def test_no_transactions_in_file(tmp_path):
# Act
# Extract Entries from specified Beancount files
entry_nodes, file_to_entries = extract_beancount_transactions(beancount_files=[beancount_file])
entry_nodes, file_to_entries = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
# Process Each Entry from All Beancount Files
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entry_nodes, file_to_entries))
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entry_nodes, file_to_entries))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -38,10 +39,11 @@ Assets:Test:Test -1.00 KES
# Act
# Extract Entries from specified Beancount files
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
# Process Each Entry from All Beancount Files
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -65,10 +67,11 @@ Assets:Test:Test -1.00 KES
# Act
# Extract Entries from specified Beancount files
entries, entry_to_file_map = extract_beancount_transactions(beancount_files=[beancount_file])
entries, entry_to_file_map = BeancountToJsonl.extract_beancount_transactions(beancount_files=[beancount_file])
# Process Each Entry from All Beancount Files
jsonl_string = convert_transaction_maps_to_jsonl(convert_transactions_to_maps(entries, entry_to_file_map))
jsonl_string = BeancountToJsonl.convert_transaction_maps_to_jsonl(
BeancountToJsonl.convert_transactions_to_maps(entries, entry_to_file_map))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -96,7 +99,7 @@ def test_get_beancount_files(tmp_path):
input_filter = [tmp_path / 'group1*.bean', tmp_path / 'group2*.beancount']
# Act
extracted_org_files = get_beancount_files(input_files, input_filter)
extracted_org_files = BeancountToJsonl.get_beancount_files(input_files, input_filter)
# Assert
assert len(extracted_org_files) == 5

View file

@ -12,7 +12,7 @@ from src.main import app
from src.utils.state import model, config
from src.search_type import text_search, image_search
from src.utils.rawconfig import ContentConfig, SearchConfig
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.search_filter.word_filter import WordFilter
from src.search_filter.file_filter import FileFilter
@ -118,7 +118,7 @@ def test_image_search(content_config: ContentConfig, search_config: SearchConfig
# ----------------------------------------------------------------------------------------------------
def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
user_query = quote("How to git install application?")
# Act
@ -135,7 +135,7 @@ def test_notes_search(content_config: ContentConfig, search_config: SearchConfig
def test_notes_search_with_only_filters(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter(), FileFilter()]
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
user_query = quote('+"Emacs" file:"*.org"')
# Act
@ -152,7 +152,7 @@ def test_notes_search_with_only_filters(content_config: ContentConfig, search_co
def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter()]
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
user_query = quote('How to git install application? +"Emacs"')
# Act
@ -169,7 +169,7 @@ def test_notes_search_with_include_filter(content_config: ContentConfig, search_
def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
filters = [WordFilter()]
model.orgmode_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
model.orgmode_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False, filters=filters)
user_query = quote('How to git install application? -"clone"')
# Act

View file

@ -2,7 +2,7 @@
import json
# Internal Packages
from src.processor.markdown.markdown_to_jsonl import extract_markdown_entries, convert_markdown_maps_to_jsonl, convert_markdown_entries_to_maps, get_markdown_files
from src.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
@ -16,10 +16,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entry_nodes, file_to_entries = extract_markdown_entries(markdown_files=[markdownfile])
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -37,10 +38,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -62,10 +64,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Markdown files
entries, entry_to_file_map = extract_markdown_entries(markdown_files=[markdownfile])
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=[markdownfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_markdown_maps_to_jsonl(convert_markdown_entries_to_maps(entries, entry_to_file_map))
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -93,7 +96,7 @@ def test_get_markdown_files(tmp_path):
input_filter = [tmp_path / 'group1*.md', tmp_path / 'group2*.markdown']
# Act
extracted_org_files = get_markdown_files(input_files, input_filter)
extracted_org_files = MarkdownToJsonl.get_markdown_files(input_files, input_filter)
# Assert
assert len(extracted_org_files) == 5

View file

@ -2,7 +2,7 @@
import json
# Internal Packages
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, convert_org_nodes_to_entries, extract_org_entries, get_org_files
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.utils.helpers import is_none_or_empty
@ -21,8 +21,8 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
for index_heading_entries in [True, False]:
# Act
# Extract entries into jsonl from specified Org files
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(
*extract_org_entries(org_files=[orgfile]),
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=[orgfile]),
index_heading_entries=index_heading_entries))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -49,10 +49,10 @@ def test_entry_with_body_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entries, entry_to_file_map = extract_org_entries(org_files=[orgfile])
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
jsonl_string = convert_org_entries_to_jsonl(convert_org_nodes_to_entries(entries, entry_to_file_map))
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -70,11 +70,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
# Act
# Extract Entries from specified Org files
entry_nodes, file_to_entries = extract_org_entries(org_files=[orgfile])
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])
# Process Each Entry from All Notes Files
entries = convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = convert_org_entries_to_jsonl(entries)
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
@ -102,7 +102,7 @@ def test_get_org_files(tmp_path):
input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']
# Act
extracted_org_files = get_org_files(input_files, input_filter)
extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)
# Assert
assert len(extracted_org_files) == 5

View file

@ -9,7 +9,7 @@ import pytest
from src.utils.state import model
from src.search_type import text_search
from src.utils.rawconfig import ContentConfig, SearchConfig
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
# Test
@ -24,7 +24,7 @@ def test_asymmetric_setup_with_missing_file_raises_error(content_config: Content
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(FileNotFoundError):
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
# ----------------------------------------------------------------------------------------------------
@ -39,7 +39,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
# Act
# Generate notes embeddings during asymmetric setup
with pytest.raises(ValueError, match=r'^No valid entries found*'):
text_search.setup(org_to_jsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
text_search.setup(OrgToJsonl, new_org_content_config, search_config.asymmetric, regenerate=True)
# Cleanup
# delete created test file
@ -50,7 +50,7 @@ def test_asymmetric_setup_with_empty_file_raises_error(content_config: ContentCo
def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
# Act
# Regenerate notes embeddings during asymmetric setup
notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
# Assert
assert len(notes_model.entries) == 10
@ -60,7 +60,7 @@ def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchCo
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
model.notes_search = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
model.notes_search = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
query = "How to git install application?"
# Act
@ -83,7 +83,7 @@ def test_asymmetric_search(content_config: ContentConfig, search_config: SearchC
# ----------------------------------------------------------------------------------------------------
def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
initial_notes_model= text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
initial_notes_model= text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
assert len(initial_notes_model.entries) == 10
assert len(initial_notes_model.corpus_embeddings) == 10
@ -96,11 +96,11 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
f.write("\n* A Chihuahua doing Tango\n- Saw a super cute video of a chihuahua doing the Tango on Youtube\n")
# regenerate notes jsonl, model embeddings and model to include entry from new file
regenerated_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
regenerated_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
# Act
# reload embeddings, entries, notes model from previously generated notes jsonl and model embeddings files
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
# Assert
assert len(regenerated_notes_model.entries) == 11
@ -119,7 +119,7 @@ def test_asymmetric_reload(content_config: ContentConfig, search_config: SearchC
# ----------------------------------------------------------------------------------------------------
def test_incremental_update(content_config: ContentConfig, search_config: SearchConfig):
# Arrange
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=True)
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=True)
assert len(initial_notes_model.entries) == 10
assert len(initial_notes_model.corpus_embeddings) == 10
@ -133,7 +133,7 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
# Act
# update embeddings, entries with the newly added note
initial_notes_model = text_search.setup(org_to_jsonl, content_config.org, search_config.asymmetric, regenerate=False)
initial_notes_model = text_search.setup(OrgToJsonl, content_config.org, search_config.asymmetric, regenerate=False)
# verify new entry added in updated embeddings, entries
assert len(initial_notes_model.entries) == 11