Rename RawConfig Types for Consistency

- Naming convention - [ContentType][ConfigType]Config
  - Where [ConfigType] ~ Content, Search, Processor
  - Where [ContentType] ~ Text, Image, Asymmetric, Symmetric, Conversation

- Current Configs:
  - Content:
    - Org Notes
    - Org Music
    - Image
    - Ledger/Beancount

  - Search:
     - Asymmetric
     - Symmetric
     - Image

  - Processor:
    - Conversation
This commit is contained in:
Debanjum Singh Solanky 2022-01-14 20:54:38 -05:00
parent ed7c2901f5
commit 179153dc5a
8 changed files with 62 additions and 69 deletions

View file

@ -15,10 +15,10 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
from src.processor.org_mode.org_to_jsonl import org_to_jsonl from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.utils.config import TextSearchModel from src.utils.config import TextSearchModel
from src.utils.rawconfig import AsymmetricConfig, TextSearchConfig from src.utils.rawconfig import AsymmetricSearchConfig, TextContentConfig
def initialize_model(search_config: AsymmetricConfig): def initialize_model(search_config: AsymmetricSearchConfig):
"Initialize model for assymetric semantic search. That is, where query smaller than results" "Initialize model for assymetric semantic search. That is, where query smaller than results"
torch.set_num_threads(4) torch.set_num_threads(4)
@ -162,7 +162,7 @@ def collate_results(hits, entries, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: TextSearchConfig, search_config: AsymmetricConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel: def setup(config: TextContentConfig, search_config: AsymmetricSearchConfig, regenerate: bool, verbose: bool=False) -> TextSearchModel:
# Initialize Model # Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config) bi_encoder, cross_encoder, top_k = initialize_model(search_config)

View file

@ -13,10 +13,10 @@ import torch
from src.utils.helpers import resolve_absolute_path, load_model from src.utils.helpers import resolve_absolute_path, load_model
import src.utils.exiftool as exiftool import src.utils.exiftool as exiftool
from src.utils.config import ImageSearchModel from src.utils.config import ImageSearchModel
from src.utils.rawconfig import ImageSearchConfig, ImageSearchTypeConfig from src.utils.rawconfig import ImageContentConfig, ImageSearchConfig
def initialize_model(search_config: ImageSearchTypeConfig): def initialize_model(search_config: ImageSearchConfig):
# Initialize Model # Initialize Model
torch.set_num_threads(4) torch.set_num_threads(4)
@ -160,7 +160,7 @@ def collate_results(hits, image_names, image_directory, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: ImageSearchConfig, search_config: ImageSearchTypeConfig, regenerate: bool, verbose: bool=False) -> ImageSearchModel: def setup(config: ImageContentConfig, search_config: ImageSearchConfig, regenerate: bool, verbose: bool=False) -> ImageSearchModel:
# Initialize Model # Initialize Model
encoder = initialize_model(search_config) encoder = initialize_model(search_config)

View file

@ -13,10 +13,10 @@ from sentence_transformers import SentenceTransformer, CrossEncoder, util
from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model from src.utils.helpers import get_absolute_path, resolve_absolute_path, load_model
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.utils.config import TextSearchModel from src.utils.config import TextSearchModel
from src.utils.rawconfig import SymmetricConfig, TextSearchConfig from src.utils.rawconfig import SymmetricSearchConfig, TextContentConfig
def initialize_model(search_config: SymmetricConfig): def initialize_model(search_config: SymmetricSearchConfig):
"Initialize model for symmetric semantic search. That is, where query of similar size to results" "Initialize model for symmetric semantic search. That is, where query of similar size to results"
torch.set_num_threads(4) torch.set_num_threads(4)
@ -154,7 +154,7 @@ def collate_results(hits, entries, count=5):
in hits[0:count]] in hits[0:count]]
def setup(config: TextSearchConfig, search_config: SymmetricConfig, regenerate: bool, verbose: bool) -> TextSearchModel: def setup(config: TextContentConfig, search_config: SymmetricSearchConfig, regenerate: bool, verbose: bool) -> TextSearchModel:
# Initialize Model # Initialize Model
bi_encoder, cross_encoder, top_k = initialize_model(search_config) bi_encoder, cross_encoder, top_k = initialize_model(search_config)

View file

@ -13,57 +13,52 @@ class ConfigBase(BaseModel):
alias_generator = to_snake_case_from_dash alias_generator = to_snake_case_from_dash
allow_population_by_field_name = True allow_population_by_field_name = True
class SearchConfig(ConfigBase): class TextContentConfig(ConfigBase):
input_files: Optional[List[str]]
input_filter: Optional[str]
embeddings_file: Optional[Path]
class TextSearchConfig(ConfigBase):
compressed_jsonl: Optional[Path] compressed_jsonl: Optional[Path]
input_files: Optional[List[str]] input_files: Optional[List[str]]
input_filter: Optional[str] input_filter: Optional[str]
embeddings_file: Optional[Path] embeddings_file: Optional[Path]
class ImageSearchConfig(ConfigBase): class ImageContentConfig(ConfigBase):
use_xmp_metadata: Optional[str] use_xmp_metadata: Optional[str]
batch_size: Optional[int] batch_size: Optional[int]
input_directory: Optional[Path] input_directory: Optional[Path]
input_filter: Optional[str] input_filter: Optional[str]
embeddings_file: Optional[Path] embeddings_file: Optional[Path]
class ContentTypeConfig(ConfigBase): class ContentConfig(ConfigBase):
org: Optional[TextSearchConfig] org: Optional[TextContentConfig]
ledger: Optional[TextSearchConfig] ledger: Optional[TextContentConfig]
image: Optional[ImageContentConfig]
music: Optional[TextContentConfig]
class SymmetricSearchConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
model_directory: Optional[Path]
class AsymmetricSearchConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
model_directory: Optional[Path]
class ImageSearchConfig(ConfigBase):
encoder: Optional[str]
model_directory: Optional[Path]
class SearchConfig(ConfigBase):
asymmetric: Optional[AsymmetricSearchConfig]
symmetric: Optional[SymmetricSearchConfig]
image: Optional[ImageSearchConfig] image: Optional[ImageSearchConfig]
music: Optional[TextSearchConfig]
class SymmetricConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
model_directory: Optional[Path]
class AsymmetricConfig(ConfigBase):
encoder: Optional[str]
cross_encoder: Optional[str]
model_directory: Optional[Path]
class ImageSearchTypeConfig(ConfigBase):
encoder: Optional[str]
model_directory: Optional[Path]
class SearchTypeConfig(ConfigBase):
asymmetric: Optional[AsymmetricConfig]
symmetric: Optional[SymmetricConfig]
image: Optional[ImageSearchTypeConfig]
class ConversationProcessorConfig(ConfigBase): class ConversationProcessorConfig(ConfigBase):
openai_api_key: Optional[str] openai_api_key: Optional[str]
conversation_logfile: Optional[str] conversation_logfile: Optional[str]
class ProcessorConfigModel(ConfigBase): class ProcessorConfig(ConfigBase):
conversation: Optional[ConversationProcessorConfig] conversation: Optional[ConversationProcessorConfig]
class FullConfig(ConfigBase): class FullConfig(ConfigBase):
content_type: Optional[ContentTypeConfig] content_type: Optional[ContentConfig]
search_type: Optional[SearchTypeConfig] search_type: Optional[SearchConfig]
processor: Optional[ProcessorConfigModel] processor: Optional[ProcessorConfig]

View file

@ -1,32 +1,30 @@
# Standard Packages # Standard Packages
import pytest import pytest
from pathlib import Path
from src import search_type
# Internal Packages # Internal Packages
from src.search_type import asymmetric, image_search from src.search_type import asymmetric, image_search
from src.utils.rawconfig import AsymmetricConfig, ContentTypeConfig, ImageSearchConfig, ImageSearchTypeConfig, SearchTypeConfig, SymmetricConfig, TextSearchConfig from src.utils.rawconfig import ContentConfig, TextContentConfig, ImageContentConfig, SearchConfig, SymmetricSearchConfig, AsymmetricSearchConfig, ImageSearchConfig
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def search_config(tmp_path_factory): def search_config(tmp_path_factory):
model_dir = tmp_path_factory.mktemp('data') model_dir = tmp_path_factory.mktemp('data')
search_config = SearchTypeConfig() search_config = SearchConfig()
search_config.asymmetric = SymmetricConfig( search_config.asymmetric = SymmetricSearchConfig(
encoder = "sentence-transformers/paraphrase-MiniLM-L6-v2", encoder = "sentence-transformers/paraphrase-MiniLM-L6-v2",
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory = model_dir model_directory = model_dir
) )
search_config.asymmetric = AsymmetricConfig( search_config.asymmetric = AsymmetricSearchConfig(
encoder = "sentence-transformers/msmarco-MiniLM-L-6-v3", encoder = "sentence-transformers/msmarco-MiniLM-L-6-v3",
cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2", cross_encoder = "cross-encoder/ms-marco-MiniLM-L-6-v2",
model_directory = model_dir model_directory = model_dir
) )
search_config.image = ImageSearchTypeConfig( search_config.image = ImageSearchConfig(
encoder = "clip-ViT-B-32", encoder = "clip-ViT-B-32",
model_directory = model_dir model_directory = model_dir
) )
@ -39,8 +37,8 @@ def model_dir(search_config):
model_dir = search_config.asymmetric.model_directory model_dir = search_config.asymmetric.model_directory
# Generate Image Embeddings from Test Images # Generate Image Embeddings from Test Images
content_config = ContentTypeConfig() content_config = ContentConfig()
content_config.image = ImageSearchConfig( content_config.image = ImageContentConfig(
input_directory = 'tests/data', input_directory = 'tests/data',
embeddings_file = model_dir.joinpath('.image_embeddings.pt'), embeddings_file = model_dir.joinpath('.image_embeddings.pt'),
batch_size = 10, batch_size = 10,
@ -49,7 +47,7 @@ def model_dir(search_config):
image_search.setup(content_config.image, search_config.image, regenerate=False, verbose=True) image_search.setup(content_config.image, search_config.image, regenerate=False, verbose=True)
# Generate Notes Embeddings from Test Notes # Generate Notes Embeddings from Test Notes
content_config.org = TextSearchConfig( content_config.org = TextContentConfig(
input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'], input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None, input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'), compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
@ -62,14 +60,14 @@ def model_dir(search_config):
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def content_config(model_dir): def content_config(model_dir):
content_config = ContentTypeConfig() content_config = ContentConfig()
content_config.org = TextSearchConfig( content_config.org = TextContentConfig(
input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'], input_files = ['tests/data/main_readme.org', 'tests/data/interface_emacs_readme.org'],
input_filter = None, input_filter = None,
compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'), compressed_jsonl = model_dir.joinpath('.notes.jsonl.gz'),
embeddings_file = model_dir.joinpath('.note_embeddings.pt')) embeddings_file = model_dir.joinpath('.note_embeddings.pt'))
content_config.image = ImageSearchConfig( content_config.image = ImageContentConfig(
input_directory = 'tests/data', input_directory = 'tests/data',
embeddings_file = model_dir.joinpath('.image_embeddings.pt'), embeddings_file = model_dir.joinpath('.image_embeddings.pt'),
batch_size = 10, batch_size = 10,

View file

@ -1,12 +1,12 @@
# Internal Packages # Internal Packages
from src.main import model from src.main import model
from src.search_type import asymmetric from src.search_type import asymmetric
from src.utils.rawconfig import ContentTypeConfig, SearchTypeConfig from src.utils.rawconfig import ContentConfig, SearchConfig
# Test # Test
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_asymmetric_setup(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_asymmetric_setup(content_config: ContentConfig, search_config: SearchConfig):
# Act # Act
# Regenerate notes embeddings during asymmetric setup # Regenerate notes embeddings during asymmetric setup
notes_model = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=True) notes_model = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=True)
@ -17,7 +17,7 @@ def test_asymmetric_setup(content_config: ContentTypeConfig, search_config: Sear
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_asymmetric_search(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_asymmetric_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False) model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
query = "How to git install application?" query = "How to git install application?"

View file

@ -9,7 +9,7 @@ import pytest
from src.main import app, model, config from src.main import app, model, config
from src.search_type import asymmetric, image_search from src.search_type import asymmetric, image_search
from src.utils.helpers import resolve_absolute_path from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentTypeConfig, SearchTypeConfig from src.utils.rawconfig import ContentConfig, SearchConfig
# Arrange # Arrange
@ -30,7 +30,7 @@ def test_search_with_invalid_content_type():
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_search_with_valid_content_type(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_search_with_valid_content_type(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
config.content_type = content_config config.content_type = content_config
config.search_type = search_config config.search_type = search_config
@ -53,7 +53,7 @@ def test_regenerate_with_invalid_content_type():
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_regenerate_with_valid_content_type(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_regenerate_with_valid_content_type(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
config.content_type = content_config config.content_type = content_config
config.search_type = search_config config.search_type = search_config
@ -67,7 +67,7 @@ def test_regenerate_with_valid_content_type(content_config: ContentTypeConfig, s
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="Flaky test. Search doesn't always return expected image path.") @pytest.mark.skip(reason="Flaky test. Search doesn't always return expected image path.")
def test_image_search(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
config.content_type = content_config config.content_type = content_config
config.search_type = search_config config.search_type = search_config
@ -90,7 +90,7 @@ def test_image_search(content_config: ContentTypeConfig, search_config: SearchTy
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_notes_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False) model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
user_query = "How to git install application?" user_query = "How to git install application?"
@ -106,7 +106,7 @@ def test_notes_search(content_config: ContentTypeConfig, search_config: SearchTy
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search_with_include_filter(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_notes_search_with_include_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False) model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
user_query = "How to git install application? +Emacs" user_query = "How to git install application? +Emacs"
@ -122,7 +122,7 @@ def test_notes_search_with_include_filter(content_config: ContentTypeConfig, sea
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_notes_search_with_exclude_filter(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_notes_search_with_exclude_filter(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False) model.notes_search = asymmetric.setup(content_config.org, search_config.asymmetric, regenerate=False)
user_query = "How to git install application? -clone" user_query = "How to git install application? -clone"

View file

@ -5,12 +5,12 @@ import pytest
from src.main import model from src.main import model
from src.search_type import image_search from src.search_type import image_search
from src.utils.helpers import resolve_absolute_path from src.utils.helpers import resolve_absolute_path
from src.utils.rawconfig import ContentTypeConfig, SearchTypeConfig from src.utils.rawconfig import ContentConfig, SearchConfig
# Test # Test
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
def test_image_search_setup(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_image_search_setup(content_config: ContentConfig, search_config: SearchConfig):
# Act # Act
# Regenerate image search embeddings during image setup # Regenerate image search embeddings during image setup
image_search_model = image_search.setup(content_config.image, search_config.image, regenerate=True) image_search_model = image_search.setup(content_config.image, search_config.image, regenerate=True)
@ -22,7 +22,7 @@ def test_image_search_setup(content_config: ContentTypeConfig, search_config: Se
# ---------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------
@pytest.mark.skip(reason="results inconsistent currently") @pytest.mark.skip(reason="results inconsistent currently")
def test_image_search(content_config: ContentTypeConfig, search_config: SearchTypeConfig): def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
# Arrange # Arrange
model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False) model.image_search = image_search.setup(content_config.image, search_config.image, regenerate=False)
query_expected_image_pairs = [("brown kitten next to plant", "kitten_park.jpg"), query_expected_image_pairs = [("brown kitten next to plant", "kitten_park.jpg"),