khoj/src/utils/config.py

# System Packages
from enum import Enum
from dataclasses import dataclass
from pathlib import Path

# Internal Packages
from src.utils.helpers import get_from_dict


class SearchType(str, Enum):
    Notes = "notes"
    Ledger = "ledger"
    Music = "music"
    Image = "image"


class TextSearchModel():
    def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose):
        self.entries = entries
        self.corpus_embeddings = corpus_embeddings
        self.bi_encoder = bi_encoder
        self.cross_encoder = cross_encoder
        self.top_k = top_k
        self.verbose = verbose


class ImageSearchModel():
    def __init__(self, image_names, image_embeddings, image_metadata_embeddings, image_encoder, verbose):
        self.image_encoder = image_encoder
        self.image_names = image_names
        self.image_embeddings = image_embeddings
        self.image_metadata_embeddings = image_metadata_embeddings
        self.image_encoder = image_encoder
        self.verbose = verbose


@dataclass
class SearchModels():
    notes_search: TextSearchModel = None
    ledger_search: TextSearchModel = None
    music_search: TextSearchModel = None
    image_search: ImageSearchModel = None


class TextSearchConfig():
    def __init__(self, input_files, input_filter, compressed_jsonl, embeddings_file, verbose):
        self.input_files = input_files
        self.input_filter = input_filter
        self.compressed_jsonl = Path(compressed_jsonl)
        self.embeddings_file = Path(embeddings_file)
        self.verbose = verbose


    def create_from_dictionary(config, key_tree, verbose):
        text_config = get_from_dict(config, *key_tree)
        search_enabled = text_config and ('input-files' in text_config or 'input-filter' in text_config)
        if not search_enabled:
            return None

        return TextSearchConfig(
            input_files = text_config['input-files'],
            input_filter = text_config['input-filter'],
            compressed_jsonl = Path(text_config['compressed-jsonl']),
            embeddings_file = Path(text_config['embeddings-file']),
            verbose = verbose)


class ImageSearchConfig():
    def __init__(self, input_directory, embeddings_file, batch_size, use_xmp_metadata, verbose):
        self.input_directory = input_directory
        self.embeddings_file = Path(embeddings_file)
        self.batch_size = batch_size
        self.use_xmp_metadata = use_xmp_metadata
        self.verbose = verbose

    def create_from_dictionary(config, key_tree, verbose):
        image_config = get_from_dict(config, *key_tree)
        search_enabled = image_config and 'input-directory' in image_config
        if not search_enabled:
            return None

        return ImageSearchConfig(
            input_directory = Path(image_config['input-directory']),
            embeddings_file = Path(image_config['embeddings-file']),
            batch_size = image_config['batch-size'],
            use_xmp_metadata = {'yes': True, 'no': False}[image_config['use-xmp-metadata']],
            verbose = verbose)


@dataclass
class SearchConfig():
    notes: TextSearchConfig = None
    ledger: TextSearchConfig = None
    music: TextSearchConfig = None
    image: ImageSearchConfig = None


class ConversationProcessorConfig():
    def __init__(self, conversation_logfile, chat_log, meta_log, openai_api_key, verbose):
        self.openai_api_key = openai_api_key
        self.conversation_logfile = conversation_logfile
        self.chat_log = chat_log
        self.meta_log = meta_log
        self.verbose = verbose

    def create_from_dictionary(config, key_tree, verbose):
        conversation_config = get_from_dict(config, *key_tree)
        if not conversation_config:
            return None

        return ConversationProcessorConfig(
            openai_api_key = conversation_config['openai-api-key'],
            chat_log = '',
            meta_log = [],
            conversation_logfile = Path(conversation_config['conversation-logfile']),
            verbose = verbose)


@dataclass
class ProcessorConfig():
    conversation: ConversationProcessorConfig = None
Wrap search type enablement status in a search settings class - Cleaner, more idiomatic usage of a global variable - Simplifies mocking when testing client in pytest as setting wrapped in object rather than a simple type. So passed around by reference 2021-09-30 04:18:33 +02:00			`# System Packages`
Only allow supported search types to /search, /regenerate APIs - Use a SearchType to limit types that can be passed by user - FastAPI automatically validates type passed in query param - Available type options show up in Swagger UI, FastAPI docs - controller code looks neater instead of doing string comparisons for type - Test invalid, valid search types via pytest 2021-09-30 04:02:55 +02:00			`from enum import Enum`
Wrap search type enablement status in a search settings class - Cleaner, more idiomatic usage of a global variable - Simplifies mocking when testing client in pytest as setting wrapped in object rather than a simple type. So passed around by reference 2021-09-30 04:18:33 +02:00			`from dataclasses import dataclass`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`from pathlib import Path`

			`# Internal Packages`
Move tests out to project root. Use absolute import in project tests/ directory in project root is more standard. Just had to use absolute path for internal module imports to get it to work 2021-09-30 13:12:14 +02:00			`from src.utils.helpers import get_from_dict`
Only allow supported search types to /search, /regenerate APIs - Use a SearchType to limit types that can be passed by user - FastAPI automatically validates type passed in query param - Available type options show up in Swagger UI, FastAPI docs - controller code looks neater instead of doing string comparisons for type - Test invalid, valid search types via pytest 2021-09-30 04:02:55 +02:00

			`class SearchType(str, Enum):`
			`Notes = "notes"`
			`Ledger = "ledger"`
			`Music = "music"`
			`Image = "image"`

Wrap search type enablement status in a search settings class - Cleaner, more idiomatic usage of a global variable - Simplifies mocking when testing client in pytest as setting wrapped in object rather than a simple type. So passed around by reference 2021-09-30 04:18:33 +02:00
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`class TextSearchModel():`
			`def __init__(self, entries, corpus_embeddings, bi_encoder, cross_encoder, top_k, verbose):`
Wrap asymmetric search model into SearchModels. Test notes search end-to-end - Wrap asymmetric search model parameters into AsymmetricSearchModel class - Create wrapper for all search type models. Put notes search model into it - Test notes search end-to-end from client API layer to results. Use model build on test data 2021-09-30 05:24:27 +02:00			`self.entries = entries`
			`self.corpus_embeddings = corpus_embeddings`
			`self.bi_encoder = bi_encoder`
			`self.cross_encoder = cross_encoder`
			`self.top_k = top_k`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`self.verbose = verbose`
Use type specific model for other search types too. Expose them via SearchModels - Wrap Image, Music, Ledger search into the type of SearchModel they use Similar to what was done for notes model by wrapping it's config into an AsymmetricSearchModel. - Use the uber wrapper class to expose all type specific search models 2021-09-30 06:09:42 +02:00

			`class ImageSearchModel():`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`def __init__(self, image_names, image_embeddings, image_metadata_embeddings, image_encoder, verbose):`
			`self.image_encoder = image_encoder`
Use type specific model for other search types too. Expose them via SearchModels - Wrap Image, Music, Ledger search into the type of SearchModel they use Similar to what was done for notes model by wrapping it's config into an AsymmetricSearchModel. - Use the uber wrapper class to expose all type specific search models 2021-09-30 06:09:42 +02:00			`self.image_names = image_names`
			`self.image_embeddings = image_embeddings`
			`self.image_metadata_embeddings = image_metadata_embeddings`
			`self.image_encoder = image_encoder`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`self.verbose = verbose`
Use type specific model for other search types too. Expose them via SearchModels - Wrap Image, Music, Ledger search into the type of SearchModel they use Similar to what was done for notes model by wrapping it's config into an AsymmetricSearchModel. - Use the uber wrapper class to expose all type specific search models 2021-09-30 06:09:42 +02:00

Wrap asymmetric search model into SearchModels. Test notes search end-to-end - Wrap asymmetric search model parameters into AsymmetricSearchModel class - Create wrapper for all search type models. Put notes search model into it - Test notes search end-to-end from client API layer to results. Use model build on test data 2021-09-30 05:24:27 +02:00			`@dataclass`
			`class SearchModels():`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00			`notes_search: TextSearchModel = None`
			`ledger_search: TextSearchModel = None`
			`music_search: TextSearchModel = None`
Use type specific model for other search types too. Expose them via SearchModels - Wrap Image, Music, Ledger search into the type of SearchModel they use Similar to what was done for notes model by wrapping it's config into an AsymmetricSearchModel. - Use the uber wrapper class to expose all type specific search models 2021-09-30 06:09:42 +02:00			`image_search: ImageSearchModel = None`
Modularize Code. Wrap Search, Model Config in Classes. Add Tests Details - Rename method query_* to query in search_types for standardization - Wrapping Config code in classes simplified mocking test config - Reduce args beings passed to a function by passing it as single argument wrapped in a class - Minimize setup in main.py:__main__. Put most of it into functions These functions can be mocked if required in tests later too Setup Flow: CLI_Args\|Config_YAML -> (Text\|Image)SearchConfig -> (Text\|Image)SearchModel 2021-09-30 11:04:04 +02:00

			`class TextSearchConfig():`
			`def __init__(self, input_files, input_filter, compressed_jsonl, embeddings_file, verbose):`
			`self.input_files = input_files`
			`self.input_filter = input_filter`
			`self.compressed_jsonl = Path(compressed_jsonl)`
			`self.embeddings_file = Path(embeddings_file)`
			`self.verbose = verbose`


			`def create_from_dictionary(config, key_tree, verbose):`
			`text_config = get_from_dict(config, *key_tree)`
			`search_enabled = text_config and ('input-files' in text_config or 'input-filter' in text_config)`
			`if not search_enabled:`
			`return None`

			`return TextSearchConfig(`
			`input_files = text_config['input-files'],`
			`input_filter = text_config['input-filter'],`
			`compressed_jsonl = Path(text_config['compressed-jsonl']),`
			`embeddings_file = Path(text_config['embeddings-file']),`
			`verbose = verbose)`


			`class ImageSearchConfig():`
			`def __init__(self, input_directory, embeddings_file, batch_size, use_xmp_metadata, verbose):`
			`self.input_directory = input_directory`
			`self.embeddings_file = Path(embeddings_file)`
			`self.batch_size = batch_size`
			`self.use_xmp_metadata = use_xmp_metadata`
			`self.verbose = verbose`

			`def create_from_dictionary(config, key_tree, verbose):`
			`image_config = get_from_dict(config, *key_tree)`
			`search_enabled = image_config and 'input-directory' in image_config`
			`if not search_enabled:`
			`return None`

			`return ImageSearchConfig(`
			`input_directory = Path(image_config['input-directory']),`
			`embeddings_file = Path(image_config['embeddings-file']),`
			`batch_size = image_config['batch-size'],`
			`use_xmp_metadata = {'yes': True, 'no': False}[image_config['use-xmp-metadata']],`
			`verbose = verbose)`


			`@dataclass`
			`class SearchConfig():`
			`notes: TextSearchConfig = None`
			`ledger: TextSearchConfig = None`
			`music: TextSearchConfig = None`
			`image: ImageSearchConfig = None`
Make conversation processor configurable 2021-11-26 20:56:26 +01:00

			`class ConversationProcessorConfig():`
Wire up GPT understand method to /chat API. Log conversation metadata too 2021-11-27 19:34:39 +01:00			`def __init__(self, conversation_logfile, chat_log, meta_log, openai_api_key, verbose):`
Make conversation processor configurable 2021-11-26 20:56:26 +01:00			`self.openai_api_key = openai_api_key`
			`self.conversation_logfile = conversation_logfile`
Wire up GPT understand method to /chat API. Log conversation metadata too 2021-11-27 19:34:39 +01:00			`self.chat_log = chat_log`
			`self.meta_log = meta_log`
Make conversation processor configurable 2021-11-26 20:56:26 +01:00			`self.verbose = verbose`

			`def create_from_dictionary(config, key_tree, verbose):`
			`conversation_config = get_from_dict(config, *key_tree)`
			`if not conversation_config:`
			`return None`

			`return ConversationProcessorConfig(`
			`openai_api_key = conversation_config['openai-api-key'],`
Wire up GPT understand method to /chat API. Log conversation metadata too 2021-11-27 19:34:39 +01:00			`chat_log = '',`
			`meta_log = [],`
Make conversation processor configurable 2021-11-26 20:56:26 +01:00			`conversation_logfile = Path(conversation_config['conversation-logfile']),`
			`verbose = verbose)`


			`@dataclass`
			`class ProcessorConfig():`
			`conversation: ConversationProcessorConfig = None`