khoj/src/configure.py

# System Packages
import sys

# External Packages
import torch
import json

# Internal Packages
from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl
from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl
from src.processor.org_mode.org_to_jsonl import org_to_jsonl
from src.search_type import image_search, text_search
from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
from src.utils import state
from src.utils.helpers import get_absolute_path
from src.utils.rawconfig import FullConfig, ProcessorConfig


def configure_server(args, required=False):
    if args.config is None:
        if required:
            print('Exiting as Khoj is not configured. Configure the application to use it.')
            sys.exit(1)
        else:
            return
    else:
        state.config = args.config

    # Initialize the search model from Config
    state.model = configure_search(state.model, state.config, args.regenerate, device=state.device, verbose=state.verbose)

    # Initialize Processor from Config
    state.processor_config = configure_processor(args.config.processor, verbose=state.verbose)


def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None, device=torch.device("cpu"), verbose: int = 0):
    # Initialize Org Notes Search
    if (t == SearchType.Org or t == None) and config.content_type.org:
        # Extract Entries, Generate Notes Embeddings
        model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)

    # Initialize Org Music Search
    if (t == SearchType.Music or t == None) and config.content_type.music:
        # Extract Entries, Generate Music Embeddings
        model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)

    # Initialize Markdown Search
    if (t == SearchType.Markdown or t == None) and config.content_type.markdown:
        # Extract Entries, Generate Markdown Embeddings
        model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)

    # Initialize Ledger Search
    if (t == SearchType.Ledger or t == None) and config.content_type.ledger:
        # Extract Entries, Generate Ledger Embeddings
        model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, verbose=verbose)

    # Initialize Image Search
    if (t == SearchType.Image or t == None) and config.content_type.image:
        # Extract Entries, Generate Image Embeddings
        model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate, verbose=verbose)

    return model


def configure_processor(processor_config: ProcessorConfig, verbose: int):
    if not processor_config:
        return

    processor = ProcessorConfigModel()

    # Initialize Conversation Processor
    if processor_config.conversation:
        processor.conversation = configure_conversation_processor(processor_config.conversation, verbose)

    return processor


def configure_conversation_processor(conversation_processor_config, verbose: int):
    conversation_processor = ConversationProcessorConfigModel(conversation_processor_config, verbose)

    conversation_logfile = conversation_processor.conversation_logfile
    if conversation_processor.verbose:
        print('INFO:\tLoading conversation logs from disk...')

    if conversation_logfile.expanduser().absolute().is_file():
        # Load Metadata Logs from Conversation Logfile
        with open(get_absolute_path(conversation_logfile), 'r') as f:
            conversation_processor.meta_log = json.load(f)

        print('INFO:\tConversation logs loaded from disk.')
    else:
        # Initialize Conversation Logs
        conversation_processor.meta_log = {}
        conversation_processor.chat_session = ""

    return conversation_processor
Refactor app start to start server even if backend not configured - Decouple configuring backend from starting server. Backend search and processors can be configured after the backend server has started - Set global state in main instead of in configure_server method. This allows the app to start even if configure_server exits early in the first run scenario, where no config available to configure server - Now start server, even if no config, before GUI started in main - This refactor of app startup flow will allow users to configure backend using the configure screen after server start 2022-08-10 23:13:14 +02:00			`# System Packages`
			`import sys`

Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00			`# External Packages`
			`import torch`
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`import json`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
			`# Internal Packages`
			`from src.processor.ledger.beancount_to_jsonl import beancount_to_jsonl`
			`from src.processor.markdown.markdown_to_jsonl import markdown_to_jsonl`
			`from src.processor.org_mode.org_to_jsonl import org_to_jsonl`
			`from src.search_type import image_search, text_search`
			`from src.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel`
Put global state variables into separate state module - Variables storing app, device state aren't constants. Do not mix with actual constants like empty_escape_sequence, web_directory 2022-08-06 02:05:35 +02:00			`from src.utils import state`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00			`from src.utils.helpers import get_absolute_path`
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`from src.utils.rawconfig import FullConfig, ProcessorConfig`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00

Refactor app start to start server even if backend not configured - Decouple configuring backend from starting server. Backend search and processors can be configured after the backend server has started - Set global state in main instead of in configure_server method. This allows the app to start even if configure_server exits early in the first run scenario, where no config available to configure server - Now start server, even if no config, before GUI started in main - This refactor of app startup flow will allow users to configure backend using the configure screen after server start 2022-08-10 23:13:14 +02:00			`def configure_server(args, required=False):`
			`if args.config is None:`
			`if required:`
			`print('Exiting as Khoj is not configured. Configure the application to use it.')`
			`sys.exit(1)`
			`else:`
			`return`
			`else:`
			`state.config = args.config`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
			`# Initialize the search model from Config`
Refactor app start to start server even if backend not configured - Decouple configuring backend from starting server. Backend search and processors can be configured after the backend server has started - Set global state in main instead of in configure_server method. This allows the app to start even if configure_server exits early in the first run scenario, where no config available to configure server - Now start server, even if no config, before GUI started in main - This refactor of app startup flow will allow users to configure backend using the configure screen after server start 2022-08-10 23:13:14 +02:00			`state.model = configure_search(state.model, state.config, args.regenerate, device=state.device, verbose=state.verbose)`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
			`# Initialize Processor from Config`
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`state.processor_config = configure_processor(args.config.processor, verbose=state.verbose)`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00

Rename initialize_{search,processor,server} to configure_{search,procesor,server} - Search is being reconfigured multiple times in /regenerate and n/reload. More appropriate name is configure_ rather than initialize_ for it - Standardize name of methods under configure.py 2022-08-06 02:20:04 +02:00			`def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: SearchType = None, device=torch.device("cpu"), verbose: int = 0):`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00			`# Initialize Org Notes Search`
			`if (t == SearchType.Org or t == None) and config.content_type.org:`
			`# Extract Entries, Generate Notes Embeddings`
			`model.orgmode_search = text_search.setup(org_to_jsonl, config.content_type.org, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)`

			`# Initialize Org Music Search`
			`if (t == SearchType.Music or t == None) and config.content_type.music:`
			`# Extract Entries, Generate Music Embeddings`
			`model.music_search = text_search.setup(org_to_jsonl, config.content_type.music, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)`

			`# Initialize Markdown Search`
			`if (t == SearchType.Markdown or t == None) and config.content_type.markdown:`
			`# Extract Entries, Generate Markdown Embeddings`
			`model.markdown_search = text_search.setup(markdown_to_jsonl, config.content_type.markdown, search_config=config.search_type.asymmetric, regenerate=regenerate, device=device, verbose=verbose)`

			`# Initialize Ledger Search`
			`if (t == SearchType.Ledger or t == None) and config.content_type.ledger:`
			`# Extract Entries, Generate Ledger Embeddings`
			`model.ledger_search = text_search.setup(beancount_to_jsonl, config.content_type.ledger, search_config=config.search_type.symmetric, regenerate=regenerate, verbose=verbose)`

			`# Initialize Image Search`
			`if (t == SearchType.Image or t == None) and config.content_type.image:`
			`# Extract Entries, Generate Image Embeddings`
			`model.image_search = image_search.setup(config.content_type.image, search_config=config.search_type.image, regenerate=regenerate, verbose=verbose)`

			`return model`


Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`def configure_processor(processor_config: ProcessorConfig, verbose: int):`
			`if not processor_config:`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00			`return`

Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`processor = ProcessorConfigModel()`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
			`# Initialize Conversation Processor`
Only setup conversation processor if it has configuration set 2022-08-10 21:34:03 +02:00			`if processor_config.conversation:`
			`processor.conversation = configure_conversation_processor(processor_config.conversation, verbose)`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`return processor`


			`def configure_conversation_processor(conversation_processor_config, verbose: int):`
			`conversation_processor = ConversationProcessorConfigModel(conversation_processor_config, verbose)`

			`conversation_logfile = conversation_processor.conversation_logfile`
			`if conversation_processor.verbose:`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00			`print('INFO:\tLoading conversation logs from disk...')`

			`if conversation_logfile.expanduser().absolute().is_file():`
			`# Load Metadata Logs from Conversation Logfile`
			`with open(get_absolute_path(conversation_logfile), 'r') as f:`
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`conversation_processor.meta_log = json.load(f)`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
			`print('INFO:\tConversation logs loaded from disk.')`
			`else:`
			`# Initialize Conversation Logs`
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`conversation_processor.meta_log = {}`
			`conversation_processor.chat_session = ""`
Extract configure and routers from main.py into separate modules - Main.py was becoming too big to manage. It had both controllers/routers and component configurations (search, processors) in it - Now that the native app GUI code is also getting added to the main path, good time to split/modularize/clean main.py - Put global state into a separate file to share across modules 2022-08-06 01:37:52 +02:00
Extract conversation processor from config into separate function - Only pass processor config arg required by configure_processor. Not the unused full config object - Type arguments passed to methods configure processors - Import json for use by conversation processor to load logs 2022-08-10 21:30:13 +02:00			`return conversation_processor`