Address Notion, Image tech debt in indexing code path (#687)

* Add support for using OAuth2.0 in the Notion integration * Add notion to the admin page * Remove unnecessary content_index and image search/setup references * Trigger background job to start indexing Notion after user configures it * Add a log line when a new Notion integration is setup * Fix references to the configure_content methods
2025-02-18 22:54:20 +00:00 · 2024-04-04 23:40:03 -07:00 · 2024-04-04 23:40:03 -07:00 · f57f9f672d
commit f57f9f672d
parent 69dee75c34
16 changed files with 145 additions and 599 deletions
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@ -34,7 +34,7 @@ from khoj.database.adapters import (
 )
 from khoj.database.models import ClientApplication, KhojUser, Subscription
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
-from khoj.routers.indexer import configure_content, configure_search, load_content
+from khoj.routers.indexer import configure_content, configure_search
 from khoj.routers.twilio import is_twilio_enabled
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType
@ -245,16 +245,12 @@ def initialize_content(regenerate: bool, search_type: Optional[SearchType] = Non
    if state.search_models:
        try:
            if init:
-                logger.info("📬 Initializing content index...")
+                logger.info("📬 No-op...")
                state.content_index = load_content(state.config.content_type, state.content_index, state.search_models)
            else:
                logger.info("📬 Updating content index...")
                all_files = collect_files(user=user)
-                state.content_index, status = configure_content(
+                status = configure_content(
                    state.content_index,
                    state.config.content_type,
                    all_files,
                    state.search_models,
                    regenerate,
                    search_type,
                    user=user,
@ -272,6 +268,7 @@ def configure_routes(app):
    from khoj.routers.api_chat import api_chat
    from khoj.routers.api_config import api_config
    from khoj.routers.indexer import indexer
    from khoj.routers.notion import notion_router
    from khoj.routers.web_client import web_client
    app.include_router(api, prefix="/api")
@ -279,6 +276,7 @@ def configure_routes(app):
    app.include_router(api_agents, prefix="/api/agents")
    app.include_router(api_config, prefix="/api/config")
    app.include_router(indexer, prefix="/api/v1/index")
    app.include_router(notion_router, prefix="/api/notion")
    app.include_router(web_client)
    if not state.anonymous_mode:
@ -311,13 +309,9 @@ def update_search_index():
        logger.info("📬 Updating content index via Scheduler")
        for user in get_all_users():
            all_files = collect_files(user=user)
-            state.content_index, success = configure_content(
+            success = configure_content(all_files, user=user)
                state.content_index, state.config.content_type, all_files, state.search_models, user=user
            )
        all_files = collect_files(user=None)
-        state.content_index, success = configure_content(
+        success = configure_content(all_files, user=None)
            state.content_index, state.config.content_type, all_files, state.search_models, user=None
        )
        if not success:
            raise RuntimeError("Failed to update content index")
        logger.info("📪 Content index updated via Scheduler")
--- a/src/khoj/database/adapters/init.py
+++ b/src/khoj/database/adapters/init.py
@ -259,6 +259,10 @@ async def get_user_by_email(email: str) -> KhojUser:
    return await KhojUser.objects.filter(email=email).afirst()
 async def aget_user_by_uuid(uuid: str) -> KhojUser:
    return await KhojUser.objects.filter(uuid=uuid).afirst()
 async def get_user_by_token(token: dict) -> KhojUser:
    google_user = await GoogleUser.objects.filter(sub=token.get("sub")).select_related("user").afirst()
    if not google_user:
--- a/src/khoj/database/admin.py
+++ b/src/khoj/database/admin.py
@ -11,7 +11,9 @@ from khoj.database.models import (
    ClientApplication,
    Conversation,
    Entry,
    GithubConfig,
    KhojUser,
    NotionConfig,
    OfflineChatProcessorConversationConfig,
    OpenAIProcessorConversationConfig,
    ReflectiveQuestion,
@ -52,6 +54,8 @@ admin.site.register(UserSearchModelConfig)
 admin.site.register(TextToImageModelConfig)
 admin.site.register(ClientApplication)
 admin.site.register(Agent)
 admin.site.register(GithubConfig)
 admin.site.register(NotionConfig)
@admin.register(Entry)
--- a/src/khoj/interface/web/config.html
+++ b/src/khoj/interface/web/config.html
@ -109,14 +109,23 @@
                    <p class="card-description">Sync your Notion pages</p>
                </div>
                <div class="card-action-row">
                    {% if current_model_state.notion %}
                    <a class="card-button" href="/config/content-source/notion">
                        {% if current_model_state.notion %}
                            Update
                        {% else %}
                            Setup
                        {% endif %}
                        <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
                    </a>
                    {% elif notion_oauth_url %}
                    <a class="card-button" href="{{ notion_oauth_url }}">
                            Connect
                        <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
                    </a>
                    {% else %}
                    <a class="card-button" href="/config/content-source/notion">
                            Setup
                        <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
                    </a>
                    {% endif %}
                    <div id="clear-notion"
                        class="card-action-row"
                        style="display: {% if not current_model_state.notion %}none{% endif %}">
--- a/src/khoj/interface/web/content_source_notion_input.html
+++ b/src/khoj/interface/web/content_source_notion_input.html
@ -5,11 +5,6 @@
        <h2 class="section-title">
            <img class="card-icon" src="/static/assets/icons/notion.svg?v={{ khoj_version }}" alt="Notion">
            <span class="card-title-text">Notion</span>
            <div class="instructions">
                <a href="https://docs.khoj.dev/#/notion_integration">ⓘ Help</a>
            </div>
        </h2>
        <form>
            <table>
                <tr>
                    <td>
@ -22,7 +17,7 @@
            </table>
            <div class="section">
                <div id="success" style="display: none;"></div>
-                <button id="submit" type="submit">Save</button>
+                <button id="submit" type="submit">Sync to Update</button>
            </div>
        </form>
    </div>
@ -43,7 +38,7 @@
        const submitButton = document.getElementById("submit");
        submitButton.disabled = true;
-        submitButton.innerHTML = "Saving...";
+        submitButton.innerHTML = "Syncing...";
        // Save Notion config on server
        const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@ -33,7 +33,7 @@ from khoj.routers.helpers import (
 from khoj.search_filter.date_filter import DateFilter
 from khoj.search_filter.file_filter import FileFilter
 from khoj.search_filter.word_filter import WordFilter
-from khoj.search_type import image_search, text_search
+from khoj.search_type import text_search
 from khoj.utils import constants, state
 from khoj.utils.config import OfflineChatProcessorModel
 from khoj.utils.helpers import ConversationCommand, timer
@ -145,41 +145,17 @@ async def execute_search(
                )
            ]
        elif (t == SearchType.Image) and state.content_index.image and state.search_models.image_search:
            # query images
            search_futures += [
                executor.submit(
                    image_search.query,
                    user_query,
                    results_count,
                    state.search_models.image_search,
                    state.content_index.image,
                )
            ]
        # Query across each requested content types in parallel
        with timer("Query took", logger):
            for search_future in concurrent.futures.as_completed(search_futures):
-                if t == SearchType.Image and state.content_index.image:
+                hits = await search_future.result()
-                    hits = await search_future.result()
+                # Collate results
-                    output_directory = constants.web_directory / "images"
+                results += text_search.collate_results(hits, dedupe=dedupe)
                    # Collate results
                    results += image_search.collate_results(
                        hits,
                        image_names=state.content_index.image.image_names,
                        output_directory=output_directory,
                        image_files_url="/static/images",
                        count=results_count,
                    )
                else:
                    hits = await search_future.result()
                    # Collate results
                    results += text_search.collate_results(hits, dedupe=dedupe)
-                    # Sort results across all content types and take top results
+                # Sort results across all content types and take top results
-                    results = text_search.rerank_and_sort_results(
+                results = text_search.rerank_and_sort_results(
-                        results, query=defiltered_query, rank_results=r, search_model_name=search_model.name
+                    results, query=defiltered_query, rank_results=r, search_model_name=search_model.name
-                    )[:results_count]
+                )[:results_count]
    # Cache results
    if user:
@ -214,8 +190,6 @@ def update(
        components = []
        if state.search_models:
            components.append("Search models")
        if state.content_index:
            components.append("Content index")
        components_msg = ", ".join(components)
        logger.info(f"📪 {components_msg} updated via API")
--- a/src/khoj/routers/api_config.py
+++ b/src/khoj/routers/api_config.py
@ -38,7 +38,7 @@ logger = logging.getLogger(__name__)
 def map_config_to_object(content_source: str):
    if content_source == DbEntry.EntrySource.GITHUB:
        return GithubConfig
-    if content_source == DbEntry.EntrySource.GITHUB:
+    if content_source == DbEntry.EntrySource.NOTION:
        return NotionConfig
    if content_source == DbEntry.EntrySource.COMPUTER:
        return "Computer"
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@ -14,9 +14,9 @@ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.routers.helpers import ApiIndexedDataLimiter, update_telemetry_state
-from khoj.search_type import image_search, text_search
+from khoj.search_type import text_search
 from khoj.utils import constants, state
-from khoj.utils.config import ContentIndex, SearchModels
+from khoj.utils.config import SearchModels
 from khoj.utils.helpers import LRU, get_file_type
 from khoj.utils.rawconfig import ContentConfig, FullConfig, SearchConfig
 from khoj.utils.yaml import save_config_to_file_updated_state
@ -105,13 +105,10 @@ async def update(
        # Extract required fields from config
        loop = asyncio.get_event_loop()
-        state.content_index, success = await loop.run_in_executor(
+        success = await loop.run_in_executor(
            None,
            configure_content,
            state.content_index,
            state.config.content_type,
            indexer_input.model_dump(),
            state.search_models,
            force,
            t,
            False,
@ -159,23 +156,17 @@ def configure_search(search_models: SearchModels, search_config: Optional[Search
    if search_config and search_config.image:
        logger.info("🔍 🌄 Setting up image search model")
        search_models.image_search = image_search.initialize_model(search_config.image)
    return search_models
 def configure_content(
    content_index: Optional[ContentIndex],
    content_config: Optional[ContentConfig],
    files: Optional[dict[str, dict[str, str]]],
    search_models: SearchModels,
    regenerate: bool = False,
    t: Optional[state.SearchType] = state.SearchType.All,
    full_corpus: bool = True,
    user: KhojUser = None,
-) -> tuple[Optional[ContentIndex], bool]:
+) -> bool:
    content_index = ContentIndex()
    success = True
    if t == None:
        t = state.SearchType.All
@ -185,7 +176,7 @@ def configure_content(
    if t is not None and not t.value in [type.value for type in state.SearchType]:
        logger.warning(f"🚨 Invalid search type: {t}")
-        return None, False
+        return False
    search_type = t.value if t else None
@ -193,7 +184,7 @@ def configure_content(
    if files is None:
        logger.warning(f"🚨 No files to process for {search_type} search.")
-        return None, True
+        return True
    try:
        # Initialize Org Notes Search
@ -266,24 +257,6 @@ def configure_content(
        logger.error(f"🚨 Failed to setup plaintext: {e}", exc_info=True)
        success = False
    try:
        # Initialize Image Search
        if (
            (search_type == state.SearchType.All.value or search_type == state.SearchType.Image.value)
            and content_config
            and content_config.image
            and search_models.image_search
        ):
            logger.info("🌄 Setting up search for images")
            # Extract Entries, Generate Image Embeddings
            content_index.image = image_search.setup(
                content_config.image, search_models.image_search.image_encoder, regenerate=regenerate
            )
    except Exception as e:
        logger.error(f"🚨 Failed to setup images: {e}", exc_info=True)
        success = False
    try:
        if no_documents:
            github_config = GithubConfig.objects.filter(user=user).prefetch_related("githubrepoconfig").first()
@ -330,23 +303,4 @@ def configure_content(
    if user:
        state.query_cache[user.uuid] = LRU()
-    return content_index, success
+    return success
 def load_content(
    content_config: Optional[ContentConfig],
    content_index: Optional[ContentIndex],
    search_models: SearchModels,
 ):
    if content_config is None:
        logger.debug("🚨 No Content configuration available.")
        return None
    if content_index is None:
        content_index = ContentIndex()
    if content_config.image:
        logger.info("🌄 Loading images")
        content_index.image = image_search.setup(
            content_config.image, search_models.image_search.image_encoder, regenerate=False
        )
    return content_index
--- a/src/khoj/routers/notion.py
+++ b/src/khoj/routers/notion.py
@ -0,0 +1,89 @@
 import asyncio
 import base64
 import json
 import logging
 import os
 from concurrent.futures import ThreadPoolExecutor
 import requests
 from fastapi import APIRouter, BackgroundTasks, Request, Response
 from starlette.responses import RedirectResponse
 from khoj.database.adapters import aget_user_by_uuid
 from khoj.database.models import KhojUser, NotionConfig
 from khoj.routers.indexer import configure_content
 from khoj.utils.state import SearchType
 NOTION_OAUTH_CLIENT_ID = os.getenv("NOTION_OAUTH_CLIENT_ID")
 NOTION_OAUTH_CLIENT_SECRET = os.getenv("NOTION_OAUTH_CLIENT_SECRET")
 NOTION_REDIRECT_URI = os.getenv("NOTION_REDIRECT_URI")
 notion_router = APIRouter()
 executor = ThreadPoolExecutor()
 logger = logging.getLogger(__name__)
 def get_notion_auth_url(user: KhojUser):
    if not NOTION_OAUTH_CLIENT_ID or not NOTION_OAUTH_CLIENT_SECRET or not NOTION_REDIRECT_URI:
        return None
    return f"https://api.notion.com/v1/oauth/authorize?client_id={NOTION_OAUTH_CLIENT_ID}&redirect_uri={NOTION_REDIRECT_URI}&response_type=code&state={user.uuid}"
 async def run_in_executor(func, *args):
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(executor, func, *args)
@notion_router.get("/auth/callback")
 async def notion_auth_callback(request: Request, background_tasks: BackgroundTasks):
    code = request.query_params.get("code")
    state = request.query_params.get("state")
    if not code or not state:
        return Response("Missing code or state", status_code=400)
    user: KhojUser = await aget_user_by_uuid(state)
    NotionConfig.objects.filter(user=user).adelete()
    if not user:
        raise Exception("User not found")
    bearer_token = f"{NOTION_OAUTH_CLIENT_ID}:{NOTION_OAUTH_CLIENT_SECRET}"
    base64_encoded_token = base64.b64encode(bearer_token.encode()).decode()
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json",
        "Authorization": f"Basic {base64_encoded_token}",
    }
    data = {
        "grant_type": "authorization_code",
        "code": code,
        "redirect_uri": NOTION_REDIRECT_URI,
    }
    response = requests.post("https://api.notion.com/v1/oauth/token", data=json.dumps(data), headers=headers)
    final_response = response.json()
    access_token = final_response.get("access_token")
    NotionConfig.objects.acreate(token=access_token, user=user)
    owner = final_response.get("owner")
    workspace_id = final_response.get("workspace_id")
    workspace_name = final_response.get("workspace_name")
    bot_id = final_response.get("bot_id")
    logger.info(
        f"Notion integration. Owner: {owner}, Workspace ID: {workspace_id}, Workspace Name: {workspace_name}, Bot ID: {bot_id}"
    )
    notion_redirect = str(request.app.url_path_for("notion_config_page"))
    # Trigger an async job to configure_content. Let it run without blocking the response.
    background_tasks.add_task(run_in_executor, configure_content, {}, False, SearchType.Notion, True, user)
    return RedirectResponse(notion_redirect)
--- a/src/khoj/routers/web_client.py
+++ b/src/khoj/routers/web_client.py
@ -19,6 +19,7 @@ from khoj.database.adapters import (
    get_user_subscription_state,
 )
 from khoj.database.models import KhojUser
 from khoj.routers.notion import get_notion_auth_url
 from khoj.routers.twilio import is_twilio_enabled
 from khoj.utils import constants, state
 from khoj.utils.rawconfig import (
@ -244,6 +245,8 @@ def config_page(request: Request):
    current_search_model_option = adapters.get_user_search_model_or_default(user)
    notion_oauth_url = get_notion_auth_url(user)
    return templates.TemplateResponse(
        "config.html",
        context={
@ -267,6 +270,7 @@ def config_page(request: Request):
            "phone_number": user.phone_number,
            "is_phone_number_verified": user.verified_phone_number,
            "khoj_version": state.khoj_version,
            "notion_oauth_url": notion_oauth_url,
        },
    )
@ -324,7 +328,7 @@ def notion_config_page(request: Request):
        token=current_notion_config.token if current_notion_config else "",
    )
-    current_config = json.loads(current_config.json())
+    current_config = json.loads(current_config.model_dump_json())
    return templates.TemplateResponse(
        "content_source_notion_input.html",
--- a/src/khoj/search_type/image_search.py
+++ b/src/khoj/search_type/image_search.py
@ -1,272 +0,0 @@
 import copy
 import glob
 import logging
 import math
 import pathlib
 import shutil
 from typing import List
 import torch
 from PIL import Image
 from sentence_transformers import SentenceTransformer, util
 from tqdm import trange
 from khoj.utils.config import ImageContent, ImageSearchModel
 from khoj.utils.helpers import (
    get_absolute_path,
    get_from_dict,
    load_model,
    resolve_absolute_path,
    timer,
 )
 from khoj.utils.models import BaseEncoder
 from khoj.utils.rawconfig import ImageContentConfig, ImageSearchConfig, SearchResponse
 # Create Logger
 logger = logging.getLogger(__name__)
 def initialize_model(search_config: ImageSearchConfig):
    # Convert model directory to absolute path
    search_config.model_directory = resolve_absolute_path(search_config.model_directory)
    # Create model directory if it doesn't exist
    search_config.model_directory.parent.mkdir(parents=True, exist_ok=True)
    # Load the CLIP model
    encoder = load_model(
        model_dir=search_config.model_directory,
        model_name=search_config.encoder,
        model_type=search_config.encoder_type or SentenceTransformer,
    )
    return ImageSearchModel(encoder)
 def extract_entries(image_directories):
    image_names = []
    for image_directory in image_directories:
        image_directory = resolve_absolute_path(image_directory, strict=True)
        image_names.extend(list(image_directory.glob("*.jpg")))
        image_names.extend(list(image_directory.glob("*.jpeg")))
    if logger.level >= logging.DEBUG:
        image_directory_names = ", ".join([str(image_directory) for image_directory in image_directories])
        logger.debug(f"Found {len(image_names)} images in {image_directory_names}")
    return sorted(image_names)
 def compute_embeddings(image_names, encoder, embeddings_file, batch_size=50, use_xmp_metadata=False, regenerate=False):
    "Compute (and Save) Embeddings or Load Pre-Computed Embeddings"
    image_embeddings = compute_image_embeddings(image_names, encoder, embeddings_file, batch_size, regenerate)
    image_metadata_embeddings = compute_metadata_embeddings(
        image_names, encoder, embeddings_file, batch_size, use_xmp_metadata, regenerate
    )
    return image_embeddings, image_metadata_embeddings
 def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=50, regenerate=False):
    # Load pre-computed image embeddings from file if exists
    if resolve_absolute_path(embeddings_file).exists() and not regenerate:
        image_embeddings = torch.load(embeddings_file)
        logger.debug(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}")
    # Else compute the image embeddings from scratch, which can take a while
    else:
        image_embeddings = []
        for index in trange(0, len(image_names), batch_size):
            images = []
            for image_name in image_names[index : index + batch_size]:
                image = Image.open(image_name)
                # Resize images to max width of 640px for faster processing
                image.thumbnail((640, image.height))
                images += [image]
            image_embeddings += encoder.encode(images, convert_to_tensor=True, batch_size=min(len(images), batch_size))
        # Create directory for embeddings file, if it doesn't exist
        embeddings_file.parent.mkdir(parents=True, exist_ok=True)
        # Save computed image embeddings to file
        torch.save(image_embeddings, embeddings_file)
        logger.info(f"📩 Saved computed image embeddings to {embeddings_file}")
    return image_embeddings
 def compute_metadata_embeddings(
    image_names, encoder, embeddings_file, batch_size=50, use_xmp_metadata=False, regenerate=False, verbose=0
 ):
    image_metadata_embeddings = None
    # Load pre-computed image metadata embedding file if exists
    if use_xmp_metadata and resolve_absolute_path(f"{embeddings_file}_metadata").exists() and not regenerate:
        image_metadata_embeddings = torch.load(f"{embeddings_file}_metadata")
        logger.debug(f"Loaded image metadata embeddings from {embeddings_file}_metadata")
    # Else compute the image metadata embeddings from scratch, which can take a while
    if use_xmp_metadata and image_metadata_embeddings is None:
        image_metadata_embeddings = []
        for index in trange(0, len(image_names), batch_size):
            image_metadata = [
                extract_metadata(image_name, verbose) for image_name in image_names[index : index + batch_size]
            ]
            try:
                image_metadata_embeddings += encoder.encode(
                    image_metadata, convert_to_tensor=True, batch_size=min(len(image_metadata), batch_size)
                )
            except RuntimeError as e:
                logger.error(
                    f"Error encoding metadata for images starting from\n\tindex: {index},\n\timages: {image_names[index:index+batch_size]}\nException: {e}"
                )
                continue
        torch.save(image_metadata_embeddings, f"{embeddings_file}_metadata")
        logger.info(f"📩 Saved computed image metadata embeddings to {embeddings_file}_metadata")
    return image_metadata_embeddings
 def extract_metadata(image_name):
    image_xmp_metadata = Image.open(image_name).getxmp()
    image_description = get_from_dict(
        image_xmp_metadata, "xmpmeta", "RDF", "Description", "description", "Alt", "li", "text"
    )
    image_subjects = get_from_dict(image_xmp_metadata, "xmpmeta", "RDF", "Description", "subject", "Bag", "li")
    image_metadata_subjects = set([subject.split(":")[1] for subject in image_subjects if ":" in subject])
    image_processed_metadata = image_description
    if len(image_metadata_subjects) > 0:
        image_processed_metadata += ". " + ", ".join(image_metadata_subjects)
    logger.debug(f"{image_name}:\t{image_processed_metadata}")
    return image_processed_metadata
 async def query(
    raw_query, count, search_model: ImageSearchModel, content: ImageContent, score_threshold: float = math.inf
 ):
    # Set query to image content if query is of form file:/path/to/file.png
    if raw_query.startswith("file:") and pathlib.Path(raw_query[5:]).is_file():
        query_imagepath = resolve_absolute_path(pathlib.Path(raw_query[5:]), strict=True)
        query = copy.deepcopy(Image.open(query_imagepath))
        query.thumbnail((640, query.height))  # scale down image for faster processing
        logger.info(f"🔎 Find Images by Image: {query_imagepath}")
    else:
        # Truncate words in query to stay below max_tokens supported by ML model
        max_words = 20
        query = " ".join(raw_query.split()[:max_words])
        logger.info(f"🔎 Find Images by Text: {query}")
    # Now we encode the query (which can either be an image or a text string)
    with timer("Query Encode Time", logger):
        query_embedding = search_model.image_encoder.encode([query], convert_to_tensor=True, show_progress_bar=False)
    # Compute top_k ranked images based on cosine-similarity b/w query and all image embeddings.
    with timer("Search Time", logger):
        image_hits = {
            # Map scores to distance metric by multiplying by -1
            result["corpus_id"]: {"image_score": -1 * result["score"], "score": -1 * result["score"]}
            for result in util.semantic_search(query_embedding, content.image_embeddings, top_k=count)[0]
        }
    # Compute top_k ranked images based on cosine-similarity b/w query and all image metadata embeddings.
    if content.image_metadata_embeddings:
        with timer("Metadata Search Time", logger):
            metadata_hits = {
                result["corpus_id"]: result["score"]
                for result in util.semantic_search(query_embedding, content.image_metadata_embeddings, top_k=count)[0]
            }
        # Sum metadata, image scores of the highest ranked images
        for corpus_id, score in metadata_hits.items():
            scaling_factor = 0.33
            if "corpus_id" in image_hits:
                image_hits[corpus_id].update(
                    {
                        "metadata_score": score,
                        "score": image_hits[corpus_id].get("score", 0) + scaling_factor * score,
                    }
                )
            else:
                image_hits[corpus_id] = {"metadata_score": score, "score": scaling_factor * score}
    # Reformat results in original form from sentence transformer semantic_search()
    hits = [
        {
            "corpus_id": corpus_id,
            "score": scores["score"],
            "image_score": scores.get("image_score", 0),
            "metadata_score": scores.get("metadata_score", 0),
        }
        for corpus_id, scores in image_hits.items()
    ]
    # Filter results by score threshold
    hits = [hit for hit in hits if hit["image_score"] <= score_threshold]
    # Sort the images based on their combined metadata, image scores
    return sorted(hits, key=lambda hit: hit["score"], reverse=True)
 def collate_results(hits, image_names, output_directory, image_files_url, count=5) -> List[SearchResponse]:
    results: List[SearchResponse] = []
    for index, hit in enumerate(hits[:count]):
        source_path = image_names[hit["corpus_id"]]
        target_image_name = f"{index}{source_path.suffix}"
        target_path = resolve_absolute_path(f"{output_directory}/{target_image_name}")
        # Create output directory, if it doesn't exist
        if not target_path.parent.exists():
            target_path.parent.mkdir(exist_ok=True)
        # Copy the image to the output directory
        shutil.copy(source_path, target_path)
        # Add the image metadata to the results
        results += [
            SearchResponse.model_validate(
                {
                    "entry": f"{image_files_url}/{target_image_name}",
                    "score": f"{hit['score']:.9f}",
                    "additional": {
                        "image_score": f"{hit['image_score']:.9f}",
                        "metadata_score": f"{hit['metadata_score']:.9f}",
                    },
                    "corpus_id": str(hit["corpus_id"]),
                }
            )
        ]
    return results
 def setup(config: ImageContentConfig, encoder: BaseEncoder, regenerate: bool) -> ImageContent:
    # Extract Entries
    absolute_image_files, filtered_image_files = set(), set()
    if config.input_directories:
        image_directories = [resolve_absolute_path(directory, strict=True) for directory in config.input_directories]
        absolute_image_files = set(extract_entries(image_directories))
    if config.input_filter:
        filtered_image_files = {
            filtered_file
            for input_filter in config.input_filter
            for filtered_file in glob.glob(get_absolute_path(input_filter))
        }
    all_image_files = sorted(list(absolute_image_files | filtered_image_files))
    # Compute or Load Embeddings
    embeddings_file = resolve_absolute_path(config.embeddings_file)
    image_embeddings, image_metadata_embeddings = compute_embeddings(
        all_image_files,
        encoder,
        embeddings_file,
        batch_size=config.batch_size,
        regenerate=regenerate,
        use_xmp_metadata=config.use_xmp_metadata,
    )
    return ImageContent(all_image_files, image_embeddings, image_metadata_embeddings)
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@ -58,15 +58,9 @@ class ImageSearchModel:
    image_encoder: BaseEncoder
@dataclass
 class ContentIndex:
    image: Optional[ImageContent] = None
@dataclass
 class SearchModels:
    text_search: Optional[TextSearchModel] = None
    image_search: Optional[ImageSearchModel] = None
@dataclass
--- a/src/khoj/utils/state.py
+++ b/src/khoj/utils/state.py
@ -9,7 +9,7 @@ from whisper import Whisper
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.utils import config as utils_config
-from khoj.utils.config import ContentIndex, OfflineChatProcessorModel, SearchModels
+from khoj.utils.config import OfflineChatProcessorModel, SearchModels
 from khoj.utils.helpers import LRU, get_device
 from khoj.utils.rawconfig import FullConfig
@ -18,7 +18,6 @@ config = FullConfig()
 search_models = SearchModels()
 embeddings_model: Dict[str, EmbeddingsModel] = None
 cross_encoder_model: Dict[str, CrossEncoderModel] = None
 content_index = ContentIndex()
 openai_client: OpenAI = None
 offline_chat_processor_config: OfflineChatProcessorModel = None
 whisper_model: Whisper = None
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -25,7 +25,7 @@ from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
 from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
 from khoj.processor.embeddings import CrossEncoderModel, EmbeddingsModel
 from khoj.routers.indexer import configure_content
-from khoj.search_type import image_search, text_search
+from khoj.search_type import text_search
 from khoj.utils import fs_syncer, state
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import web_directory
@ -207,7 +207,6 @@ def openai_agent():
@pytest.fixture(scope="session")
 def search_models(search_config: SearchConfig):
    search_models = SearchModels()
    search_models.image_search = image_search.initialize_model(search_config.image)
    return search_models
@ -232,8 +231,6 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
        use_xmp_metadata=False,
    )
    image_search.setup(content_config.image, search_models.image_search.image_encoder, regenerate=False)
    LocalOrgConfig.objects.create(
        input_files=None,
        input_filter=["tests/data/org/*.org"],
@ -305,9 +302,7 @@ def chat_client_builder(search_config, user, index_content=True, require_auth=Fa
        # Index Markdown Content for Search
        all_files = fs_syncer.collect_files(user=user)
-        state.content_index, _ = configure_content(
+        success = configure_content(all_files, user=user)
            state.content_index, state.config.content_type, all_files, state.search_models, user=user
        )
    # Initialize Processor from Config
    if os.getenv("OPENAI_API_KEY"):
@ -349,16 +344,12 @@ def client(
    state.cross_encoder_model["default"] = CrossEncoderModel()
    # These lines help us Mock the Search models for these search types
    state.search_models.image_search = image_search.initialize_model(search_config.image)
    text_search.setup(
        OrgToEntries,
        get_sample_data("org"),
        regenerate=False,
        user=api_user.user,
    )
    state.content_index.image = image_search.setup(
        content_config.image, state.search_models.image_search, regenerate=False
    )
    text_search.setup(
        PlaintextToEntries,
        get_sample_data("plaintext"),
@ -388,9 +379,7 @@ def client_offline_chat(search_config: SearchConfig, default_user2: KhojUser):
    )
    all_files = fs_syncer.collect_files(user=default_user2)
-    configure_content(
+    configure_content(all_files, user=default_user2)
        state.content_index, state.config.content_type, all_files, state.search_models, user=default_user2
    )
    # Initialize Processor from Config
    OfflineChatProcessorConversationConfigFactory(enabled=True)
--- a/tests/test_client.py
+++ b/tests/test_client.py
@ -12,10 +12,9 @@ from khoj.configure import configure_routes, configure_search_types
 from khoj.database.adapters import EntryAdapters
 from khoj.database.models import KhojApiUser, KhojUser
 from khoj.processor.content.org_mode.org_to_entries import OrgToEntries
-from khoj.search_type import image_search, text_search
+from khoj.search_type import text_search
 from khoj.utils import state
 from khoj.utils.rawconfig import ContentConfig, SearchConfig
 from khoj.utils.state import config, content_index, search_models
 # Test
@ -298,34 +297,6 @@ def test_get_configured_types_with_no_content_config(fastapi_app: FastAPI):
    assert response.json() == ["all"]
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
 def test_image_search(client, content_config: ContentConfig, search_config: SearchConfig):
    # Arrange
    headers = {"Authorization": "Bearer kk-secret"}
    search_models.image_search = image_search.initialize_model(search_config.image)
    content_index.image = image_search.setup(
        content_config.image, search_models.image_search.image_encoder, regenerate=False
    )
    query_expected_image_pairs = [
        ("kitten", "kitten_park.jpg"),
        ("a horse and dog on a leash", "horse_dog.jpg"),
        ("A guinea pig eating grass", "guineapig_grass.jpg"),
    ]
    for query, expected_image_name in query_expected_image_pairs:
        # Act
        response = client.get(f"/api/search?q={query}&n=1&t=image", headers=headers)
        # Assert
        assert response.status_code == 200
        actual_image = Image.open(BytesIO(client.get(response.json()[0]["entry"]).content))
        expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
        # Assert
        assert expected_image == actual_image
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.django_db(transaction=True)
 def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
--- a/tests/test_image_search.py
+++ b/tests/test_image_search.py
@ -1,162 +0,0 @@
 # Standard Modules
 import logging
 from pathlib import Path
 import pytest
 from PIL import Image
 from khoj.search_type import image_search
 from khoj.utils.config import SearchModels
 from khoj.utils.constants import web_directory
 from khoj.utils.helpers import resolve_absolute_path
 from khoj.utils.rawconfig import ContentConfig, SearchConfig
 from khoj.utils.state import content_index, search_models
 # Test
 # ----------------------------------------------------------------------------------------------------
 def test_image_search_setup(content_config: ContentConfig, search_models: SearchModels):
    # Act
    # Regenerate image search embeddings during image setup
    image_search_model = image_search.setup(
        content_config.image, search_models.image_search.image_encoder, regenerate=True
    )
    # Assert
    assert len(image_search_model.image_names) == 3
    assert len(image_search_model.image_embeddings) == 3
 # ----------------------------------------------------------------------------------------------------
 def test_image_metadata(content_config: ContentConfig):
    "Verify XMP Description and Subjects Extracted from Image"
    # Arrange
    expected_metadata_image_name_pairs = [
        (["Billi Ka Bacha.", "Cat", "Grass"], "kitten_park.jpg"),
        (["Pasture.", "Horse", "Dog"], "horse_dog.jpg"),
        (["Guinea Pig Eating Celery.", "Rodent", "Whiskers"], "guineapig_grass.jpg"),
    ]
    test_image_paths = [
        Path(content_config.image.input_directories[0] / image_name[1])
        for image_name in expected_metadata_image_name_pairs
    ]
    for expected_metadata, test_image_path in zip(expected_metadata_image_name_pairs, test_image_paths):
        # Act
        actual_metadata = image_search.extract_metadata(test_image_path)
        # Assert
        for expected_snippet in expected_metadata[0]:
            assert expected_snippet in actual_metadata
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
 async def test_image_search(content_config: ContentConfig, search_config: SearchConfig):
    # Arrange
    search_models.image_search = image_search.initialize_model(search_config.image)
    content_index.image = image_search.setup(
        content_config.image, search_models.image_search.image_encoder, regenerate=False
    )
    output_directory = resolve_absolute_path(web_directory)
    query_expected_image_pairs = [
        ("kitten", "kitten_park.jpg"),
        ("horse and dog in a farm", "horse_dog.jpg"),
        ("A guinea pig eating grass", "guineapig_grass.jpg"),
    ]
    # Act
    for query, expected_image_name in query_expected_image_pairs:
        hits = await image_search.query(
            query, count=1, search_model=search_models.image_search, content=content_index.image
        )
        results = image_search.collate_results(
            hits,
            content_index.image.image_names,
            output_directory=output_directory,
            image_files_url="/static/images",
            count=1,
        )
        actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
        actual_image = Image.open(actual_image_path)
        expected_image = Image.open(content_config.image.input_directories[0].joinpath(expected_image_name))
        # Assert
        assert expected_image == actual_image
    # Cleanup
    # Delete the image files copied to results directory
    actual_image_path.unlink()
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
 async def test_image_search_query_truncated(content_config: ContentConfig, search_config: SearchConfig, caplog):
    # Arrange
    search_models.image_search = image_search.initialize_model(search_config.image)
    content_index.image = image_search.setup(
        content_config.image, search_models.image_search.image_encoder, regenerate=False
    )
    max_words_supported = 10
    query = " ".join(["hello"] * 100)
    truncated_query = " ".join(["hello"] * max_words_supported)
    # Act
    try:
        with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
            await image_search.query(
                query, count=1, search_model=search_models.image_search, content=content_index.image
            )
    # Assert
    except RuntimeError as e:
        if "The size of tensor a (102) must match the size of tensor b (77)" in str(e):
            assert False, f"Query length exceeds max tokens supported by model\n"
    assert f"Find Images by Text: {truncated_query}" in caplog.text, "Query not truncated"
 # ----------------------------------------------------------------------------------------------------
@pytest.mark.anyio
 async def test_image_search_by_filepath(content_config: ContentConfig, search_config: SearchConfig, caplog):
    # Arrange
    search_models.image_search = image_search.initialize_model(search_config.image)
    content_index.image = image_search.setup(
        content_config.image, search_models.image_search.image_encoder, regenerate=False
    )
    output_directory = resolve_absolute_path(web_directory)
    image_directory = content_config.image.input_directories[0]
    query = f"file:{image_directory.joinpath('kitten_park.jpg')}"
    expected_image_path = f"{image_directory.joinpath('kitten_park.jpg')}"
    # Act
    with caplog.at_level(logging.INFO, logger="khoj.search_type.image_search"):
        hits = await image_search.query(
            query, count=1, search_model=search_models.image_search, content=content_index.image
        )
        results = image_search.collate_results(
            hits,
            content_index.image.image_names,
            output_directory=output_directory,
            image_files_url="/static/images",
            count=1,
        )
    actual_image_path = output_directory.joinpath(Path(results[0].entry).name)
    actual_image = Image.open(actual_image_path)
    expected_image = Image.open(expected_image_path)
    # Assert
    # Ensure file search triggered instead of query with file path as string
    assert (
        f"Find Images by Image: {resolve_absolute_path(expected_image_path)}" in caplog.text
    ), "File search not triggered"
    # Ensure the correct image is returned
    assert expected_image == actual_image, "Incorrect image returned by file search"
    # Cleanup
    # Delete the image files copied to results directory
    actual_image_path.unlink()