From 285d17af2add96208e2aa39c6053bd2e301736f1 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 6 Jun 2023 19:28:54 +0530 Subject: [PATCH 01/21] Search in parallel across all enabled content types requested via API - Update API to return content from all enabled content types when type is not set to specific type in HTTP request param - To do this efficiently run the search queries in parallel threads --- src/khoj/routers/api.py | 211 ++++++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 97 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index f7658caa..93fa0fda 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -1,4 +1,6 @@ # Standard Packages +from collections import defaultdict +import concurrent.futures import math import yaml import logging @@ -121,6 +123,7 @@ def search( user_query = q.strip() results_count = n score_threshold = score_threshold if score_threshold is not None else -math.inf + search_futures = defaultdict(list) # return cached results, if available query_cache_key = f"{user_query}-{n}-{t}-{r}-{score_threshold}-{dedupe}" @@ -128,105 +131,119 @@ def search( logger.debug(f"Return response from query cache") return state.query_cache[query_cache_key] - if (t == SearchType.Org or t == None) and state.model.org_search: - # query org-mode notes + with concurrent.futures.ThreadPoolExecutor() as executor: + if (t == SearchType.Org or t == None) and state.model.org_search: + # query org-mode notes + search_futures[t] += [ + executor.submit( + text_search.query, + user_query, + state.model.org_search, + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + if (t == SearchType.Markdown or t == None) and state.model.markdown_search: + # query markdown notes + search_futures[t] += [ + executor.submit( + text_search.query, + user_query, + state.model.markdown_search, + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + if (t == SearchType.Pdf or t == None) and state.model.pdf_search: + # query pdf files + search_futures[t] += [ + executor.submit( + text_search.query, + user_query, + state.model.pdf_search, + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + if (t == SearchType.Ledger or t == None) and state.model.ledger_search: + # query transactions + search_futures[t] += [ + executor.submit( + text_search.query, + user_query, + state.model.ledger_search, + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + if (t == SearchType.Music or t == None) and state.model.music_search: + # query music library + search_futures[t] += [ + executor.submit( + text_search.query, + user_query, + state.model.music_search, + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + if (t == SearchType.Image) and state.model.image_search: + # query images + search_futures[t] += [ + executor.submit( + image_search.query, + user_query, + results_count, + state.model.image_search, + score_threshold=score_threshold, + ) + ] + + if (t is None or t in SearchType) and state.model.plugin_search: + # query specified plugin type + search_future[t] += [ + executor.submit( + text_search.query, + user_query, + # Get plugin search model for specified search type, or the first one if none specified + state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())), + rank_results=r, + score_threshold=score_threshold, + dedupe=dedupe, + ) + ] + + # Query across each requested content types in parallel with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.org_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) + for search_future in search_futures[t]: + if t == SearchType.Image: + hits = search_futures.result() + output_directory = constants.web_directory / "images" + # Collate results + results += image_search.collate_results( + hits, + image_names=state.model.image_search.image_names, + output_directory=output_directory, + image_files_url="/static/images", + count=results_count, + ) + else: + hits, entries = search_future.result() + # Collate results + results += text_search.collate_results(hits, entries, results_count) - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Markdown or t == None) and state.model.markdown_search: - # query markdown files - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.markdown_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Pdf or t == None) and state.model.pdf_search: - # query pdf files - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.pdf_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Github or t == None) and state.model.github_search: - # query github embeddings - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.github_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Ledger or t == None) and state.model.ledger_search: - # query transactions - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.ledger_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Music or t == None) and state.model.music_search: - # query music library - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, state.model.music_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) - - elif (t == SearchType.Image or t == None) and state.model.image_search: - # query images - with timer("Query took", logger): - hits = image_search.query( - user_query, results_count, state.model.image_search, score_threshold=score_threshold - ) - output_directory = constants.web_directory / "images" - - # collate and return results - with timer("Collating results took", logger): - results = image_search.collate_results( - hits, - image_names=state.model.image_search.image_names, - output_directory=output_directory, - image_files_url="/static/images", - count=results_count, - ) - - elif (t in SearchType or t == None) and state.model.plugin_search: - # query specified plugin type - with timer("Query took", logger): - hits, entries = text_search.query( - user_query, - # Get plugin search model for specified search type, or the first one if none specified - state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())), - rank_results=r, - score_threshold=score_threshold, - dedupe=dedupe, - ) - - # collate and return results - with timer("Collating results took", logger): - results = text_search.collate_results(hits, entries, results_count) + # Sort results across all content types + results.sort(key=lambda x: float(x.score), reverse=True) # Cache results state.query_cache[query_cache_key] = results From db07362ca31be3d3dbf72a7b9ac3196c65b42d06 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Thu, 8 Jun 2023 13:37:19 +0530 Subject: [PATCH 02/21] Encode user query as same across search types to speed up query time - Add new filter abstract method to remove filter terms from query - Use the filter method to remove filter terms, encode this defiltered query and pass it to the query methods of each search types TODO: Encoding query is still taking 100-200 ms unlike before. Need to investigate why --- src/khoj/routers/api.py | 24 ++++++++++++++++++++++++ src/khoj/search_filter/base_filter.py | 4 ++++ src/khoj/search_filter/date_filter.py | 10 +++++++--- src/khoj/search_filter/file_filter.py | 7 ++++++- src/khoj/search_filter/word_filter.py | 5 ++++- src/khoj/search_type/text_search.py | 8 +++++--- 6 files changed, 50 insertions(+), 8 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 93fa0fda..35216343 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -10,12 +10,16 @@ from typing import List, Optional, Union # External Packages from fastapi import APIRouter from fastapi import HTTPException +from sentence_transformers import util # Internal Packages from khoj.configure import configure_processor, configure_search from khoj.processor.conversation.gpt import converse, extract_questions from khoj.processor.conversation.utils import message_to_log, message_to_prompt from khoj.search_type import image_search, text_search +from khoj.search_filter.date_filter import DateFilter +from khoj.search_filter.file_filter import FileFilter +from khoj.search_filter.word_filter import WordFilter from khoj.utils.helpers import log_telemetry, timer from khoj.utils.rawconfig import ( FullConfig, @@ -131,6 +135,20 @@ def search( logger.debug(f"Return response from query cache") return state.query_cache[query_cache_key] + # Encode query with filter terms removed + for filter in [DateFilter(), WordFilter(), FileFilter()]: + defiltered_query = filter.defilter(user_query) + + encoded_asymmetric_query = state.model.org_search.bi_encoder.encode( + [defiltered_query], convert_to_tensor=True, device=state.device + ) + encoded_asymmetric_query = util.normalize_embeddings(encoded_asymmetric_query) + + encoded_symmetric_query = state.model.org_search.bi_encoder.encode( + [defiltered_query], convert_to_tensor=True, device=state.device + ) + encoded_symmetric_query = util.normalize_embeddings(encoded_symmetric_query) + with concurrent.futures.ThreadPoolExecutor() as executor: if (t == SearchType.Org or t == None) and state.model.org_search: # query org-mode notes @@ -139,6 +157,7 @@ def search( text_search.query, user_query, state.model.org_search, + question_embedding=encoded_asymmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -152,6 +171,7 @@ def search( text_search.query, user_query, state.model.markdown_search, + question_embedding=encoded_asymmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -165,6 +185,7 @@ def search( text_search.query, user_query, state.model.pdf_search, + question_embedding=encoded_asymmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -178,6 +199,7 @@ def search( text_search.query, user_query, state.model.ledger_search, + question_embedding=encoded_symmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -191,6 +213,7 @@ def search( text_search.query, user_query, state.model.music_search, + question_embedding=encoded_asymmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -217,6 +240,7 @@ def search( user_query, # Get plugin search model for specified search type, or the first one if none specified state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())), + question_embedding=encoded_asymmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, diff --git a/src/khoj/search_filter/base_filter.py b/src/khoj/search_filter/base_filter.py index c273f9b8..aa4fa2e4 100644 --- a/src/khoj/search_filter/base_filter.py +++ b/src/khoj/search_filter/base_filter.py @@ -18,3 +18,7 @@ class BaseFilter(ABC): @abstractmethod def apply(self, query: str, entries: List[Entry]) -> Tuple[str, Set[int]]: ... + + @abstractmethod + def defilter(self, query: str) -> str: + ... diff --git a/src/khoj/search_filter/date_filter.py b/src/khoj/search_filter/date_filter.py index 36dc7974..be07eefd 100644 --- a/src/khoj/search_filter/date_filter.py +++ b/src/khoj/search_filter/date_filter.py @@ -49,6 +49,12 @@ class DateFilter(BaseFilter): "Check if query contains date filters" return self.extract_date_range(raw_query) is not None + def defilter(self, query): + # remove date range filter from query + query = re.sub(rf"\s+{self.date_regex}", " ", query) + query = re.sub(r"\s{2,}", " ", query).strip() # remove multiple spaces + return query + def apply(self, query, entries): "Find entries containing any dates that fall within date range specified in query" # extract date range specified in date filter of query @@ -59,9 +65,7 @@ class DateFilter(BaseFilter): if query_daterange is None: return query, set(range(len(entries))) - # remove date range filter from query - query = re.sub(rf"\s+{self.date_regex}", " ", query) - query = re.sub(r"\s{2,}", " ", query).strip() # remove multiple spaces + query = self.defilter(query) # return results from cache if exists cache_key = tuple(query_daterange) diff --git a/src/khoj/search_filter/file_filter.py b/src/khoj/search_filter/file_filter.py index 28610796..26f416fe 100644 --- a/src/khoj/search_filter/file_filter.py +++ b/src/khoj/search_filter/file_filter.py @@ -28,6 +28,9 @@ class FileFilter(BaseFilter): def can_filter(self, raw_query): return re.search(self.file_filter_regex, raw_query) is not None + def defilter(self, query: str) -> str: + return re.sub(self.file_filter_regex, "", query).strip() + def apply(self, query, entries): # Extract file filters from raw query with timer("Extract files_to_search from query", logger): @@ -44,8 +47,10 @@ class FileFilter(BaseFilter): else: files_to_search += [file] + # Remove filter terms from original query + query = self.defilter(query) + # Return item from cache if exists - query = re.sub(self.file_filter_regex, "", query).strip() cache_key = tuple(files_to_search) if cache_key in self.cache: logger.debug(f"Return file filter results from cache") diff --git a/src/khoj/search_filter/word_filter.py b/src/khoj/search_filter/word_filter.py index 9ee81b21..9c98e848 100644 --- a/src/khoj/search_filter/word_filter.py +++ b/src/khoj/search_filter/word_filter.py @@ -43,13 +43,16 @@ class WordFilter(BaseFilter): return len(required_words) != 0 or len(blocked_words) != 0 + def defilter(self, query: str) -> str: + return re.sub(self.blocked_regex, "", re.sub(self.required_regex, "", query)).strip() + def apply(self, query, entries): "Find entries containing required and not blocked words specified in query" # Separate natural query from required, blocked words filters with timer("Extract required, blocked filters from query", logger): required_words = set([word.lower() for word in re.findall(self.required_regex, query)]) blocked_words = set([word.lower() for word in re.findall(self.blocked_regex, query)]) - query = re.sub(self.blocked_regex, "", re.sub(self.required_regex, "", query)).strip() + query = self.defilter(query) if len(required_words) == 0 and len(blocked_words) == 0: return query, set(range(len(entries))) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 9d8d5c3a..96ffac7a 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -105,6 +105,7 @@ def compute_embeddings( def query( raw_query: str, model: TextSearchModel, + question_embedding: torch.Tensor = None, rank_results: bool = False, score_threshold: float = -math.inf, dedupe: bool = True, @@ -124,9 +125,10 @@ def query( return hits, entries # Encode the query using the bi-encoder - with timer("Query Encode Time", logger, state.device): - question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device) - question_embedding = util.normalize_embeddings(question_embedding) + if question_embedding is None: + with timer("Query Encode Time", logger, state.device): + question_embedding = model.bi_encoder.encode([query], convert_to_tensor=True, device=state.device) + question_embedding = util.normalize_embeddings(question_embedding) # Find relevant entries for the query with timer("Search Time", logger, state.device): From 6d94d6e75a28a7becf2c947f81a8255db0cf67dc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 01:17:21 -0700 Subject: [PATCH 03/21] Encode the asymmetric, symmetric search queries in parallel for speed Use timer to measure time to encode queries and total search time --- src/khoj/routers/api.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 35216343..a1aef1a1 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -2,6 +2,7 @@ from collections import defaultdict import concurrent.futures import math +import time import yaml import logging from datetime import datetime @@ -118,6 +119,8 @@ def search( dedupe: Optional[bool] = True, client: Optional[str] = None, ): + start_time = time.time() + results: List[SearchResponse] = [] if q is None or q == "": logger.warn(f"No query param (q) passed in API call to initiate search") @@ -139,15 +142,26 @@ def search( for filter in [DateFilter(), WordFilter(), FileFilter()]: defiltered_query = filter.defilter(user_query) - encoded_asymmetric_query = state.model.org_search.bi_encoder.encode( - [defiltered_query], convert_to_tensor=True, device=state.device - ) - encoded_asymmetric_query = util.normalize_embeddings(encoded_asymmetric_query) + with concurrent.futures.ThreadPoolExecutor() as executor: + with timer("Encoding query for asymmetric search took", logger=logger): + encode_asymmetric_futures = executor.submit( + state.model.org_search.bi_encoder.encode, + [defiltered_query], + convert_to_tensor=True, + device=state.device, + ) - encoded_symmetric_query = state.model.org_search.bi_encoder.encode( - [defiltered_query], convert_to_tensor=True, device=state.device - ) - encoded_symmetric_query = util.normalize_embeddings(encoded_symmetric_query) + with timer("Encoding query for symmetric search took", logger=logger): + encode_symmetric_futures = executor.submit( + state.model.org_search.bi_encoder.encode, + [defiltered_query], + convert_to_tensor=True, + device=state.device, + ) + + with timer("Normalizing query embeddings took", logger=logger): + encoded_asymmetric_query = util.normalize_embeddings(encode_asymmetric_futures.result()) + encoded_symmetric_query = util.normalize_embeddings(encode_symmetric_futures.result()) with concurrent.futures.ThreadPoolExecutor() as executor: if (t == SearchType.Org or t == None) and state.model.org_search: @@ -279,6 +293,9 @@ def search( ] state.previous_query = user_query + end_time = time.time() + logger.debug(f"🔍 Search took {end_time - start_time:.2f} seconds") + return results From 0144e610d619292fd3e1f101f1927cd7cde73e22 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 02:28:51 -0700 Subject: [PATCH 04/21] Only search across content types that work with asymmetric search --- src/khoj/routers/api.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index a1aef1a1..7217df9c 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -142,27 +142,17 @@ def search( for filter in [DateFilter(), WordFilter(), FileFilter()]: defiltered_query = filter.defilter(user_query) - with concurrent.futures.ThreadPoolExecutor() as executor: - with timer("Encoding query for asymmetric search took", logger=logger): - encode_asymmetric_futures = executor.submit( - state.model.org_search.bi_encoder.encode, - [defiltered_query], - convert_to_tensor=True, - device=state.device, + encoded_asymmetric_query = None + if t == None or (t != SearchType.Ledger and t != SearchType.Image): + with timer("Encoding query took", logger=logger): + encoded_asymmetric_query = util.normalize_embeddings( + state.model.org_search.bi_encoder.encode( + [defiltered_query], + convert_to_tensor=True, + device=state.device, + ) ) - with timer("Encoding query for symmetric search took", logger=logger): - encode_symmetric_futures = executor.submit( - state.model.org_search.bi_encoder.encode, - [defiltered_query], - convert_to_tensor=True, - device=state.device, - ) - - with timer("Normalizing query embeddings took", logger=logger): - encoded_asymmetric_query = util.normalize_embeddings(encode_asymmetric_futures.result()) - encoded_symmetric_query = util.normalize_embeddings(encode_symmetric_futures.result()) - with concurrent.futures.ThreadPoolExecutor() as executor: if (t == SearchType.Org or t == None) and state.model.org_search: # query org-mode notes @@ -206,14 +196,13 @@ def search( ) ] - if (t == SearchType.Ledger or t == None) and state.model.ledger_search: + if (t == SearchType.Ledger) and state.model.ledger_search: # query transactions search_futures[t] += [ executor.submit( text_search.query, user_query, state.model.ledger_search, - question_embedding=encoded_symmetric_query, rank_results=r, score_threshold=score_threshold, dedupe=dedupe, @@ -294,7 +283,7 @@ def search( state.previous_query = user_query end_time = time.time() - logger.debug(f"🔍 Search took {end_time - start_time:.2f} seconds") + logger.debug(f"🔍 Search took: {end_time - start_time:.2f} seconds") return results From 1192e49307e0b2f1172f88a57f0466812a08d762 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 19:51:33 -0700 Subject: [PATCH 05/21] Pass default value matching argument types expected by text_search methods --- src/khoj/routers/api.py | 37 ++++++++++++++++------------- src/khoj/search_type/text_search.py | 2 +- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 7217df9c..18487a59 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -24,6 +24,7 @@ from khoj.search_filter.word_filter import WordFilter from khoj.utils.helpers import log_telemetry, timer from khoj.utils.rawconfig import ( FullConfig, + ProcessorConfig, SearchResponse, TextContentConfig, ConversationProcessorConfig, @@ -101,7 +102,10 @@ async def set_content_config_data(content_type: str, updated_config: TextContent @api.post("/config/data/processor/conversation", status_code=200) async def set_processor_conversation_config_data(updated_config: ConversationProcessorConfig): - state.config.processor.conversation = updated_config + if state.config.processor is None: + state.config.processor = ProcessorConfig(conversation=updated_config) + else: + state.config.processor.conversation = updated_config try: save_config_to_file_updated_state() return {"status": "ok"} @@ -139,6 +143,7 @@ def search( return state.query_cache[query_cache_key] # Encode query with filter terms removed + defiltered_query = user_query for filter in [DateFilter(), WordFilter(), FileFilter()]: defiltered_query = filter.defilter(user_query) @@ -162,9 +167,9 @@ def search( user_query, state.model.org_search, question_embedding=encoded_asymmetric_query, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -176,9 +181,9 @@ def search( user_query, state.model.markdown_search, question_embedding=encoded_asymmetric_query, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -190,9 +195,9 @@ def search( user_query, state.model.pdf_search, question_embedding=encoded_asymmetric_query, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -203,9 +208,9 @@ def search( text_search.query, user_query, state.model.ledger_search, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -217,9 +222,9 @@ def search( user_query, state.model.music_search, question_embedding=encoded_asymmetric_query, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -237,16 +242,16 @@ def search( if (t is None or t in SearchType) and state.model.plugin_search: # query specified plugin type - search_future[t] += [ + search_futures[t] += [ executor.submit( text_search.query, user_query, # Get plugin search model for specified search type, or the first one if none specified state.model.plugin_search.get(t.value) or next(iter(state.model.plugin_search.values())), question_embedding=encoded_asymmetric_query, - rank_results=r, + rank_results=r or False, score_threshold=score_threshold, - dedupe=dedupe, + dedupe=dedupe or True, ) ] @@ -262,12 +267,12 @@ def search( image_names=state.model.image_search.image_names, output_directory=output_directory, image_files_url="/static/images", - count=results_count, + count=results_count or 5, ) else: hits, entries = search_future.result() # Collate results - results += text_search.collate_results(hits, entries, results_count) + results += text_search.collate_results(hits, entries, results_count or 5) # Sort results across all content types results.sort(key=lambda x: float(x.score), reverse=True) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 96ffac7a..c85857bb 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -105,7 +105,7 @@ def compute_embeddings( def query( raw_query: str, model: TextSearchModel, - question_embedding: torch.Tensor = None, + question_embedding: torch.Tensor | None = None, rank_results: bool = False, score_threshold: float = -math.inf, dedupe: bool = True, From 5c7c8d1f465d62c330a7e2174f60c1cb609ecc55 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 19:52:57 -0700 Subject: [PATCH 06/21] Use async/await to fix parallelization of search across content types --- src/khoj/routers/api.py | 8 ++++---- src/khoj/search_type/text_search.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 18487a59..785b08c0 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -114,7 +114,7 @@ async def set_processor_conversation_config_data(updated_config: ConversationPro @api.get("/search", response_model=List[SearchResponse]) -def search( +async def search( q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, @@ -257,9 +257,9 @@ def search( # Query across each requested content types in parallel with timer("Query took", logger): - for search_future in search_futures[t]: + for search_future in concurrent.futures.as_completed(search_futures[t]): if t == SearchType.Image: - hits = search_futures.result() + hits = await search_future.result() output_directory = constants.web_directory / "images" # Collate results results += image_search.collate_results( @@ -270,7 +270,7 @@ def search( count=results_count or 5, ) else: - hits, entries = search_future.result() + hits, entries = await search_future.result() # Collate results results += text_search.collate_results(hits, entries, results_count or 5) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index c85857bb..14e2015f 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -102,7 +102,7 @@ def compute_embeddings( return corpus_embeddings -def query( +async def query( raw_query: str, model: TextSearchModel, question_embedding: torch.Tensor | None = None, From d5fb4196de3afefd04c72c68d2d7b121d5e88005 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 20:19:15 -0700 Subject: [PATCH 07/21] Update web interface to allow querying all content types at once --- src/khoj/interface/web/index.html | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 51412d75..78b24d56 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -94,9 +94,12 @@ setQueryFieldInUrl(query); // Generate Backend API URL to execute Search - url = type === "image" - ? `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&client=web` - : `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&r=${rerank}&client=web`; + if (type == 'all') + url = `/api/search?q=${encodeURIComponent(query)}&n=${results_count}&client=web`; + else if (type === "image") + url = `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&client=web`; + else + url = `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&r=${rerank}&client=web`; // Execute Search and Render Results fetch(url) @@ -138,6 +141,7 @@ fetch("/api/config/types") .then(response => response.json()) .then(enabled_types => { + enabled_types.push("all"); document.getElementById("type").innerHTML = enabled_types .map(type => ``) From 2cd3e799d3692dac9184bf682c72283afa0514ef Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 22:22:43 -0700 Subject: [PATCH 08/21] Improve null and type checks --- src/khoj/configure.py | 23 ++++++++++++++--------- src/khoj/routers/api.py | 22 +++++++++++----------- src/khoj/search_type/text_search.py | 2 +- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 3aa39f10..df031dfa 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -3,6 +3,7 @@ import sys import logging import json from enum import Enum +from typing import Optional import requests # External Packages @@ -78,16 +79,20 @@ def configure_search_types(config: FullConfig): core_search_types = {e.name: e.value for e in SearchType} # Extract configured plugin search types plugin_search_types = {} - if config.content_type.plugins: + if config.content_type and config.content_type.plugins: plugin_search_types = {plugin_type: plugin_type for plugin_type in config.content_type.plugins.keys()} # Dynamically generate search type enum by merging core search types with configured plugin search types return Enum("SearchType", merge_dicts(core_search_types, plugin_search_types)) -def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: state.SearchType = None): +def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: Optional[state.SearchType] = None): + if config.content_type is None or config.search_type is None: + logger.error("🚨 Content Type or Search Type not configured.") + return + # Initialize Org Notes Search - if (t == state.SearchType.Org or t == None) and config.content_type.org: + if (t == state.SearchType.Org or t == None) and config.content_type.org and config.search_type.asymmetric: logger.info("🦄 Setting up search for orgmode notes") # Extract Entries, Generate Notes Embeddings model.org_search = text_search.setup( @@ -99,7 +104,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, ) # Initialize Org Music Search - if (t == state.SearchType.Music or t == None) and config.content_type.music: + if (t == state.SearchType.Music or t == None) and config.content_type.music and config.search_type.asymmetric: logger.info("🎺 Setting up search for org-music") # Extract Entries, Generate Music Embeddings model.music_search = text_search.setup( @@ -111,7 +116,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, ) # Initialize Markdown Search - if (t == state.SearchType.Markdown or t == None) and config.content_type.markdown: + if (t == state.SearchType.Markdown or t == None) and config.content_type.markdown and config.search_type.asymmetric: logger.info("💎 Setting up search for markdown notes") # Extract Entries, Generate Markdown Embeddings model.markdown_search = text_search.setup( @@ -123,7 +128,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, ) # Initialize Ledger Search - if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger: + if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger and config.search_type.symmetric: logger.info("💸 Setting up search for ledger") # Extract Entries, Generate Ledger Embeddings model.ledger_search = text_search.setup( @@ -135,7 +140,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, ) # Initialize PDF Search - if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf: + if (t == state.SearchType.Pdf or t == None) and config.content_type.pdf and config.search_type.asymmetric: logger.info("🖨️ Setting up search for pdf") # Extract Entries, Generate PDF Embeddings model.pdf_search = text_search.setup( @@ -147,14 +152,14 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, ) # Initialize Image Search - if (t == state.SearchType.Image or t == None) and config.content_type.image: + if (t == state.SearchType.Image or t == None) and config.content_type.image and config.search_type.image: logger.info("🌄 Setting up search for images") # Extract Entries, Generate Image Embeddings model.image_search = image_search.setup( config.content_type.image, search_config=config.search_type.image, regenerate=regenerate ) - if (t == state.SearchType.Github or t == None) and config.content_type.github: + if (t == state.SearchType.Github or t == None) and config.content_type.github and config.search_type.asymmetric: logger.info("🐙 Setting up search for github") # Extract Entries, Generate Github Embeddings model.github_search = text_search.setup( diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 785b08c0..fc8ff7ce 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -134,7 +134,7 @@ async def search( user_query = q.strip() results_count = n score_threshold = score_threshold if score_threshold is not None else -math.inf - search_futures = defaultdict(list) + search_futures: list[concurrent.futures.Future] = [] # return cached results, if available query_cache_key = f"{user_query}-{n}-{t}-{r}-{score_threshold}-{dedupe}" @@ -161,7 +161,7 @@ async def search( with concurrent.futures.ThreadPoolExecutor() as executor: if (t == SearchType.Org or t == None) and state.model.org_search: # query org-mode notes - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -175,7 +175,7 @@ async def search( if (t == SearchType.Markdown or t == None) and state.model.markdown_search: # query markdown notes - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -189,7 +189,7 @@ async def search( if (t == SearchType.Pdf or t == None) and state.model.pdf_search: # query pdf files - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -203,7 +203,7 @@ async def search( if (t == SearchType.Ledger) and state.model.ledger_search: # query transactions - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -216,7 +216,7 @@ async def search( if (t == SearchType.Music or t == None) and state.model.music_search: # query music library - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -230,7 +230,7 @@ async def search( if (t == SearchType.Image) and state.model.image_search: # query images - search_futures[t] += [ + search_futures += [ executor.submit( image_search.query, user_query, @@ -242,7 +242,7 @@ async def search( if (t is None or t in SearchType) and state.model.plugin_search: # query specified plugin type - search_futures[t] += [ + search_futures += [ executor.submit( text_search.query, user_query, @@ -257,7 +257,7 @@ async def search( # Query across each requested content types in parallel with timer("Query took", logger): - for search_future in concurrent.futures.as_completed(search_futures[t]): + for search_future in concurrent.futures.as_completed(search_futures): if t == SearchType.Image: hits = await search_future.result() output_directory = constants.web_directory / "images" @@ -288,7 +288,7 @@ async def search( state.previous_query = user_query end_time = time.time() - logger.debug(f"🔍 Search took: {end_time - start_time:.2f} seconds") + logger.debug(f"🔍 Search took: {end_time - start_time:.3f} seconds") return results @@ -297,7 +297,7 @@ async def search( def update(t: Optional[SearchType] = None, force: Optional[bool] = False, client: Optional[str] = None): try: state.search_index_lock.acquire() - state.model = configure_search(state.model, state.config, regenerate=force, t=t) + state.model = configure_search(state.model, state.config, regenerate=force or False, t=t) state.search_index_lock.release() except ValueError as e: logger.error(e) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 14e2015f..83f15918 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -181,7 +181,7 @@ def setup( previous_entries = ( extract_entries(config.compressed_jsonl) if config.compressed_jsonl.exists() and not regenerate else None ) - entries_with_indices = text_to_jsonl(config).process(previous_entries) + entries_with_indices = text_to_jsonl(config).process(previous_entries or []) # Extract Updated Entries entries = extract_entries(config.compressed_jsonl) From 5c4eb950d53a723f124cac9ebf1a6f352a360645 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 20 Jun 2023 23:39:19 -0700 Subject: [PATCH 09/21] Search across all content types via khoj.el on Emacs If no content-type selected in transient menu option, khoj.el queries khoj server without content-type parameter (t) set. This results in search across all enabled asymmetric search text content types --- src/interface/emacs/khoj.el | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/interface/emacs/khoj.el b/src/interface/emacs/khoj.el index a397d460..91cdff66 100644 --- a/src/interface/emacs/khoj.el +++ b/src/interface/emacs/khoj.el @@ -651,7 +651,9 @@ CONFIG is json obtained from Khoj config API." Use QUERY, CONTENT-TYPE and (optional) RERANK as query params" (let ((rerank (or rerank "false")) (encoded-query (url-hexify-string query))) - (format "%s/api/search?q=%s&t=%s&r=%s&n=%s&client=emacs" khoj-server-url encoded-query content-type rerank khoj-results-count))) + (if content-type + (format "%s/api/search?q=%s&r=%s&n=%s&client=emacs" khoj-server-url encoded-query rerank khoj-results-count) + (format "%s/api/search?q=%s&t=%s&r=%s&n=%s&client=emacs&t=%s" khoj-server-url content-type encoded-query rerank khoj-results-count)))) (defun khoj--query-search-api-and-render-results (query-url content-type query buffer-name) "Query Khoj Search with QUERY-URL. From 09f739b8cc2e319b6baf06e936c4e5eead426c31 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 27 Jun 2023 15:48:48 -0700 Subject: [PATCH 10/21] Null check config, log warning instead of error when configuring search --- src/khoj/configure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index df031dfa..482e6b28 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -87,8 +87,8 @@ def configure_search_types(config: FullConfig): def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: Optional[state.SearchType] = None): - if config.content_type is None or config.search_type is None: - logger.error("🚨 Content Type or Search Type not configured.") + if config is None or config.content_type is None or config.search_type is None: + logger.warn("🚨 No Content or Search type is configured.") return # Initialize Org Notes Search From 1b11d5723d474fb86cb05a1876a97dab6b4ea063 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 27 Jun 2023 15:50:41 -0700 Subject: [PATCH 11/21] Extract search request URL builder into js function in web interface --- src/khoj/interface/web/index.html | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 78b24d56..906f3912 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -93,15 +93,8 @@ if (rerank) setQueryFieldInUrl(query); - // Generate Backend API URL to execute Search - if (type == 'all') - url = `/api/search?q=${encodeURIComponent(query)}&n=${results_count}&client=web`; - else if (type === "image") - url = `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&client=web`; - else - url = `/api/search?q=${encodeURIComponent(query)}&t=${type}&n=${results_count}&r=${rerank}&client=web`; - // Execute Search and Render Results + url = createRequestUrl(query, type, results_count, rerank); fetch(url) .then(response => response.json()) .then(data => { @@ -157,6 +150,18 @@ }); } + function createRequestUrl(query, results_count, type, rerank) { + // Generate Backend API URL to execute Search + let url = `/api/search?q=${encodeURIComponent(query)}&n=${results_count}&client=web`; + // If type is not 'all', append type to URL + if (type !== 'all') + url += `&t=${type}`; + // Rerank is only supported by text types + if (type !== "image") + url += `&r=${rerank}`; + return url; + } + function setTypeFieldInUrl(type) { var url = new URL(window.location.href); url.searchParams.set("t", type.value); From 510bb7e684a14cadcd8990932e15bc7a7f80690d Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 27 Jun 2023 15:59:50 -0700 Subject: [PATCH 12/21] Use typing union in text_search for python 3.8 compatible type hinting --- src/khoj/search_type/text_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 83f15918..0af5b0fc 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -2,7 +2,7 @@ import logging import math from pathlib import Path -from typing import List, Tuple, Type +from typing import List, Tuple, Type, Union # External Packages import torch @@ -105,7 +105,7 @@ def compute_embeddings( async def query( raw_query: str, model: TextSearchModel, - question_embedding: torch.Tensor | None = None, + question_embedding: Union[torch.Tensor, None] = None, rank_results: bool = False, score_threshold: float = -math.inf, dedupe: bool = True, From 212b1a96c8dfef47af5d2727d0604cf406efc232 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 28 Jun 2023 11:34:26 -0700 Subject: [PATCH 13/21] Create "all" search type for search across all content types on khoj server Allows moving logic to handle search across all content types to server from clients --- src/khoj/routers/api.py | 15 ++++++++------- src/khoj/utils/config.py | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 4e305704..266eaed0 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -65,6 +65,7 @@ def get_config_types(): and getattr(state.model, f"{search_type.value}_search") is not None ) or ("plugins" in configured_content_types and search_type.name in configured_content_types["plugins"]) + or search_type == SearchType.All ] @@ -135,7 +136,7 @@ async def set_processor_conversation_config_data(updated_config: ConversationPro async def search( q: str, n: Optional[int] = 5, - t: Optional[SearchType] = None, + t: Optional[SearchType] = SearchType.All, r: Optional[bool] = False, score_threshold: Optional[Union[float, None]] = None, dedupe: Optional[bool] = True, @@ -166,7 +167,7 @@ async def search( defiltered_query = filter.defilter(user_query) encoded_asymmetric_query = None - if t == None or (t != SearchType.Ledger and t != SearchType.Image): + if t == SearchType.All or (t != SearchType.Ledger and t != SearchType.Image): with timer("Encoding query took", logger=logger): encoded_asymmetric_query = util.normalize_embeddings( state.model.org_search.bi_encoder.encode( @@ -177,7 +178,7 @@ async def search( ) with concurrent.futures.ThreadPoolExecutor() as executor: - if (t == SearchType.Org or t == None) and state.model.org_search: + if (t == SearchType.Org or t == SearchType.All) and state.model.org_search: # query org-mode notes search_futures += [ executor.submit( @@ -191,7 +192,7 @@ async def search( ) ] - if (t == SearchType.Markdown or t == None) and state.model.markdown_search: + if (t == SearchType.Markdown or t == SearchType.All) and state.model.markdown_search: # query markdown notes search_futures += [ executor.submit( @@ -205,7 +206,7 @@ async def search( ) ] - if (t == SearchType.Pdf or t == None) and state.model.pdf_search: + if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search: # query pdf files search_futures += [ executor.submit( @@ -232,7 +233,7 @@ async def search( ) ] - if (t == SearchType.Music or t == None) and state.model.music_search: + if (t == SearchType.Music or t == SearchType.All) and state.model.music_search: # query music library search_futures += [ executor.submit( @@ -258,7 +259,7 @@ async def search( ) ] - if (t is None or t in SearchType) and state.model.plugin_search: + if (t == SearchType.All or t in SearchType) and state.model.plugin_search: # query specified plugin type search_futures += [ executor.submit( diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index a83f7814..e3bea7b9 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: class SearchType(str, Enum): + All = "all" Org = "org" Ledger = "ledger" Music = "music" From 1773a783398ea24ba10cda47de722da183171b98 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 28 Jun 2023 12:10:45 -0700 Subject: [PATCH 14/21] Fix createRequestUrl method signature to fetch results from khoj web --- src/khoj/interface/web/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index e24ad33c..2adfeda2 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -172,7 +172,7 @@ }); } - function createRequestUrl(query, results_count, type, rerank) { + function createRequestUrl(query, type, results_count, rerank) { // Generate Backend API URL to execute Search let url = `/api/search?q=${encodeURIComponent(query)}&n=${results_count}&client=web`; // If type is not 'all', append type to URL From 630bf995f1316b717eac4787fe3b97c56ce6907e Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 28 Jun 2023 12:12:38 -0700 Subject: [PATCH 15/21] Style each result based on its content type in same view on Khoj web - So when searching across content types (with content-type = "all") org-mode results get rendered differently than markdown, PDF etc. results - Set div class for each result separately instead of a single uber div for styling. This allows styling div of each result based on the content-type of that result - No need to create placeholder "all" content type on web interface as server is passing an all content type by itself --- src/khoj/interface/web/index.html | 110 ++++++++++++++---------------- 1 file changed, 51 insertions(+), 59 deletions(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 2adfeda2..5dc6b0b1 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -14,11 +14,13 @@