From 1ae40163a9a5b394413239ad09da09c81b72766b Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Wed, 1 Mar 2023 15:22:14 -0600 Subject: [PATCH] Show user friendly information logs by default for context - Use emojis to make info logs easier to read - Inform when khoj is ready to use - Provide information on what khoj is doing while starting up - Inform when content/search types and processors are setup - Inform when models are being loaded from the web as this step can take time - Convert all other info logs to be only shown in verbose mode --- src/khoj/configure.py | 11 +++++++++-- src/khoj/main.py | 9 ++++----- src/khoj/processor/jsonl/jsonl_to_jsonl.py | 2 +- src/khoj/processor/ledger/beancount_to_jsonl.py | 4 ++-- .../processor/markdown/markdown_to_jsonl.py | 4 ++-- src/khoj/processor/org_mode/org_to_jsonl.py | 2 +- src/khoj/routers/api.py | 8 ++++---- src/khoj/routers/api_beta.py | 3 +-- src/khoj/search_filter/file_filter.py | 2 +- src/khoj/search_filter/word_filter.py | 2 +- src/khoj/search_type/image_search.py | 17 ++++++++--------- src/khoj/search_type/text_search.py | 5 ++--- src/khoj/utils/helpers.py | 4 ++++ src/khoj/utils/jsonl.py | 6 +++--- 14 files changed, 43 insertions(+), 36 deletions(-) diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 27067a4b..09c63781 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -66,7 +66,7 @@ def update_search_index(): state.search_index_lock.acquire() state.model = configure_search(state.model, state.config, regenerate=False) state.search_index_lock.release() - logger.info("Search Index updated via Scheduler") + logger.info("📬 Search index updated via Scheduler") def configure_search_types(config: FullConfig): @@ -84,6 +84,7 @@ def configure_search_types(config: FullConfig): def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: state.SearchType = None): # Initialize Org Notes Search if (t == state.SearchType.Org or t == None) and config.content_type.org: + logger.info("🦄 Setting up search for orgmode notes") # Extract Entries, Generate Notes Embeddings model.orgmode_search = text_search.setup( OrgToJsonl, @@ -95,6 +96,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize Org Music Search if (t == state.SearchType.Music or t == None) and config.content_type.music: + logger.info("🎺 Setting up search for org-music") # Extract Entries, Generate Music Embeddings model.music_search = text_search.setup( OrgToJsonl, @@ -106,6 +108,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize Markdown Search if (t == state.SearchType.Markdown or t == None) and config.content_type.markdown: + logger.info("💎 Setting up search for markdown notes") # Extract Entries, Generate Markdown Embeddings model.markdown_search = text_search.setup( MarkdownToJsonl, @@ -117,6 +120,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize Ledger Search if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger: + logger.info("💸 Setting up search for ledger") # Extract Entries, Generate Ledger Embeddings model.ledger_search = text_search.setup( BeancountToJsonl, @@ -128,6 +132,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize Image Search if (t == state.SearchType.Image or t == None) and config.content_type.image: + logger.info("🌄 Setting up search for images") # Extract Entries, Generate Image Embeddings model.image_search = image_search.setup( config.content_type.image, search_config=config.search_type.image, regenerate=regenerate @@ -135,6 +140,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, # Initialize External Plugin Search if (t == None or t in state.SearchType) and config.content_type.plugins: + logger.info("🔌 Setting up search for plugins") model.plugin_search = {} for plugin_type, plugin_config in config.content_type.plugins.items(): model.plugin_search[plugin_type] = text_search.setup( @@ -159,6 +165,7 @@ def configure_processor(processor_config: ProcessorConfig): # Initialize Conversation Processor if processor_config.conversation: + logger.info("💬 Setting up conversation processor") processor.conversation = configure_conversation_processor(processor_config.conversation) return processor @@ -172,7 +179,7 @@ def configure_conversation_processor(conversation_processor_config): # Load Metadata Logs from Conversation Logfile with conversation_logfile.open("r") as f: conversation_processor.meta_log = json.load(f) - logger.info("Conversation logs loaded from disk.") + logger.debug(f"Loaded conversation logs from {conversation_logfile}") else: # Initialize Conversation Logs conversation_processor.meta_log = {} diff --git a/src/khoj/main.py b/src/khoj/main.py index e9cfec78..a54b2b4d 100644 --- a/src/khoj/main.py +++ b/src/khoj/main.py @@ -52,10 +52,8 @@ def run(): # Set Logging Level if args.verbose == 0: - logger.setLevel(logging.WARN) - elif args.verbose == 1: logger.setLevel(logging.INFO) - elif args.verbose >= 2: + elif args.verbose >= 1: logger.setLevel(logging.DEBUG) # Set Log File @@ -63,7 +61,7 @@ def run(): fh.setLevel(logging.DEBUG) logger.addHandler(fh) - logger.info("Starting Khoj...") + logger.info("🌘 Starting Khoj") if args.no_gui: # Setup task scheduler @@ -122,7 +120,6 @@ def run(): def sigint_handler(*args): - print("\nShutting down Khoj...") QtWidgets.QApplication.quit() @@ -135,10 +132,12 @@ def set_state(args): def start_server(app, host=None, port=None, socket=None): + logger.info("🌖 Khoj is ready to use") if socket: uvicorn.run(app, proxy_headers=True, uds=socket, log_level="debug", use_colors=True, log_config=None) else: uvicorn.run(app, host=host, port=port, log_level="debug", use_colors=True, log_config=None) + logger.info("🌒 Stopping Khoj") def poll_task_scheduler(): diff --git a/src/khoj/processor/jsonl/jsonl_to_jsonl.py b/src/khoj/processor/jsonl/jsonl_to_jsonl.py index 0268a659..83c82374 100644 --- a/src/khoj/processor/jsonl/jsonl_to_jsonl.py +++ b/src/khoj/processor/jsonl/jsonl_to_jsonl.py @@ -82,7 +82,7 @@ class JsonlToJsonl(TextToJsonl): if any(files_with_non_jsonl_extensions): print(f"[Warning] There maybe non jsonl files in the input set: {files_with_non_jsonl_extensions}") - logger.info(f"Processing files: {all_jsonl_files}") + logger.debug(f"Processing files: {all_jsonl_files}") return all_jsonl_files diff --git a/src/khoj/processor/ledger/beancount_to_jsonl.py b/src/khoj/processor/ledger/beancount_to_jsonl.py index 006d0d70..49c43301 100644 --- a/src/khoj/processor/ledger/beancount_to_jsonl.py +++ b/src/khoj/processor/ledger/beancount_to_jsonl.py @@ -88,7 +88,7 @@ class BeancountToJsonl(TextToJsonl): if any(files_with_non_beancount_extensions): print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}") - logger.info(f"Processing files: {all_beancount_files}") + logger.debug(f"Processing files: {all_beancount_files}") return all_beancount_files @@ -123,7 +123,7 @@ class BeancountToJsonl(TextToJsonl): Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{transaction_to_file_map[parsed_entry]}") ) - logger.info(f"Converted {len(parsed_entries)} transactions to dictionaries") + logger.debug(f"Converted {len(parsed_entries)} transactions to dictionaries") return entries diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index c661c1aa..045eea65 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -92,7 +92,7 @@ class MarkdownToJsonl(TextToJsonl): f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}" ) - logger.info(f"Processing files: {all_markdown_files}") + logger.debug(f"Processing files: {all_markdown_files}") return all_markdown_files @@ -126,7 +126,7 @@ class MarkdownToJsonl(TextToJsonl): for parsed_entry in parsed_entries: entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}")) - logger.info(f"Converted {len(parsed_entries)} markdown entries to dictionaries") + logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") return entries diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 05eff771..0950a089 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -86,7 +86,7 @@ class OrgToJsonl(TextToJsonl): if any(files_with_non_org_extensions): logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}") - logger.info(f"Processing files: {all_org_files}") + logger.debug(f"Processing files: {all_org_files}") return all_org_files diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 033ea220..2d5329ea 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -56,7 +56,7 @@ async def set_config_data(updated_config: FullConfig): def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False): results: List[SearchResponse] = [] if q is None or q == "": - logger.info(f"No query param (q) passed in API call to initiate search") + logger.warn(f"No query param (q) passed in API call to initiate search") return results # initialize variables @@ -66,7 +66,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti # return cached results, if available query_cache_key = f"{user_query}-{n}-{t}-{r}" if query_cache_key in state.query_cache: - logger.info(f"Return response from query cache") + logger.debug(f"Return response from query cache") return state.query_cache[query_cache_key] if (t == SearchType.Org or t == None) and state.model.orgmode_search: @@ -151,7 +151,7 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False): logger.error(e) raise HTTPException(status_code=500, detail=str(e)) else: - logger.info("Search Index updated via API call") + logger.info("📬 Search index updated via API") try: state.processor_config = configure_processor(state.config.processor) @@ -159,6 +159,6 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False): logger.error(e) raise HTTPException(status_code=500, detail=str(e)) else: - logger.info("Processor reconfigured via API call") + logger.info("📬 Processor reconfigured via API") return {"status": "ok", "message": "khoj reloaded"} diff --git a/src/khoj/routers/api_beta.py b/src/khoj/routers/api_beta.py index eefd09fa..2f523917 100644 --- a/src/khoj/routers/api_beta.py +++ b/src/khoj/routers/api_beta.py @@ -152,7 +152,6 @@ def save_chat_session(): conversation_log["session"].append(session) else: conversation_log["session"] = [session] - logger.info("Added new chat session to conversation logs") # Save Conversation Metadata Logs to Disk conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile) @@ -161,4 +160,4 @@ def save_chat_session(): json.dump(conversation_log, logfile) state.processor_config.conversation.chat_session = None - logger.info("Saved updated conversation logs to disk.") + logger.info("📩 Saved current chat session to conversation logs") diff --git a/src/khoj/search_filter/file_filter.py b/src/khoj/search_filter/file_filter.py index 39bc10ba..a0e9d790 100644 --- a/src/khoj/search_filter/file_filter.py +++ b/src/khoj/search_filter/file_filter.py @@ -49,7 +49,7 @@ class FileFilter(BaseFilter): query = re.sub(self.file_filter_regex, "", query).strip() cache_key = tuple(files_to_search) if cache_key in self.cache: - logger.info(f"Return file filter results from cache") + logger.debug(f"Return file filter results from cache") included_entry_indices = self.cache[cache_key] return query, included_entry_indices diff --git a/src/khoj/search_filter/word_filter.py b/src/khoj/search_filter/word_filter.py index 9aab259a..0c868b0c 100644 --- a/src/khoj/search_filter/word_filter.py +++ b/src/khoj/search_filter/word_filter.py @@ -58,7 +58,7 @@ class WordFilter(BaseFilter): # Return item from cache if exists cache_key = tuple(sorted(required_words)), tuple(sorted(blocked_words)) if cache_key in self.cache: - logger.info(f"Return word filter results from cache") + logger.debug(f"Return word filter results from cache") included_entry_indices = self.cache[cache_key] return query, included_entry_indices diff --git a/src/khoj/search_type/image_search.py b/src/khoj/search_type/image_search.py index 34264189..024dc79d 100644 --- a/src/khoj/search_type/image_search.py +++ b/src/khoj/search_type/image_search.py @@ -3,7 +3,6 @@ import glob import pathlib import copy import shutil -import time import logging from typing import List @@ -50,9 +49,9 @@ def extract_entries(image_directories): image_names.extend(list(image_directory.glob("*.jpg"))) image_names.extend(list(image_directory.glob("*.jpeg"))) - if logger.level >= logging.INFO: + if logger.level >= logging.DEBUG: image_directory_names = ", ".join([str(image_directory) for image_directory in image_directories]) - logger.info(f"Found {len(image_names)} images in {image_directory_names}") + logger.debug(f"Found {len(image_names)} images in {image_directory_names}") return sorted(image_names) @@ -71,7 +70,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5 # Load pre-computed image embeddings from file if exists if resolve_absolute_path(embeddings_file).exists() and not regenerate: image_embeddings = torch.load(embeddings_file) - logger.info(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}") + logger.debug(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}") # Else compute the image embeddings from scratch, which can take a while else: image_embeddings = [] @@ -89,7 +88,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5 # Save computed image embeddings to file torch.save(image_embeddings, embeddings_file) - logger.info(f"Saved computed embeddings to {embeddings_file}") + logger.info(f"📩 Saved computed image embeddings to {embeddings_file}") return image_embeddings @@ -102,7 +101,7 @@ def compute_metadata_embeddings( # Load pre-computed image metadata embedding file if exists if use_xmp_metadata and resolve_absolute_path(f"{embeddings_file}_metadata").exists() and not regenerate: image_metadata_embeddings = torch.load(f"{embeddings_file}_metadata") - logger.info(f"Loaded pre-computed embeddings from {embeddings_file}_metadata") + logger.debug(f"Loaded image metadata embeddings from {embeddings_file}_metadata") # Else compute the image metadata embeddings from scratch, which can take a while if use_xmp_metadata and image_metadata_embeddings is None: @@ -121,7 +120,7 @@ def compute_metadata_embeddings( ) continue torch.save(image_metadata_embeddings, f"{embeddings_file}_metadata") - logger.info(f"Saved computed metadata embeddings to {embeddings_file}_metadata") + logger.info(f"📩 Saved computed image metadata embeddings to {embeddings_file}_metadata") return image_metadata_embeddings @@ -149,12 +148,12 @@ def query(raw_query, count, model: ImageSearchModel): query_imagepath = resolve_absolute_path(pathlib.Path(raw_query[5:]), strict=True) query = copy.deepcopy(Image.open(query_imagepath)) query.thumbnail((640, query.height)) # scale down image for faster processing - logger.info(f"Find Images by Image: {query_imagepath}") + logger.info(f"🔎 Find Images by Image: {query_imagepath}") else: # Truncate words in query to stay below max_tokens supported by ML model max_words = 20 query = " ".join(raw_query.split()[:max_words]) - logger.info(f"Find Images by Text: {query}") + logger.info(f"🔎 Find Images by Text: {query}") # Now we encode the query (which can either be an image or a text string) with timer("Query Encode Time", logger): diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index 882bcb64..af805c42 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -1,7 +1,6 @@ # Standard Packages import logging from pathlib import Path -import time from typing import List, Tuple, Type # External Packages @@ -68,7 +67,7 @@ def compute_embeddings( # Load pre-computed embeddings from file if exists and update them if required if embeddings_file.exists() and not regenerate: corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device) - logger.info(f"Loaded embeddings from {embeddings_file}") + logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}") # Encode any new entries in the corpus and update corpus embeddings new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1] @@ -95,7 +94,7 @@ def compute_embeddings( if new_entries: corpus_embeddings = util.normalize_embeddings(corpus_embeddings) torch.save(corpus_embeddings, embeddings_file) - logger.info(f"Computed embeddings and saved them to {embeddings_file}") + logger.info(f"📩 Saved computed text embeddings to {embeddings_file}") return corpus_embeddings diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index d2f87126..521b7b65 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -58,16 +58,20 @@ def merge_dicts(priority_dict: dict, default_dict: dict): def load_model(model_name: str, model_type, model_dir=None, device: str = None) -> Union[BaseEncoder, CrossEncoder]: "Load model from disk or huggingface" # Construct model path + logger = logging.getLogger(__name__) model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None # Load model from model_path if it exists there model_type_class = get_class_by_name(model_type) if isinstance(model_type, str) else model_type if model_path is not None and resolve_absolute_path(model_path).exists(): + logger.debug(f"Loading {model_name} model from disk") model = model_type_class(get_absolute_path(model_path), device=device) # Else load the model from the model_name else: + logger.info(f"🤖 Downloading {model_name} model from web") model = model_type_class(model_name, device=device) if model_path is not None: + logger.info(f"📩 Saved {model_name} model to disk") model.save(model_path) return model diff --git a/src/khoj/utils/jsonl.py b/src/khoj/utils/jsonl.py index a98910fb..c9576810 100644 --- a/src/khoj/utils/jsonl.py +++ b/src/khoj/utils/jsonl.py @@ -31,7 +31,7 @@ def load_jsonl(input_path): jsonl_file.close() # Log JSONL entries loaded - logger.info(f"Loaded {len(data)} records from {input_path}") + logger.debug(f"Loaded {len(data)} records from {input_path}") return data @@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path): with open(output_path, "w", encoding="utf-8") as f: f.write(jsonl_data) - logger.info(f"Wrote jsonl data to {output_path}") + logger.debug(f"Wrote jsonl data to {output_path}") def compress_jsonl_data(jsonl_data, output_path): @@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path): with gzip.open(output_path, "wt", encoding="utf-8") as gzip_file: gzip_file.write(jsonl_data) - logger.info(f"Wrote jsonl data to gzip compressed jsonl at {output_path}") + logger.debug(f"Wrote jsonl data to gzip compressed jsonl at {output_path}")