Show user friendly information logs by default for context

- Use emojis to make info logs easier to read
- Inform when khoj is ready to use
- Provide information on what khoj is doing while starting up
- Inform when content/search types and processors are setup
- Inform when models are being loaded from the web as this step can
  take time
- Convert all other info logs to be only shown in verbose mode
This commit is contained in:
Debanjum Singh Solanky 2023-03-01 15:22:14 -06:00
parent fe03ba3dce
commit 1ae40163a9
14 changed files with 43 additions and 36 deletions

View file

@ -66,7 +66,7 @@ def update_search_index():
state.search_index_lock.acquire()
state.model = configure_search(state.model, state.config, regenerate=False)
state.search_index_lock.release()
logger.info("Search Index updated via Scheduler")
logger.info("📬 Search index updated via Scheduler")
def configure_search_types(config: FullConfig):
@ -84,6 +84,7 @@ def configure_search_types(config: FullConfig):
def configure_search(model: SearchModels, config: FullConfig, regenerate: bool, t: state.SearchType = None):
# Initialize Org Notes Search
if (t == state.SearchType.Org or t == None) and config.content_type.org:
logger.info("🦄 Setting up search for orgmode notes")
# Extract Entries, Generate Notes Embeddings
model.orgmode_search = text_search.setup(
OrgToJsonl,
@ -95,6 +96,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize Org Music Search
if (t == state.SearchType.Music or t == None) and config.content_type.music:
logger.info("🎺 Setting up search for org-music")
# Extract Entries, Generate Music Embeddings
model.music_search = text_search.setup(
OrgToJsonl,
@ -106,6 +108,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize Markdown Search
if (t == state.SearchType.Markdown or t == None) and config.content_type.markdown:
logger.info("💎 Setting up search for markdown notes")
# Extract Entries, Generate Markdown Embeddings
model.markdown_search = text_search.setup(
MarkdownToJsonl,
@ -117,6 +120,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize Ledger Search
if (t == state.SearchType.Ledger or t == None) and config.content_type.ledger:
logger.info("💸 Setting up search for ledger")
# Extract Entries, Generate Ledger Embeddings
model.ledger_search = text_search.setup(
BeancountToJsonl,
@ -128,6 +132,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize Image Search
if (t == state.SearchType.Image or t == None) and config.content_type.image:
logger.info("🌄 Setting up search for images")
# Extract Entries, Generate Image Embeddings
model.image_search = image_search.setup(
config.content_type.image, search_config=config.search_type.image, regenerate=regenerate
@ -135,6 +140,7 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
# Initialize External Plugin Search
if (t == None or t in state.SearchType) and config.content_type.plugins:
logger.info("🔌 Setting up search for plugins")
model.plugin_search = {}
for plugin_type, plugin_config in config.content_type.plugins.items():
model.plugin_search[plugin_type] = text_search.setup(
@ -159,6 +165,7 @@ def configure_processor(processor_config: ProcessorConfig):
# Initialize Conversation Processor
if processor_config.conversation:
logger.info("💬 Setting up conversation processor")
processor.conversation = configure_conversation_processor(processor_config.conversation)
return processor
@ -172,7 +179,7 @@ def configure_conversation_processor(conversation_processor_config):
# Load Metadata Logs from Conversation Logfile
with conversation_logfile.open("r") as f:
conversation_processor.meta_log = json.load(f)
logger.info("Conversation logs loaded from disk.")
logger.debug(f"Loaded conversation logs from {conversation_logfile}")
else:
# Initialize Conversation Logs
conversation_processor.meta_log = {}

View file

@ -52,10 +52,8 @@ def run():
# Set Logging Level
if args.verbose == 0:
logger.setLevel(logging.WARN)
elif args.verbose == 1:
logger.setLevel(logging.INFO)
elif args.verbose >= 2:
elif args.verbose >= 1:
logger.setLevel(logging.DEBUG)
# Set Log File
@ -63,7 +61,7 @@ def run():
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
logger.info("Starting Khoj...")
logger.info("🌘 Starting Khoj")
if args.no_gui:
# Setup task scheduler
@ -122,7 +120,6 @@ def run():
def sigint_handler(*args):
print("\nShutting down Khoj...")
QtWidgets.QApplication.quit()
@ -135,10 +132,12 @@ def set_state(args):
def start_server(app, host=None, port=None, socket=None):
logger.info("🌖 Khoj is ready to use")
if socket:
uvicorn.run(app, proxy_headers=True, uds=socket, log_level="debug", use_colors=True, log_config=None)
else:
uvicorn.run(app, host=host, port=port, log_level="debug", use_colors=True, log_config=None)
logger.info("🌒 Stopping Khoj")
def poll_task_scheduler():

View file

@ -82,7 +82,7 @@ class JsonlToJsonl(TextToJsonl):
if any(files_with_non_jsonl_extensions):
print(f"[Warning] There maybe non jsonl files in the input set: {files_with_non_jsonl_extensions}")
logger.info(f"Processing files: {all_jsonl_files}")
logger.debug(f"Processing files: {all_jsonl_files}")
return all_jsonl_files

View file

@ -88,7 +88,7 @@ class BeancountToJsonl(TextToJsonl):
if any(files_with_non_beancount_extensions):
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
logger.info(f"Processing files: {all_beancount_files}")
logger.debug(f"Processing files: {all_beancount_files}")
return all_beancount_files
@ -123,7 +123,7 @@ class BeancountToJsonl(TextToJsonl):
Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{transaction_to_file_map[parsed_entry]}")
)
logger.info(f"Converted {len(parsed_entries)} transactions to dictionaries")
logger.debug(f"Converted {len(parsed_entries)} transactions to dictionaries")
return entries

View file

@ -92,7 +92,7 @@ class MarkdownToJsonl(TextToJsonl):
f"[Warning] There maybe non markdown-mode files in the input set: {files_with_non_markdown_extensions}"
)
logger.info(f"Processing files: {all_markdown_files}")
logger.debug(f"Processing files: {all_markdown_files}")
return all_markdown_files
@ -126,7 +126,7 @@ class MarkdownToJsonl(TextToJsonl):
for parsed_entry in parsed_entries:
entries.append(Entry(compiled=parsed_entry, raw=parsed_entry, file=f"{entry_to_file_map[parsed_entry]}"))
logger.info(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries")
return entries

View file

@ -86,7 +86,7 @@ class OrgToJsonl(TextToJsonl):
if any(files_with_non_org_extensions):
logger.warn(f"There maybe non org-mode files in the input set: {files_with_non_org_extensions}")
logger.info(f"Processing files: {all_org_files}")
logger.debug(f"Processing files: {all_org_files}")
return all_org_files

View file

@ -56,7 +56,7 @@ async def set_config_data(updated_config: FullConfig):
def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Optional[bool] = False):
results: List[SearchResponse] = []
if q is None or q == "":
logger.info(f"No query param (q) passed in API call to initiate search")
logger.warn(f"No query param (q) passed in API call to initiate search")
return results
# initialize variables
@ -66,7 +66,7 @@ def search(q: str, n: Optional[int] = 5, t: Optional[SearchType] = None, r: Opti
# return cached results, if available
query_cache_key = f"{user_query}-{n}-{t}-{r}"
if query_cache_key in state.query_cache:
logger.info(f"Return response from query cache")
logger.debug(f"Return response from query cache")
return state.query_cache[query_cache_key]
if (t == SearchType.Org or t == None) and state.model.orgmode_search:
@ -151,7 +151,7 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False):
logger.error(e)
raise HTTPException(status_code=500, detail=str(e))
else:
logger.info("Search Index updated via API call")
logger.info("📬 Search index updated via API")
try:
state.processor_config = configure_processor(state.config.processor)
@ -159,6 +159,6 @@ def update(t: Optional[SearchType] = None, force: Optional[bool] = False):
logger.error(e)
raise HTTPException(status_code=500, detail=str(e))
else:
logger.info("Processor reconfigured via API call")
logger.info("📬 Processor reconfigured via API")
return {"status": "ok", "message": "khoj reloaded"}

View file

@ -152,7 +152,6 @@ def save_chat_session():
conversation_log["session"].append(session)
else:
conversation_log["session"] = [session]
logger.info("Added new chat session to conversation logs")
# Save Conversation Metadata Logs to Disk
conversation_logfile = resolve_absolute_path(state.processor_config.conversation.conversation_logfile)
@ -161,4 +160,4 @@ def save_chat_session():
json.dump(conversation_log, logfile)
state.processor_config.conversation.chat_session = None
logger.info("Saved updated conversation logs to disk.")
logger.info("📩 Saved current chat session to conversation logs")

View file

@ -49,7 +49,7 @@ class FileFilter(BaseFilter):
query = re.sub(self.file_filter_regex, "", query).strip()
cache_key = tuple(files_to_search)
if cache_key in self.cache:
logger.info(f"Return file filter results from cache")
logger.debug(f"Return file filter results from cache")
included_entry_indices = self.cache[cache_key]
return query, included_entry_indices

View file

@ -58,7 +58,7 @@ class WordFilter(BaseFilter):
# Return item from cache if exists
cache_key = tuple(sorted(required_words)), tuple(sorted(blocked_words))
if cache_key in self.cache:
logger.info(f"Return word filter results from cache")
logger.debug(f"Return word filter results from cache")
included_entry_indices = self.cache[cache_key]
return query, included_entry_indices

View file

@ -3,7 +3,6 @@ import glob
import pathlib
import copy
import shutil
import time
import logging
from typing import List
@ -50,9 +49,9 @@ def extract_entries(image_directories):
image_names.extend(list(image_directory.glob("*.jpg")))
image_names.extend(list(image_directory.glob("*.jpeg")))
if logger.level >= logging.INFO:
if logger.level >= logging.DEBUG:
image_directory_names = ", ".join([str(image_directory) for image_directory in image_directories])
logger.info(f"Found {len(image_names)} images in {image_directory_names}")
logger.debug(f"Found {len(image_names)} images in {image_directory_names}")
return sorted(image_names)
@ -71,7 +70,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5
# Load pre-computed image embeddings from file if exists
if resolve_absolute_path(embeddings_file).exists() and not regenerate:
image_embeddings = torch.load(embeddings_file)
logger.info(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}")
logger.debug(f"Loaded {len(image_embeddings)} image embeddings from {embeddings_file}")
# Else compute the image embeddings from scratch, which can take a while
else:
image_embeddings = []
@ -89,7 +88,7 @@ def compute_image_embeddings(image_names, encoder, embeddings_file, batch_size=5
# Save computed image embeddings to file
torch.save(image_embeddings, embeddings_file)
logger.info(f"Saved computed embeddings to {embeddings_file}")
logger.info(f"📩 Saved computed image embeddings to {embeddings_file}")
return image_embeddings
@ -102,7 +101,7 @@ def compute_metadata_embeddings(
# Load pre-computed image metadata embedding file if exists
if use_xmp_metadata and resolve_absolute_path(f"{embeddings_file}_metadata").exists() and not regenerate:
image_metadata_embeddings = torch.load(f"{embeddings_file}_metadata")
logger.info(f"Loaded pre-computed embeddings from {embeddings_file}_metadata")
logger.debug(f"Loaded image metadata embeddings from {embeddings_file}_metadata")
# Else compute the image metadata embeddings from scratch, which can take a while
if use_xmp_metadata and image_metadata_embeddings is None:
@ -121,7 +120,7 @@ def compute_metadata_embeddings(
)
continue
torch.save(image_metadata_embeddings, f"{embeddings_file}_metadata")
logger.info(f"Saved computed metadata embeddings to {embeddings_file}_metadata")
logger.info(f"📩 Saved computed image metadata embeddings to {embeddings_file}_metadata")
return image_metadata_embeddings
@ -149,12 +148,12 @@ def query(raw_query, count, model: ImageSearchModel):
query_imagepath = resolve_absolute_path(pathlib.Path(raw_query[5:]), strict=True)
query = copy.deepcopy(Image.open(query_imagepath))
query.thumbnail((640, query.height)) # scale down image for faster processing
logger.info(f"Find Images by Image: {query_imagepath}")
logger.info(f"🔎 Find Images by Image: {query_imagepath}")
else:
# Truncate words in query to stay below max_tokens supported by ML model
max_words = 20
query = " ".join(raw_query.split()[:max_words])
logger.info(f"Find Images by Text: {query}")
logger.info(f"🔎 Find Images by Text: {query}")
# Now we encode the query (which can either be an image or a text string)
with timer("Query Encode Time", logger):

View file

@ -1,7 +1,6 @@
# Standard Packages
import logging
from pathlib import Path
import time
from typing import List, Tuple, Type
# External Packages
@ -68,7 +67,7 @@ def compute_embeddings(
# Load pre-computed embeddings from file if exists and update them if required
if embeddings_file.exists() and not regenerate:
corpus_embeddings = torch.load(get_absolute_path(embeddings_file), map_location=state.device)
logger.info(f"Loaded embeddings from {embeddings_file}")
logger.debug(f"Loaded {len(corpus_embeddings)} text embeddings from {embeddings_file}")
# Encode any new entries in the corpus and update corpus embeddings
new_entries = [entry.compiled for id, entry in entries_with_ids if id == -1]
@ -95,7 +94,7 @@ def compute_embeddings(
if new_entries:
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
torch.save(corpus_embeddings, embeddings_file)
logger.info(f"Computed embeddings and saved them to {embeddings_file}")
logger.info(f"📩 Saved computed text embeddings to {embeddings_file}")
return corpus_embeddings

View file

@ -58,16 +58,20 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
def load_model(model_name: str, model_type, model_dir=None, device: str = None) -> Union[BaseEncoder, CrossEncoder]:
"Load model from disk or huggingface"
# Construct model path
logger = logging.getLogger(__name__)
model_path = join(model_dir, model_name.replace("/", "_")) if model_dir is not None else None
# Load model from model_path if it exists there
model_type_class = get_class_by_name(model_type) if isinstance(model_type, str) else model_type
if model_path is not None and resolve_absolute_path(model_path).exists():
logger.debug(f"Loading {model_name} model from disk")
model = model_type_class(get_absolute_path(model_path), device=device)
# Else load the model from the model_name
else:
logger.info(f"🤖 Downloading {model_name} model from web")
model = model_type_class(model_name, device=device)
if model_path is not None:
logger.info(f"📩 Saved {model_name} model to disk")
model.save(model_path)
return model

View file

@ -31,7 +31,7 @@ def load_jsonl(input_path):
jsonl_file.close()
# Log JSONL entries loaded
logger.info(f"Loaded {len(data)} records from {input_path}")
logger.debug(f"Loaded {len(data)} records from {input_path}")
return data
@ -44,7 +44,7 @@ def dump_jsonl(jsonl_data, output_path):
with open(output_path, "w", encoding="utf-8") as f:
f.write(jsonl_data)
logger.info(f"Wrote jsonl data to {output_path}")
logger.debug(f"Wrote jsonl data to {output_path}")
def compress_jsonl_data(jsonl_data, output_path):
@ -54,4 +54,4 @@ def compress_jsonl_data(jsonl_data, output_path):
with gzip.open(output_path, "wt", encoding="utf-8") as gzip_file:
gzip_file.write(jsonl_data)
logger.info(f"Wrote jsonl data to gzip compressed jsonl at {output_path}")
logger.debug(f"Wrote jsonl data to gzip compressed jsonl at {output_path}")