From d27dc71dfecf3f395a7200e7622ed6b7054543fc Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Tue, 17 Oct 2023 02:37:20 -0700 Subject: [PATCH] Use encoding of each file set in indexer request to read file Get encoding type from multi-part/form-request body for each file Read text files as utf-8 and pdfs, images as binary --- src/interface/desktop/main.js | 2 +- src/khoj/routers/indexer.py | 6 ++++-- src/khoj/utils/helpers.py | 17 +++++++++-------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js index 62493f54..17ab2fb4 100644 --- a/src/interface/desktop/main.js +++ b/src/interface/desktop/main.js @@ -93,9 +93,9 @@ function filenameToMimeType (filename) { case 'png': return 'image/png'; case 'jpg': - return 'image/jpeg'; case 'jpeg': return 'image/jpeg'; + case 'md': case 'markdown': return 'text/markdown'; case 'org': diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index 86cd847f..d94b8330 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -73,7 +73,7 @@ async def index_batch( plaintext_files: Dict[str, str] = {} for file in files: - file_type = get_file_type(file.content_type) + file_type, encoding = get_file_type(file.content_type) dict_to_update = None if file_type == "org": dict_to_update = org_files @@ -85,7 +85,9 @@ async def index_batch( dict_to_update = plaintext_files if dict_to_update is not None: - dict_to_update[file.filename] = file.file.read().decode("utf-8") + dict_to_update[file.filename] = ( + file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() + ) else: logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}") diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py index 3391a55d..9209ff67 100644 --- a/src/khoj/utils/helpers.py +++ b/src/khoj/utils/helpers.py @@ -66,24 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict): return merged_dict -def get_file_type(file_type: str) -> str: +def get_file_type(file_type: str) -> tuple[str, str]: "Get file type from file mime type" + encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type if file_type in ["text/markdown"]: - return "markdown" + return "markdown", encoding elif file_type in ["text/org"]: - return "org" + return "org", encoding elif file_type in ["application/pdf"]: - return "pdf" + return "pdf", encoding elif file_type in ["image/jpeg"]: - return "jpeg" + return "jpeg", encoding elif file_type in ["image/png"]: - return "png" + return "png", encoding elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]: - return "plaintext" + return "plaintext", encoding else: - return "other" + return "other", encoding def load_model(