From d27dc71dfecf3f395a7200e7622ed6b7054543fc Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Tue, 17 Oct 2023 02:37:20 -0700
Subject: [PATCH] Use encoding of each file set in indexer request to read file

Get encoding type from multi-part/form-request body for each file
Read text files as utf-8 and pdfs, images as binary
---
 src/interface/desktop/main.js |  2 +-
 src/khoj/routers/indexer.py   |  6 ++++--
 src/khoj/utils/helpers.py     | 17 +++++++++--------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/interface/desktop/main.js b/src/interface/desktop/main.js
index 62493f54..17ab2fb4 100644
--- a/src/interface/desktop/main.js
+++ b/src/interface/desktop/main.js
@@ -93,9 +93,9 @@ function filenameToMimeType (filename) {
         case 'png':
             return 'image/png';
         case 'jpg':
-            return 'image/jpeg';
         case 'jpeg':
             return 'image/jpeg';
+        case 'md':
         case 'markdown':
             return 'text/markdown';
         case 'org':
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 86cd847f..d94b8330 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -73,7 +73,7 @@ async def index_batch(
         plaintext_files: Dict[str, str] = {}
 
         for file in files:
-            file_type = get_file_type(file.content_type)
+            file_type, encoding = get_file_type(file.content_type)
             dict_to_update = None
             if file_type == "org":
                 dict_to_update = org_files
@@ -85,7 +85,9 @@ async def index_batch(
                 dict_to_update = plaintext_files
 
             if dict_to_update is not None:
-                dict_to_update[file.filename] = file.file.read().decode("utf-8")
+                dict_to_update[file.filename] = (
+                    file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()
+                )
             else:
                 logger.warning(f"Skipped indexing unsupported file type sent by client: {file.filename}")
 
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index 3391a55d..9209ff67 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -66,24 +66,25 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
     return merged_dict
 
 
-def get_file_type(file_type: str) -> str:
+def get_file_type(file_type: str) -> tuple[str, str]:
     "Get file type from file mime type"
 
+    encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
     file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
     if file_type in ["text/markdown"]:
-        return "markdown"
+        return "markdown", encoding
     elif file_type in ["text/org"]:
-        return "org"
+        return "org", encoding
     elif file_type in ["application/pdf"]:
-        return "pdf"
+        return "pdf", encoding
     elif file_type in ["image/jpeg"]:
-        return "jpeg"
+        return "jpeg", encoding
     elif file_type in ["image/png"]:
-        return "png"
+        return "png", encoding
     elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
-        return "plaintext"
+        return "plaintext", encoding
     else:
-        return "other"
+        return "other", encoding
 
 
 def load_model(