From 89915dcb4ced821f64f14add259bd7e3f09560fc Mon Sep 17 00:00:00 2001
From: Debanjum Singh Solanky <debanjum@gmail.com>
Date: Wed, 3 Apr 2024 10:24:16 +0530
Subject: [PATCH] Identify file type by content & allow server to index all
 text files

- Use Magika's AI for a tiny, portable and better file type
  identification system
- Existing file type identification tools like `file' and `magic'
  require system level packages, that may not be installed by default
  on all operating systems (e.g `file' command on Windows)
---
 pyproject.toml              |  1 +
 src/khoj/routers/indexer.py |  7 +++----
 src/khoj/utils/fs_syncer.py | 12 ++++++++++--
 src/khoj/utils/helpers.py   | 28 +++++++++++++++++++++-------
 4 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e446a2a1..a0667981 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
     "openai >= 1.0.0",
     "tiktoken >= 0.3.2",
     "tenacity >= 8.2.2",
+    "magika ~= 0.5.1",
     "pillow ~= 9.5.0",
     "pydantic >= 2.0.0",
     "pyyaml ~= 6.0",
diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py
index 1bca6a25..d25d166a 100644
--- a/src/khoj/routers/indexer.py
+++ b/src/khoj/routers/indexer.py
@@ -67,11 +67,10 @@ async def update(
     try:
         logger.info(f"📬 Updating content index via API call by {client} client")
         for file in files:
-            file_type, encoding = get_file_type(file.content_type)
+            file_content = file.file.read()
+            file_type, encoding = get_file_type(file.content_type, file_content)
             if file_type in index_files:
-                index_files[file_type][file.filename] = (
-                    file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read()  # type: ignore
-                )
+                index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
             else:
                 logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
 
diff --git a/src/khoj/utils/fs_syncer.py b/src/khoj/utils/fs_syncer.py
index 31bc13b9..f9b7fc62 100644
--- a/src/khoj/utils/fs_syncer.py
+++ b/src/khoj/utils/fs_syncer.py
@@ -1,9 +1,11 @@
 import glob
 import logging
 import os
+from pathlib import Path
 from typing import Optional
 
 from bs4 import BeautifulSoup
+from magika import Magika
 
 from khoj.database.models import (
     LocalMarkdownConfig,
@@ -16,6 +18,7 @@ from khoj.utils.helpers import get_absolute_path, is_none_or_empty
 from khoj.utils.rawconfig import TextContentConfig
 
 logger = logging.getLogger(__name__)
+magika = Magika()
 
 
 def collect_files(search_type: Optional[SearchType] = SearchType.All, user=None) -> dict:
@@ -47,6 +50,11 @@ def construct_config_from_db(db_config) -> TextContentConfig:
 def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
     def is_plaintextfile(file: str):
         "Check if file is plaintext file"
+        # Check if file path exists
+        mime_type = magika.identify_path(Path(file)).output.mime_type
+        if mime_type != "inode/x-empty" and mime_type != "application/unknown":
+            return mime_type.startswith("text/")
+        # Use file extension to decide plaintext if file content is not identifiable
         return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
 
     def extract_html_content(html_content: str):
@@ -65,7 +73,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
         logger.debug("At least one of input-files or input-file-filter is required to be specified")
         return {}
 
-    "Get all files to process"
+    # Get all plain text files to process
     absolute_plaintext_files, filtered_plaintext_files = set(), set()
     if input_files:
         absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
@@ -209,7 +217,7 @@ def get_pdf_files(config: TextContentConfig):
         logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
         return {}
 
-    "Get PDF files to process"
+    # Get PDF files to process
     absolute_pdf_files, filtered_pdf_files = set(), set()
     if pdf_files:
         absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
diff --git a/src/khoj/utils/helpers.py b/src/khoj/utils/helpers.py
index d713c335..d5bf9b4b 100644
--- a/src/khoj/utils/helpers.py
+++ b/src/khoj/utils/helpers.py
@@ -19,6 +19,7 @@ from urllib.parse import urlparse
 
 import torch
 from asgiref.sync import sync_to_async
+from magika import Magika
 
 from khoj.utils import constants
 
@@ -29,6 +30,10 @@ if TYPE_CHECKING:
     from khoj.utils.rawconfig import AppConfig
 
 
+# Initialize Magika for file type identification
+magika = Magika()
+
+
 class AsyncIteratorWrapper:
     def __init__(self, obj):
         self._it = iter(obj)
@@ -88,22 +93,31 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
     return merged_dict
 
 
-def get_file_type(file_type: str) -> tuple[str, str]:
+def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
     "Get file type from file mime type"
 
+    # Extract encoding from file_type
     encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
     file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
-    if file_type in ["text/markdown"]:
+
+    # Infer content type from reading file content
+    try:
+        content_type = magika.identify_bytes(file_content).output.mime_type
+    except Exception:
+        # Fallback to using just file type if content type cannot be inferred
+        content_type = file_type
+
+    if file_type in ["text/markdown"] and content_type.startswith("text/"):
         return "markdown", encoding
-    elif file_type in ["text/org"]:
+    elif file_type in ["text/org"] and content_type.startswith("text/"):
         return "org", encoding
-    elif file_type in ["application/pdf"]:
+    elif file_type in ["application/pdf"] and content_type == "application/pdf":
         return "pdf", encoding
-    elif file_type in ["image/jpeg"]:
+    elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
         return "jpeg", encoding
-    elif file_type in ["image/png"]:
+    elif file_type in ["image/png"] and content_type == "image/png":
         return "png", encoding
-    elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
+    elif content_type.startswith("text/"):
         return "plaintext", encoding
     else:
         return "other", encoding