mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 09:25:06 +01:00
Identify file type by content & allow server to index all text files
- Use Magika's AI for a tiny, portable and better file type identification system - Existing file type identification tools like `file' and `magic' require system level packages, that may not be installed by default on all operating systems (e.g `file' command on Windows)
This commit is contained in:
parent
1c229dad91
commit
89915dcb4c
4 changed files with 35 additions and 13 deletions
|
@ -46,6 +46,7 @@ dependencies = [
|
||||||
"openai >= 1.0.0",
|
"openai >= 1.0.0",
|
||||||
"tiktoken >= 0.3.2",
|
"tiktoken >= 0.3.2",
|
||||||
"tenacity >= 8.2.2",
|
"tenacity >= 8.2.2",
|
||||||
|
"magika ~= 0.5.1",
|
||||||
"pillow ~= 9.5.0",
|
"pillow ~= 9.5.0",
|
||||||
"pydantic >= 2.0.0",
|
"pydantic >= 2.0.0",
|
||||||
"pyyaml ~= 6.0",
|
"pyyaml ~= 6.0",
|
||||||
|
|
|
@ -67,11 +67,10 @@ async def update(
|
||||||
try:
|
try:
|
||||||
logger.info(f"📬 Updating content index via API call by {client} client")
|
logger.info(f"📬 Updating content index via API call by {client} client")
|
||||||
for file in files:
|
for file in files:
|
||||||
file_type, encoding = get_file_type(file.content_type)
|
file_content = file.file.read()
|
||||||
|
file_type, encoding = get_file_type(file.content_type, file_content)
|
||||||
if file_type in index_files:
|
if file_type in index_files:
|
||||||
index_files[file_type][file.filename] = (
|
index_files[file_type][file.filename] = file_content.decode(encoding) if encoding else file_content
|
||||||
file.file.read().decode("utf-8") if encoding == "utf-8" else file.file.read() # type: ignore
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
logger.warning(f"Skipped indexing unsupported file type sent by {client} client: {file.filename}")
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import glob
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from magika import Magika
|
||||||
|
|
||||||
from khoj.database.models import (
|
from khoj.database.models import (
|
||||||
LocalMarkdownConfig,
|
LocalMarkdownConfig,
|
||||||
|
@ -16,6 +18,7 @@ from khoj.utils.helpers import get_absolute_path, is_none_or_empty
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
magika = Magika()
|
||||||
|
|
||||||
|
|
||||||
def collect_files(search_type: Optional[SearchType] = SearchType.All, user=None) -> dict:
|
def collect_files(search_type: Optional[SearchType] = SearchType.All, user=None) -> dict:
|
||||||
|
@ -47,6 +50,11 @@ def construct_config_from_db(db_config) -> TextContentConfig:
|
||||||
def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
def is_plaintextfile(file: str):
|
def is_plaintextfile(file: str):
|
||||||
"Check if file is plaintext file"
|
"Check if file is plaintext file"
|
||||||
|
# Check if file path exists
|
||||||
|
mime_type = magika.identify_path(Path(file)).output.mime_type
|
||||||
|
if mime_type != "inode/x-empty" and mime_type != "application/unknown":
|
||||||
|
return mime_type.startswith("text/")
|
||||||
|
# Use file extension to decide plaintext if file content is not identifiable
|
||||||
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
||||||
|
|
||||||
def extract_html_content(html_content: str):
|
def extract_html_content(html_content: str):
|
||||||
|
@ -65,7 +73,7 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
||||||
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
logger.debug("At least one of input-files or input-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
"Get all files to process"
|
# Get all plain text files to process
|
||||||
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
absolute_plaintext_files, filtered_plaintext_files = set(), set()
|
||||||
if input_files:
|
if input_files:
|
||||||
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in input_files}
|
||||||
|
@ -209,7 +217,7 @@ def get_pdf_files(config: TextContentConfig):
|
||||||
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
logger.debug("At least one of pdf-files or pdf-file-filter is required to be specified")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
"Get PDF files to process"
|
# Get PDF files to process
|
||||||
absolute_pdf_files, filtered_pdf_files = set(), set()
|
absolute_pdf_files, filtered_pdf_files = set(), set()
|
||||||
if pdf_files:
|
if pdf_files:
|
||||||
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
absolute_pdf_files = {get_absolute_path(pdf_file) for pdf_file in pdf_files}
|
||||||
|
|
|
@ -19,6 +19,7 @@ from urllib.parse import urlparse
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from asgiref.sync import sync_to_async
|
from asgiref.sync import sync_to_async
|
||||||
|
from magika import Magika
|
||||||
|
|
||||||
from khoj.utils import constants
|
from khoj.utils import constants
|
||||||
|
|
||||||
|
@ -29,6 +30,10 @@ if TYPE_CHECKING:
|
||||||
from khoj.utils.rawconfig import AppConfig
|
from khoj.utils.rawconfig import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize Magika for file type identification
|
||||||
|
magika = Magika()
|
||||||
|
|
||||||
|
|
||||||
class AsyncIteratorWrapper:
|
class AsyncIteratorWrapper:
|
||||||
def __init__(self, obj):
|
def __init__(self, obj):
|
||||||
self._it = iter(obj)
|
self._it = iter(obj)
|
||||||
|
@ -88,22 +93,31 @@ def merge_dicts(priority_dict: dict, default_dict: dict):
|
||||||
return merged_dict
|
return merged_dict
|
||||||
|
|
||||||
|
|
||||||
def get_file_type(file_type: str) -> tuple[str, str]:
|
def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
||||||
"Get file type from file mime type"
|
"Get file type from file mime type"
|
||||||
|
|
||||||
|
# Extract encoding from file_type
|
||||||
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
|
encoding = file_type.split("=")[1].strip().lower() if ";" in file_type else None
|
||||||
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
file_type = file_type.split(";")[0].strip() if ";" in file_type else file_type
|
||||||
if file_type in ["text/markdown"]:
|
|
||||||
|
# Infer content type from reading file content
|
||||||
|
try:
|
||||||
|
content_type = magika.identify_bytes(file_content).output.mime_type
|
||||||
|
except Exception:
|
||||||
|
# Fallback to using just file type if content type cannot be inferred
|
||||||
|
content_type = file_type
|
||||||
|
|
||||||
|
if file_type in ["text/markdown"] and content_type.startswith("text/"):
|
||||||
return "markdown", encoding
|
return "markdown", encoding
|
||||||
elif file_type in ["text/org"]:
|
elif file_type in ["text/org"] and content_type.startswith("text/"):
|
||||||
return "org", encoding
|
return "org", encoding
|
||||||
elif file_type in ["application/pdf"]:
|
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
||||||
return "pdf", encoding
|
return "pdf", encoding
|
||||||
elif file_type in ["image/jpeg"]:
|
elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
|
||||||
return "jpeg", encoding
|
return "jpeg", encoding
|
||||||
elif file_type in ["image/png"]:
|
elif file_type in ["image/png"] and content_type == "image/png":
|
||||||
return "png", encoding
|
return "png", encoding
|
||||||
elif file_type in ["text/plain", "text/html", "application/xml", "text/x-rst"]:
|
elif content_type.startswith("text/"):
|
||||||
return "plaintext", encoding
|
return "plaintext", encoding
|
||||||
else:
|
else:
|
||||||
return "other", encoding
|
return "other", encoding
|
||||||
|
|
Loading…
Reference in a new issue