mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Only check content type if file extension cannot identify text file
This commit is contained in:
parent
7d2ef728e6
commit
5c7797dbca
2 changed files with 10 additions and 13 deletions
|
@ -51,11 +51,10 @@ def get_plaintext_files(config: TextContentConfig) -> dict[str, str]:
|
|||
def is_plaintextfile(file: str):
|
||||
"Check if file is plaintext file"
|
||||
# Check if file path exists
|
||||
content_identity = magika.identify_path(Path(file)).output
|
||||
if content_identity.mime_type not in ["inode/x-empty", "application/unknown"]:
|
||||
return content_identity.group in ["text", "code"]
|
||||
content_group = magika.identify_path(Path(file)).output.group
|
||||
# Use file extension to decide plaintext if file content is not identifiable
|
||||
return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
|
||||
valid_text_file_extensions = ("txt", "md", "markdown", "org" "mbox", "rst", "html", "htm", "xml")
|
||||
return file.endswith(valid_text_file_extensions) or content_group in ["text", "code"]
|
||||
|
||||
def extract_html_content(html_content: str):
|
||||
"Extract content from HTML"
|
||||
|
|
|
@ -102,22 +102,20 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
|||
|
||||
# Infer content type from reading file content
|
||||
try:
|
||||
content_identity = magika.identify_bytes(file_content).output
|
||||
content_type = content_identity.mime_type
|
||||
content_group = content_identity.group
|
||||
content_group = magika.identify_bytes(file_content).output.group
|
||||
except Exception:
|
||||
# Fallback to using just file type if content type cannot be inferred
|
||||
content_type = file_type
|
||||
content_group = "unknown"
|
||||
|
||||
if file_type in ["text/markdown"] and content_group in ["code", "text"]:
|
||||
if file_type in ["text/markdown"]:
|
||||
return "markdown", encoding
|
||||
elif file_type in ["text/org"] and content_group in ["code", "text"]:
|
||||
elif file_type in ["text/org"]:
|
||||
return "org", encoding
|
||||
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
||||
elif file_type in ["application/pdf"]:
|
||||
return "pdf", encoding
|
||||
elif file_type in ["image/jpeg"] and content_type == "image/jpeg":
|
||||
elif file_type in ["image/jpeg"]:
|
||||
return "jpeg", encoding
|
||||
elif file_type in ["image/png"] and content_type == "image/png":
|
||||
elif file_type in ["image/png"]:
|
||||
return "png", encoding
|
||||
elif content_group in ["code", "text"]:
|
||||
return "plaintext", encoding
|
||||
|
|
Loading…
Add table
Reference in a new issue