Fix identifying pdf files on server

Introduced bug in previous commit that would stop indexing PDF files
as trying to check content_group instead of mime_type is application/pdf
This commit is contained in:
Debanjum Singh Solanky 2024-04-12 02:49:39 +05:30
parent 07f8fb5c5b
commit 7d2ef728e6

View file

@ -102,14 +102,16 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
# Infer content type from reading file content
try:
content_type = magika.identify_bytes(file_content).output.group
content_identity = magika.identify_bytes(file_content).output
content_type = content_identity.mime_type
content_group = content_identity.group
except Exception:
# Fallback to using just file type if content type cannot be inferred
content_type = file_type
if file_type in ["text/markdown"] and content_type in ["code", "text"]:
if file_type in ["text/markdown"] and content_group in ["code", "text"]:
return "markdown", encoding
elif file_type in ["text/org"] and content_type in ["code", "text"]:
elif file_type in ["text/org"] and content_group in ["code", "text"]:
return "org", encoding
elif file_type in ["application/pdf"] and content_type == "application/pdf":
return "pdf", encoding
@ -117,7 +119,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
return "jpeg", encoding
elif file_type in ["image/png"] and content_type == "image/png":
return "png", encoding
elif content_type in ["code", "text"]:
elif content_group in ["code", "text"]:
return "plaintext", encoding
else:
return "other", encoding