mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Fix identifying pdf files on server
Introduced bug in previous commit that would stop indexing PDF files as trying to check content_group instead of mime_type is application/pdf
This commit is contained in:
parent
07f8fb5c5b
commit
7d2ef728e6
1 changed files with 6 additions and 4 deletions
|
@ -102,14 +102,16 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
||||||
|
|
||||||
# Infer content type from reading file content
|
# Infer content type from reading file content
|
||||||
try:
|
try:
|
||||||
content_type = magika.identify_bytes(file_content).output.group
|
content_identity = magika.identify_bytes(file_content).output
|
||||||
|
content_type = content_identity.mime_type
|
||||||
|
content_group = content_identity.group
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fallback to using just file type if content type cannot be inferred
|
# Fallback to using just file type if content type cannot be inferred
|
||||||
content_type = file_type
|
content_type = file_type
|
||||||
|
|
||||||
if file_type in ["text/markdown"] and content_type in ["code", "text"]:
|
if file_type in ["text/markdown"] and content_group in ["code", "text"]:
|
||||||
return "markdown", encoding
|
return "markdown", encoding
|
||||||
elif file_type in ["text/org"] and content_type in ["code", "text"]:
|
elif file_type in ["text/org"] and content_group in ["code", "text"]:
|
||||||
return "org", encoding
|
return "org", encoding
|
||||||
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
elif file_type in ["application/pdf"] and content_type == "application/pdf":
|
||||||
return "pdf", encoding
|
return "pdf", encoding
|
||||||
|
@ -117,7 +119,7 @@ def get_file_type(file_type: str, file_content: bytes) -> tuple[str, str]:
|
||||||
return "jpeg", encoding
|
return "jpeg", encoding
|
||||||
elif file_type in ["image/png"] and content_type == "image/png":
|
elif file_type in ["image/png"] and content_type == "image/png":
|
||||||
return "png", encoding
|
return "png", encoding
|
||||||
elif content_type in ["code", "text"]:
|
elif content_group in ["code", "text"]:
|
||||||
return "plaintext", encoding
|
return "plaintext", encoding
|
||||||
else:
|
else:
|
||||||
return "other", encoding
|
return "other", encoding
|
||||||
|
|
Loading…
Reference in a new issue