Handle size calculation more gracefully for converted documents, depending on type

2024-11-23 23:48:56 +01:00 · 2024-11-12 02:00:29 -08:00 · 2024-11-12 02:00:29 -08:00 · 8ec1764e42
commit 8ec1764e42
parent b6714c202f
1 changed files with 11 additions and 1 deletions
--- a/src/khoj/routers/api_content.py
+++ b/src/khoj/routers/api_content.py
@ -422,8 +422,18 @@ async def convert_documents(
                    f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
                ]
                extracted_content = "\n".join(annotated_pages)
            else:
                # Convert content to string
                extracted_content = extracted_content.decode("utf-8")
-            size_in_bytes = len(extracted_content.encode("utf-8"))
+            # Calculate size in bytes. Some of the content might be in bytes, some in str.
            if isinstance(extracted_content, str):
                size_in_bytes = len(extracted_content.encode("utf-8"))
            elif isinstance(extracted_content, bytes):
                size_in_bytes = len(extracted_content)
            else:
                size_in_bytes = 0
                logger.warning(f"Unexpected content type: {type(extracted_content)}")
            converted_files.append(
                {