From dd36303bb71f61f71b2bbfe4b42812504b46fb48 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Mon, 11 Nov 2024 12:53:06 -0800 Subject: [PATCH] Fix sending file attachments in save_to_conversation method - When files attached but upload fails, don't update the state variables - Make removing null characters in pdf extraction more space efficient --- .../chatInputArea/chatInputArea.tsx | 36 ++++++++++--------- .../processor/content/pdf/pdf_to_entries.py | 14 ++++---- src/khoj/routers/api_chat.py | 2 -- 3 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx index 88a4fcae..1c86a1b6 100644 --- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx +++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx @@ -237,26 +237,28 @@ export const ChatInputArea = forwardRef((pr ? Array.from(nonImageFiles).concat(Array.from(attachedFiles || [])) : Array.from(attachedFiles || []); - // Ensure files are below size limit (10 MB) - for (let i = 0; i < newFiles.length; i++) { - if (newFiles[i].size > 10 * 1024 * 1024) { - setWarning( - `File ${newFiles[i].name} is too large. Please upload files smaller than 10 MB.`, - ); - return; + if (newFiles.length > 0) { + // Ensure files are below size limit (10 MB) + for (let i = 0; i < newFiles.length; i++) { + if (newFiles[i].size > 10 * 1024 * 1024) { + setWarning( + `File ${newFiles[i].name} is too large. Please upload files smaller than 10 MB.`, + ); + return; + } } + + const dataTransfer = new DataTransfer(); + newFiles.forEach((file) => dataTransfer.items.add(file)); + + // Extract text from files + extractTextFromFiles(dataTransfer.files).then((data) => { + props.setUploadedFiles(data); + setAttachedFiles(dataTransfer.files); + setConvertedAttachedFiles(data); + }); } - const dataTransfer = new DataTransfer(); - newFiles.forEach((file) => dataTransfer.items.add(file)); - setAttachedFiles(dataTransfer.files); - - // Extract text from files - extractTextFromFiles(dataTransfer.files).then((data) => { - props.setUploadedFiles(data); - setConvertedAttachedFiles(data); - }); - // Set focus to the input for user message after uploading files chatInputRef?.current?.focus(); } diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 7d2bd384..2b7122ea 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -1,7 +1,6 @@ import logging import tempfile -from io import BytesIO -from typing import Dict, List, Tuple +from typing import Dict, Final, List, Tuple from langchain_community.document_loaders import PyMuPDFLoader @@ -15,6 +14,9 @@ logger = logging.getLogger(__name__) class PdfToEntries(TextToEntries): + # Class-level constant translation table + NULL_TRANSLATOR: Final = str.maketrans("", "", "\x00") + def __init__(self): super().__init__() @@ -112,8 +114,6 @@ class PdfToEntries(TextToEntries): @staticmethod def clean_text(text: str) -> str: - # Remove null bytes - text = text.replace("\x00", "") - # Replace invalid Unicode - text = text.encode("utf-8", errors="ignore").decode("utf-8") - return text + """Clean PDF text by removing null bytes and invalid Unicode characters.""" + # Use faster translation table instead of replace + return text.translate(PdfToEntries.NULL_TRANSLATOR) diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py index 33476a97..6752de78 100644 --- a/src/khoj/routers/api_chat.py +++ b/src/khoj/routers/api_chat.py @@ -1133,7 +1133,6 @@ async def chat( online_results=online_results, query_images=uploaded_images, train_of_thought=train_of_thought, - attached_file_context=attached_file_context, raw_query_files=raw_query_files, tracer=tracer, ) @@ -1194,7 +1193,6 @@ async def chat( online_results=online_results, query_images=uploaded_images, train_of_thought=train_of_thought, - attached_file_context=attached_file_context, raw_query_files=raw_query_files, tracer=tracer, )