From ad46b0e7184859d6474044a9bcfb28b0e66c5086 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Fri, 8 Nov 2024 14:53:20 -0800 Subject: [PATCH] Label pages when extract text from pdf, docs content. Fix scroll area in doc preview. --- .../web/app/components/chatInputArea/chatInputArea.tsx | 5 +++++ .../web/app/components/chatMessage/chatMessage.tsx | 4 +++- src/khoj/routers/api_content.py | 10 ++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx index 2f58fcb5..a5b2482b 100644 --- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx +++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx @@ -262,6 +262,11 @@ export const ChatInputArea = forwardRef((pr return await response.json(); } catch (error) { + setError( + "Error converting files. " + + error + + ". Please try again, or contact team@khoj.dev if the issue persists.", + ); console.error("Error converting files:", error); return []; } diff --git a/src/interface/web/app/components/chatMessage/chatMessage.tsx b/src/interface/web/app/components/chatMessage/chatMessage.tsx index 9f9cff12..9deaf955 100644 --- a/src/interface/web/app/components/chatMessage/chatMessage.tsx +++ b/src/interface/web/app/components/chatMessage/chatMessage.tsx @@ -728,7 +728,9 @@ const ChatMessage = forwardRef((props, ref) => {file.name} - {file.content} + + {file.content} + diff --git a/src/khoj/routers/api_content.py b/src/khoj/routers/api_content.py index a83e0538..eb808c22 100644 --- a/src/khoj/routers/api_content.py +++ b/src/khoj/routers/api_content.py @@ -396,11 +396,17 @@ async def convert_documents( if file_data.file_type == "docx": entries_per_page = DocxToEntries.extract_text(file_data.content) - extracted_content = "\n".join(entries_per_page) + annotated_pages = [ + f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page) + ] + extracted_content = "\n".join(annotated_pages) elif file_data.file_type == "pdf": entries_per_page = PdfToEntries.extract_text(file_data.content) - extracted_content = "\n".join(entries_per_page) + annotated_pages = [ + f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page) + ] + extracted_content = "\n".join(annotated_pages) size_in_bytes = len(extracted_content.encode("utf-8"))