Label pages when extract text from pdf, docs content. Fix scroll area in doc preview.

2024-11-23 15:38:55 +01:00 · 2024-11-08 14:53:20 -08:00 · 2024-11-08 14:53:20 -08:00 · ad46b0e718
commit ad46b0e718
parent ee062d1c48
3 changed files with 16 additions and 3 deletions
--- a/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
+++ b/src/interface/web/app/components/chatInputArea/chatInputArea.tsx
@ -262,6 +262,11 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr

            return await response.json();
        } catch (error) {
+            setError(
+                "Error converting files. " +
+                    error +
+                    ". Please try again, or contact team@khoj.dev if the issue persists.",
+            );
            console.error("Error converting files:", error);
            return [];
        }
--- a/src/interface/web/app/components/chatMessage/chatMessage.tsx
+++ b/src/interface/web/app/components/chatMessage/chatMessage.tsx
@ -728,7 +728,9 @@ const ChatMessage = forwardRef<HTMLDivElement, ChatMessageProps>((props, ref) =>
                                    <DialogTitle>{file.name}</DialogTitle>
                                </DialogHeader>
                                <DialogDescription>
-                                    <ScrollArea className="max-h-96">{file.content}</ScrollArea>
+                                    <ScrollArea className="h-72 w-full rounded-md">
+                                        {file.content}
+                                    </ScrollArea>
                                </DialogDescription>
                            </DialogContent>
                        </Dialog>
--- a/src/khoj/routers/api_content.py
+++ b/src/khoj/routers/api_content.py
@ -396,11 +396,17 @@ async def convert_documents(

            if file_data.file_type == "docx":
                entries_per_page = DocxToEntries.extract_text(file_data.content)
-                extracted_content = "\n".join(entries_per_page)
+                annotated_pages = [
+                    f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
+                ]
+                extracted_content = "\n".join(annotated_pages)

            elif file_data.file_type == "pdf":
                entries_per_page = PdfToEntries.extract_text(file_data.content)
-                extracted_content = "\n".join(entries_per_page)
+                annotated_pages = [
+                    f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
+                ]
+                extracted_content = "\n".join(annotated_pages)

            size_in_bytes = len(extracted_content.encode("utf-8"))