Label pages when extract text from pdf, docs content. Fix scroll area in doc preview.

This commit is contained in:
sabaimran 2024-11-08 14:53:20 -08:00
parent ee062d1c48
commit ad46b0e718
3 changed files with 16 additions and 3 deletions

View file

@ -262,6 +262,11 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
return await response.json(); return await response.json();
} catch (error) { } catch (error) {
setError(
"Error converting files. " +
error +
". Please try again, or contact team@khoj.dev if the issue persists.",
);
console.error("Error converting files:", error); console.error("Error converting files:", error);
return []; return [];
} }

View file

@ -728,7 +728,9 @@ const ChatMessage = forwardRef<HTMLDivElement, ChatMessageProps>((props, ref) =>
<DialogTitle>{file.name}</DialogTitle> <DialogTitle>{file.name}</DialogTitle>
</DialogHeader> </DialogHeader>
<DialogDescription> <DialogDescription>
<ScrollArea className="max-h-96">{file.content}</ScrollArea> <ScrollArea className="h-72 w-full rounded-md">
{file.content}
</ScrollArea>
</DialogDescription> </DialogDescription>
</DialogContent> </DialogContent>
</Dialog> </Dialog>

View file

@ -396,11 +396,17 @@ async def convert_documents(
if file_data.file_type == "docx": if file_data.file_type == "docx":
entries_per_page = DocxToEntries.extract_text(file_data.content) entries_per_page = DocxToEntries.extract_text(file_data.content)
extracted_content = "\n".join(entries_per_page) annotated_pages = [
f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
]
extracted_content = "\n".join(annotated_pages)
elif file_data.file_type == "pdf": elif file_data.file_type == "pdf":
entries_per_page = PdfToEntries.extract_text(file_data.content) entries_per_page = PdfToEntries.extract_text(file_data.content)
extracted_content = "\n".join(entries_per_page) annotated_pages = [
f"Page {index} of {file_data.name}:\n\n{entry}" for index, entry in enumerate(entries_per_page)
]
extracted_content = "\n".join(annotated_pages)
size_in_bytes = len(extracted_content.encode("utf-8")) size_in_bytes = len(extracted_content.encode("utf-8"))