From e6ffb6b52c0d374f1ca3e3db250c8818928ebbc3 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 14:41:47 +0530 Subject: [PATCH] Improve scaling user flow to delete all entries - Delete entries by batch to improve efficiency of query at scale - Share code to delete all user entries between it's async, sync methods - Add indicator to show when files being deleted on web config page --- src/khoj/database/adapters/__init__.py | 40 +++++++++++-------- .../web/content_source_computer_input.html | 18 +++++---- src/khoj/processor/content/text_to_entries.py | 2 +- src/khoj/routers/api_config.py | 2 +- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 2ea9e9af..3a21a919 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -1012,27 +1012,35 @@ class EntryAdapters: return deleted_count @staticmethod - def delete_all_entries_by_type(user: KhojUser, file_type: str = None): - if file_type is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete() + def get_entries_by_batch(user: KhojUser, batch_size: int, file_type: str = None, file_source: str = None): + queryset = Entry.objects.filter(user=user) + + if file_type is not None: + queryset = queryset.filter(file_type=file_type) + + if file_source is not None: + queryset = queryset.filter(file_source=file_source) + + while queryset.exists(): + batch_ids = list(queryset.values_list("id", flat=True)[:batch_size]) + yield Entry.objects.filter(id__in=batch_ids) + + @staticmethod + def delete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = batch.delete() + deleted_count += count return deleted_count @staticmethod - def delete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_source=file_source).delete() + async def adelete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + async for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = await batch.adelete() + deleted_count += count return deleted_count - @staticmethod - async def adelete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - return await Entry.objects.filter(user=user).adelete() - return await Entry.objects.filter(user=user, file_source=file_source).adelete() - @staticmethod def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str): return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True) diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html index 77816f35..77ce2287 100644 --- a/src/khoj/interface/web/content_source_computer_input.html +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -12,7 +12,7 @@
- +
@@ -112,9 +112,13 @@ // Get all currently indexed files on page load getAllComputerFilenames(); - let deleteAllComputerFilesButton = document.getElementById("delete-all-files"); + let deleteAllComputerFilesButton = document.getElementById("delete-all-files-button"); deleteAllComputerFilesButton.addEventListener("click", function(event) { event.preventDefault(); + originalDeleteAllComputerFilesButtonText = deleteAllComputerFilesButton.textContent; + deleteAllComputerFilesButton.textContent = "🗑️ Deleting..."; + deleteAllComputerFilesButton.disabled = true; + fetch('/api/config/data/content-source/computer', { method: 'DELETE', headers: { @@ -122,11 +126,11 @@ } }) .then(response => response.json()) - .then(data => { - if (data.status == "ok") { - getAllComputerFilenames(); - } - }) + .finally(() => { + getAllComputerFilenames(); + deleteAllComputerFilesButton.textContent = originalDeleteAllComputerFilesButtonText; + deleteAllComputerFilesButton.disabled = false; + }); }); {% endblock %} diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 49331d6b..af0f95d9 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -137,7 +137,7 @@ class TextToEntries(ABC): if regenerate: with timer("Cleared existing dataset for regeneration in", logger): logger.debug(f"Deleting all entries for file type {file_type}") - num_deleted_entries = EntryAdapters.delete_all_entries_by_type(user, file_type) + num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type) hashes_to_process = set() with timer("Identified entries to add to database in", logger): diff --git a/src/khoj/routers/api_config.py b/src/khoj/routers/api_config.py index 10b1044c..58a8abae 100644 --- a/src/khoj/routers/api_config.py +++ b/src/khoj/routers/api_config.py @@ -183,7 +183,7 @@ async def remove_content_source_data( raise ValueError(f"Invalid content source: {content_source}") elif content_object != "Computer": await content_object.objects.filter(user=user).adelete() - await sync_to_async(EntryAdapters.delete_all_entries)(user, content_source) + await sync_to_async(EntryAdapters.delete_all_entries)(user, file_source=content_source) enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user) return {"status": "ok"}