mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Improve scaling user flow to delete all entries
- Delete entries by batch to improve efficiency of query at scale - Share code to delete all user entries between it's async, sync methods - Add indicator to show when files being deleted on web config page
This commit is contained in:
parent
1ab59865b5
commit
e6ffb6b52c
4 changed files with 37 additions and 25 deletions
|
@ -1012,27 +1012,35 @@ class EntryAdapters:
|
||||||
return deleted_count
|
return deleted_count
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_all_entries_by_type(user: KhojUser, file_type: str = None):
|
def get_entries_by_batch(user: KhojUser, batch_size: int, file_type: str = None, file_source: str = None):
|
||||||
if file_type is None:
|
queryset = Entry.objects.filter(user=user)
|
||||||
deleted_count, _ = Entry.objects.filter(user=user).delete()
|
|
||||||
else:
|
if file_type is not None:
|
||||||
deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete()
|
queryset = queryset.filter(file_type=file_type)
|
||||||
|
|
||||||
|
if file_source is not None:
|
||||||
|
queryset = queryset.filter(file_source=file_source)
|
||||||
|
|
||||||
|
while queryset.exists():
|
||||||
|
batch_ids = list(queryset.values_list("id", flat=True)[:batch_size])
|
||||||
|
yield Entry.objects.filter(id__in=batch_ids)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def delete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000):
|
||||||
|
deleted_count = 0
|
||||||
|
for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source):
|
||||||
|
count, _ = batch.delete()
|
||||||
|
deleted_count += count
|
||||||
return deleted_count
|
return deleted_count
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_all_entries(user: KhojUser, file_source: str = None):
|
async def adelete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000):
|
||||||
if file_source is None:
|
deleted_count = 0
|
||||||
deleted_count, _ = Entry.objects.filter(user=user).delete()
|
async for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source):
|
||||||
else:
|
count, _ = await batch.adelete()
|
||||||
deleted_count, _ = Entry.objects.filter(user=user, file_source=file_source).delete()
|
deleted_count += count
|
||||||
return deleted_count
|
return deleted_count
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
async def adelete_all_entries(user: KhojUser, file_source: str = None):
|
|
||||||
if file_source is None:
|
|
||||||
return await Entry.objects.filter(user=user).adelete()
|
|
||||||
return await Entry.objects.filter(user=user, file_source=file_source).adelete()
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
|
def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str):
|
||||||
return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True)
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
</h2>
|
</h2>
|
||||||
<div class="section-manage-files">
|
<div class="section-manage-files">
|
||||||
<div id="delete-all-files" class="delete-all-files">
|
<div id="delete-all-files" class="delete-all-files">
|
||||||
<button id="delete-all-files" type="submit" title="Remove all computer files from Khoj">🗑️ Delete all</button>
|
<button id="delete-all-files-button" type="submit" title="Remove all computer files from Khoj">🗑️ Delete all</button>
|
||||||
</div>
|
</div>
|
||||||
<div class="indexed-files">
|
<div class="indexed-files">
|
||||||
</div>
|
</div>
|
||||||
|
@ -112,9 +112,13 @@
|
||||||
// Get all currently indexed files on page load
|
// Get all currently indexed files on page load
|
||||||
getAllComputerFilenames();
|
getAllComputerFilenames();
|
||||||
|
|
||||||
let deleteAllComputerFilesButton = document.getElementById("delete-all-files");
|
let deleteAllComputerFilesButton = document.getElementById("delete-all-files-button");
|
||||||
deleteAllComputerFilesButton.addEventListener("click", function(event) {
|
deleteAllComputerFilesButton.addEventListener("click", function(event) {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
|
originalDeleteAllComputerFilesButtonText = deleteAllComputerFilesButton.textContent;
|
||||||
|
deleteAllComputerFilesButton.textContent = "🗑️ Deleting...";
|
||||||
|
deleteAllComputerFilesButton.disabled = true;
|
||||||
|
|
||||||
fetch('/api/config/data/content-source/computer', {
|
fetch('/api/config/data/content-source/computer', {
|
||||||
method: 'DELETE',
|
method: 'DELETE',
|
||||||
headers: {
|
headers: {
|
||||||
|
@ -122,11 +126,11 @@
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.finally(() => {
|
||||||
if (data.status == "ok") {
|
getAllComputerFilenames();
|
||||||
getAllComputerFilenames();
|
deleteAllComputerFilesButton.textContent = originalDeleteAllComputerFilesButtonText;
|
||||||
}
|
deleteAllComputerFilesButton.disabled = false;
|
||||||
})
|
});
|
||||||
});
|
});
|
||||||
</script>
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
|
@ -137,7 +137,7 @@ class TextToEntries(ABC):
|
||||||
if regenerate:
|
if regenerate:
|
||||||
with timer("Cleared existing dataset for regeneration in", logger):
|
with timer("Cleared existing dataset for regeneration in", logger):
|
||||||
logger.debug(f"Deleting all entries for file type {file_type}")
|
logger.debug(f"Deleting all entries for file type {file_type}")
|
||||||
num_deleted_entries = EntryAdapters.delete_all_entries_by_type(user, file_type)
|
num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type)
|
||||||
|
|
||||||
hashes_to_process = set()
|
hashes_to_process = set()
|
||||||
with timer("Identified entries to add to database in", logger):
|
with timer("Identified entries to add to database in", logger):
|
||||||
|
|
|
@ -183,7 +183,7 @@ async def remove_content_source_data(
|
||||||
raise ValueError(f"Invalid content source: {content_source}")
|
raise ValueError(f"Invalid content source: {content_source}")
|
||||||
elif content_object != "Computer":
|
elif content_object != "Computer":
|
||||||
await content_object.objects.filter(user=user).adelete()
|
await content_object.objects.filter(user=user).adelete()
|
||||||
await sync_to_async(EntryAdapters.delete_all_entries)(user, content_source)
|
await sync_to_async(EntryAdapters.delete_all_entries)(user, file_source=content_source)
|
||||||
|
|
||||||
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
|
||||||
return {"status": "ok"}
|
return {"status": "ok"}
|
||||||
|
|
Loading…
Add table
Reference in a new issue