diff --git a/src/khoj/app/settings.py b/src/khoj/app/settings.py index 2cb4440b..93f449ab 100644 --- a/src/khoj/app/settings.py +++ b/src/khoj/app/settings.py @@ -112,7 +112,7 @@ ASGI_APPLICATION = "app.asgi.application" # Database # https://docs.djangoproject.com/en/4.2/ref/settings/#databases - +DATA_UPLOAD_MAX_NUMBER_FIELDS = 20000 DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql", diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 2ea9e9af..cfbe7ca6 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -956,7 +956,7 @@ class FileObjectAdapters: return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text) @staticmethod - def get_file_objects_by_name(user: KhojUser, file_name: str): + def get_file_object_by_name(user: KhojUser, file_name: str): return FileObject.objects.filter(user=user, file_name=file_name).first() @staticmethod @@ -1012,27 +1012,35 @@ class EntryAdapters: return deleted_count @staticmethod - def delete_all_entries_by_type(user: KhojUser, file_type: str = None): - if file_type is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_type=file_type).delete() + def get_entries_by_batch(user: KhojUser, batch_size: int, file_type: str = None, file_source: str = None): + queryset = Entry.objects.filter(user=user) + + if file_type is not None: + queryset = queryset.filter(file_type=file_type) + + if file_source is not None: + queryset = queryset.filter(file_source=file_source) + + while queryset.exists(): + batch_ids = list(queryset.values_list("id", flat=True)[:batch_size]) + yield Entry.objects.filter(id__in=batch_ids) + + @staticmethod + def delete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = batch.delete() + deleted_count += count return deleted_count @staticmethod - def delete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - deleted_count, _ = Entry.objects.filter(user=user).delete() - else: - deleted_count, _ = Entry.objects.filter(user=user, file_source=file_source).delete() + async def adelete_all_entries(user: KhojUser, file_type: str = None, file_source: str = None, batch_size=1000): + deleted_count = 0 + async for batch in EntryAdapters.get_entries_by_batch(user, batch_size, file_type, file_source): + count, _ = await batch.adelete() + deleted_count += count return deleted_count - @staticmethod - async def adelete_all_entries(user: KhojUser, file_source: str = None): - if file_source is None: - return await Entry.objects.filter(user=user).adelete() - return await Entry.objects.filter(user=user, file_source=file_source).adelete() - @staticmethod def get_existing_entry_hashes_by_file(user: KhojUser, file_path: str): return Entry.objects.filter(user=user, file_path=file_path).values_list("hashed_value", flat=True) diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py index 95e3508c..d3e630c3 100644 --- a/src/khoj/database/admin.py +++ b/src/khoj/database/admin.py @@ -125,7 +125,10 @@ class EntryAdmin(admin.ModelAdmin): "file_path", ) search_fields = ("id", "user__email", "user__username", "file_path") - list_filter = ("file_type",) + list_filter = ( + "file_type", + "user__email", + ) ordering = ("-created_at",) diff --git a/src/khoj/interface/web/content_source_computer_input.html b/src/khoj/interface/web/content_source_computer_input.html index 49f8bdc5..ac3209c3 100644 --- a/src/khoj/interface/web/content_source_computer_input.html +++ b/src/khoj/interface/web/content_source_computer_input.html @@ -12,7 +12,7 @@
- +
@@ -115,9 +115,13 @@ // Get all currently indexed files on page load getAllComputerFilenames(); - let deleteAllComputerFilesButton = document.getElementById("delete-all-files"); + let deleteAllComputerFilesButton = document.getElementById("delete-all-files-button"); deleteAllComputerFilesButton.addEventListener("click", function(event) { event.preventDefault(); + originalDeleteAllComputerFilesButtonText = deleteAllComputerFilesButton.textContent; + deleteAllComputerFilesButton.textContent = "🗑️ Deleting..."; + deleteAllComputerFilesButton.disabled = true; + fetch('/api/config/data/content-source/computer', { method: 'DELETE', headers: { @@ -125,11 +129,11 @@ } }) .then(response => response.json()) - .then(data => { - if (data.status == "ok") { - getAllComputerFilenames(); - } - }) + .finally(() => { + getAllComputerFilenames(); + deleteAllComputerFilesButton.textContent = originalDeleteAllComputerFilesButtonText; + deleteAllComputerFilesButton.disabled = false; + }); }); {% endblock %} diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index ae0bd822..f18e1e21 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -146,7 +146,7 @@ class MarkdownToEntries(TextToEntries): else: entry_filename = str(Path(raw_filename)) - heading = parsed_entry.splitlines()[0] if re.search("^#+\s", parsed_entry) else "" + heading = parsed_entry.splitlines()[0] if re.search(r"^#+\s", parsed_entry) else "" # Append base filename to compiled entry for context to model # Increment heading level for heading entries and make filename as its top level heading prefix = f"# {entry_filename}\n#" if heading else f"# {entry_filename}\n" diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index af85a6bd..c528244d 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -115,14 +115,20 @@ class OrgToEntries(TextToEntries): return entries, entry_to_file_map # Split this entry tree into sections by the next heading level in it - # Increment heading level until able to split entry into sections + # Increment heading level until able to split entry into sections or reach max heading level # A successful split will result in at least 2 sections + max_heading_level = 100 next_heading_level = len(ancestry) sections: List[str] = [] - while len(sections) < 2: + while len(sections) < 2 and next_heading_level < max_heading_level: next_heading_level += 1 sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE) + # If unable to split entry into sections, log error and skip indexing it + if next_heading_level == max_heading_level: + logger.error(f"Unable to split current entry chunk: {org_content_with_ancestry[:20]}. Skip indexing it.") + return entries, entry_to_file_map + # Recurse down each non-empty section after parsing its body, heading and ancestry for section in sections: # Skip empty sections @@ -135,7 +141,7 @@ class OrgToEntries(TextToEntries): # If first non-empty line is a heading with expected heading level if re.search(rf"^\*{{{next_heading_level}}}\s", first_non_empty_line): # Extract the section body without the heading - current_section_body = "\n".join(section.split(first_non_empty_line)[1:]) + current_section_body = "\n".join(section.split(first_non_empty_line, 1)[1:]) # Parse the section heading into current section ancestry current_section_title = first_non_empty_line[next_heading_level:].strip() current_ancestry[next_heading_level] = current_section_title diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index 49331d6b..cdb2e207 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -124,7 +124,7 @@ class TextToEntries(ABC): deletion_filenames: Set[str] = None, user: KhojUser = None, regenerate: bool = False, - file_to_text_map: dict[str, List[str]] = None, + file_to_text_map: dict[str, str] = None, ): with timer("Constructed current entry hashes in", logger): hashes_by_file = dict[str, set[str]]() @@ -137,7 +137,7 @@ class TextToEntries(ABC): if regenerate: with timer("Cleared existing dataset for regeneration in", logger): logger.debug(f"Deleting all entries for file type {file_type}") - num_deleted_entries = EntryAdapters.delete_all_entries_by_type(user, file_type) + num_deleted_entries = EntryAdapters.delete_all_entries(user, file_type=file_type) hashes_to_process = set() with timer("Identified entries to add to database in", logger): @@ -192,16 +192,17 @@ class TextToEntries(ABC): logger.debug(f"Added {len(added_entries)} {file_type} entries to database") if file_to_text_map: - # get the list of file_names using added_entries - filenames_to_update = [entry.file_path for entry in added_entries] - # for each file_name in filenames_to_update, try getting the file object and updating raw_text and if it fails create a new file object - for file_name in filenames_to_update: - raw_text = " ".join(file_to_text_map[file_name]) - file_object = FileObjectAdapters.get_file_objects_by_name(user, file_name) - if file_object: - FileObjectAdapters.update_raw_text(file_object, raw_text) - else: - FileObjectAdapters.create_file_object(user, file_name, raw_text) + with timer("Indexed text of modified file in", logger): + # get the set of modified files from added_entries + modified_files = {entry.file_path for entry in added_entries} + # create or update text of each updated file indexed on DB + for modified_file in modified_files: + raw_text = file_to_text_map[modified_file] + file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file) + if file_object: + FileObjectAdapters.update_raw_text(file_object, raw_text) + else: + FileObjectAdapters.create_file_object(user, modified_file, raw_text) new_dates = [] with timer("Indexed dates from added entries in", logger): diff --git a/src/khoj/routers/api_config.py b/src/khoj/routers/api_config.py index 10b1044c..58a8abae 100644 --- a/src/khoj/routers/api_config.py +++ b/src/khoj/routers/api_config.py @@ -183,7 +183,7 @@ async def remove_content_source_data( raise ValueError(f"Invalid content source: {content_source}") elif content_object != "Computer": await content_object.objects.filter(user=user).adelete() - await sync_to_async(EntryAdapters.delete_all_entries)(user, content_source) + await sync_to_async(EntryAdapters.delete_all_entries)(user, file_source=content_source) enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user) return {"status": "ok"} diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index e8940269..a84fe6e8 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -1,5 +1,6 @@ import os import re +import time from khoj.processor.content.org_mode.org_to_entries import OrgToEntries from khoj.processor.content.text_to_entries import TextToEntries @@ -41,6 +42,35 @@ def test_configure_indexing_heading_only_entries(tmp_path): assert is_none_or_empty(entries[1]) +def test_extract_entries_when_child_headings_have_same_prefix(): + """Extract org entries from entries having child headings with same prefix. + Prevents regressions like the one fixed in PR #840. + """ + # Arrange + tmp_path = "tests/data/org/same_prefix_headings.org" + entry: str = """ +** 1 +*** 1.1 +**** 1.1.2 +""".strip() + data = { + f"{tmp_path}": entry, + } + + # Act + # Extract Entries from specified Org files + start = time.time() + entries = OrgToEntries.extract_org_entries(org_files=data, max_tokens=2) + end = time.time() + indexing_time = end - start + + # Assert + explanation_msg = ( + "It should not take more than 6 seconds to index. Entry extraction may have gone into an infinite loop." + ) + assert indexing_time < 6 * len(entries), explanation_msg + + def test_entry_split_when_exceeds_max_tokens(): "Ensure entries with compiled words exceeding max_tokens are split." # Arrange