Fix degrade in speed of indexing large files. Improve summarization

Adding files to the DB for summarization was slow, buggy in two ways: - We were updating same text of modified files in DB = no of chunks per file times - The `" ".join(file_content)' code was breaking each character in the file content by a space. This formats the original file content incorrectly before storing in the DB Because this code ran in the main file indexing path, it was slowing down file indexing. Knowledge bases with larger files were impacted more strongly
2024-11-23 15:38:55 +01:00 · 2024-07-06 15:38:10 +05:30 · 2024-07-06 15:38:10 +05:30 · 6a135b1ed7
commit 6a135b1ed7
parent e6ffb6b52c
2 changed files with 13 additions and 12 deletions
--- a/src/khoj/database/adapters/init.py
+++ b/src/khoj/database/adapters/init.py
@ -956,7 +956,7 @@ class FileObjectAdapters:
        return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text)
    @staticmethod
-    def get_file_objects_by_name(user: KhojUser, file_name: str):
+    def get_file_object_by_name(user: KhojUser, file_name: str):
        return FileObject.objects.filter(user=user, file_name=file_name).first()
    @staticmethod
--- a/src/khoj/processor/content/text_to_entries.py
+++ b/src/khoj/processor/content/text_to_entries.py
@ -124,7 +124,7 @@ class TextToEntries(ABC):
        deletion_filenames: Set[str] = None,
        user: KhojUser = None,
        regenerate: bool = False,
-        file_to_text_map: dict[str, List[str]] = None,
+        file_to_text_map: dict[str, str] = None,
    ):
        with timer("Constructed current entry hashes in", logger):
            hashes_by_file = dict[str, set[str]]()
@ -192,16 +192,17 @@ class TextToEntries(ABC):
            logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
        if file_to_text_map:
-            # get the list of file_names using added_entries
+            with timer("Indexed text of modified file in", logger):
-            filenames_to_update = [entry.file_path for entry in added_entries]
+                # get the set of modified files from added_entries
-            # for each file_name in filenames_to_update, try getting the file object and updating raw_text and if it fails create a new file object
+                modified_files = {entry.file_path for entry in added_entries}
-            for file_name in filenames_to_update:
+                # create or update text of each updated file indexed on DB
-                raw_text = " ".join(file_to_text_map[file_name])
+                for modified_file in modified_files:
-                file_object = FileObjectAdapters.get_file_objects_by_name(user, file_name)
+                    raw_text = file_to_text_map[modified_file]
-                if file_object:
+                    file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
-                    FileObjectAdapters.update_raw_text(file_object, raw_text)
+                    if file_object:
-                else:
+                        FileObjectAdapters.update_raw_text(file_object, raw_text)
-                    FileObjectAdapters.create_file_object(user, file_name, raw_text)
+                    else:
                        FileObjectAdapters.create_file_object(user, modified_file, raw_text)
        new_dates = []
        with timer("Indexed dates from added entries in", logger):