From 6a135b1ed795e0c788ecb7535b7f73946901e782 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 6 Jul 2024 15:38:10 +0530 Subject: [PATCH] Fix degrade in speed of indexing large files. Improve summarization Adding files to the DB for summarization was slow, buggy in two ways: - We were updating same text of modified files in DB = no of chunks per file times - The `" ".join(file_content)' code was breaking each character in the file content by a space. This formats the original file content incorrectly before storing in the DB Because this code ran in the main file indexing path, it was slowing down file indexing. Knowledge bases with larger files were impacted more strongly --- src/khoj/database/adapters/__init__.py | 2 +- src/khoj/processor/content/text_to_entries.py | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/khoj/database/adapters/__init__.py b/src/khoj/database/adapters/__init__.py index 3a21a919..cfbe7ca6 100644 --- a/src/khoj/database/adapters/__init__.py +++ b/src/khoj/database/adapters/__init__.py @@ -956,7 +956,7 @@ class FileObjectAdapters: return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text) @staticmethod - def get_file_objects_by_name(user: KhojUser, file_name: str): + def get_file_object_by_name(user: KhojUser, file_name: str): return FileObject.objects.filter(user=user, file_name=file_name).first() @staticmethod diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index af0f95d9..cdb2e207 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -124,7 +124,7 @@ class TextToEntries(ABC): deletion_filenames: Set[str] = None, user: KhojUser = None, regenerate: bool = False, - file_to_text_map: dict[str, List[str]] = None, + file_to_text_map: dict[str, str] = None, ): with timer("Constructed current entry hashes in", logger): hashes_by_file = dict[str, set[str]]() @@ -192,16 +192,17 @@ class TextToEntries(ABC): logger.debug(f"Added {len(added_entries)} {file_type} entries to database") if file_to_text_map: - # get the list of file_names using added_entries - filenames_to_update = [entry.file_path for entry in added_entries] - # for each file_name in filenames_to_update, try getting the file object and updating raw_text and if it fails create a new file object - for file_name in filenames_to_update: - raw_text = " ".join(file_to_text_map[file_name]) - file_object = FileObjectAdapters.get_file_objects_by_name(user, file_name) - if file_object: - FileObjectAdapters.update_raw_text(file_object, raw_text) - else: - FileObjectAdapters.create_file_object(user, file_name, raw_text) + with timer("Indexed text of modified file in", logger): + # get the set of modified files from added_entries + modified_files = {entry.file_path for entry in added_entries} + # create or update text of each updated file indexed on DB + for modified_file in modified_files: + raw_text = file_to_text_map[modified_file] + file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file) + if file_object: + FileObjectAdapters.update_raw_text(file_object, raw_text) + else: + FileObjectAdapters.create_file_object(user, modified_file, raw_text) new_dates = [] with timer("Indexed dates from added entries in", logger):