mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Fix degrade in speed of indexing large files. Improve summarization
Adding files to the DB for summarization was slow, buggy in two ways: - We were updating same text of modified files in DB = no of chunks per file times - The `" ".join(file_content)' code was breaking each character in the file content by a space. This formats the original file content incorrectly before storing in the DB Because this code ran in the main file indexing path, it was slowing down file indexing. Knowledge bases with larger files were impacted more strongly
This commit is contained in:
parent
e6ffb6b52c
commit
6a135b1ed7
2 changed files with 13 additions and 12 deletions
|
@ -956,7 +956,7 @@ class FileObjectAdapters:
|
||||||
return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text)
|
return FileObject.objects.create(user=user, file_name=file_name, raw_text=raw_text)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_file_objects_by_name(user: KhojUser, file_name: str):
|
def get_file_object_by_name(user: KhojUser, file_name: str):
|
||||||
return FileObject.objects.filter(user=user, file_name=file_name).first()
|
return FileObject.objects.filter(user=user, file_name=file_name).first()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -124,7 +124,7 @@ class TextToEntries(ABC):
|
||||||
deletion_filenames: Set[str] = None,
|
deletion_filenames: Set[str] = None,
|
||||||
user: KhojUser = None,
|
user: KhojUser = None,
|
||||||
regenerate: bool = False,
|
regenerate: bool = False,
|
||||||
file_to_text_map: dict[str, List[str]] = None,
|
file_to_text_map: dict[str, str] = None,
|
||||||
):
|
):
|
||||||
with timer("Constructed current entry hashes in", logger):
|
with timer("Constructed current entry hashes in", logger):
|
||||||
hashes_by_file = dict[str, set[str]]()
|
hashes_by_file = dict[str, set[str]]()
|
||||||
|
@ -192,16 +192,17 @@ class TextToEntries(ABC):
|
||||||
logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
|
logger.debug(f"Added {len(added_entries)} {file_type} entries to database")
|
||||||
|
|
||||||
if file_to_text_map:
|
if file_to_text_map:
|
||||||
# get the list of file_names using added_entries
|
with timer("Indexed text of modified file in", logger):
|
||||||
filenames_to_update = [entry.file_path for entry in added_entries]
|
# get the set of modified files from added_entries
|
||||||
# for each file_name in filenames_to_update, try getting the file object and updating raw_text and if it fails create a new file object
|
modified_files = {entry.file_path for entry in added_entries}
|
||||||
for file_name in filenames_to_update:
|
# create or update text of each updated file indexed on DB
|
||||||
raw_text = " ".join(file_to_text_map[file_name])
|
for modified_file in modified_files:
|
||||||
file_object = FileObjectAdapters.get_file_objects_by_name(user, file_name)
|
raw_text = file_to_text_map[modified_file]
|
||||||
if file_object:
|
file_object = FileObjectAdapters.get_file_object_by_name(user, modified_file)
|
||||||
FileObjectAdapters.update_raw_text(file_object, raw_text)
|
if file_object:
|
||||||
else:
|
FileObjectAdapters.update_raw_text(file_object, raw_text)
|
||||||
FileObjectAdapters.create_file_object(user, file_name, raw_text)
|
else:
|
||||||
|
FileObjectAdapters.create_file_object(user, modified_file, raw_text)
|
||||||
|
|
||||||
new_dates = []
|
new_dates = []
|
||||||
with timer("Indexed dates from added entries in", logger):
|
with timer("Indexed dates from added entries in", logger):
|
||||||
|
|
Loading…
Reference in a new issue