mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Process text content files in sorted order for stable indexing
- Image search already uses a sorted list of images to process - Prevents index of entries to desync when entries, embeddings generated by a separate server/app instance
This commit is contained in:
parent
a701ad08b9
commit
536f03af8f
6 changed files with 12 additions and 10 deletions
|
@ -77,12 +77,14 @@ def get_beancount_files(beancount_files=None, beancount_file_filters=None):
|
|||
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
||||
}
|
||||
|
||||
all_beancount_files = absolute_beancount_files | filtered_beancount_files
|
||||
all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
|
||||
|
||||
files_with_non_beancount_extensions = {beancount_file
|
||||
files_with_non_beancount_extensions = {
|
||||
beancount_file
|
||||
for beancount_file
|
||||
in all_beancount_files
|
||||
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
|
||||
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
|
||||
}
|
||||
if any(files_with_non_beancount_extensions):
|
||||
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ def get_markdown_files(markdown_files=None, markdown_file_filters=None):
|
|||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
||||
}
|
||||
|
||||
all_markdown_files = absolute_markdown_files | filtered_markdown_files
|
||||
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||
|
||||
files_with_non_markdown_extensions = {
|
||||
md_file
|
||||
|
|
|
@ -82,7 +82,7 @@ def get_org_files(org_files=None, org_file_filters=None):
|
|||
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
||||
}
|
||||
|
||||
all_org_files = absolute_org_files | filtered_org_files
|
||||
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||
|
||||
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
||||
if any(files_with_non_org_extensions):
|
||||
|
|
|
@ -89,7 +89,7 @@ def test_get_beancount_files(tmp_path):
|
|||
create_file(tmp_path, filename="not-included-ledger.bean")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / 'ledger.bean']
|
||||
|
|
|
@ -86,7 +86,7 @@ def test_get_markdown_files(tmp_path):
|
|||
create_file(tmp_path, filename="not-included-markdown.md")
|
||||
create_file(tmp_path, filename="not-included-text.txt")
|
||||
|
||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / 'notes.md']
|
||||
|
|
|
@ -95,7 +95,7 @@ def test_get_org_files(tmp_path):
|
|||
create_file(tmp_path, filename="orgfile2.org")
|
||||
create_file(tmp_path, filename="text1.txt")
|
||||
|
||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
|
||||
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
|
||||
|
||||
# Setup input-files, input-filters
|
||||
input_files = [tmp_path / 'orgfile1.org']
|
||||
|
|
Loading…
Reference in a new issue