mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-24 07:55:07 +01:00
Process text content files in sorted order for stable indexing
- Image search already uses a sorted list of images to process - Prevents index of entries to desync when entries, embeddings generated by a separate server/app instance
This commit is contained in:
parent
a701ad08b9
commit
536f03af8f
6 changed files with 12 additions and 10 deletions
|
@ -77,12 +77,14 @@ def get_beancount_files(beancount_files=None, beancount_file_filters=None):
|
||||||
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
|
||||||
}
|
}
|
||||||
|
|
||||||
all_beancount_files = absolute_beancount_files | filtered_beancount_files
|
all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
|
||||||
|
|
||||||
files_with_non_beancount_extensions = {beancount_file
|
files_with_non_beancount_extensions = {
|
||||||
|
beancount_file
|
||||||
for beancount_file
|
for beancount_file
|
||||||
in all_beancount_files
|
in all_beancount_files
|
||||||
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
|
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
|
||||||
|
}
|
||||||
if any(files_with_non_beancount_extensions):
|
if any(files_with_non_beancount_extensions):
|
||||||
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")
|
||||||
|
|
||||||
|
|
|
@ -75,7 +75,7 @@ def get_markdown_files(markdown_files=None, markdown_file_filters=None):
|
||||||
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
|
||||||
}
|
}
|
||||||
|
|
||||||
all_markdown_files = absolute_markdown_files | filtered_markdown_files
|
all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
|
||||||
|
|
||||||
files_with_non_markdown_extensions = {
|
files_with_non_markdown_extensions = {
|
||||||
md_file
|
md_file
|
||||||
|
|
|
@ -82,7 +82,7 @@ def get_org_files(org_files=None, org_file_filters=None):
|
||||||
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
for filtered_file in glob.glob(get_absolute_path(org_file_filter))
|
||||||
}
|
}
|
||||||
|
|
||||||
all_org_files = absolute_org_files | filtered_org_files
|
all_org_files = sorted(absolute_org_files | filtered_org_files)
|
||||||
|
|
||||||
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
|
||||||
if any(files_with_non_org_extensions):
|
if any(files_with_non_org_extensions):
|
||||||
|
|
|
@ -89,7 +89,7 @@ def test_get_beancount_files(tmp_path):
|
||||||
create_file(tmp_path, filename="not-included-ledger.bean")
|
create_file(tmp_path, filename="not-included-ledger.bean")
|
||||||
create_file(tmp_path, filename="not-included-text.txt")
|
create_file(tmp_path, filename="not-included-text.txt")
|
||||||
|
|
||||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
# Setup input-files, input-filters
|
||||||
input_files = [tmp_path / 'ledger.bean']
|
input_files = [tmp_path / 'ledger.bean']
|
||||||
|
|
|
@ -86,7 +86,7 @@ def test_get_markdown_files(tmp_path):
|
||||||
create_file(tmp_path, filename="not-included-markdown.md")
|
create_file(tmp_path, filename="not-included-markdown.md")
|
||||||
create_file(tmp_path, filename="not-included-text.txt")
|
create_file(tmp_path, filename="not-included-text.txt")
|
||||||
|
|
||||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
# Setup input-files, input-filters
|
||||||
input_files = [tmp_path / 'notes.md']
|
input_files = [tmp_path / 'notes.md']
|
||||||
|
|
|
@ -95,7 +95,7 @@ def test_get_org_files(tmp_path):
|
||||||
create_file(tmp_path, filename="orgfile2.org")
|
create_file(tmp_path, filename="orgfile2.org")
|
||||||
create_file(tmp_path, filename="text1.txt")
|
create_file(tmp_path, filename="text1.txt")
|
||||||
|
|
||||||
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
|
expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
|
||||||
|
|
||||||
# Setup input-files, input-filters
|
# Setup input-files, input-filters
|
||||||
input_files = [tmp_path / 'orgfile1.org']
|
input_files = [tmp_path / 'orgfile1.org']
|
||||||
|
|
Loading…
Reference in a new issue