Process text content files in sorted order for stable indexing

- Image search already uses a sorted list of images to process
- Prevents index of entries to desync when entries, embeddings
  generated by a separate server/app instance
This commit is contained in:
Debanjum Singh Solanky 2022-09-12 11:02:05 +03:00
parent a701ad08b9
commit 536f03af8f
6 changed files with 12 additions and 10 deletions

View file

@ -77,12 +77,14 @@ def get_beancount_files(beancount_files=None, beancount_file_filters=None):
for filtered_file in glob.glob(get_absolute_path(beancount_file_filter)) for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
} }
all_beancount_files = absolute_beancount_files | filtered_beancount_files all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)
files_with_non_beancount_extensions = {beancount_file files_with_non_beancount_extensions = {
beancount_file
for beancount_file for beancount_file
in all_beancount_files in all_beancount_files
if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")} if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
}
if any(files_with_non_beancount_extensions): if any(files_with_non_beancount_extensions):
print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}") print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")

View file

@ -75,7 +75,7 @@ def get_markdown_files(markdown_files=None, markdown_file_filters=None):
for filtered_file in glob.glob(get_absolute_path(markdown_file_filter)) for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
} }
all_markdown_files = absolute_markdown_files | filtered_markdown_files all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)
files_with_non_markdown_extensions = { files_with_non_markdown_extensions = {
md_file md_file

View file

@ -82,7 +82,7 @@ def get_org_files(org_files=None, org_file_filters=None):
for filtered_file in glob.glob(get_absolute_path(org_file_filter)) for filtered_file in glob.glob(get_absolute_path(org_file_filter))
} }
all_org_files = absolute_org_files | filtered_org_files all_org_files = sorted(absolute_org_files | filtered_org_files)
files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")} files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
if any(files_with_non_org_extensions): if any(files_with_non_org_extensions):

View file

@ -89,7 +89,7 @@ def test_get_beancount_files(tmp_path):
create_file(tmp_path, filename="not-included-ledger.bean") create_file(tmp_path, filename="not-included-ledger.bean")
create_file(tmp_path, filename="not-included-text.txt") create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1])) expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
# Setup input-files, input-filters # Setup input-files, input-filters
input_files = [tmp_path / 'ledger.bean'] input_files = [tmp_path / 'ledger.bean']

View file

@ -86,7 +86,7 @@ def test_get_markdown_files(tmp_path):
create_file(tmp_path, filename="not-included-markdown.md") create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt") create_file(tmp_path, filename="not-included-text.txt")
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1])) expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
# Setup input-files, input-filters # Setup input-files, input-filters
input_files = [tmp_path / 'notes.md'] input_files = [tmp_path / 'notes.md']

View file

@ -95,7 +95,7 @@ def test_get_org_files(tmp_path):
create_file(tmp_path, filename="orgfile2.org") create_file(tmp_path, filename="orgfile2.org")
create_file(tmp_path, filename="text1.txt") create_file(tmp_path, filename="text1.txt")
expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1])) expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
# Setup input-files, input-filters # Setup input-files, input-filters
input_files = [tmp_path / 'orgfile1.org'] input_files = [tmp_path / 'orgfile1.org']