Process text content files in sorted order for stable indexing

- Image search already uses a sorted list of images to process - Prevents index of entries to desync when entries, embeddings generated by a separate server/app instance
2024-11-27 17:35:07 +01:00 · 2022-09-12 11:02:05 +03:00 · 2022-09-12 11:02:05 +03:00 · 536f03af8f
commit 536f03af8f
parent a701ad08b9
6 changed files with 12 additions and 10 deletions
--- a/src/processor/ledger/beancount_to_jsonl.py
+++ b/src/processor/ledger/beancount_to_jsonl.py
@ -77,12 +77,14 @@ def get_beancount_files(beancount_files=None, beancount_file_filters=None):
            for filtered_file in glob.glob(get_absolute_path(beancount_file_filter))
        }

-    all_beancount_files = absolute_beancount_files | filtered_beancount_files
+    all_beancount_files = sorted(absolute_beancount_files | filtered_beancount_files)

-    files_with_non_beancount_extensions = {beancount_file
-                                    for beancount_file
-                                    in all_beancount_files
-                                    if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")}
+    files_with_non_beancount_extensions = {
+        beancount_file
+        for beancount_file
+        in all_beancount_files
+        if not beancount_file.endswith(".bean") and not beancount_file.endswith(".beancount")
+    }
    if any(files_with_non_beancount_extensions):
        print(f"[Warning] There maybe non beancount files in the input set: {files_with_non_beancount_extensions}")

--- a/src/processor/markdown/markdown_to_jsonl.py
+++ b/src/processor/markdown/markdown_to_jsonl.py
@ -75,7 +75,7 @@ def get_markdown_files(markdown_files=None, markdown_file_filters=None):
            for filtered_file in glob.glob(get_absolute_path(markdown_file_filter))
        }

-    all_markdown_files = absolute_markdown_files | filtered_markdown_files
+    all_markdown_files = sorted(absolute_markdown_files | filtered_markdown_files)

    files_with_non_markdown_extensions = {
        md_file
--- a/src/processor/org_mode/org_to_jsonl.py
+++ b/src/processor/org_mode/org_to_jsonl.py
@ -82,7 +82,7 @@ def get_org_files(org_files=None, org_file_filters=None):
            for filtered_file in glob.glob(get_absolute_path(org_file_filter))
        }

-    all_org_files = absolute_org_files | filtered_org_files
+    all_org_files = sorted(absolute_org_files | filtered_org_files)

    files_with_non_org_extensions = {org_file for org_file in all_org_files if not org_file.endswith(".org")}
    if any(files_with_non_org_extensions):
--- a/tests/test_beancount_to_jsonl.py
+++ b/tests/test_beancount_to_jsonl.py
@ -89,7 +89,7 @@ def test_get_beancount_files(tmp_path):
    create_file(tmp_path, filename="not-included-ledger.bean")
    create_file(tmp_path, filename="not-included-text.txt")

-    expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
+    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))

    # Setup input-files, input-filters
    input_files = [tmp_path / 'ledger.bean']
--- a/tests/test_markdown_to_jsonl.py
+++ b/tests/test_markdown_to_jsonl.py
@ -86,7 +86,7 @@ def test_get_markdown_files(tmp_path):
    create_file(tmp_path, filename="not-included-markdown.md")
    create_file(tmp_path, filename="not-included-text.txt")

-    expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))
+    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1]))

    # Setup input-files, input-filters
    input_files = [tmp_path / 'notes.md']
--- a/tests/test_org_to_jsonl.py
+++ b/tests/test_org_to_jsonl.py
@ -95,7 +95,7 @@ def test_get_org_files(tmp_path):
    create_file(tmp_path, filename="orgfile2.org")
    create_file(tmp_path, filename="text1.txt")

-    expected_files = set(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))
+    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))

    # Setup input-files, input-filters
    input_files = [tmp_path / 'orgfile1.org']