Allow indexing to continue even if there's an issue parsing a particular org file (#430)

* Allow indexing to continue even if there's an issue parsing a particular org file * Use approximation in pytorch comparison in text_search UT, skip additional file parser errors for org files * Change error of expected failure
2024-11-27 09:25:06 +01:00 · 2023-08-14 14:56:33 +00:00 · 2023-08-14 14:56:33 +00:00 · 0ea901c7c1
commit 0ea901c7c1
parent 7b907add77
2 changed files with 16 additions and 10 deletions
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@ -95,19 +95,26 @@ class OrgToJsonl(TextToJsonl):
        entries = []
        entry_to_file_map = []
        for org_file in org_files:
-            org_file_entries = orgnode.makelist_with_filepath(str(org_file))
-            entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
-            entries.extend(org_file_entries)
+            try:
+                org_file_entries = orgnode.makelist_with_filepath(str(org_file))
+                entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
+                entries.extend(org_file_entries)
+            except Exception as e:
+                logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True)

        return entries, dict(entry_to_file_map)

    @staticmethod
    def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List):
        # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. We'll split the raw conetnt of this file by new line to mimic the same behavior.
-        org_file_entries = orgnode.makelist(org_content.split("\n"), org_file)
-        entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
-        entries.extend(org_file_entries)
-        return entries, entry_to_file_map
+        try:
+            org_file_entries = orgnode.makelist(org_content.split("\n"), org_file)
+            entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
+            entries.extend(org_file_entries)
+            return entries, entry_to_file_map
+        except Exception as e:
+            logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True)
+            return entries, entry_to_file_map

    @staticmethod
    def convert_org_nodes_to_entries(
--- a/tests/test_text_search.py
+++ b/tests/test_text_search.py
@ -5,7 +5,6 @@ import os

 # External Packages
 import pytest
-import torch
 from khoj.utils.config import SearchModels

 # Internal Packages
@ -28,7 +27,7 @@ def test_text_search_setup_with_missing_file_raises_error(

    # Act
    # Generate notes embeddings during asymmetric setup
-    with pytest.raises(FileNotFoundError):
+    with pytest.raises(ValueError, match=r"^No valid entries found in specified files:*"):
        text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True)


@ -281,7 +280,7 @@ def compare_index(initial_notes_model, final_notes_model):

    # verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings
    for index in range(len(initial_notes_model.corpus_embeddings)):
-        if not torch.equal(final_notes_model.corpus_embeddings[index], initial_notes_model.corpus_embeddings[index]):
+        if not initial_notes_model.corpus_embeddings[index].allclose(final_notes_model.corpus_embeddings[index]):
            mismatched_embeddings.append(index)

    error_details = ""