From 0ea901c7c1c812798bbf83f72841e6c3c2ca99a4 Mon Sep 17 00:00:00 2001 From: sabaimran <65192171+sabaimran@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:56:33 +0000 Subject: [PATCH] Allow indexing to continue even if there's an issue parsing a particular org file (#430) * Allow indexing to continue even if there's an issue parsing a particular org file * Use approximation in pytorch comparison in text_search UT, skip additional file parser errors for org files * Change error of expected failure --- src/khoj/processor/org_mode/org_to_jsonl.py | 21 ++++++++++++++------- tests/test_text_search.py | 5 ++--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index b3bc06fd..d8190a49 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -95,19 +95,26 @@ class OrgToJsonl(TextToJsonl): entries = [] entry_to_file_map = [] for org_file in org_files: - org_file_entries = orgnode.makelist_with_filepath(str(org_file)) - entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) - entries.extend(org_file_entries) + try: + org_file_entries = orgnode.makelist_with_filepath(str(org_file)) + entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) + entries.extend(org_file_entries) + except Exception as e: + logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True) return entries, dict(entry_to_file_map) @staticmethod def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List): # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. We'll split the raw conetnt of this file by new line to mimic the same behavior. - org_file_entries = orgnode.makelist(org_content.split("\n"), org_file) - entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) - entries.extend(org_file_entries) - return entries, entry_to_file_map + try: + org_file_entries = orgnode.makelist(org_content.split("\n"), org_file) + entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) + entries.extend(org_file_entries) + return entries, entry_to_file_map + except Exception as e: + logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True) + return entries, entry_to_file_map @staticmethod def convert_org_nodes_to_entries( diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 4d775e92..6b051d59 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -5,7 +5,6 @@ import os # External Packages import pytest -import torch from khoj.utils.config import SearchModels # Internal Packages @@ -28,7 +27,7 @@ def test_text_search_setup_with_missing_file_raises_error( # Act # Generate notes embeddings during asymmetric setup - with pytest.raises(FileNotFoundError): + with pytest.raises(ValueError, match=r"^No valid entries found in specified files:*"): text_search.setup(OrgToJsonl, org_config_with_only_new_file, search_config.asymmetric, regenerate=True) @@ -281,7 +280,7 @@ def compare_index(initial_notes_model, final_notes_model): # verify new entry embedding appended to embeddings tensor, without disrupting order or content of existing embeddings for index in range(len(initial_notes_model.corpus_embeddings)): - if not torch.equal(final_notes_model.corpus_embeddings[index], initial_notes_model.corpus_embeddings[index]): + if not initial_notes_model.corpus_embeddings[index].allclose(final_notes_model.corpus_embeddings[index]): mismatched_embeddings.append(index) error_details = ""