From e64357698d2be29f95d5290ab8038e6fcc7c1e4d Mon Sep 17 00:00:00 2001 From: sabaimran <65192171+sabaimran@users.noreply.github.com> Date: Wed, 23 Aug 2023 15:34:56 -0700 Subject: [PATCH] Skip indexing single bad markdown, plaintext file (#460) --- src/khoj/processor/markdown/markdown_to_jsonl.py | 12 ++++++++---- src/khoj/processor/org_mode/org_to_jsonl.py | 3 ++- src/khoj/processor/plaintext/plaintext_to_jsonl.py | 8 ++++++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index b6acbfbb..0e029d12 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -105,10 +105,14 @@ class MarkdownToJsonl(TextToJsonl): entry_to_file_map = [] for markdown_file in markdown_files: with open(markdown_file, "r", encoding="utf8") as f: - markdown_content = f.read() - entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( - markdown_content, markdown_file, entries, entry_to_file_map - ) + try: + markdown_content = f.read() + entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + markdown_content, markdown_file, entries, entry_to_file_map + ) + except Exception as e: + logger.warning(f"Unable to process file: {markdown_file}. This file will not be indexed.") + logger.warning(e, exc_info=True) return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index d8190a49..b857cd05 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -100,7 +100,8 @@ class OrgToJsonl(TextToJsonl): entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) entries.extend(org_file_entries) except Exception as e: - logger.error(f"Error processing file: {org_file} with error: {e}", exc_info=True) + logger.warning(f"Unable to process file: {org_file}. This file will not be indexed.") + logger.warning(e, exc_info=True) return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_jsonl.py index 8a740f6d..643d3cea 100644 --- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py +++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py @@ -91,8 +91,12 @@ class PlaintextToJsonl(TextToJsonl): for plaintext_file in plaintext_files: with open(plaintext_file, "r") as f: - plaintext_content = f.read() - entry_to_file_map.append((plaintext_content, plaintext_file)) + try: + plaintext_content = f.read() + entry_to_file_map.append((plaintext_content, plaintext_file)) + except Exception as e: + logger.warning(f"Unable to process file: {plaintext_file}. This file will not be indexed.") + logger.warning(e, exc_info=True) return dict(entry_to_file_map)