Skip plaintext file indexing if there's a parsing issue and log the file

This commit is contained in:
sabaimran 2023-08-29 14:34:08 -07:00
parent 74409c2c64
commit 92cbfef7ab

View file

@ -91,10 +91,13 @@ class PlaintextToJsonl(TextToJsonl):
for plaintext_file in plaintext_files: for plaintext_file in plaintext_files:
with open(plaintext_file, "r") as f: with open(plaintext_file, "r") as f:
plaintext_content = f.read() try:
if plaintext_file.endswith(("html", "htm", "xml")): plaintext_content = f.read()
plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content) if plaintext_file.endswith(("html", "htm", "xml")):
entry_to_file_map.append((plaintext_content, plaintext_file)) plaintext_content = PlaintextToJsonl.extract_html_content(plaintext_content)
entry_to_file_map.append((plaintext_content, plaintext_file))
except Exception as e:
logger.error(f"Error processing file: {plaintext_file} - {e}", exc_info=True)
return dict(entry_to_file_map) return dict(entry_to_file_map)