mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Fix logic to ignore notes with no body. Add tests to prevent regression
- Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion
This commit is contained in:
parent
5e107eedc0
commit
ea4fdd9134
2 changed files with 65 additions and 2 deletions
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Standard Packages
|
||||
import re
|
||||
import json
|
||||
import argparse
|
||||
import pathlib
|
||||
|
@ -71,14 +72,14 @@ def extract_org_entries(org_files):
|
|||
return entries
|
||||
|
||||
|
||||
def convert_org_entries_to_jsonl(entries, verbose=0):
|
||||
def convert_org_entries_to_jsonl(entries, verbose=0) -> str:
|
||||
"Convert each Org-Mode entries to JSON and collate as JSONL"
|
||||
jsonl = ''
|
||||
for entry in entries:
|
||||
entry_dict = dict()
|
||||
|
||||
# Ignore title notes i.e notes with just headings and empty body
|
||||
if not entry.Body() or entry.Body().strip(empty_escape_sequences) == "":
|
||||
if not entry.Body() or re.sub(r'\n|\t|\r| ', '', entry.Body()) == "":
|
||||
continue
|
||||
|
||||
entry_dict["compiled"] = f'{entry.Heading()}.'
|
||||
|
|
62
tests/test_org_to_jsonl.py
Normal file
62
tests/test_org_to_jsonl.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
# Standard Packages
|
||||
import json
|
||||
from posixpath import split
|
||||
|
||||
# Internal Packages
|
||||
from src.processor.org_mode.org_to_jsonl import convert_org_entries_to_jsonl, extract_org_entries
|
||||
from src.utils.helpers import is_none_or_empty
|
||||
|
||||
|
||||
def test_entry_with_empty_body_line_to_jsonl(tmp_path):
|
||||
'''Ensure entries with empty body are ignored.
|
||||
Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
|
||||
# Arrange
|
||||
entry = f'''*** Heading
|
||||
:PROPERTIES:
|
||||
:ID: 42-42-42
|
||||
:END:
|
||||
\t\r\n
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries = extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_data = convert_org_entries_to_jsonl(entries)
|
||||
|
||||
# Assert
|
||||
assert is_none_or_empty(jsonl_data)
|
||||
|
||||
|
||||
def test_entry_with_body_to_jsonl(tmp_path):
|
||||
"Ensure entries with valid body text are loaded."
|
||||
# Arrange
|
||||
entry = f'''*** Heading
|
||||
:PROPERTIES:
|
||||
:ID: 42-42-42
|
||||
:END:
|
||||
\t\r\nBody Line 1\n
|
||||
'''
|
||||
orgfile = create_file(tmp_path, entry)
|
||||
|
||||
# Act
|
||||
# Extract Entries from specified Org files
|
||||
entries = extract_org_entries(org_files=[orgfile])
|
||||
|
||||
# Process Each Entry from All Notes Files
|
||||
jsonl_string = convert_org_entries_to_jsonl(entries)
|
||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||
|
||||
# Assert
|
||||
assert len(jsonl_data) == 1
|
||||
|
||||
|
||||
# Helper Functions
|
||||
def create_file(tmp_path, entry, filename="test.org"):
|
||||
org_file = tmp_path / f"notes/{filename}"
|
||||
org_file.parent.mkdir()
|
||||
org_file.touch()
|
||||
org_file.write_text(entry)
|
||||
return org_file
|
Loading…
Reference in a new issue