khoj/tests/test_org_to_jsonl.py

# Standard Packages
import json

# Internal Packages
from src.processor.org_mode.org_to_jsonl import OrgToJsonl
from src.processor.text_to_jsonl import TextToJsonl
from src.utils.helpers import is_none_or_empty
from src.utils.rawconfig import Entry


def test_configure_heading_entry_to_jsonl(tmp_path):
    '''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.
    Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''
    # Arrange
    entry = f'''*** Heading
    :PROPERTIES:
    :ID:       42-42-42
    :END:
    \t \r
    '''
    orgfile = create_file(tmp_path, entry)

    for index_heading_entries in [True, False]:
        # Act
        # Extract entries into jsonl from specified Org files
        jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(
            *OrgToJsonl.extract_org_entries(org_files=[orgfile]),
            index_heading_entries=index_heading_entries))
        jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

        # Assert
        if index_heading_entries:
            # Entry with empty body indexed when index_heading_entries set to True
            assert len(jsonl_data) == 1
        else:
            # Entry with empty body ignored when index_heading_entries set to False
            assert is_none_or_empty(jsonl_data)


def test_entry_split_when_exceeds_max_words(tmp_path):
    "Ensure entries with compiled words exceeding max_words are split."
    # Arrange
    entry = f'''*** Heading
    \t\r
    Body Line 1
    '''
    orgfile = create_file(tmp_path, entry)

    # Act
    # Extract Entries from specified Org files
    entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])

    # Split each entry from specified Org files by max words
    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
        TextToJsonl.split_entries_by_max_tokens(
            OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),
            max_tokens = 2)
        )
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
    assert len(jsonl_data) == 2


def test_entry_split_drops_large_words(tmp_path):
    "Ensure entries drops words larger than specified max word length from compiled version."
    # Arrange
    entry_text = f'''*** Heading
    \t\r
    Body Line 1
    '''
    entry = Entry(raw=entry_text, compiled=entry_text)

    # Act
    # Split entry by max words and drop words larger than max word length
    processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0]

    # Assert
    # "Heading" dropped from compiled version because its over the set max word limit
    assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1


def test_entry_with_body_to_jsonl(tmp_path):
    "Ensure entries with valid body text are loaded."
    # Arrange
    entry = f'''*** Heading
    :PROPERTIES:
    :ID:       42-42-42
    :END:
    \t\r
    Body Line 1
    '''
    orgfile = create_file(tmp_path, entry)

    # Act
    # Extract Entries from specified Org files
    entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])

    # Process Each Entry from All Notes Files
    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
    assert len(jsonl_data) == 1


def test_file_with_no_headings_to_jsonl(tmp_path):
    "Ensure files with no heading, only body text are loaded."
    # Arrange
    entry = f'''
    - Bullet point 1
    - Bullet point 2
    '''
    orgfile = create_file(tmp_path, entry)

    # Act
    # Extract Entries from specified Org files
    entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])

    # Process Each Entry from All Notes Files
    entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
    jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] 

    # Assert
    assert len(jsonl_data) == 1


def test_get_org_files(tmp_path):
    "Ensure Org files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.org")
    group1_file2 = create_file(tmp_path, filename="group1-file2.org")
    group2_file1 = create_file(tmp_path, filename="group2-file1.org")
    group2_file2 = create_file(tmp_path, filename="group2-file2.org")
    # Include via input-file field
    orgfile1 = create_file(tmp_path, filename="orgfile1.org")
    # Not included by any filter
    create_file(tmp_path, filename="orgfile2.org")
    create_file(tmp_path, filename="text1.txt")

    expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))

    # Setup input-files, input-filters
    input_files = [tmp_path / 'orgfile1.org']
    input_filter = [tmp_path / 'group1*.org', tmp_path / 'group2*.org']

    # Act
    extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)

    # Assert
    assert len(extracted_org_files) == 5
    assert extracted_org_files == expected_files


# Helper Functions
def create_file(tmp_path, entry=None, filename="test.org"):
    org_file = tmp_path / filename
    org_file.touch()
    if entry:
        org_file.write_text(entry)
    return org_file
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`# Standard Packages`
			`import json`

			`# Internal Packages`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`from src.processor.org_mode.org_to_jsonl import OrgToJsonl`
Add method to split entries by specified max tokens limit - Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens 2022-12-23 19:45:53 +01:00			`from src.processor.text_to_jsonl import TextToJsonl`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`from src.utils.helpers import is_none_or_empty`
Drop long words from compiled entries to be within max token limit of models Long words (>500 characters) provide less useful context to models. Dropping very long words allow models to create better embeddings by passing more of the useful context from the entry to the model 2023-01-07 20:59:33 +01:00			`from src.utils.rawconfig import Entry`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00

Set index_heading_entries field in config to index entries with no body - Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content 2022-09-11 11:40:58 +02:00			`def test_configure_heading_entry_to_jsonl(tmp_path):`
			`'''Ensure entries with empty body are ignored, unless explicitly configured to index heading entries.`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`Property drawers not considered Body. Ignore control characters for evaluating if Body empty.'''`
			`# Arrange`
			`entry = f'''*** Heading`
			`:PROPERTIES:`
			`:ID: 42-42-42`
			`:END:`
Set index_heading_entries field in config to index entries with no body - Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content 2022-09-11 11:40:58 +02:00			`\t \r`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`'''`
			`orgfile = create_file(tmp_path, entry)`

Set index_heading_entries field in config to index entries with no body - Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content 2022-09-11 11:40:58 +02:00			`for index_heading_entries in [True, False]:`
			`# Act`
			`# Extract entries into jsonl from specified Org files`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(`
			`*OrgToJsonl.extract_org_entries(org_files=[orgfile]),`
Set index_heading_entries field in config to index entries with no body - Previously heading entries were not indexed to maintain search quality - But given that there are use-cases for indexing entries with no body - Add a configurable `index_heading_entries' field to index heading entries - This `TextContentConfig' field is currently only used for OrgMode content 2022-09-11 11:40:58 +02:00			`index_heading_entries=index_heading_entries))`
			`jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]`

			`# Assert`
			`if index_heading_entries:`
			`# Entry with empty body indexed when index_heading_entries set to True`
			`assert len(jsonl_data) == 1`
			`else:`
			`# Entry with empty body ignored when index_heading_entries set to False`
			`assert is_none_or_empty(jsonl_data)`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00

Add method to split entries by specified max tokens limit - Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens 2022-12-23 19:45:53 +01:00			`def test_entry_split_when_exceeds_max_words(tmp_path):`
			`"Ensure entries with compiled words exceeding max_words are split."`
			`# Arrange`
			`entry = f'''*** Heading`
			`\t\r`
			`Body Line 1`
			`'''`
			`orgfile = create_file(tmp_path, entry)`

			`# Act`
			`# Extract Entries from specified Org files`
			`entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])`

Fix comments, use minimal test case, regenerate test index, merge debug logs - Remove property drawer from test entry for max_words splitting test - Property drawer is not required for the test - Keep minimal test case to reduce chance for confusion 2022-12-26 01:45:40 +01:00			`# Split each entry from specified Org files by max words`
Add method to split entries by specified max tokens limit - Issue ML Models truncate entries exceeding some max token limit. This lowers the quality of search results - Fix Split entries by max tokens before indexing. This should improve searching for content in longer entries. - Miscellaneous - Test method to split entries by max tokens 2022-12-23 19:45:53 +01:00			`jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(`
			`TextToJsonl.split_entries_by_max_tokens(`
			`OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map),`
			`max_tokens = 2)`
			`)`
			`jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]`

			`# Assert`
			`assert len(jsonl_data) == 2`


Drop long words from compiled entries to be within max token limit of models Long words (>500 characters) provide less useful context to models. Dropping very long words allow models to create better embeddings by passing more of the useful context from the entry to the model 2023-01-07 20:59:33 +01:00			`def test_entry_split_drops_large_words(tmp_path):`
			`"Ensure entries drops words larger than specified max word length from compiled version."`
			`# Arrange`
			`entry_text = f'''*** Heading`
			`\t\r`
			`Body Line 1`
			`'''`
			`entry = Entry(raw=entry_text, compiled=entry_text)`

			`# Act`
			`# Split entry by max words and drop words larger than max word length`
			`processed_entry = TextToJsonl.split_entries_by_max_tokens([entry], max_word_length = 5)[0]`

			`# Assert`
			`# "Heading" dropped from compiled version because its over the set max word limit`
			`assert len(processed_entry.compiled.split()) == len(entry_text.split()) - 1`


Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`def test_entry_with_body_to_jsonl(tmp_path):`
			`"Ensure entries with valid body text are loaded."`
			`# Arrange`
			`entry = f'''*** Heading`
			`:PROPERTIES:`
			`:ID: 42-42-42`
			`:END:`
Fix initializing OrgNode level to string to parse org files - Parsed `level` argument passed to OrgNode during init is expected to be a string, not an integer - This was resulting in app failure only when parsing org files with no headings, like in issue #83, as level is set to string of `*`s the moment a heading is found in the current file 2022-09-10 12:11:58 +02:00			`\t\r`
			`Body Line 1`
			`'''`
			`orgfile = create_file(tmp_path, entry)`

			`# Act`
			`# Extract Entries from specified Org files`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=[orgfile])`
Fix initializing OrgNode level to string to parse org files - Parsed `level` argument passed to OrgNode during init is expected to be a string, not an integer - This was resulting in app failure only when parsing org files with no headings, like in issue #83, as level is set to string of `*`s the moment a heading is found in the current file 2022-09-10 12:11:58 +02:00
			`# Process Each Entry from All Notes Files`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map))`
Fix initializing OrgNode level to string to parse org files - Parsed `level` argument passed to OrgNode during init is expected to be a string, not an integer - This was resulting in app failure only when parsing org files with no headings, like in issue #83, as level is set to string of `*`s the moment a heading is found in the current file 2022-09-10 12:11:58 +02:00			`jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]`

			`# Assert`
			`assert len(jsonl_data) == 1`


			`def test_file_with_no_headings_to_jsonl(tmp_path):`
			`"Ensure files with no heading, only body text are loaded."`
			`# Arrange`
			`entry = f'''`
			`- Bullet point 1`
			`- Bullet point 2`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`'''`
			`orgfile = create_file(tmp_path, entry)`

			`# Act`
			`# Extract Entries from specified Org files`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=[orgfile])`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00
			`# Process Each Entry from All Notes Files`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)`
			`jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]`

			`# Assert`
			`assert len(jsonl_data) == 1`


Support multiple input-filters to configure content to index via khoj.yml - Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84 2022-09-12 09:39:39 +02:00			`def test_get_org_files(tmp_path):`
			`"Ensure Org files specified via input-filter, input-files extracted"`
			`# Arrange`
			`# Include via input-filter globs`
			`group1_file1 = create_file(tmp_path, filename="group1-file1.org")`
			`group1_file2 = create_file(tmp_path, filename="group1-file2.org")`
			`group2_file1 = create_file(tmp_path, filename="group2-file1.org")`
			`group2_file2 = create_file(tmp_path, filename="group2-file2.org")`
			`# Include via input-file field`
			`orgfile1 = create_file(tmp_path, filename="orgfile1.org")`
			`# Not included by any filter`
			`create_file(tmp_path, filename="orgfile2.org")`
			`create_file(tmp_path, filename="text1.txt")`

Process text content files in sorted order for stable indexing - Image search already uses a sorted list of images to process - Prevents index of entries to desync when entries, embeddings generated by a separate server/app instance 2022-09-12 10:02:05 +02:00			`expected_files = sorted(map(str, [group1_file1, group1_file2, group2_file1, group2_file2, orgfile1]))`
Support multiple input-filters to configure content to index via khoj.yml - Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84 2022-09-12 09:39:39 +02:00
			`# Setup input-files, input-filters`
			`input_files = [tmp_path / 'orgfile1.org']`
			`input_filter = [tmp_path / 'group1.org', tmp_path / 'group2.org']`

			`# Act`
Use Base TextToJsonl class to standardize <text>_to_jsonl processors - Start standardizing implementation of the `text_to_jsonl' processors - `text_to_jsonl; scripts already had a shared structure - This change starts to codify that implicit structure - Benefits - Ease adding more `text_to_jsonl; processors - Allow merging shared functionality - Help with type hinting - Drawbacks - Lower agility to change. But this was already an implicit issue as the text_to_jsonl processors got more deeply wired into the app 2022-09-14 09:53:43 +02:00			`extracted_org_files = OrgToJsonl.get_org_files(input_files, input_filter)`
Support multiple input-filters to configure content to index via khoj.yml - Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84 2022-09-12 09:39:39 +02:00
			`# Assert`
			`assert len(extracted_org_files) == 5`
			`assert extracted_org_files == expected_files`


Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`# Helper Functions`
Support multiple input-filters to configure content to index via khoj.yml - Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84 2022-09-12 09:39:39 +02:00			`def create_file(tmp_path, entry=None, filename="test.org"):`
			`org_file = tmp_path / filename`
Fix logic to ignore notes with no body. Add tests to prevent regression - Notes with empty newlines in body were not being ignored - Add regression tests to avoid above regression in org_to_jsonl conversion 2022-08-21 18:41:40 +02:00			`org_file.touch()`
Support multiple input-filters to configure content to index via khoj.yml - Update existings code, tests to process input-filters as list instead of str - Test `text_to_jsonl' get files methods to work with combination of `input-files' and `input-filters' Resolves #84 2022-09-12 09:39:39 +02:00			`if entry:`
			`org_file.write_text(entry)`
			`return org_file`