khoj/tests/test_plaintext_to_entries.py

import json
import os
from pathlib import Path

from khoj.database.models import KhojUser, LocalPlaintextConfig
from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.utils.fs_syncer import get_plaintext_files
from khoj.utils.rawconfig import TextContentConfig


def test_plaintext_file(tmp_path):
    "Convert files with no heading to jsonl."
    # Arrange
    entry = f"""
    Hi, I am a plaintext file and I have some plaintext words.
    """
    plaintextfile = create_file(tmp_path, entry)

    filename = plaintextfile.stem

    # Act
    # Extract Entries from specified plaintext files

    data = {
        f"{plaintextfile}": entry,
    }

    maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)

    # Convert each entry.file to absolute path to make them JSON serializable
    for map in maps:
        map.file = str(Path(map.file).absolute())

    # Process Each Entry from All Notes Files
    jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
    jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

    # Assert
    assert len(jsonl_data) == 1
    # Ensure raw entry with no headings do not get heading prefix prepended
    assert not jsonl_data[0]["raw"].startswith("#")
    # Ensure compiled entry has filename prepended as top level heading
    assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"


def test_get_plaintext_files(tmp_path):
    "Ensure Plaintext files specified via input-filter, input-files extracted"
    # Arrange
    # Include via input-filter globs
    group1_file1 = create_file(tmp_path, filename="group1-file1.md")
    group1_file2 = create_file(tmp_path, filename="group1-file2.md")

    group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
    group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
    group2_file3 = create_file(tmp_path, filename="group2-file3.mbox")
    group2_file4 = create_file(tmp_path, filename="group2-file4.html")
    # Include via input-file field
    file1 = create_file(tmp_path, filename="notes.txt")
    # Include unsupported file types
    create_file(tmp_path, filename="group2-unincluded.py")
    create_file(tmp_path, filename="group2-unincluded.csv")
    create_file(tmp_path, filename="group2-unincluded.csv")
    # Not included by any filter
    create_file(tmp_path, filename="not-included-markdown.md")
    create_file(tmp_path, filename="not-included-text.txt")

    expected_files = set(
        [
            os.path.join(tmp_path, file.name)
            for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file3, group2_file4, file1]
        ]
    )

    # Setup input-files, input-filters
    input_files = [tmp_path / "notes.txt"]
    input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]

    plaintext_config = TextContentConfig(
        input_files=input_files,
        input_filter=[str(filter) for filter in input_filter],
        compressed_jsonl=tmp_path / "test.jsonl",
        embeddings_file=tmp_path / "test_embeddings.jsonl",
    )

    # Act
    extracted_plaintext_files = get_plaintext_files(plaintext_config)

    # Assert
    assert len(extracted_plaintext_files) == 7
    assert set(extracted_plaintext_files.keys()) == set(expected_files)


def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
    "Ensure HTML files are parsed correctly"
    # Arrange
    # Setup input-files, input-filters
    config = LocalPlaintextConfig.objects.filter(user=default_user).first()
    extracted_plaintext_files = get_plaintext_files(config=config)

    # Act
    maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)

    # Assert
    assert len(maps) == 1
    assert "<div>" not in maps[0].raw


# Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"):
    file_ = tmp_path / filename
    file_.touch()
    if entry:
        file_.write_text(entry)
    return file_
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`import json`
Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`import os`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`from pathlib import Path`

Resolve merge conflicts and fix some import ordering 2023-11-21 21:30:43 +01:00			`from khoj.database.models import KhojUser, LocalPlaintextConfig`
Rename the data_sources module to content 2023-11-22 07:11:32 +01:00			`from khoj.processor.content.plaintext.plaintext_to_entries import PlaintextToEntries`
Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`from khoj.utils.fs_syncer import get_plaintext_files`
			`from khoj.utils.rawconfig import TextContentConfig`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00

			`def test_plaintext_file(tmp_path):`
			`"Convert files with no heading to jsonl."`
			`# Arrange`
			`entry = f"""`
			`Hi, I am a plaintext file and I have some plaintext words.`
			`"""`
			`plaintextfile = create_file(tmp_path, entry)`

			`filename = plaintextfile.stem`

			`# Act`
			`# Extract Entries from specified plaintext files`

Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`data = {`
			`f"{plaintextfile}": entry,`
			`}`

Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries These content processors are converting content into entries in DB instead of entries in JSONL file 2023-11-01 22:51:33 +01:00			`maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00
			`# Convert each entry.file to absolute path to make them JSON serializable`
			`for map in maps:`
			`map.file = str(Path(map.file).absolute())`

			`# Process Each Entry from All Notes Files`
Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries These content processors are converting content into entries in DB instead of entries in JSONL file 2023-11-01 22:51:33 +01:00			`jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]`

			`# Assert`
			`assert len(jsonl_data) == 1`
			`# Ensure raw entry with no headings do not get heading prefix prepended`
			`assert not jsonl_data[0]["raw"].startswith("#")`
			`# Ensure compiled entry has filename prepended as top level heading`
			`assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"`


			`def test_get_plaintext_files(tmp_path):`
			`"Ensure Plaintext files specified via input-filter, input-files extracted"`
			`# Arrange`
			`# Include via input-filter globs`
			`group1_file1 = create_file(tmp_path, filename="group1-file1.md")`
			`group1_file2 = create_file(tmp_path, filename="group1-file2.md")`

			`group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")`
			`group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")`
			`group2_file3 = create_file(tmp_path, filename="group2-file3.mbox")`
			`group2_file4 = create_file(tmp_path, filename="group2-file4.html")`
			`# Include via input-file field`
			`file1 = create_file(tmp_path, filename="notes.txt")`
			`# Include unsupported file types`
			`create_file(tmp_path, filename="group2-unincluded.py")`
			`create_file(tmp_path, filename="group2-unincluded.csv")`
			`create_file(tmp_path, filename="group2-unincluded.csv")`
			`# Not included by any filter`
			`create_file(tmp_path, filename="not-included-markdown.md")`
			`create_file(tmp_path, filename="not-included-text.txt")`

Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`expected_files = set(`
			`[`
			`os.path.join(tmp_path, file.name)`
			`for file in [group1_file1, group1_file2, group2_file1, group2_file2, group2_file3, group2_file4, file1]`
			`]`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`)`

			`# Setup input-files, input-filters`
			`input_files = [tmp_path / "notes.txt"]`
			`input_filter = [tmp_path / "group1.md", tmp_path / "group2.*"]`

Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`plaintext_config = TextContentConfig(`
			`input_files=input_files,`
			`input_filter=[str(filter) for filter in input_filter],`
			`compressed_jsonl=tmp_path / "test.jsonl",`
			`embeddings_file=tmp_path / "test_embeddings.jsonl",`
			`)`

Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`# Act`
Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`extracted_plaintext_files = get_plaintext_files(plaintext_config)`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00
			`# Assert`
			`assert len(extracted_plaintext_files) == 7`
Move to a push-first model for retrieving embeddings from local files (#457) * Initial version - setup a file-push architecture for generating embeddings with Khoj * Update unit tests to fix with new application design * Allow configure server to be called without regenerating the index; this no longer works because the API for indexing files is not up in time for the server to send a request * Use state.host and state.port for configuring the URL for the indexer * On application startup, load in embeddings from configurations files, rather than regenerating the corpus based on file system 2023-08-31 21:55:17 +02:00			`assert set(extracted_plaintext_files.keys()) == set(expected_files)`
Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00

[Multi-User Part 1]: Enable storage of settings for plaintext files based on user account (#498) - Partition configuration for indexing local data based on user accounts - Store indexed data in an underlying postgres db using the `pgvector` extension - Add migrations for all relevant user data and embeddings generation. Very little performance optimization has been done for the lookup time - Apply filters using SQL queries - Start removing many server-level configuration settings - Configure GitHub test actions to run during any PR. Update the test action to run in a containerized environment with a DB. - Update the Docker image and docker-compose.yml to work with the new application design 2023-10-26 18:42:29 +02:00			`def test_parse_html_plaintext_file(content_config, default_user: KhojUser):`
Fix plaintext HTML parsing and rendering (#464) * Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com> 2023-08-27 20:24:30 +02:00			`"Ensure HTML files are parsed correctly"`
			`# Arrange`
			`# Setup input-files, input-filters`
[Multi-User Part 1]: Enable storage of settings for plaintext files based on user account (#498) - Partition configuration for indexing local data based on user accounts - Store indexed data in an underlying postgres db using the `pgvector` extension - Add migrations for all relevant user data and embeddings generation. Very little performance optimization has been done for the lookup time - Apply filters using SQL queries - Start removing many server-level configuration settings - Configure GitHub test actions to run during any PR. Update the test action to run in a containerized environment with a DB. - Update the Docker image and docker-compose.yml to work with the new application design 2023-10-26 18:42:29 +02:00			`config = LocalPlaintextConfig.objects.filter(user=default_user).first()`
			`extracted_plaintext_files = get_plaintext_files(config=config)`
Fix plaintext HTML parsing and rendering (#464) * Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com> 2023-08-27 20:24:30 +02:00
			`# Act`
Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries These content processors are converting content into entries in DB instead of entries in JSONL file 2023-11-01 22:51:33 +01:00			`maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)`
Fix plaintext HTML parsing and rendering (#464) * Store conversation command options in an Enum * Move to slash commands instead of using @ to specify general commands * Calculate conversation command once & pass it as arg to child funcs * Add /notes command to respond using only knowledge base as context This prevents the chat model to try respond using it's general world knowledge only without any references pulled from the indexed knowledge base * Test general and notes slash commands in openai chat director tests --------- Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com> 2023-08-27 20:24:30 +02:00
			`# Assert`
			`assert len(maps) == 1`
			`assert "<div>" not in maps[0].raw`


Add support for indexing plaintext files (#420) * Add support for indexing plaintext files - Adds backend support for parsing plaintext files generically (.html, .txt, .xml, .csv, .md) - Add equivalent frontend views for setting up plaintext file indexing - Update config, rawconfig, default config, search API, setup endpoints * Add a nifty plaintext file icon to configure plaintext files in the Web UI * Use generic glob path for plaintext files. Skip indexing files that aren't in whitelist 2023-08-10 00:44:40 +02:00			`# Helper Functions`
			`def create_file(tmp_path: Path, entry=None, filename="test.md"):`
			`file_ = tmp_path / filename`
			`file_.touch()`
			`if entry:`
			`file_.write_text(entry)`
			`return file_`