khoj/tests/test_plaintext_to_jsonl.py
sabaimran b45e1d8c0d
Fix plaintext HTML parsing and rendering (#464)
* Store conversation command options in an Enum
* Move to slash commands instead of using @ to specify general commands
* Calculate conversation command once & pass it as arg to child funcs
* Add /notes command to respond using only knowledge base as context
This prevents the chat model to try respond using it's general world
knowledge only without any references pulled from the indexed
knowledge base
* Test general and notes slash commands in openai chat director tests
---------

Co-authored-by: Debanjum Singh Solanky <debanjum@gmail.com>
2023-08-27 11:24:30 -07:00

101 lines
3.7 KiB
Python

# Standard Packages
import json
from pathlib import Path
# Internal Packages
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
def test_plaintext_file(tmp_path):
"Convert files with no heading to jsonl."
# Arrange
entry = f"""
Hi, I am a plaintext file and I have some plaintext words.
"""
plaintextfile = create_file(tmp_path, entry)
filename = plaintextfile.stem
# Act
# Extract Entries from specified plaintext files
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[str(plaintextfile)])
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
# Convert each entry.file to absolute path to make them JSON serializable
for map in maps:
map.file = str(Path(map.file).absolute())
# Process Each Entry from All Notes Files
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert
assert len(jsonl_data) == 1
# Ensure raw entry with no headings do not get heading prefix prepended
assert not jsonl_data[0]["raw"].startswith("#")
# Ensure compiled entry has filename prepended as top level heading
assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"
def test_get_plaintext_files(tmp_path):
"Ensure Plaintext files specified via input-filter, input-files extracted"
# Arrange
# Include via input-filter globs
group1_file1 = create_file(tmp_path, filename="group1-file1.md")
group1_file2 = create_file(tmp_path, filename="group1-file2.md")
group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
group2_file3 = create_file(tmp_path, filename="group2-file3.mbox")
group2_file4 = create_file(tmp_path, filename="group2-file4.html")
# Include via input-file field
file1 = create_file(tmp_path, filename="notes.txt")
# Include unsupported file types
create_file(tmp_path, filename="group2-unincluded.py")
create_file(tmp_path, filename="group2-unincluded.csv")
create_file(tmp_path, filename="group2-unincluded.csv")
# Not included by any filter
create_file(tmp_path, filename="not-included-markdown.md")
create_file(tmp_path, filename="not-included-text.txt")
expected_files = sorted(
map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4])
)
# Setup input-files, input-filters
input_files = [tmp_path / "notes.txt"]
input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
# Assert
assert len(extracted_plaintext_files) == 7
assert set(extracted_plaintext_files) == set(expected_files)
def test_parse_html_plaintext_file(content_config):
"Ensure HTML files are parsed correctly"
# Arrange
# Setup input-files, input-filters
input_files = content_config.plaintext.input_files
input_filter = content_config.plaintext.input_filter
# Act
extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
file_to_entries = PlaintextToJsonl.extract_plaintext_entries(extracted_plaintext_files)
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
# Assert
assert len(maps) == 1
assert "<div>" not in maps[0].raw
# Helper Functions
def create_file(tmp_path: Path, entry=None, filename="test.md"):
file_ = tmp_path / filename
file_.touch()
if entry:
file_.write_text(entry)
return file_