diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py index a3cb75f9..fb0fe4e5 100644 --- a/src/khoj/processor/content/markdown/markdown_to_entries.py +++ b/src/khoj/processor/content/markdown/markdown_to_entries.py @@ -123,8 +123,3 @@ class MarkdownToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} markdown entries to dictionaries") return entries - - @staticmethod - def convert_markdown_maps_to_jsonl(entries: List[Entry]): - "Convert each Markdown entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py index 989d3501..60226946 100644 --- a/src/khoj/processor/content/org_mode/org_to_entries.py +++ b/src/khoj/processor/content/org_mode/org_to_entries.py @@ -146,8 +146,3 @@ class OrgToEntries(TextToEntries): ) return entries - - @staticmethod - def convert_org_entries_to_jsonl(entries: Iterable[Entry]) -> str: - "Convert each Org-Mode entry to JSON and collate as JSONL" - return "".join([f"{entry_dict.to_json()}\n" for entry_dict in entries]) diff --git a/src/khoj/processor/content/pdf/pdf_to_entries.py b/src/khoj/processor/content/pdf/pdf_to_entries.py index 008b4cce..c59b305c 100644 --- a/src/khoj/processor/content/pdf/pdf_to_entries.py +++ b/src/khoj/processor/content/pdf/pdf_to_entries.py @@ -106,8 +106,3 @@ class PdfToEntries(TextToEntries): logger.debug(f"Converted {len(parsed_entries)} PDF entries to dictionaries") return entries - - @staticmethod - def convert_pdf_maps_to_jsonl(entries: List[Entry]): - "Convert each PDF entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/plaintext/plaintext_to_entries.py b/src/khoj/processor/content/plaintext/plaintext_to_entries.py index 67604ad7..4fb0dd2e 100644 --- a/src/khoj/processor/content/plaintext/plaintext_to_entries.py +++ b/src/khoj/processor/content/plaintext/plaintext_to_entries.py @@ -87,8 +87,3 @@ class PlaintextToEntries(TextToEntries): ) ) return entries - - @staticmethod - def convert_entries_to_jsonl(entries: List[Entry]): - "Convert each entry to JSON and collate as JSONL" - return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/processor/content/text_to_entries.py b/src/khoj/processor/content/text_to_entries.py index f8bf30dc..8ebc6604 100644 --- a/src/khoj/processor/content/text_to_entries.py +++ b/src/khoj/processor/content/text_to_entries.py @@ -244,11 +244,6 @@ class TextToEntries(ABC): return entries_with_ids - @staticmethod - def convert_text_maps_to_jsonl(entries: List[Entry]) -> str: - # Convert each entry to JSON and write to JSONL file - return "".join([f"{entry.to_json()}\n" for entry in entries]) - @staticmethod def clean_field(field: str) -> str: return field.replace("\0", "") if not is_none_or_empty(field) else "" diff --git a/tests/test_markdown_to_entries.py b/tests/test_markdown_to_entries.py index 6851a7ed..8a086dbc 100644 --- a/tests/test_markdown_to_entries.py +++ b/tests/test_markdown_to_entries.py @@ -23,18 +23,14 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 # Ensure raw entry with no headings do not get heading prefix prepended - assert not jsonl_data[0]["raw"].startswith("#") + assert not entries[0].raw.startswith("#") # Ensure compiled entry has filename prepended as top level heading - assert expected_heading in jsonl_data[0]["compiled"] + assert entries[0].compiled.startswith(expected_heading) # Ensure compiled entry also includes the file name - assert str(tmp_path) in jsonl_data[0]["compiled"] + assert str(tmp_path) in entries[0].compiled def test_single_markdown_entry_to_jsonl(tmp_path): @@ -52,12 +48,8 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_multiple_markdown_entries_to_jsonl(tmp_path): @@ -79,12 +71,8 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Extract Entries from specified Markdown files entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) - # Process Each Entry from All Notes Files - jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 # Ensure entry compiled strings include the markdown files they originate from assert all([tmp_path.stem in entry.compiled for entry in entries]) diff --git a/tests/test_org_to_entries.py b/tests/test_org_to_entries.py index e1051d25..66371e5c 100644 --- a/tests/test_org_to_entries.py +++ b/tests/test_org_to_entries.py @@ -26,18 +26,15 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) - ) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = OrgToEntries.extract_org_entries(org_files=data, index_heading_entries=index_heading_entries) # Assert if index_heading_entries: # Entry with empty body indexed when index_heading_entries set to True - assert len(jsonl_data) == 1 + assert len(entries) == 1 else: # Entry with empty body ignored when index_heading_entries set to False - assert is_none_or_empty(jsonl_data) + assert is_none_or_empty(entries) def test_entry_split_when_exceeds_max_words(): @@ -58,15 +55,12 @@ def test_entry_split_when_exceeds_max_words(): entries = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max words - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( - TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) - ) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = TextToEntries.split_entries_by_max_tokens(entries, max_tokens=4) # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 # Ensure compiled entries split by max_words start with entry heading (for search context) - assert all([entry["compiled"].startswith(expected_heading) for entry in jsonl_data]) + assert all([entry.compiled.startswith(expected_heading) for entry in entries]) def test_entry_split_drops_large_words(): @@ -103,11 +97,10 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(OrgToEntries.extract_org_entries(org_files=data)) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + entries = OrgToEntries.extract_org_entries(org_files=data) # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_file_with_entry_after_intro_text_to_jsonl(tmp_path): @@ -127,12 +120,8 @@ Intro text # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) - # Process Each Entry from All Notes Files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 2 + assert len(entries) == 2 def test_file_with_no_headings_to_jsonl(tmp_path): @@ -150,12 +139,8 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Extract Entries from specified Org files entries = OrgToEntries.extract_org_entries(org_files=data) - # Process Each Entry from All Notes Files - jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_get_org_files(tmp_path): diff --git a/tests/test_pdf_to_entries.py b/tests/test_pdf_to_entries.py index 1e3e2783..a8c6aa43 100644 --- a/tests/test_pdf_to_entries.py +++ b/tests/test_pdf_to_entries.py @@ -1,4 +1,3 @@ -import json import os from khoj.processor.content.pdf.pdf_to_entries import PdfToEntries @@ -17,12 +16,8 @@ def test_single_page_pdf_to_jsonl(): data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} entries = PdfToEntries.extract_pdf_entries(pdf_files=data) - # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 def test_multi_page_pdf_to_jsonl(): @@ -35,12 +30,8 @@ def test_multi_page_pdf_to_jsonl(): data = {"tests/data/pdf/multipage.pdf": pdf_bytes} entries = PdfToEntries.extract_pdf_entries(pdf_files=data) - # Process Each Entry from All Pdf Files - jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 6 + assert len(entries) == 6 def test_ocr_page_pdf_to_jsonl(): diff --git a/tests/test_plaintext_to_entries.py b/tests/test_plaintext_to_entries.py index 679892dc..41d841fc 100644 --- a/tests/test_plaintext_to_entries.py +++ b/tests/test_plaintext_to_entries.py @@ -1,4 +1,3 @@ -import json import os from pathlib import Path @@ -31,16 +30,12 @@ def test_plaintext_file(tmp_path): for entry in entries: entry.file = str(Path(entry.file).absolute()) - # Process Each Entry from All Notes Files - jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(entries) - jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] - # Assert - assert len(jsonl_data) == 1 + assert len(entries) == 1 # Ensure raw entry with no headings do not get heading prefix prepended - assert not jsonl_data[0]["raw"].startswith("#") + assert not entries[0].raw.startswith("#") # Ensure compiled entry has filename prepended as top level heading - assert jsonl_data[0]["compiled"] == f"{filename}\n{raw_entry}" + assert entries[0].compiled == f"{filename}\n{raw_entry}" def test_get_plaintext_files(tmp_path):