diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_entries.py similarity index 92% rename from src/khoj/processor/github/github_to_jsonl.py rename to src/khoj/processor/github/github_to_entries.py index 98e771dc..14e9b696 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_entries.py @@ -10,16 +10,16 @@ import requests # Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.text_to_entries import TextToEntries from database.models import Entry as DbEntry, GithubConfig, KhojUser logger = logging.getLogger(__name__) -class GithubToJsonl(TextEntries): +class GithubToEntries(TextToEntries): def __init__(self, config: GithubConfig): super().__init__(config) raw_repos = config.githubrepoconfig.all() @@ -77,24 +77,26 @@ class GithubToJsonl(TextEntries): current_entries = [] with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger): - current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( - *GithubToJsonl.extract_markdown_entries(markdown_files) + current_entries = MarkdownToEntries.convert_markdown_entries_to_maps( + *GithubToEntries.extract_markdown_entries(markdown_files) ) with timer(f"Extract org entries from github repo {repo_shorthand}", logger): - current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files)) + current_entries += OrgToEntries.convert_org_nodes_to_entries( + *GithubToEntries.extract_org_entries(org_files) + ) with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) with timer(f"Extract issues from github repo {repo_shorthand}", logger): - issue_entries = GithubToJsonl.convert_issues_to_entries( - *GithubToJsonl.extract_github_issues(self.get_issues(repo_url)) + issue_entries = GithubToEntries.convert_issues_to_entries( + *GithubToEntries.extract_github_issues(self.get_issues(repo_url)) ) current_entries += issue_entries with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): - current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) + current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) return current_entries @@ -280,7 +282,7 @@ class GithubToJsonl(TextEntries): entries = [] entry_to_file_map = [] for doc in markdown_files: - entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) @@ -291,7 +293,7 @@ class GithubToJsonl(TextEntries): entry_to_file_map = [] for doc in org_files: - entries, entry_to_file_map = OrgToJsonl.process_single_org_file( + entries, entry_to_file_map = OrgToEntries.process_single_org_file( doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_entries.py similarity index 93% rename from src/khoj/processor/markdown/markdown_to_jsonl.py rename to src/khoj/processor/markdown/markdown_to_entries.py index 86acc4b3..e0b76368 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_entries.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Tuple, List # Internal Packages -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer from khoj.utils.constants import empty_escape_sequences from khoj.utils.rawconfig import Entry @@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class MarkdownToJsonl(TextEntries): +class MarkdownToEntries(TextToEntries): def __init__(self): super().__init__() @@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEntries): # Extract Entries from specified Markdown files with timer("Parse entries from Markdown files into dictionaries", logger): - current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( - *MarkdownToJsonl.extract_markdown_entries(files) + current_entries = MarkdownToEntries.convert_markdown_entries_to_maps( + *MarkdownToEntries.extract_markdown_entries(files) ) # Split entries by max tokens supported by model @@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEntries): for markdown_file in markdown_files: try: markdown_content = markdown_files[markdown_file] - entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( + entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file( markdown_content, markdown_file, entries, entry_to_file_map ) except Exception as e: diff --git a/src/khoj/processor/notion/notion_to_jsonl.py b/src/khoj/processor/notion/notion_to_entries.py similarity index 99% rename from src/khoj/processor/notion/notion_to_jsonl.py rename to src/khoj/processor/notion/notion_to_entries.py index 048642ef..a4b15d4e 100644 --- a/src/khoj/processor/notion/notion_to_jsonl.py +++ b/src/khoj/processor/notion/notion_to_entries.py @@ -8,7 +8,7 @@ import requests # Internal Packages from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry, NotionContentConfig -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser, NotionConfig @@ -50,7 +50,7 @@ class NotionBlockType(Enum): CALLOUT = "callout" -class NotionToJsonl(TextEntries): +class NotionToEntries(TextToEntries): def __init__(self, config: NotionConfig): super().__init__(config) self.config = NotionContentConfig( diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_entries.py similarity index 98% rename from src/khoj/processor/org_mode/org_to_jsonl.py rename to src/khoj/processor/org_mode/org_to_entries.py index fbb43f55..387f8572 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_entries.py @@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple # Internal Packages from khoj.processor.org_mode import orgnode -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from khoj.utils import state @@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class OrgToJsonl(TextEntries): +class OrgToEntries(TextToEntries): def __init__(self): super().__init__() diff --git a/src/khoj/processor/pdf/pdf_to_jsonl.py b/src/khoj/processor/pdf/pdf_to_entries.py similarity index 95% rename from src/khoj/processor/pdf/pdf_to_jsonl.py rename to src/khoj/processor/pdf/pdf_to_entries.py index 034e51f4..24dcdc5a 100644 --- a/src/khoj/processor/pdf/pdf_to_jsonl.py +++ b/src/khoj/processor/pdf/pdf_to_entries.py @@ -8,7 +8,7 @@ import base64 from langchain.document_loaders import PyMuPDFLoader # Internal Packages -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser @@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class PdfToJsonl(TextEntries): +class PdfToEntries(TextToEntries): def __init__(self): super().__init__() @@ -35,7 +35,7 @@ class PdfToJsonl(TextEntries): # Extract Entries from specified Pdf files with timer("Parse entries from PDF files into dictionaries", logger): - current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files)) + current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files)) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_entries.py similarity index 92% rename from src/khoj/processor/plaintext/plaintext_to_jsonl.py rename to src/khoj/processor/plaintext/plaintext_to_entries.py index 1094baa2..fd5e1de2 100644 --- a/src/khoj/processor/plaintext/plaintext_to_jsonl.py +++ b/src/khoj/processor/plaintext/plaintext_to_entries.py @@ -6,7 +6,7 @@ from bs4 import BeautifulSoup # Internal Packages -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import timer from khoj.utils.rawconfig import Entry from database.models import Entry as DbEntry, KhojUser @@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser logger = logging.getLogger(__name__) -class PlaintextToJsonl(TextEntries): +class PlaintextToEntries(TextToEntries): def __init__(self): super().__init__() @@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEntries): try: plaintext_content = files[file] if file.endswith(("html", "htm", "xml")): - plaintext_content = PlaintextToJsonl.extract_html_content( + plaintext_content = PlaintextToEntries.extract_html_content( plaintext_content, file.split(".")[-1] ) files[file] = plaintext_content @@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEntries): # Extract Entries from specified plaintext files with timer("Parse entries from plaintext files", logger): - current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files) + current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files) # Split entries by max tokens supported by model with timer("Split entries by max token size supported by model", logger): diff --git a/src/khoj/processor/text_to_jsonl.py b/src/khoj/processor/text_to_entries.py similarity index 96% rename from src/khoj/processor/text_to_jsonl.py rename to src/khoj/processor/text_to_entries.py index 763db9df..0477caa2 100644 --- a/src/khoj/processor/text_to_jsonl.py +++ b/src/khoj/processor/text_to_entries.py @@ -19,7 +19,7 @@ from database.adapters import EntryAdapters logger = logging.getLogger(__name__) -class TextEntries(ABC): +class TextToEntries(ABC): def __init__(self, config: Any = None): self.embeddings_model = EmbeddingsModel() self.config = config @@ -85,10 +85,10 @@ class TextEntries(ABC): ): with timer("Construct current entry hashes", logger): hashes_by_file = dict[str, set[str]]() - current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) + current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries)) hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) for entry in tqdm(current_entries, desc="Hashing Entries"): - hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry)) + hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry)) num_deleted_embeddings = 0 with timer("Preparing dataset for regeneration", logger): @@ -180,11 +180,11 @@ class TextEntries(ABC): ): # Hash all current and previous entries to identify new entries with timer("Hash previous, current entries", logger): - current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) - previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries)) + current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries)) + previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries)) if deletion_filenames is not None: deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames] - deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries)) + deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries)) else: deletion_entry_hashes = [] diff --git a/src/khoj/routers/indexer.py b/src/khoj/routers/indexer.py index 590164fb..06275dbe 100644 --- a/src/khoj/routers/indexer.py +++ b/src/khoj/routers/indexer.py @@ -10,12 +10,12 @@ from starlette.authentication import requires # Internal Packages from khoj.utils import state, constants -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl -from khoj.processor.github.github_to_jsonl import GithubToJsonl -from khoj.processor.notion.notion_to_jsonl import NotionToJsonl -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.pdf.pdf_to_entries import PdfToEntries +from khoj.processor.github.github_to_entries import GithubToEntries +from khoj.processor.notion.notion_to_entries import NotionToEntries +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.search_type import text_search, image_search from khoj.routers.helpers import update_telemetry_state from khoj.utils.yaml import save_config_to_file_updated_state @@ -201,7 +201,7 @@ def configure_content( logger.info("🦄 Setting up search for orgmode notes") # Extract Entries, Generate Notes Embeddings text_search.setup( - OrgToJsonl, + OrgToEntries, files.get("org"), regenerate=regenerate, full_corpus=full_corpus, @@ -216,7 +216,7 @@ def configure_content( logger.info("💎 Setting up search for markdown notes") # Extract Entries, Generate Markdown Embeddings text_search.setup( - MarkdownToJsonl, + MarkdownToEntries, files.get("markdown"), regenerate=regenerate, full_corpus=full_corpus, @@ -232,7 +232,7 @@ def configure_content( logger.info("🖨️ Setting up search for pdf") # Extract Entries, Generate PDF Embeddings text_search.setup( - PdfToJsonl, + PdfToEntries, files.get("pdf"), regenerate=regenerate, full_corpus=full_corpus, @@ -248,7 +248,7 @@ def configure_content( logger.info("📄 Setting up search for plaintext") # Extract Entries, Generate Plaintext Embeddings text_search.setup( - PlaintextToJsonl, + PlaintextToEntries, files.get("plaintext"), regenerate=regenerate, full_corpus=full_corpus, @@ -281,7 +281,7 @@ def configure_content( logger.info("🐙 Setting up search for github") # Extract Entries, Generate Github Embeddings text_search.setup( - GithubToJsonl, + GithubToEntries, None, regenerate=regenerate, full_corpus=full_corpus, @@ -298,7 +298,7 @@ def configure_content( if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config: logger.info("🔌 Setting up search for notion") text_search.setup( - NotionToJsonl, + NotionToEntries, None, regenerate=regenerate, full_corpus=full_corpus, diff --git a/src/khoj/search_type/text_search.py b/src/khoj/search_type/text_search.py index e1da9043..cacf5c77 100644 --- a/src/khoj/search_type/text_search.py +++ b/src/khoj/search_type/text_search.py @@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder from khoj.utils.state import SearchType from khoj.utils.rawconfig import SearchResponse, Entry from khoj.utils.jsonl import load_jsonl -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.text_to_entries import TextToEntries from database.adapters import EntryAdapters from database.models import KhojUser, Entry as DbEntry @@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query): def setup( - text_to_jsonl: Type[TextEntries], + text_to_entries: Type[TextToEntries], files: dict[str, str], regenerate: bool, full_corpus: bool = True, @@ -196,11 +196,11 @@ def setup( config=None, ) -> None: if config: - num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process( + num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process( files=files, full_corpus=full_corpus, user=user, regenerate=regenerate ) else: - num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process( + num_new_embeddings, num_deleted_embeddings = text_to_entries().process( files=files, full_corpus=full_corpus, user=user, regenerate=regenerate ) diff --git a/tests/conftest.py b/tests/conftest.py index aad20274..46838594 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,7 @@ app = FastAPI() # Internal Packages from khoj.configure import configure_routes, configure_search_types, configure_middleware -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries from khoj.search_type import image_search, text_search from khoj.utils.config import SearchModels from khoj.utils.constants import web_directory @@ -26,7 +26,7 @@ from khoj.utils.rawconfig import ( ) from khoj.utils import state, fs_syncer from khoj.routers.indexer import configure_content -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl +from khoj.processor.org_mode.org_to_entries import OrgToEntries from database.models import ( KhojApiUser, LocalOrgConfig, @@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user: user=default_user, ) - text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user) + text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user) if os.getenv("GITHUB_PAT_TOKEN"): GithubConfig.objects.create( @@ -242,7 +242,7 @@ def client( # These lines help us Mock the Search models for these search types state.search_models.image_search = image_search.initialize_model(search_config.image) text_search.setup( - OrgToJsonl, + OrgToEntries, get_sample_data("org"), regenerate=False, user=api_user.user, @@ -251,7 +251,7 @@ def client( content_config.image, state.search_models.image_search, regenerate=False ) text_search.setup( - PlaintextToJsonl, + PlaintextToEntries, get_sample_data("plaintext"), regenerate=False, user=api_user.user, diff --git a/tests/test_client.py b/tests/test_client.py index a1013017..5cf438c7 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -15,7 +15,7 @@ from khoj.utils import state from khoj.utils.state import search_models, content_index, config from khoj.search_type import text_search, image_search from khoj.utils.rawconfig import ContentConfig, SearchConfig -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl +from khoj.processor.org_mode.org_to_entries import OrgToEntries from database.models import KhojUser from database.adapters import EntryAdapters @@ -176,7 +176,7 @@ def test_regenerate_with_github_fails_without_pat(client): @pytest.mark.skip(reason="Flaky test on parallel test runs") def test_get_configured_types_via_api(client, sample_org_data): # Act - text_search.setup(OrgToJsonl, sample_org_data, regenerate=False) + text_search.setup(OrgToEntries, sample_org_data, regenerate=False) enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True) @@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data): def test_get_api_config_types(client, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} - text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) # Act response = client.get(f"/api/config/types", headers=headers) @@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} - text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) user_query = quote("How to git install application?") # Act @@ -276,7 +276,7 @@ def test_notes_search_with_only_filters( # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup( - OrgToJsonl, + OrgToEntries, sample_org_data, regenerate=False, user=default_user, @@ -298,7 +298,7 @@ def test_notes_search_with_only_filters( def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-secret"} - text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) user_query = quote('How to git install application? +"Emacs"') # Act @@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user: # Arrange headers = {"Authorization": "Bearer kk-secret"} text_search.setup( - OrgToJsonl, + OrgToEntries, sample_org_data, regenerate=False, user=default_user, @@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user: def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser): # Arrange headers = {"Authorization": "Bearer kk-token"} # Token for default_user2 - text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user) user_query = quote("How to git install application?") # Act diff --git a/tests/test_markdown_to_jsonl.py b/tests/test_markdown_to_jsonl.py index a1a458ef..4593b23a 100644 --- a/tests/test_markdown_to_jsonl.py +++ b/tests/test_markdown_to_jsonl.py @@ -4,7 +4,7 @@ from pathlib import Path import os # Internal Packages -from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl +from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries from khoj.utils.fs_syncer import get_markdown_files from khoj.utils.rawconfig import TextContentConfig @@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( - MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( + MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( - MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl( + MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path): # Act # Extract Entries from specified Markdown files - entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) - entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) + entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data) + entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) # Process Each Entry from All Notes Files - jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) + jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Markdown files - entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) + entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data) # Assert assert len(entries) == 2 diff --git a/tests/test_org_to_jsonl.py b/tests/test_org_to_jsonl.py index c9ccf0d6..1eddcf95 100644 --- a/tests/test_org_to_jsonl.py +++ b/tests/test_org_to_jsonl.py @@ -3,8 +3,8 @@ import json import os # Internal Packages -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.text_to_jsonl import TextEntries +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.text_to_entries import TextToEntries from khoj.utils.helpers import is_none_or_empty from khoj.utils.rawconfig import Entry from khoj.utils.fs_syncer import get_org_files @@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path): for index_heading_entries in [True, False]: # Act # Extract entries into jsonl from specified Org files - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - OrgToJsonl.convert_org_nodes_to_entries( - *OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + OrgToEntries.convert_org_nodes_to_entries( + *OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) + entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) # Split each entry from specified Org files by max words - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - TextEntries.split_entries_by_max_tokens( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + TextToEntries.split_entries_by_max_tokens( + OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 ) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -86,7 +86,7 @@ def test_entry_split_drops_large_words(): # Act # Split entry by max words and drop words larger than max word length - processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] + processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] # Assert # "Heading" dropped from compiled version because its over the set max word limit @@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) + entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( - OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl( + OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -136,11 +136,11 @@ Intro text # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) + entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) + entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path): # Act # Extract Entries from specified Org files - entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) + entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data) # Process Each Entry from All Notes Files - entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) - jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) + entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries) + jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path): # Act # Extract Entries from specified Org files - entries, _ = OrgToJsonl.extract_org_entries(org_files=data) + entries, _ = OrgToEntries.extract_org_entries(org_files=data) # Assert assert len(entries) == 2 diff --git a/tests/test_pdf_to_jsonl.py b/tests/test_pdf_to_jsonl.py index b9b26986..81ea18c8 100644 --- a/tests/test_pdf_to_jsonl.py +++ b/tests/test_pdf_to_jsonl.py @@ -3,7 +3,7 @@ import json import os # Internal Packages -from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl +from khoj.processor.pdf.pdf_to_entries import PdfToEntries from khoj.utils.fs_syncer import get_pdf_files from khoj.utils.rawconfig import TextContentConfig @@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( - PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( + PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] @@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl(): pdf_bytes = f.read() data = {"tests/data/pdf/multipage.pdf": pdf_bytes} - entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) + entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data) # Process Each Entry from All Pdf Files - jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( - PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) + jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl( + PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map) ) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] diff --git a/tests/test_plaintext_to_jsonl.py b/tests/test_plaintext_to_jsonl.py index 56c68e38..23b0d652 100644 --- a/tests/test_plaintext_to_jsonl.py +++ b/tests/test_plaintext_to_jsonl.py @@ -6,7 +6,7 @@ from pathlib import Path # Internal Packages from khoj.utils.fs_syncer import get_plaintext_files from khoj.utils.rawconfig import TextContentConfig -from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl +from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries from database.models import LocalPlaintextConfig, KhojUser @@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path): f"{plaintextfile}": entry, } - maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data) + maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data) # Convert each entry.file to absolute path to make them JSON serializable for map in maps: map.file = str(Path(map.file).absolute()) # Process Each Entry from All Notes Files - jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps) + jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps) jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] # Assert @@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser): extracted_plaintext_files = get_plaintext_files(config=config) # Act - maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files) + maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files) # Assert assert len(maps) == 1 diff --git a/tests/test_text_search.py b/tests/test_text_search.py index db26ea7b..b5b78646 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -10,8 +10,8 @@ import pytest # Internal Packages from khoj.search_type import text_search from khoj.utils.rawconfig import ContentConfig, SearchConfig -from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl -from khoj.processor.github.github_to_jsonl import GithubToJsonl +from khoj.processor.org_mode.org_to_entries import OrgToEntries +from khoj.processor.github.github_to_entries import GithubToEntries from khoj.utils.fs_syncer import collect_files, get_org_files from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig @@ -65,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error( # Act # Generate notes embeddings during asymmetric setup with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message verify_embeddings(0, default_user) @@ -80,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate( org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) with caplog.at_level(logging.DEBUG): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert assert "Deleting all embeddings for file type org" in caplog.text @@ -94,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) with caplog.at_level(logging.DEBUG): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert assert "Created 4 new embeddings" in caplog.text @@ -112,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def # Act # Generate initial notes embeddings during asymmetric setup with caplog.at_level(logging.DEBUG): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) initial_logs = caplog.text caplog.clear() # Clear logs # Run asymmetric setup again with no changes to data source. Ensure index is not updated with caplog.at_level(logging.DEBUG): - text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) final_logs = caplog.text # Assert @@ -148,7 +148,7 @@ async def test_text_search(search_config: SearchConfig): await loop.run_in_executor( None, text_search.setup, - OrgToJsonl, + OrgToEntries, data, True, True, @@ -185,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon # Act # reload embeddings, entries, notes model after adding new org-mode file with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) # Assert # verify newly added org-mode entry is split by max tokens @@ -218,7 +218,7 @@ conda activate khoj #+end_src""" } text_search.setup( - OrgToJsonl, + OrgToEntries, data, regenerate=False, user=default_user, @@ -237,7 +237,7 @@ conda activate khoj # reload embeddings, entries, notes model after adding new org-mode file with caplog.at_level(logging.INFO): text_search.setup( - OrgToJsonl, + OrgToEntries, data, regenerate=False, full_corpus=False, @@ -259,7 +259,7 @@ def test_regenerate_index_with_new_entry( data = get_org_files(org_config) with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message @@ -273,7 +273,7 @@ def test_regenerate_index_with_new_entry( # Act # regenerate notes jsonl, model embeddings and model to include entry from new file with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) # Assert assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message @@ -298,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order( # Act # generate embeddings, entries, notes model from scratch after adding new org-mode file with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) initial_logs = caplog.text caplog.clear() # Clear logs @@ -306,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order( # update embeddings, entries, notes model with no new changes with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) final_logs = caplog.text # Assert @@ -331,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg # load embeddings, entries, notes model after adding new org file with 2 entries with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) initial_logs = caplog.text caplog.clear() # Clear logs @@ -343,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg # Act with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) final_logs = caplog.text # Assert @@ -361,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file org_config = LocalOrgConfig.objects.filter(user=default_user).first() data = get_org_files(org_config) with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=True, user=default_user) initial_logs = caplog.text caplog.clear() # Clear logs @@ -375,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file # Act # update embeddings, entries with the newly added note with caplog.at_level(logging.INFO): - text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) + text_search.setup(OrgToEntries, data, regenerate=False, user=default_user) final_logs = caplog.text # Assert @@ -393,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K # Act # Regenerate github embeddings to test asymmetric setup without caching text_search.setup( - GithubToJsonl, + GithubToEntries, {}, regenerate=True, user=default_user,