Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries

These content processors are converting content into entries in DB
instead of entries in JSONL file
This commit is contained in:
Debanjum Singh Solanky 2023-11-01 14:51:33 -07:00
parent 2ad2055bcb
commit d92a2d03a7
16 changed files with 127 additions and 125 deletions

View file

@ -10,16 +10,16 @@ import requests
# Internal Packages # Internal Packages
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from database.models import Entry as DbEntry, GithubConfig, KhojUser from database.models import Entry as DbEntry, GithubConfig, KhojUser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class GithubToJsonl(TextEntries): class GithubToEntries(TextToEntries):
def __init__(self, config: GithubConfig): def __init__(self, config: GithubConfig):
super().__init__(config) super().__init__(config)
raw_repos = config.githubrepoconfig.all() raw_repos = config.githubrepoconfig.all()
@ -77,24 +77,26 @@ class GithubToJsonl(TextEntries):
current_entries = [] current_entries = []
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger): with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
*GithubToJsonl.extract_markdown_entries(markdown_files) *GithubToEntries.extract_markdown_entries(markdown_files)
) )
with timer(f"Extract org entries from github repo {repo_shorthand}", logger): with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files)) current_entries += OrgToEntries.convert_org_nodes_to_entries(
*GithubToEntries.extract_org_entries(org_files)
)
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
with timer(f"Extract issues from github repo {repo_shorthand}", logger): with timer(f"Extract issues from github repo {repo_shorthand}", logger):
issue_entries = GithubToJsonl.convert_issues_to_entries( issue_entries = GithubToEntries.convert_issues_to_entries(
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url)) *GithubToEntries.extract_github_issues(self.get_issues(repo_url))
) )
current_entries += issue_entries current_entries += issue_entries
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256) current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
return current_entries return current_entries
@ -280,7 +282,7 @@ class GithubToJsonl(TextEntries):
entries = [] entries = []
entry_to_file_map = [] entry_to_file_map = []
for doc in markdown_files: for doc in markdown_files:
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
doc["content"], doc["path"], entries, entry_to_file_map doc["content"], doc["path"], entries, entry_to_file_map
) )
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)
@ -291,7 +293,7 @@ class GithubToJsonl(TextEntries):
entry_to_file_map = [] entry_to_file_map = []
for doc in org_files: for doc in org_files:
entries, entry_to_file_map = OrgToJsonl.process_single_org_file( entries, entry_to_file_map = OrgToEntries.process_single_org_file(
doc["content"], doc["path"], entries, entry_to_file_map doc["content"], doc["path"], entries, entry_to_file_map
) )
return entries, dict(entry_to_file_map) return entries, dict(entry_to_file_map)

View file

@ -6,7 +6,7 @@ from pathlib import Path
from typing import Tuple, List from typing import Tuple, List
# Internal Packages # Internal Packages
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.constants import empty_escape_sequences from khoj.utils.constants import empty_escape_sequences
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class MarkdownToJsonl(TextEntries): class MarkdownToEntries(TextToEntries):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEntries):
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
with timer("Parse entries from Markdown files into dictionaries", logger): with timer("Parse entries from Markdown files into dictionaries", logger):
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
*MarkdownToJsonl.extract_markdown_entries(files) *MarkdownToEntries.extract_markdown_entries(files)
) )
# Split entries by max tokens supported by model # Split entries by max tokens supported by model
@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEntries):
for markdown_file in markdown_files: for markdown_file in markdown_files:
try: try:
markdown_content = markdown_files[markdown_file] markdown_content = markdown_files[markdown_file]
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file( entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
markdown_content, markdown_file, entries, entry_to_file_map markdown_content, markdown_file, entries, entry_to_file_map
) )
except Exception as e: except Exception as e:

View file

@ -8,7 +8,7 @@ import requests
# Internal Packages # Internal Packages
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, NotionContentConfig from khoj.utils.rawconfig import Entry, NotionContentConfig
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
from database.models import Entry as DbEntry, KhojUser, NotionConfig from database.models import Entry as DbEntry, KhojUser, NotionConfig
@ -50,7 +50,7 @@ class NotionBlockType(Enum):
CALLOUT = "callout" CALLOUT = "callout"
class NotionToJsonl(TextEntries): class NotionToEntries(TextToEntries):
def __init__(self, config: NotionConfig): def __init__(self, config: NotionConfig):
super().__init__(config) super().__init__(config)
self.config = NotionContentConfig( self.config = NotionContentConfig(

View file

@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple
# Internal Packages # Internal Packages
from khoj.processor.org_mode import orgnode from khoj.processor.org_mode import orgnode
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
from khoj.utils import state from khoj.utils import state
@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class OrgToJsonl(TextEntries): class OrgToEntries(TextToEntries):
def __init__(self): def __init__(self):
super().__init__() super().__init__()

View file

@ -8,7 +8,7 @@ import base64
from langchain.document_loaders import PyMuPDFLoader from langchain.document_loaders import PyMuPDFLoader
# Internal Packages # Internal Packages
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
from database.models import Entry as DbEntry, KhojUser from database.models import Entry as DbEntry, KhojUser
@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class PdfToJsonl(TextEntries): class PdfToEntries(TextToEntries):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -35,7 +35,7 @@ class PdfToJsonl(TextEntries):
# Extract Entries from specified Pdf files # Extract Entries from specified Pdf files
with timer("Parse entries from PDF files into dictionaries", logger): with timer("Parse entries from PDF files into dictionaries", logger):
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files)) current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
# Split entries by max tokens supported by model # Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger): with timer("Split entries by max token size supported by model", logger):

View file

@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
# Internal Packages # Internal Packages
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import timer from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
from database.models import Entry as DbEntry, KhojUser from database.models import Entry as DbEntry, KhojUser
@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class PlaintextToJsonl(TextEntries): class PlaintextToEntries(TextToEntries):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEntries):
try: try:
plaintext_content = files[file] plaintext_content = files[file]
if file.endswith(("html", "htm", "xml")): if file.endswith(("html", "htm", "xml")):
plaintext_content = PlaintextToJsonl.extract_html_content( plaintext_content = PlaintextToEntries.extract_html_content(
plaintext_content, file.split(".")[-1] plaintext_content, file.split(".")[-1]
) )
files[file] = plaintext_content files[file] = plaintext_content
@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEntries):
# Extract Entries from specified plaintext files # Extract Entries from specified plaintext files
with timer("Parse entries from plaintext files", logger): with timer("Parse entries from plaintext files", logger):
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files) current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
# Split entries by max tokens supported by model # Split entries by max tokens supported by model
with timer("Split entries by max token size supported by model", logger): with timer("Split entries by max token size supported by model", logger):

View file

@ -19,7 +19,7 @@ from database.adapters import EntryAdapters
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class TextEntries(ABC): class TextToEntries(ABC):
def __init__(self, config: Any = None): def __init__(self, config: Any = None):
self.embeddings_model = EmbeddingsModel() self.embeddings_model = EmbeddingsModel()
self.config = config self.config = config
@ -85,10 +85,10 @@ class TextEntries(ABC):
): ):
with timer("Construct current entry hashes", logger): with timer("Construct current entry hashes", logger):
hashes_by_file = dict[str, set[str]]() hashes_by_file = dict[str, set[str]]()
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries)) hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
for entry in tqdm(current_entries, desc="Hashing Entries"): for entry in tqdm(current_entries, desc="Hashing Entries"):
hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry)) hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
num_deleted_embeddings = 0 num_deleted_embeddings = 0
with timer("Preparing dataset for regeneration", logger): with timer("Preparing dataset for regeneration", logger):
@ -180,11 +180,11 @@ class TextEntries(ABC):
): ):
# Hash all current and previous entries to identify new entries # Hash all current and previous entries to identify new entries
with timer("Hash previous, current entries", logger): with timer("Hash previous, current entries", logger):
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries)) current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries)) previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
if deletion_filenames is not None: if deletion_filenames is not None:
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames] deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries)) deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
else: else:
deletion_entry_hashes = [] deletion_entry_hashes = []

View file

@ -10,12 +10,12 @@ from starlette.authentication import requires
# Internal Packages # Internal Packages
from khoj.utils import state, constants from khoj.utils import state, constants
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.processor.pdf.pdf_to_entries import PdfToEntries
from khoj.processor.github.github_to_jsonl import GithubToJsonl from khoj.processor.github.github_to_entries import GithubToEntries
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl from khoj.processor.notion.notion_to_entries import NotionToEntries
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.search_type import text_search, image_search from khoj.search_type import text_search, image_search
from khoj.routers.helpers import update_telemetry_state from khoj.routers.helpers import update_telemetry_state
from khoj.utils.yaml import save_config_to_file_updated_state from khoj.utils.yaml import save_config_to_file_updated_state
@ -201,7 +201,7 @@ def configure_content(
logger.info("🦄 Setting up search for orgmode notes") logger.info("🦄 Setting up search for orgmode notes")
# Extract Entries, Generate Notes Embeddings # Extract Entries, Generate Notes Embeddings
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
files.get("org"), files.get("org"),
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,
@ -216,7 +216,7 @@ def configure_content(
logger.info("💎 Setting up search for markdown notes") logger.info("💎 Setting up search for markdown notes")
# Extract Entries, Generate Markdown Embeddings # Extract Entries, Generate Markdown Embeddings
text_search.setup( text_search.setup(
MarkdownToJsonl, MarkdownToEntries,
files.get("markdown"), files.get("markdown"),
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,
@ -232,7 +232,7 @@ def configure_content(
logger.info("🖨️ Setting up search for pdf") logger.info("🖨️ Setting up search for pdf")
# Extract Entries, Generate PDF Embeddings # Extract Entries, Generate PDF Embeddings
text_search.setup( text_search.setup(
PdfToJsonl, PdfToEntries,
files.get("pdf"), files.get("pdf"),
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,
@ -248,7 +248,7 @@ def configure_content(
logger.info("📄 Setting up search for plaintext") logger.info("📄 Setting up search for plaintext")
# Extract Entries, Generate Plaintext Embeddings # Extract Entries, Generate Plaintext Embeddings
text_search.setup( text_search.setup(
PlaintextToJsonl, PlaintextToEntries,
files.get("plaintext"), files.get("plaintext"),
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,
@ -281,7 +281,7 @@ def configure_content(
logger.info("🐙 Setting up search for github") logger.info("🐙 Setting up search for github")
# Extract Entries, Generate Github Embeddings # Extract Entries, Generate Github Embeddings
text_search.setup( text_search.setup(
GithubToJsonl, GithubToEntries,
None, None,
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,
@ -298,7 +298,7 @@ def configure_content(
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config: if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
logger.info("🔌 Setting up search for notion") logger.info("🔌 Setting up search for notion")
text_search.setup( text_search.setup(
NotionToJsonl, NotionToEntries,
None, None,
regenerate=regenerate, regenerate=regenerate,
full_corpus=full_corpus, full_corpus=full_corpus,

View file

@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder
from khoj.utils.state import SearchType from khoj.utils.state import SearchType
from khoj.utils.rawconfig import SearchResponse, Entry from khoj.utils.rawconfig import SearchResponse, Entry
from khoj.utils.jsonl import load_jsonl from khoj.utils.jsonl import load_jsonl
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from database.adapters import EntryAdapters from database.adapters import EntryAdapters
from database.models import KhojUser, Entry as DbEntry from database.models import KhojUser, Entry as DbEntry
@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
def setup( def setup(
text_to_jsonl: Type[TextEntries], text_to_entries: Type[TextToEntries],
files: dict[str, str], files: dict[str, str],
regenerate: bool, regenerate: bool,
full_corpus: bool = True, full_corpus: bool = True,
@ -196,11 +196,11 @@ def setup(
config=None, config=None,
) -> None: ) -> None:
if config: if config:
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process( num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
) )
else: else:
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process( num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
) )

View file

@ -13,7 +13,7 @@ app = FastAPI()
# Internal Packages # Internal Packages
from khoj.configure import configure_routes, configure_search_types, configure_middleware from khoj.configure import configure_routes, configure_search_types, configure_middleware
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from khoj.search_type import image_search, text_search from khoj.search_type import image_search, text_search
from khoj.utils.config import SearchModels from khoj.utils.config import SearchModels
from khoj.utils.constants import web_directory from khoj.utils.constants import web_directory
@ -26,7 +26,7 @@ from khoj.utils.rawconfig import (
) )
from khoj.utils import state, fs_syncer from khoj.utils import state, fs_syncer
from khoj.routers.indexer import configure_content from khoj.routers.indexer import configure_content
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from database.models import ( from database.models import (
KhojApiUser, KhojApiUser,
LocalOrgConfig, LocalOrgConfig,
@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
user=default_user, user=default_user,
) )
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user) text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
if os.getenv("GITHUB_PAT_TOKEN"): if os.getenv("GITHUB_PAT_TOKEN"):
GithubConfig.objects.create( GithubConfig.objects.create(
@ -242,7 +242,7 @@ def client(
# These lines help us Mock the Search models for these search types # These lines help us Mock the Search models for these search types
state.search_models.image_search = image_search.initialize_model(search_config.image) state.search_models.image_search = image_search.initialize_model(search_config.image)
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
get_sample_data("org"), get_sample_data("org"),
regenerate=False, regenerate=False,
user=api_user.user, user=api_user.user,
@ -251,7 +251,7 @@ def client(
content_config.image, state.search_models.image_search, regenerate=False content_config.image, state.search_models.image_search, regenerate=False
) )
text_search.setup( text_search.setup(
PlaintextToJsonl, PlaintextToEntries,
get_sample_data("plaintext"), get_sample_data("plaintext"),
regenerate=False, regenerate=False,
user=api_user.user, user=api_user.user,

View file

@ -15,7 +15,7 @@ from khoj.utils import state
from khoj.utils.state import search_models, content_index, config from khoj.utils.state import search_models, content_index, config
from khoj.search_type import text_search, image_search from khoj.search_type import text_search, image_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from database.models import KhojUser from database.models import KhojUser
from database.adapters import EntryAdapters from database.adapters import EntryAdapters
@ -176,7 +176,7 @@ def test_regenerate_with_github_fails_without_pat(client):
@pytest.mark.skip(reason="Flaky test on parallel test runs") @pytest.mark.skip(reason="Flaky test on parallel test runs")
def test_get_configured_types_via_api(client, sample_org_data): def test_get_configured_types_via_api(client, sample_org_data):
# Act # Act
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False) text_search.setup(OrgToEntries, sample_org_data, regenerate=False)
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True) enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data):
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser): def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-secret"} headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
# Act # Act
response = client.get(f"/api/config/types", headers=headers) response = client.get(f"/api/config/types", headers=headers)
@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser): def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-secret"} headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("How to git install application?") user_query = quote("How to git install application?")
# Act # Act
@ -276,7 +276,7 @@ def test_notes_search_with_only_filters(
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-secret"} headers = {"Authorization": "Bearer kk-secret"}
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
sample_org_data, sample_org_data,
regenerate=False, regenerate=False,
user=default_user, user=default_user,
@ -298,7 +298,7 @@ def test_notes_search_with_only_filters(
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser): def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-secret"} headers = {"Authorization": "Bearer kk-secret"}
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote('How to git install application? +"Emacs"') user_query = quote('How to git install application? +"Emacs"')
# Act # Act
@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-secret"} headers = {"Authorization": "Bearer kk-secret"}
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
sample_org_data, sample_org_data,
regenerate=False, regenerate=False,
user=default_user, user=default_user,
@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser): def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
# Arrange # Arrange
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2 headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
user_query = quote("How to git install application?") user_query = quote("How to git install application?")
# Act # Act

View file

@ -4,7 +4,7 @@ from pathlib import Path
import os import os
# Internal Packages # Internal Packages
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
from khoj.utils.fs_syncer import get_markdown_files from khoj.utils.fs_syncer import get_markdown_files
from khoj.utils.rawconfig import TextContentConfig from khoj.utils.rawconfig import TextContentConfig
@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries) MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl( jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map) MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map) entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries) jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Act # Act
# Extract Entries from specified Markdown files # Extract Entries from specified Markdown files
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data) entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
# Assert # Assert
assert len(entries) == 2 assert len(entries) == 2

View file

@ -3,8 +3,8 @@ import json
import os import os
# Internal Packages # Internal Packages
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.text_to_jsonl import TextEntries from khoj.processor.text_to_entries import TextToEntries
from khoj.utils.helpers import is_none_or_empty from khoj.utils.helpers import is_none_or_empty
from khoj.utils.rawconfig import Entry from khoj.utils.rawconfig import Entry
from khoj.utils.fs_syncer import get_org_files from khoj.utils.fs_syncer import get_org_files
@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
for index_heading_entries in [True, False]: for index_heading_entries in [True, False]:
# Act # Act
# Extract entries into jsonl from specified Org files # Extract entries into jsonl from specified Org files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries( OrgToEntries.convert_org_nodes_to_entries(
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries *OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
) )
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
# Split each entry from specified Org files by max words # Split each entry from specified Org files by max words
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
TextEntries.split_entries_by_max_tokens( TextToEntries.split_entries_by_max_tokens(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4 OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
) )
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
# Act # Act
# Split entry by max words and drop words larger than max word length # Split entry by max words and drop words larger than max word length
processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0] processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
# Assert # Assert
# "Heading" dropped from compiled version because its over the set max word limit # "Heading" dropped from compiled version because its over the set max word limit
@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data) entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl( jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map) OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -136,11 +136,11 @@ Intro text
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data) entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries) entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries) jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
# Act # Act
# Extract Entries from specified Org files # Extract Entries from specified Org files
entries, _ = OrgToJsonl.extract_org_entries(org_files=data) entries, _ = OrgToEntries.extract_org_entries(org_files=data)
# Assert # Assert
assert len(entries) == 2 assert len(entries) == 2

View file

@ -3,7 +3,7 @@ import json
import os import os
# Internal Packages # Internal Packages
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.processor.pdf.pdf_to_entries import PdfToEntries
from khoj.utils.fs_syncer import get_pdf_files from khoj.utils.fs_syncer import get_pdf_files
from khoj.utils.rawconfig import TextContentConfig from khoj.utils.rawconfig import TextContentConfig
@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl():
pdf_bytes = f.read() pdf_bytes = f.read()
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes} data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files # Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl():
pdf_bytes = f.read() pdf_bytes = f.read()
data = {"tests/data/pdf/multipage.pdf": pdf_bytes} data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data) entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
# Process Each Entry from All Pdf Files # Process Each Entry from All Pdf Files
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl( jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map) PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
) )
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]

View file

@ -6,7 +6,7 @@ from pathlib import Path
# Internal Packages # Internal Packages
from khoj.utils.fs_syncer import get_plaintext_files from khoj.utils.fs_syncer import get_plaintext_files
from khoj.utils.rawconfig import TextContentConfig from khoj.utils.rawconfig import TextContentConfig
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
from database.models import LocalPlaintextConfig, KhojUser from database.models import LocalPlaintextConfig, KhojUser
@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path):
f"{plaintextfile}": entry, f"{plaintextfile}": entry,
} }
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data) maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
# Convert each entry.file to absolute path to make them JSON serializable # Convert each entry.file to absolute path to make them JSON serializable
for map in maps: for map in maps:
map.file = str(Path(map.file).absolute()) map.file = str(Path(map.file).absolute())
# Process Each Entry from All Notes Files # Process Each Entry from All Notes Files
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps) jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
# Assert # Assert
@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
extracted_plaintext_files = get_plaintext_files(config=config) extracted_plaintext_files = get_plaintext_files(config=config)
# Act # Act
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files) maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
# Assert # Assert
assert len(maps) == 1 assert len(maps) == 1

View file

@ -10,8 +10,8 @@ import pytest
# Internal Packages # Internal Packages
from khoj.search_type import text_search from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig from khoj.utils.rawconfig import ContentConfig, SearchConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_entries import OrgToEntries
from khoj.processor.github.github_to_jsonl import GithubToJsonl from khoj.processor.github.github_to_entries import GithubToEntries
from khoj.utils.fs_syncer import collect_files, get_org_files from khoj.utils.fs_syncer import collect_files, get_org_files
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
@ -65,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error(
# Act # Act
# Generate notes embeddings during asymmetric setup # Generate notes embeddings during asymmetric setup
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
verify_embeddings(0, default_user) verify_embeddings(0, default_user)
@ -80,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
org_config = LocalOrgConfig.objects.filter(user=default_user).first() org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config) data = get_org_files(org_config)
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert # Assert
assert "Deleting all embeddings for file type org" in caplog.text assert "Deleting all embeddings for file type org" in caplog.text
@ -94,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
org_config = LocalOrgConfig.objects.filter(user=default_user).first() org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config) data = get_org_files(org_config)
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert # Assert
assert "Created 4 new embeddings" in caplog.text assert "Created 4 new embeddings" in caplog.text
@ -112,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
# Act # Act
# Generate initial notes embeddings during asymmetric setup # Generate initial notes embeddings during asymmetric setup
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text initial_logs = caplog.text
caplog.clear() # Clear logs caplog.clear() # Clear logs
# Run asymmetric setup again with no changes to data source. Ensure index is not updated # Run asymmetric setup again with no changes to data source. Ensure index is not updated
with caplog.at_level(logging.DEBUG): with caplog.at_level(logging.DEBUG):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
@ -148,7 +148,7 @@ async def test_text_search(search_config: SearchConfig):
await loop.run_in_executor( await loop.run_in_executor(
None, None,
text_search.setup, text_search.setup,
OrgToJsonl, OrgToEntries,
data, data,
True, True,
True, True,
@ -185,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
# Act # Act
# reload embeddings, entries, notes model after adding new org-mode file # reload embeddings, entries, notes model after adding new org-mode file
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
# Assert # Assert
# verify newly added org-mode entry is split by max tokens # verify newly added org-mode entry is split by max tokens
@ -218,7 +218,7 @@ conda activate khoj
#+end_src""" #+end_src"""
} }
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
data, data,
regenerate=False, regenerate=False,
user=default_user, user=default_user,
@ -237,7 +237,7 @@ conda activate khoj
# reload embeddings, entries, notes model after adding new org-mode file # reload embeddings, entries, notes model after adding new org-mode file
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup( text_search.setup(
OrgToJsonl, OrgToEntries,
data, data,
regenerate=False, regenerate=False,
full_corpus=False, full_corpus=False,
@ -259,7 +259,7 @@ def test_regenerate_index_with_new_entry(
data = get_org_files(org_config) data = get_org_files(org_config)
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
@ -273,7 +273,7 @@ def test_regenerate_index_with_new_entry(
# Act # Act
# regenerate notes jsonl, model embeddings and model to include entry from new file # regenerate notes jsonl, model embeddings and model to include entry from new file
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
# Assert # Assert
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
@ -298,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# Act # Act
# generate embeddings, entries, notes model from scratch after adding new org-mode file # generate embeddings, entries, notes model from scratch after adding new org-mode file
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text initial_logs = caplog.text
caplog.clear() # Clear logs caplog.clear() # Clear logs
@ -306,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
# update embeddings, entries, notes model with no new changes # update embeddings, entries, notes model with no new changes
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
@ -331,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# load embeddings, entries, notes model after adding new org file with 2 entries # load embeddings, entries, notes model after adding new org file with 2 entries
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text initial_logs = caplog.text
caplog.clear() # Clear logs caplog.clear() # Clear logs
@ -343,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
# Act # Act
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
@ -361,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
org_config = LocalOrgConfig.objects.filter(user=default_user).first() org_config = LocalOrgConfig.objects.filter(user=default_user).first()
data = get_org_files(org_config) data = get_org_files(org_config)
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user) text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
initial_logs = caplog.text initial_logs = caplog.text
caplog.clear() # Clear logs caplog.clear() # Clear logs
@ -375,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
# Act # Act
# update embeddings, entries with the newly added note # update embeddings, entries with the newly added note
with caplog.at_level(logging.INFO): with caplog.at_level(logging.INFO):
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user) text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
final_logs = caplog.text final_logs = caplog.text
# Assert # Assert
@ -393,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
# Act # Act
# Regenerate github embeddings to test asymmetric setup without caching # Regenerate github embeddings to test asymmetric setup without caching
text_search.setup( text_search.setup(
GithubToJsonl, GithubToEntries,
{}, {},
regenerate=True, regenerate=True,
user=default_user, user=default_user,