mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-11 16:23:02 +01:00
Rename Files, Classes from X_To_JSONL to more appropriate X_To_Entries
These content processors are converting content into entries in DB instead of entries in JSONL file
This commit is contained in:
parent
2ad2055bcb
commit
d92a2d03a7
16 changed files with 127 additions and 125 deletions
|
@ -10,16 +10,16 @@ import requests
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
from database.models import Entry as DbEntry, GithubConfig, KhojUser
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class GithubToJsonl(TextEntries):
|
class GithubToEntries(TextToEntries):
|
||||||
def __init__(self, config: GithubConfig):
|
def __init__(self, config: GithubConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
raw_repos = config.githubrepoconfig.all()
|
raw_repos = config.githubrepoconfig.all()
|
||||||
|
@ -77,24 +77,26 @@ class GithubToJsonl(TextEntries):
|
||||||
current_entries = []
|
current_entries = []
|
||||||
|
|
||||||
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||||
*GithubToJsonl.extract_markdown_entries(markdown_files)
|
*GithubToEntries.extract_markdown_entries(markdown_files)
|
||||||
)
|
)
|
||||||
|
|
||||||
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
with timer(f"Extract org entries from github repo {repo_shorthand}", logger):
|
||||||
current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))
|
current_entries += OrgToEntries.convert_org_nodes_to_entries(
|
||||||
|
*GithubToEntries.extract_org_entries(org_files)
|
||||||
|
)
|
||||||
|
|
||||||
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
|
||||||
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
|
||||||
|
|
||||||
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
with timer(f"Extract issues from github repo {repo_shorthand}", logger):
|
||||||
issue_entries = GithubToJsonl.convert_issues_to_entries(
|
issue_entries = GithubToEntries.convert_issues_to_entries(
|
||||||
*GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
|
*GithubToEntries.extract_github_issues(self.get_issues(repo_url))
|
||||||
)
|
)
|
||||||
current_entries += issue_entries
|
current_entries += issue_entries
|
||||||
|
|
||||||
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
|
||||||
current_entries = TextEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
current_entries = TextToEntries.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||||
|
|
||||||
return current_entries
|
return current_entries
|
||||||
|
|
||||||
|
@ -280,7 +282,7 @@ class GithubToJsonl(TextEntries):
|
||||||
entries = []
|
entries = []
|
||||||
entry_to_file_map = []
|
entry_to_file_map = []
|
||||||
for doc in markdown_files:
|
for doc in markdown_files:
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||||
doc["content"], doc["path"], entries, entry_to_file_map
|
doc["content"], doc["path"], entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
||||||
|
@ -291,7 +293,7 @@ class GithubToJsonl(TextEntries):
|
||||||
entry_to_file_map = []
|
entry_to_file_map = []
|
||||||
|
|
||||||
for doc in org_files:
|
for doc in org_files:
|
||||||
entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
|
entries, entry_to_file_map = OrgToEntries.process_single_org_file(
|
||||||
doc["content"], doc["path"], entries, entry_to_file_map
|
doc["content"], doc["path"], entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
return entries, dict(entry_to_file_map)
|
return entries, dict(entry_to_file_map)
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
||||||
from typing import Tuple, List
|
from typing import Tuple, List
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.constants import empty_escape_sequences
|
from khoj.utils.constants import empty_escape_sequences
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
|
@ -16,7 +16,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MarkdownToJsonl(TextEntries):
|
class MarkdownToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
@ -34,8 +34,8 @@ class MarkdownToJsonl(TextEntries):
|
||||||
|
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
with timer("Parse entries from Markdown files into dictionaries", logger):
|
with timer("Parse entries from Markdown files into dictionaries", logger):
|
||||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
current_entries = MarkdownToEntries.convert_markdown_entries_to_maps(
|
||||||
*MarkdownToJsonl.extract_markdown_entries(files)
|
*MarkdownToEntries.extract_markdown_entries(files)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
|
@ -67,7 +67,7 @@ class MarkdownToJsonl(TextEntries):
|
||||||
for markdown_file in markdown_files:
|
for markdown_file in markdown_files:
|
||||||
try:
|
try:
|
||||||
markdown_content = markdown_files[markdown_file]
|
markdown_content = markdown_files[markdown_file]
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
entries, entry_to_file_map = MarkdownToEntries.process_single_markdown_file(
|
||||||
markdown_content, markdown_file, entries, entry_to_file_map
|
markdown_content, markdown_file, entries, entry_to_file_map
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
|
@ -8,7 +8,7 @@ import requests
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
from khoj.utils.rawconfig import Entry, NotionContentConfig
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
from database.models import Entry as DbEntry, KhojUser, NotionConfig
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ class NotionBlockType(Enum):
|
||||||
CALLOUT = "callout"
|
CALLOUT = "callout"
|
||||||
|
|
||||||
|
|
||||||
class NotionToJsonl(TextEntries):
|
class NotionToEntries(TextToEntries):
|
||||||
def __init__(self, config: NotionConfig):
|
def __init__(self, config: NotionConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = NotionContentConfig(
|
self.config = NotionContentConfig(
|
|
@ -5,7 +5,7 @@ from typing import Iterable, List, Tuple
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.org_mode import orgnode
|
from khoj.processor.org_mode import orgnode
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.utils import state
|
from khoj.utils import state
|
||||||
|
@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class OrgToJsonl(TextEntries):
|
class OrgToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
|
@ -8,7 +8,7 @@ import base64
|
||||||
from langchain.document_loaders import PyMuPDFLoader
|
from langchain.document_loaders import PyMuPDFLoader
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Entry as DbEntry, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
@ -17,7 +17,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PdfToJsonl(TextEntries):
|
class PdfToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ class PdfToJsonl(TextEntries):
|
||||||
|
|
||||||
# Extract Entries from specified Pdf files
|
# Extract Entries from specified Pdf files
|
||||||
with timer("Parse entries from PDF files into dictionaries", logger):
|
with timer("Parse entries from PDF files into dictionaries", logger):
|
||||||
current_entries = PdfToJsonl.convert_pdf_entries_to_maps(*PdfToJsonl.extract_pdf_entries(files))
|
current_entries = PdfToEntries.convert_pdf_entries_to_maps(*PdfToEntries.extract_pdf_entries(files))
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
with timer("Split entries by max token size supported by model", logger):
|
with timer("Split entries by max token size supported by model", logger):
|
|
@ -6,7 +6,7 @@ from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import timer
|
from khoj.utils.helpers import timer
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from database.models import Entry as DbEntry, KhojUser
|
from database.models import Entry as DbEntry, KhojUser
|
||||||
|
@ -15,7 +15,7 @@ from database.models import Entry as DbEntry, KhojUser
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PlaintextToJsonl(TextEntries):
|
class PlaintextToEntries(TextToEntries):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ class PlaintextToJsonl(TextEntries):
|
||||||
try:
|
try:
|
||||||
plaintext_content = files[file]
|
plaintext_content = files[file]
|
||||||
if file.endswith(("html", "htm", "xml")):
|
if file.endswith(("html", "htm", "xml")):
|
||||||
plaintext_content = PlaintextToJsonl.extract_html_content(
|
plaintext_content = PlaintextToEntries.extract_html_content(
|
||||||
plaintext_content, file.split(".")[-1]
|
plaintext_content, file.split(".")[-1]
|
||||||
)
|
)
|
||||||
files[file] = plaintext_content
|
files[file] = plaintext_content
|
||||||
|
@ -45,7 +45,7 @@ class PlaintextToJsonl(TextEntries):
|
||||||
|
|
||||||
# Extract Entries from specified plaintext files
|
# Extract Entries from specified plaintext files
|
||||||
with timer("Parse entries from plaintext files", logger):
|
with timer("Parse entries from plaintext files", logger):
|
||||||
current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(files)
|
current_entries = PlaintextToEntries.convert_plaintext_entries_to_maps(files)
|
||||||
|
|
||||||
# Split entries by max tokens supported by model
|
# Split entries by max tokens supported by model
|
||||||
with timer("Split entries by max token size supported by model", logger):
|
with timer("Split entries by max token size supported by model", logger):
|
|
@ -19,7 +19,7 @@ from database.adapters import EntryAdapters
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TextEntries(ABC):
|
class TextToEntries(ABC):
|
||||||
def __init__(self, config: Any = None):
|
def __init__(self, config: Any = None):
|
||||||
self.embeddings_model = EmbeddingsModel()
|
self.embeddings_model = EmbeddingsModel()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
@ -85,10 +85,10 @@ class TextEntries(ABC):
|
||||||
):
|
):
|
||||||
with timer("Construct current entry hashes", logger):
|
with timer("Construct current entry hashes", logger):
|
||||||
hashes_by_file = dict[str, set[str]]()
|
hashes_by_file = dict[str, set[str]]()
|
||||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||||
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
hash_to_current_entries = dict(zip(current_entry_hashes, current_entries))
|
||||||
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
for entry in tqdm(current_entries, desc="Hashing Entries"):
|
||||||
hashes_by_file.setdefault(entry.file, set()).add(TextEntries.hash_func(key)(entry))
|
hashes_by_file.setdefault(entry.file, set()).add(TextToEntries.hash_func(key)(entry))
|
||||||
|
|
||||||
num_deleted_embeddings = 0
|
num_deleted_embeddings = 0
|
||||||
with timer("Preparing dataset for regeneration", logger):
|
with timer("Preparing dataset for regeneration", logger):
|
||||||
|
@ -180,11 +180,11 @@ class TextEntries(ABC):
|
||||||
):
|
):
|
||||||
# Hash all current and previous entries to identify new entries
|
# Hash all current and previous entries to identify new entries
|
||||||
with timer("Hash previous, current entries", logger):
|
with timer("Hash previous, current entries", logger):
|
||||||
current_entry_hashes = list(map(TextEntries.hash_func(key), current_entries))
|
current_entry_hashes = list(map(TextToEntries.hash_func(key), current_entries))
|
||||||
previous_entry_hashes = list(map(TextEntries.hash_func(key), previous_entries))
|
previous_entry_hashes = list(map(TextToEntries.hash_func(key), previous_entries))
|
||||||
if deletion_filenames is not None:
|
if deletion_filenames is not None:
|
||||||
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
deletion_entries = [entry for entry in previous_entries if entry.file in deletion_filenames]
|
||||||
deletion_entry_hashes = list(map(TextEntries.hash_func(key), deletion_entries))
|
deletion_entry_hashes = list(map(TextToEntries.hash_func(key), deletion_entries))
|
||||||
else:
|
else:
|
||||||
deletion_entry_hashes = []
|
deletion_entry_hashes = []
|
||||||
|
|
|
@ -10,12 +10,12 @@ from starlette.authentication import requires
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils import state, constants
|
from khoj.utils import state, constants
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||||
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
|
from khoj.processor.notion.notion_to_entries import NotionToEntries
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.search_type import text_search, image_search
|
from khoj.search_type import text_search, image_search
|
||||||
from khoj.routers.helpers import update_telemetry_state
|
from khoj.routers.helpers import update_telemetry_state
|
||||||
from khoj.utils.yaml import save_config_to_file_updated_state
|
from khoj.utils.yaml import save_config_to_file_updated_state
|
||||||
|
@ -201,7 +201,7 @@ def configure_content(
|
||||||
logger.info("🦄 Setting up search for orgmode notes")
|
logger.info("🦄 Setting up search for orgmode notes")
|
||||||
# Extract Entries, Generate Notes Embeddings
|
# Extract Entries, Generate Notes Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
files.get("org"),
|
files.get("org"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
@ -216,7 +216,7 @@ def configure_content(
|
||||||
logger.info("💎 Setting up search for markdown notes")
|
logger.info("💎 Setting up search for markdown notes")
|
||||||
# Extract Entries, Generate Markdown Embeddings
|
# Extract Entries, Generate Markdown Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
MarkdownToJsonl,
|
MarkdownToEntries,
|
||||||
files.get("markdown"),
|
files.get("markdown"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
@ -232,7 +232,7 @@ def configure_content(
|
||||||
logger.info("🖨️ Setting up search for pdf")
|
logger.info("🖨️ Setting up search for pdf")
|
||||||
# Extract Entries, Generate PDF Embeddings
|
# Extract Entries, Generate PDF Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PdfToJsonl,
|
PdfToEntries,
|
||||||
files.get("pdf"),
|
files.get("pdf"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
@ -248,7 +248,7 @@ def configure_content(
|
||||||
logger.info("📄 Setting up search for plaintext")
|
logger.info("📄 Setting up search for plaintext")
|
||||||
# Extract Entries, Generate Plaintext Embeddings
|
# Extract Entries, Generate Plaintext Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PlaintextToJsonl,
|
PlaintextToEntries,
|
||||||
files.get("plaintext"),
|
files.get("plaintext"),
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
@ -281,7 +281,7 @@ def configure_content(
|
||||||
logger.info("🐙 Setting up search for github")
|
logger.info("🐙 Setting up search for github")
|
||||||
# Extract Entries, Generate Github Embeddings
|
# Extract Entries, Generate Github Embeddings
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
GithubToJsonl,
|
GithubToEntries,
|
||||||
None,
|
None,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
@ -298,7 +298,7 @@ def configure_content(
|
||||||
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
if (search_type == None or search_type in state.SearchType.Notion.value) and notion_config:
|
||||||
logger.info("🔌 Setting up search for notion")
|
logger.info("🔌 Setting up search for notion")
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
NotionToJsonl,
|
NotionToEntries,
|
||||||
None,
|
None,
|
||||||
regenerate=regenerate,
|
regenerate=regenerate,
|
||||||
full_corpus=full_corpus,
|
full_corpus=full_corpus,
|
||||||
|
|
|
@ -18,7 +18,7 @@ from khoj.utils.models import BaseEncoder
|
||||||
from khoj.utils.state import SearchType
|
from khoj.utils.state import SearchType
|
||||||
from khoj.utils.rawconfig import SearchResponse, Entry
|
from khoj.utils.rawconfig import SearchResponse, Entry
|
||||||
from khoj.utils.jsonl import load_jsonl
|
from khoj.utils.jsonl import load_jsonl
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from database.adapters import EntryAdapters
|
from database.adapters import EntryAdapters
|
||||||
from database.models import KhojUser, Entry as DbEntry
|
from database.models import KhojUser, Entry as DbEntry
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ def rerank_and_sort_results(hits, query):
|
||||||
|
|
||||||
|
|
||||||
def setup(
|
def setup(
|
||||||
text_to_jsonl: Type[TextEntries],
|
text_to_entries: Type[TextToEntries],
|
||||||
files: dict[str, str],
|
files: dict[str, str],
|
||||||
regenerate: bool,
|
regenerate: bool,
|
||||||
full_corpus: bool = True,
|
full_corpus: bool = True,
|
||||||
|
@ -196,11 +196,11 @@ def setup(
|
||||||
config=None,
|
config=None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if config:
|
if config:
|
||||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl(config).process(
|
num_new_embeddings, num_deleted_embeddings = text_to_entries(config).process(
|
||||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
num_new_embeddings, num_deleted_embeddings = text_to_jsonl().process(
|
num_new_embeddings, num_deleted_embeddings = text_to_entries().process(
|
||||||
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
files=files, full_corpus=full_corpus, user=user, regenerate=regenerate
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ app = FastAPI()
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.configure import configure_routes, configure_search_types, configure_middleware
|
from khoj.configure import configure_routes, configure_search_types, configure_middleware
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from khoj.search_type import image_search, text_search
|
from khoj.search_type import image_search, text_search
|
||||||
from khoj.utils.config import SearchModels
|
from khoj.utils.config import SearchModels
|
||||||
from khoj.utils.constants import web_directory
|
from khoj.utils.constants import web_directory
|
||||||
|
@ -26,7 +26,7 @@ from khoj.utils.rawconfig import (
|
||||||
)
|
)
|
||||||
from khoj.utils import state, fs_syncer
|
from khoj.utils import state, fs_syncer
|
||||||
from khoj.routers.indexer import configure_content
|
from khoj.routers.indexer import configure_content
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from database.models import (
|
from database.models import (
|
||||||
KhojApiUser,
|
KhojApiUser,
|
||||||
LocalOrgConfig,
|
LocalOrgConfig,
|
||||||
|
@ -134,7 +134,7 @@ def content_config(tmp_path_factory, search_models: SearchModels, default_user:
|
||||||
user=default_user,
|
user=default_user,
|
||||||
)
|
)
|
||||||
|
|
||||||
text_search.setup(OrgToJsonl, get_sample_data("org"), regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, get_sample_data("org"), regenerate=False, user=default_user)
|
||||||
|
|
||||||
if os.getenv("GITHUB_PAT_TOKEN"):
|
if os.getenv("GITHUB_PAT_TOKEN"):
|
||||||
GithubConfig.objects.create(
|
GithubConfig.objects.create(
|
||||||
|
@ -242,7 +242,7 @@ def client(
|
||||||
# These lines help us Mock the Search models for these search types
|
# These lines help us Mock the Search models for these search types
|
||||||
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
state.search_models.image_search = image_search.initialize_model(search_config.image)
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
get_sample_data("org"),
|
get_sample_data("org"),
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=api_user.user,
|
user=api_user.user,
|
||||||
|
@ -251,7 +251,7 @@ def client(
|
||||||
content_config.image, state.search_models.image_search, regenerate=False
|
content_config.image, state.search_models.image_search, regenerate=False
|
||||||
)
|
)
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
PlaintextToJsonl,
|
PlaintextToEntries,
|
||||||
get_sample_data("plaintext"),
|
get_sample_data("plaintext"),
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=api_user.user,
|
user=api_user.user,
|
||||||
|
|
|
@ -15,7 +15,7 @@ from khoj.utils import state
|
||||||
from khoj.utils.state import search_models, content_index, config
|
from khoj.utils.state import search_models, content_index, config
|
||||||
from khoj.search_type import text_search, image_search
|
from khoj.search_type import text_search, image_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from database.models import KhojUser
|
from database.models import KhojUser
|
||||||
from database.adapters import EntryAdapters
|
from database.adapters import EntryAdapters
|
||||||
|
|
||||||
|
@ -176,7 +176,7 @@ def test_regenerate_with_github_fails_without_pat(client):
|
||||||
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
@pytest.mark.skip(reason="Flaky test on parallel test runs")
|
||||||
def test_get_configured_types_via_api(client, sample_org_data):
|
def test_get_configured_types_via_api(client, sample_org_data):
|
||||||
# Act
|
# Act
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False)
|
||||||
|
|
||||||
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
enabled_types = EntryAdapters.get_unique_file_types(user=None).all().values_list("file_type", flat=True)
|
||||||
|
|
||||||
|
@ -189,7 +189,7 @@ def test_get_configured_types_via_api(client, sample_org_data):
|
||||||
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
|
def test_get_api_config_types(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
response = client.get(f"/api/config/types", headers=headers)
|
response = client.get(f"/api/config/types", headers=headers)
|
||||||
|
@ -255,7 +255,7 @@ def test_image_search(client, content_config: ContentConfig, search_config: Sear
|
||||||
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
def test_notes_search(client, search_config: SearchConfig, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote("How to git install application?")
|
user_query = quote("How to git install application?")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -276,7 +276,7 @@ def test_notes_search_with_only_filters(
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
sample_org_data,
|
sample_org_data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
|
@ -298,7 +298,7 @@ def test_notes_search_with_only_filters(
|
||||||
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
|
def test_notes_search_with_include_filter(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote('How to git install application? +"Emacs"')
|
user_query = quote('How to git install application? +"Emacs"')
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
@ -317,7 +317,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-secret"}
|
headers = {"Authorization": "Bearer kk-secret"}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
sample_org_data,
|
sample_org_data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
|
@ -339,7 +339,7 @@ def test_notes_search_with_exclude_filter(client, sample_org_data, default_user:
|
||||||
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
|
def test_different_user_data_not_accessed(client, sample_org_data, default_user: KhojUser):
|
||||||
# Arrange
|
# Arrange
|
||||||
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
|
headers = {"Authorization": "Bearer kk-token"} # Token for default_user2
|
||||||
text_search.setup(OrgToJsonl, sample_org_data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, sample_org_data, regenerate=False, user=default_user)
|
||||||
user_query = quote("How to git install application?")
|
user_query = quote("How to git install application?")
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
from khoj.processor.markdown.markdown_to_entries import MarkdownToEntries
|
||||||
from khoj.utils.fs_syncer import get_markdown_files
|
from khoj.utils.fs_syncer import get_markdown_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
|
|
||||||
|
@ -23,11 +23,11 @@ def test_markdown_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entry_nodes, file_to_entries = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entry_nodes, file_to_entries = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
MarkdownToEntries.convert_markdown_entries_to_maps(entry_nodes, file_to_entries)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
@ -52,11 +52,11 @@ def test_single_markdown_entry_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entries, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(
|
||||||
MarkdownToJsonl.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
MarkdownToEntries.convert_markdown_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
@ -81,11 +81,11 @@ def test_multiple_markdown_entries_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entry_strings, entry_to_file_map = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entry_strings, entry_to_file_map = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
entries = MarkdownToJsonl.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
entries = MarkdownToEntries.convert_markdown_entries_to_maps(entry_strings, entry_to_file_map)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
jsonl_string = MarkdownToEntries.convert_markdown_maps_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -144,7 +144,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Markdown files
|
# Extract Entries from specified Markdown files
|
||||||
entries, _ = MarkdownToJsonl.extract_markdown_entries(markdown_files=data)
|
entries, _ = MarkdownToEntries.extract_markdown_entries(markdown_files=data)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 2
|
||||||
|
|
|
@ -3,8 +3,8 @@ import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.text_to_jsonl import TextEntries
|
from khoj.processor.text_to_entries import TextToEntries
|
||||||
from khoj.utils.helpers import is_none_or_empty
|
from khoj.utils.helpers import is_none_or_empty
|
||||||
from khoj.utils.rawconfig import Entry
|
from khoj.utils.rawconfig import Entry
|
||||||
from khoj.utils.fs_syncer import get_org_files
|
from khoj.utils.fs_syncer import get_org_files
|
||||||
|
@ -29,9 +29,9 @@ def test_configure_heading_entry_to_jsonl(tmp_path):
|
||||||
for index_heading_entries in [True, False]:
|
for index_heading_entries in [True, False]:
|
||||||
# Act
|
# Act
|
||||||
# Extract entries into jsonl from specified Org files
|
# Extract entries into jsonl from specified Org files
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(
|
OrgToEntries.convert_org_nodes_to_entries(
|
||||||
*OrgToJsonl.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
*OrgToEntries.extract_org_entries(org_files=data), index_heading_entries=index_heading_entries
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
@ -59,12 +59,12 @@ def test_entry_split_when_exceeds_max_words(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Split each entry from specified Org files by max words
|
# Split each entry from specified Org files by max words
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
TextEntries.split_entries_by_max_tokens(
|
TextToEntries.split_entries_by_max_tokens(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map), max_tokens=4
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
@ -86,7 +86,7 @@ def test_entry_split_drops_large_words():
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Split entry by max words and drop words larger than max word length
|
# Split entry by max words and drop words larger than max word length
|
||||||
processed_entry = TextEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
processed_entry = TextToEntries.split_entries_by_max_tokens([entry], max_word_length=5)[0]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# "Heading" dropped from compiled version because its over the set max word limit
|
# "Heading" dropped from compiled version because its over the set max word limit
|
||||||
|
@ -109,11 +109,11 @@ def test_entry_with_body_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, entry_to_file_map = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, entry_to_file_map = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(
|
||||||
OrgToJsonl.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
OrgToEntries.convert_org_nodes_to_entries(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
@ -136,11 +136,11 @@ Intro text
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -160,11 +160,11 @@ def test_file_with_no_headings_to_jsonl(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entry_nodes, file_to_entries = OrgToJsonl.extract_org_entries(org_files=data)
|
entry_nodes, file_to_entries = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
entries = OrgToJsonl.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
entries = OrgToEntries.convert_org_nodes_to_entries(entry_nodes, file_to_entries)
|
||||||
jsonl_string = OrgToJsonl.convert_org_entries_to_jsonl(entries)
|
jsonl_string = OrgToEntries.convert_org_entries_to_jsonl(entries)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -224,7 +224,7 @@ def test_extract_entries_with_different_level_headings(tmp_path):
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
# Extract Entries from specified Org files
|
# Extract Entries from specified Org files
|
||||||
entries, _ = OrgToJsonl.extract_org_entries(org_files=data)
|
entries, _ = OrgToEntries.extract_org_entries(org_files=data)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(entries) == 2
|
assert len(entries) == 2
|
||||||
|
|
|
@ -3,7 +3,7 @@ import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
from khoj.processor.pdf.pdf_to_entries import PdfToEntries
|
||||||
|
|
||||||
from khoj.utils.fs_syncer import get_pdf_files
|
from khoj.utils.fs_syncer import get_pdf_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
|
@ -18,11 +18,11 @@ def test_single_page_pdf_to_jsonl():
|
||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/singlepage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
# Process Each Entry from All Pdf Files
|
||||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
@ -38,11 +38,11 @@ def test_multi_page_pdf_to_jsonl():
|
||||||
pdf_bytes = f.read()
|
pdf_bytes = f.read()
|
||||||
|
|
||||||
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
data = {"tests/data/pdf/multipage.pdf": pdf_bytes}
|
||||||
entries, entry_to_file_map = PdfToJsonl.extract_pdf_entries(pdf_files=data)
|
entries, entry_to_file_map = PdfToEntries.extract_pdf_entries(pdf_files=data)
|
||||||
|
|
||||||
# Process Each Entry from All Pdf Files
|
# Process Each Entry from All Pdf Files
|
||||||
jsonl_string = PdfToJsonl.convert_pdf_maps_to_jsonl(
|
jsonl_string = PdfToEntries.convert_pdf_maps_to_jsonl(
|
||||||
PdfToJsonl.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
PdfToEntries.convert_pdf_entries_to_maps(entries, entry_to_file_map)
|
||||||
)
|
)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.utils.fs_syncer import get_plaintext_files
|
from khoj.utils.fs_syncer import get_plaintext_files
|
||||||
from khoj.utils.rawconfig import TextContentConfig
|
from khoj.utils.rawconfig import TextContentConfig
|
||||||
from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
|
from khoj.processor.plaintext.plaintext_to_entries import PlaintextToEntries
|
||||||
from database.models import LocalPlaintextConfig, KhojUser
|
from database.models import LocalPlaintextConfig, KhojUser
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,14 +27,14 @@ def test_plaintext_file(tmp_path):
|
||||||
f"{plaintextfile}": entry,
|
f"{plaintextfile}": entry,
|
||||||
}
|
}
|
||||||
|
|
||||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(entry_to_file_map=data)
|
||||||
|
|
||||||
# Convert each entry.file to absolute path to make them JSON serializable
|
# Convert each entry.file to absolute path to make them JSON serializable
|
||||||
for map in maps:
|
for map in maps:
|
||||||
map.file = str(Path(map.file).absolute())
|
map.file = str(Path(map.file).absolute())
|
||||||
|
|
||||||
# Process Each Entry from All Notes Files
|
# Process Each Entry from All Notes Files
|
||||||
jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
|
jsonl_string = PlaintextToEntries.convert_entries_to_jsonl(maps)
|
||||||
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -100,7 +100,7 @@ def test_parse_html_plaintext_file(content_config, default_user: KhojUser):
|
||||||
extracted_plaintext_files = get_plaintext_files(config=config)
|
extracted_plaintext_files = get_plaintext_files(config=config)
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
maps = PlaintextToEntries.convert_plaintext_entries_to_maps(extracted_plaintext_files)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert len(maps) == 1
|
assert len(maps) == 1
|
||||||
|
|
|
@ -10,8 +10,8 @@ import pytest
|
||||||
# Internal Packages
|
# Internal Packages
|
||||||
from khoj.search_type import text_search
|
from khoj.search_type import text_search
|
||||||
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
from khoj.utils.rawconfig import ContentConfig, SearchConfig
|
||||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
from khoj.processor.org_mode.org_to_entries import OrgToEntries
|
||||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
from khoj.processor.github.github_to_entries import GithubToEntries
|
||||||
from khoj.utils.fs_syncer import collect_files, get_org_files
|
from khoj.utils.fs_syncer import collect_files, get_org_files
|
||||||
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
|
from database.models import LocalOrgConfig, KhojUser, Entry, GithubConfig
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ def test_text_search_setup_with_empty_file_raises_error(
|
||||||
# Act
|
# Act
|
||||||
# Generate notes embeddings during asymmetric setup
|
# Generate notes embeddings during asymmetric setup
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
assert "Created 0 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||||
verify_embeddings(0, default_user)
|
verify_embeddings(0, default_user)
|
||||||
|
@ -80,7 +80,7 @@ def test_text_indexer_deletes_embedding_before_regenerate(
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Deleting all embeddings for file type org" in caplog.text
|
assert "Deleting all embeddings for file type org" in caplog.text
|
||||||
|
@ -94,7 +94,7 @@ def test_text_search_setup_batch_processes(content_config: ContentConfig, defaul
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Created 4 new embeddings" in caplog.text
|
assert "Created 4 new embeddings" in caplog.text
|
||||||
|
@ -112,13 +112,13 @@ def test_text_index_same_if_content_unchanged(content_config: ContentConfig, def
|
||||||
# Act
|
# Act
|
||||||
# Generate initial notes embeddings during asymmetric setup
|
# Generate initial notes embeddings during asymmetric setup
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
# Run asymmetric setup again with no changes to data source. Ensure index is not updated
|
||||||
with caplog.at_level(logging.DEBUG):
|
with caplog.at_level(logging.DEBUG):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -148,7 +148,7 @@ async def test_text_search(search_config: SearchConfig):
|
||||||
await loop.run_in_executor(
|
await loop.run_in_executor(
|
||||||
None,
|
None,
|
||||||
text_search.setup,
|
text_search.setup,
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
True,
|
True,
|
||||||
True,
|
True,
|
||||||
|
@ -185,7 +185,7 @@ def test_entry_chunking_by_max_tokens(org_config_with_only_new_file: LocalOrgCon
|
||||||
# Act
|
# Act
|
||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
# verify newly added org-mode entry is split by max tokens
|
# verify newly added org-mode entry is split by max tokens
|
||||||
|
@ -218,7 +218,7 @@ conda activate khoj
|
||||||
#+end_src"""
|
#+end_src"""
|
||||||
}
|
}
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
|
@ -237,7 +237,7 @@ conda activate khoj
|
||||||
# reload embeddings, entries, notes model after adding new org-mode file
|
# reload embeddings, entries, notes model after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
OrgToJsonl,
|
OrgToEntries,
|
||||||
data,
|
data,
|
||||||
regenerate=False,
|
regenerate=False,
|
||||||
full_corpus=False,
|
full_corpus=False,
|
||||||
|
@ -259,7 +259,7 @@ def test_regenerate_index_with_new_entry(
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
|
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
assert "Created 10 new embeddings. Deleted 3 embeddings for user " in caplog.records[-1].message
|
||||||
|
|
||||||
|
@ -273,7 +273,7 @@ def test_regenerate_index_with_new_entry(
|
||||||
# Act
|
# Act
|
||||||
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
# regenerate notes jsonl, model embeddings and model to include entry from new file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
|
assert "Created 11 new embeddings. Deleted 10 embeddings for user " in caplog.records[-1].message
|
||||||
|
@ -298,7 +298,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
||||||
# Act
|
# Act
|
||||||
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
# generate embeddings, entries, notes model from scratch after adding new org-mode file
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
|
@ -306,7 +306,7 @@ def test_update_index_with_duplicate_entries_in_stable_order(
|
||||||
|
|
||||||
# update embeddings, entries, notes model with no new changes
|
# update embeddings, entries, notes model with no new changes
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -331,7 +331,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
||||||
|
|
||||||
# load embeddings, entries, notes model after adding new org file with 2 entries
|
# load embeddings, entries, notes model after adding new org file with 2 entries
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
|
@ -343,7 +343,7 @@ def test_update_index_with_deleted_entry(org_config_with_only_new_file: LocalOrg
|
||||||
|
|
||||||
# Act
|
# Act
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -361,7 +361,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
||||||
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
org_config = LocalOrgConfig.objects.filter(user=default_user).first()
|
||||||
data = get_org_files(org_config)
|
data = get_org_files(org_config)
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=True, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=True, user=default_user)
|
||||||
initial_logs = caplog.text
|
initial_logs = caplog.text
|
||||||
caplog.clear() # Clear logs
|
caplog.clear() # Clear logs
|
||||||
|
|
||||||
|
@ -375,7 +375,7 @@ def test_update_index_with_new_entry(content_config: ContentConfig, new_org_file
|
||||||
# Act
|
# Act
|
||||||
# update embeddings, entries with the newly added note
|
# update embeddings, entries with the newly added note
|
||||||
with caplog.at_level(logging.INFO):
|
with caplog.at_level(logging.INFO):
|
||||||
text_search.setup(OrgToJsonl, data, regenerate=False, user=default_user)
|
text_search.setup(OrgToEntries, data, regenerate=False, user=default_user)
|
||||||
final_logs = caplog.text
|
final_logs = caplog.text
|
||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
|
@ -393,7 +393,7 @@ def test_text_search_setup_github(content_config: ContentConfig, default_user: K
|
||||||
# Act
|
# Act
|
||||||
# Regenerate github embeddings to test asymmetric setup without caching
|
# Regenerate github embeddings to test asymmetric setup without caching
|
||||||
text_search.setup(
|
text_search.setup(
|
||||||
GithubToJsonl,
|
GithubToEntries,
|
||||||
{},
|
{},
|
||||||
regenerate=True,
|
regenerate=True,
|
||||||
user=default_user,
|
user=default_user,
|
||||||
|
|
Loading…
Reference in a new issue