diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index 5e569f2a..9cde97c0 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -17,6 +17,7 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
+from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
from khoj.search_type import image_search, text_search
from khoj.utils import constants, state
from khoj.utils.config import (
@@ -208,6 +209,22 @@ def configure_content(
filters=[DateFilter(), WordFilter(), FileFilter()],
)
+ # Initialize Plaintext Search
+ if (
+ (t == None or t.value == state.SearchType.Plaintext.value)
+ and content_config.plaintext
+ and search_models.text_search
+ ):
+ logger.info("📄 Setting up search for plaintext")
+ # Extract Entries, Generate Plaintext Embeddings
+ content_index.plaintext = text_search.setup(
+ PlaintextToJsonl,
+ content_config.plaintext,
+ search_models.text_search.bi_encoder,
+ regenerate=regenerate,
+ filters=[DateFilter(), WordFilter(), FileFilter()],
+ )
+
# Initialize Image Search
if (
(t == None or t.value == state.SearchType.Image.value)
diff --git a/src/khoj/interface/web/assets/icons/plaintext.svg b/src/khoj/interface/web/assets/icons/plaintext.svg
new file mode 100644
index 00000000..92233c24
--- /dev/null
+++ b/src/khoj/interface/web/assets/icons/plaintext.svg
@@ -0,0 +1 @@
+
diff --git a/src/khoj/interface/web/config.html b/src/khoj/interface/web/config.html
index 9763f0da..18e0cbd4 100644
--- a/src/khoj/interface/web/config.html
+++ b/src/khoj/interface/web/config.html
@@ -180,6 +180,41 @@
{% endif %}
+
diff --git a/src/khoj/interface/web/content_type_input.html b/src/khoj/interface/web/content_type_input.html
index 8c0e5b2e..3ef512f8 100644
--- a/src/khoj/interface/web/content_type_input.html
+++ b/src/khoj/interface/web/content_type_input.html
@@ -106,31 +106,43 @@
submit.addEventListener("click", function(event) {
event.preventDefault();
- let suffix = ""
+ let globFormat = "**/*."
+ let suffixes = [];
if ('{{content_type}}' == "markdown")
- suffix = "**/*.md"
+ suffixes = ["md", "markdown"]
else if ('{{content_type}}' == "org")
- suffix = "**/*.org"
+ suffixes = ["org"]
else if ('{{content_type}}' === "pdf")
- suffix = "**/*.pdf"
+ suffixes = ["pdf"]
+ else if ('{{content_type}}' === "plaintext")
+ suffixes = ['*']
var inputFileNodes = document.getElementsByName("input-files");
- var input_files = getValidInputNodes(inputFileNodes).map(node => node.value);
+ var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value);
var inputFilterNodes = document.getElementsByName("input-filter");
- var input_filter = getValidInputNodes(inputFilterNodes).map(node => `${node.value}/${suffix}`);
- if (input_files.length === 0 && input_filter.length === 0) {
+ var inputFilter = [];
+ var nodes = getValidInputNodes(inputFilterNodes);
+ if (nodes.length > 0) {
+ for (var i = 0; i < nodes.length; i++) {
+ for (var j = 0; j < suffixes.length; j++) {
+ inputFilter.push(nodes[i].value + globFormat + suffixes[j]);
+ }
+ }
+ }
+
+ if (inputFiles.length === 0 && inputFilter.length === 0) {
alert("You must specify at least one input file or input filter.");
return;
}
- if (input_files.length == 0) {
- input_files = null;
+ if (inputFiles.length == 0) {
+ inputFiles = null;
}
- if (input_filter.length == 0) {
- input_filter = null;
+ if (inputFilter.length == 0) {
+ inputFilter = null;
}
var compressed_jsonl = document.getElementById("compressed-jsonl").value;
@@ -145,8 +157,8 @@
'X-CSRFToken': csrfToken
},
body: JSON.stringify({
- "input_files": input_files,
- "input_filter": input_filter,
+ "input_files": inputFiles,
+ "input_filter": inputFilter,
"compressed_jsonl": compressed_jsonl,
"embeddings_file": embeddings_file,
"index_heading_entries": index_heading_entries
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index cfb8e655..e8b63a4d 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -73,6 +73,8 @@
html += render_pdf(query, [item]);
} else if (item.additional.file.includes("notion.so")) {
html += `
`;
+ } else {
+ html += `
`;
}
});
return html;
@@ -412,6 +414,7 @@
div.results-markdown,
div.results-notion,
div.results-org,
+ div.results-plugin,
div.results-pdf {
text-align: left;
box-shadow: 2px 2px 2px var(--primary-hover);
diff --git a/src/khoj/processor/plaintext/__init__.py b/src/khoj/processor/plaintext/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_jsonl.py
new file mode 100644
index 00000000..8a740f6d
--- /dev/null
+++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py
@@ -0,0 +1,117 @@
+# Standard Packages
+import glob
+import logging
+from pathlib import Path
+from typing import List
+
+# Internal Packages
+from khoj.processor.text_to_jsonl import TextToJsonl
+from khoj.utils.helpers import get_absolute_path, timer
+from khoj.utils.jsonl import load_jsonl, compress_jsonl_data
+from khoj.utils.rawconfig import Entry
+
+
+logger = logging.getLogger(__name__)
+
+
+class PlaintextToJsonl(TextToJsonl):
+ # Define Functions
+ def process(self, previous_entries=[]):
+ # Extract required fields from config
+ input_files, input_filter, output_file = (
+ self.config.input_files,
+ self.config.input_filter,
+ self.config.compressed_jsonl,
+ )
+
+ # Get Plaintext Input Files to Process
+ all_input_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
+
+ # Extract Entries from specified plaintext files
+ with timer("Parse entries from plaintext files", logger):
+ current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps(
+ PlaintextToJsonl.extract_plaintext_entries(all_input_plaintext_files)
+ )
+
+ # Split entries by max tokens supported by model
+ with timer("Split entries by max token size supported by model", logger):
+ current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256)
+
+ # Identify, mark and merge any new entries with previous entries
+ with timer("Identify new or updated entries", logger):
+ entries_with_ids = TextToJsonl.mark_entries_for_update(
+ current_entries, previous_entries, key="compiled", logger=logger
+ )
+
+ with timer("Write entries to JSONL file", logger):
+ # Process Each Entry from All Notes Files
+ entries = list(map(lambda entry: entry[1], entries_with_ids))
+ plaintext_data = PlaintextToJsonl.convert_entries_to_jsonl(entries)
+
+ # Compress JSONL formatted Data
+ compress_jsonl_data(plaintext_data, output_file)
+
+ return entries_with_ids
+
+ @staticmethod
+ def get_plaintext_files(plaintext_files=None, plaintext_file_filters=None):
+ "Get all files to process"
+ absolute_plaintext_files, filtered_plaintext_files = set(), set()
+ if plaintext_files:
+ absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in plaintext_files}
+ if plaintext_file_filters:
+ filtered_plaintext_files = {
+ filtered_file
+ for jsonl_file_filter in plaintext_file_filters
+ for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True)
+ }
+
+ all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files)
+
+ files_with_no_plaintext_extensions = {
+ target_files for target_files in all_target_files if not PlaintextToJsonl.is_plaintextfile(target_files)
+ }
+ if any(files_with_no_plaintext_extensions):
+ logger.warn(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}")
+ all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions)
+
+ logger.debug(f"Processing files: {all_target_files}")
+
+ return all_target_files
+
+ @staticmethod
+ def is_plaintextfile(file: str):
+ "Check if file is plaintext file"
+ return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml"))
+
+ @staticmethod
+ def extract_plaintext_entries(plaintext_files: List[str]):
+ "Extract entries from specified plaintext files"
+ entry_to_file_map = []
+
+ for plaintext_file in plaintext_files:
+ with open(plaintext_file, "r") as f:
+ plaintext_content = f.read()
+ entry_to_file_map.append((plaintext_content, plaintext_file))
+
+ return dict(entry_to_file_map)
+
+ @staticmethod
+ def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]:
+ "Convert each plaintext entries into a dictionary"
+ entries = []
+ for entry, file in entry_to_file_map.items():
+ entries.append(
+ Entry(
+ raw=entry,
+ file=file,
+ compiled=f"{Path(file).stem}\n{entry}",
+ heading=Path(file).stem,
+ )
+ )
+ return entries
+
+ @staticmethod
+ def convert_entries_to_jsonl(entries: List[Entry]):
+ "Convert each entry to JSON and collate as JSONL"
+ return "".join([f"{entry.to_json()}\n" for entry in entries])
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 1ff926e4..69613177 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -525,6 +525,25 @@ async def search(
)
]
+ if (
+ (t == SearchType.Plaintext or t == SearchType.All)
+ and state.content_index.plaintext
+ and state.search_models.text_search
+ ):
+ # query plaintext files
+ search_futures += [
+ executor.submit(
+ text_search.query,
+ user_query,
+ state.search_models.text_search,
+ state.content_index.plaintext,
+ question_embedding=encoded_asymmetric_query,
+ rank_results=r or False,
+ score_threshold=score_threshold,
+ dedupe=dedupe or True,
+ )
+ ]
+
# Query across each requested content types in parallel
with timer("Query took", logger):
for search_future in concurrent.futures.as_completed(search_futures):
diff --git a/src/khoj/routers/web_client.py b/src/khoj/routers/web_client.py
index f856aeb5..492a263c 100644
--- a/src/khoj/routers/web_client.py
+++ b/src/khoj/routers/web_client.py
@@ -15,7 +15,7 @@ import json
web_client = APIRouter()
templates = Jinja2Templates(directory=constants.web_directory)
-VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf"]
+VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"]
# Create Routes
@@ -47,6 +47,7 @@ if not state.demo:
"image": False,
"github": False,
"notion": False,
+ "plaintext": False,
"enable_offline_model": False,
"conversation_openai": False,
"conversation_gpt4all": False,
@@ -61,6 +62,7 @@ if not state.demo:
"image": state.content_index.image is not None,
"github": state.content_index.github is not None,
"notion": state.content_index.notion is not None,
+ "plaintext": state.content_index.plaintext is not None,
}
)
diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py
index 4e254bee..a6532346 100644
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@@ -30,6 +30,7 @@ class SearchType(str, Enum):
Pdf = "pdf"
Github = "github"
Notion = "notion"
+ Plaintext = "plaintext"
class ProcessorType(str, Enum):
@@ -70,6 +71,7 @@ class ContentIndex:
github: Optional[TextContent] = None
notion: Optional[TextContent] = None
image: Optional[ImageContent] = None
+ plaintext: Optional[TextContent] = None
plugins: Optional[Dict[str, TextContent]] = None
diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py
index 1b0efc00..c5a67714 100644
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@@ -46,6 +46,12 @@ default_config = {
"compressed-jsonl": "~/.khoj/content/notion/notion.jsonl.gz",
"embeddings-file": "~/.khoj/content/notion/notion_embeddings.pt",
},
+ "plaintext": {
+ "input-files": None,
+ "input-filter": None,
+ "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz",
+ "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt",
+ },
},
"search-type": {
"symmetric": {
diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py
index af7dda67..13c9b1cf 100644
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@@ -79,6 +79,7 @@ class ContentConfig(ConfigBase):
image: Optional[ImageContentConfig]
markdown: Optional[TextContentConfig]
pdf: Optional[TextContentConfig]
+ plaintext: Optional[TextContentConfig]
github: Optional[GithubContentConfig]
plugins: Optional[Dict[str, TextContentConfig]]
notion: Optional[NotionContentConfig]
diff --git a/tests/data/config.yml b/tests/data/config.yml
index 96009a42..06978cf1 100644
--- a/tests/data/config.yml
+++ b/tests/data/config.yml
@@ -25,4 +25,4 @@ search-type:
asymmetric:
cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2
encoder: sentence-transformers/msmarco-MiniLM-L-6-v3
-version: 0.9.1.dev0
+version: 0.10.1
diff --git a/tests/test_plaintext_to_jsonl.py b/tests/test_plaintext_to_jsonl.py
new file mode 100644
index 00000000..f9968ae4
--- /dev/null
+++ b/tests/test_plaintext_to_jsonl.py
@@ -0,0 +1,84 @@
+# Standard Packages
+import json
+from pathlib import Path
+
+# Internal Packages
+from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl
+
+
+def test_plaintext_file(tmp_path):
+ "Convert files with no heading to jsonl."
+ # Arrange
+ entry = f"""
+ Hi, I am a plaintext file and I have some plaintext words.
+ """
+ plaintextfile = create_file(tmp_path, entry)
+
+ filename = plaintextfile.stem
+
+ # Act
+ # Extract Entries from specified plaintext files
+ file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile])
+
+ maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries)
+
+ # Convert each entry.file to absolute path to make them JSON serializable
+ for map in maps:
+ map.file = str(Path(map.file).absolute())
+
+ # Process Each Entry from All Notes Files
+ jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps)
+ jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()]
+
+ # Assert
+ assert len(jsonl_data) == 1
+ # Ensure raw entry with no headings do not get heading prefix prepended
+ assert not jsonl_data[0]["raw"].startswith("#")
+ # Ensure compiled entry has filename prepended as top level heading
+ assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}"
+
+
+def test_get_plaintext_files(tmp_path):
+ "Ensure Plaintext files specified via input-filter, input-files extracted"
+ # Arrange
+ # Include via input-filter globs
+ group1_file1 = create_file(tmp_path, filename="group1-file1.md")
+ group1_file2 = create_file(tmp_path, filename="group1-file2.md")
+
+ group2_file1 = create_file(tmp_path, filename="group2-file1.markdown")
+ group2_file2 = create_file(tmp_path, filename="group2-file2.markdown")
+ group2_file3 = create_file(tmp_path, filename="group2-file3.mbox")
+ group2_file4 = create_file(tmp_path, filename="group2-file4.html")
+ # Include via input-file field
+ file1 = create_file(tmp_path, filename="notes.txt")
+ # Include unsupported file types
+ create_file(tmp_path, filename="group2-unincluded.py")
+ create_file(tmp_path, filename="group2-unincluded.csv")
+ create_file(tmp_path, filename="group2-unincluded.csv")
+ # Not included by any filter
+ create_file(tmp_path, filename="not-included-markdown.md")
+ create_file(tmp_path, filename="not-included-text.txt")
+
+ expected_files = sorted(
+ map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4])
+ )
+
+ # Setup input-files, input-filters
+ input_files = [tmp_path / "notes.txt"]
+ input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"]
+
+ # Act
+ extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter)
+
+ # Assert
+ assert len(extracted_plaintext_files) == 7
+ assert set(extracted_plaintext_files) == set(expected_files)
+
+
+# Helper Functions
+def create_file(tmp_path: Path, entry=None, filename="test.md"):
+ file_ = tmp_path / filename
+ file_.touch()
+ if entry:
+ file_.write_text(entry)
+ return file_