diff --git a/src/khoj/configure.py b/src/khoj/configure.py index 5e569f2a..9cde97c0 100644 --- a/src/khoj/configure.py +++ b/src/khoj/configure.py @@ -17,6 +17,7 @@ from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl from khoj.processor.github.github_to_jsonl import GithubToJsonl from khoj.processor.notion.notion_to_jsonl import NotionToJsonl +from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl from khoj.search_type import image_search, text_search from khoj.utils import constants, state from khoj.utils.config import ( @@ -208,6 +209,22 @@ def configure_content( filters=[DateFilter(), WordFilter(), FileFilter()], ) + # Initialize Plaintext Search + if ( + (t == None or t.value == state.SearchType.Plaintext.value) + and content_config.plaintext + and search_models.text_search + ): + logger.info("📄 Setting up search for plaintext") + # Extract Entries, Generate Plaintext Embeddings + content_index.plaintext = text_search.setup( + PlaintextToJsonl, + content_config.plaintext, + search_models.text_search.bi_encoder, + regenerate=regenerate, + filters=[DateFilter(), WordFilter(), FileFilter()], + ) + # Initialize Image Search if ( (t == None or t.value == state.SearchType.Image.value) diff --git a/src/khoj/interface/web/assets/icons/plaintext.svg b/src/khoj/interface/web/assets/icons/plaintext.svg new file mode 100644 index 00000000..92233c24 --- /dev/null +++ b/src/khoj/interface/web/assets/icons/plaintext.svg @@ -0,0 +1 @@ + diff --git a/src/khoj/interface/web/config.html b/src/khoj/interface/web/config.html index 9763f0da..18e0cbd4 100644 --- a/src/khoj/interface/web/config.html +++ b/src/khoj/interface/web/config.html @@ -180,6 +180,41 @@ {% endif %} +
+
+ Plaintext +

+ Plaintext + {% if current_config.content_type.plaintext %} + {% if current_model_state.plaintext == False %} + Not Configured + {% else %} + Configured + {% endif %} + {% endif %} +

+
+
+

Set Plaintext files to index

+
+
+ + {% if current_config.content_type.plaintext %} + Update + {% else %} + Setup + {% endif %} + + +
+ {% if current_config.content_type.plaintext %} +
+ +
+ {% endif %} +
diff --git a/src/khoj/interface/web/content_type_input.html b/src/khoj/interface/web/content_type_input.html index 8c0e5b2e..3ef512f8 100644 --- a/src/khoj/interface/web/content_type_input.html +++ b/src/khoj/interface/web/content_type_input.html @@ -106,31 +106,43 @@ submit.addEventListener("click", function(event) { event.preventDefault(); - let suffix = "" + let globFormat = "**/*." + let suffixes = []; if ('{{content_type}}' == "markdown") - suffix = "**/*.md" + suffixes = ["md", "markdown"] else if ('{{content_type}}' == "org") - suffix = "**/*.org" + suffixes = ["org"] else if ('{{content_type}}' === "pdf") - suffix = "**/*.pdf" + suffixes = ["pdf"] + else if ('{{content_type}}' === "plaintext") + suffixes = ['*'] var inputFileNodes = document.getElementsByName("input-files"); - var input_files = getValidInputNodes(inputFileNodes).map(node => node.value); + var inputFiles = getValidInputNodes(inputFileNodes).map(node => node.value); var inputFilterNodes = document.getElementsByName("input-filter"); - var input_filter = getValidInputNodes(inputFilterNodes).map(node => `${node.value}/${suffix}`); - if (input_files.length === 0 && input_filter.length === 0) { + var inputFilter = []; + var nodes = getValidInputNodes(inputFilterNodes); + if (nodes.length > 0) { + for (var i = 0; i < nodes.length; i++) { + for (var j = 0; j < suffixes.length; j++) { + inputFilter.push(nodes[i].value + globFormat + suffixes[j]); + } + } + } + + if (inputFiles.length === 0 && inputFilter.length === 0) { alert("You must specify at least one input file or input filter."); return; } - if (input_files.length == 0) { - input_files = null; + if (inputFiles.length == 0) { + inputFiles = null; } - if (input_filter.length == 0) { - input_filter = null; + if (inputFilter.length == 0) { + inputFilter = null; } var compressed_jsonl = document.getElementById("compressed-jsonl").value; @@ -145,8 +157,8 @@ 'X-CSRFToken': csrfToken }, body: JSON.stringify({ - "input_files": input_files, - "input_filter": input_filter, + "input_files": inputFiles, + "input_filter": inputFilter, "compressed_jsonl": compressed_jsonl, "embeddings_file": embeddings_file, "index_heading_entries": index_heading_entries diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index cfb8e655..e8b63a4d 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -73,6 +73,8 @@ html += render_pdf(query, [item]); } else if (item.additional.file.includes("notion.so")) { html += `
` + `${item.additional.heading}` + `

${item.entry}

` + `
`; + } else { + html += `
` + `${item.additional.heading}` + `

${item.entry}

` + `
`; } }); return html; @@ -412,6 +414,7 @@ div.results-markdown, div.results-notion, div.results-org, + div.results-plugin, div.results-pdf { text-align: left; box-shadow: 2px 2px 2px var(--primary-hover); diff --git a/src/khoj/processor/plaintext/__init__.py b/src/khoj/processor/plaintext/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/khoj/processor/plaintext/plaintext_to_jsonl.py b/src/khoj/processor/plaintext/plaintext_to_jsonl.py new file mode 100644 index 00000000..8a740f6d --- /dev/null +++ b/src/khoj/processor/plaintext/plaintext_to_jsonl.py @@ -0,0 +1,117 @@ +# Standard Packages +import glob +import logging +from pathlib import Path +from typing import List + +# Internal Packages +from khoj.processor.text_to_jsonl import TextToJsonl +from khoj.utils.helpers import get_absolute_path, timer +from khoj.utils.jsonl import load_jsonl, compress_jsonl_data +from khoj.utils.rawconfig import Entry + + +logger = logging.getLogger(__name__) + + +class PlaintextToJsonl(TextToJsonl): + # Define Functions + def process(self, previous_entries=[]): + # Extract required fields from config + input_files, input_filter, output_file = ( + self.config.input_files, + self.config.input_filter, + self.config.compressed_jsonl, + ) + + # Get Plaintext Input Files to Process + all_input_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter) + + # Extract Entries from specified plaintext files + with timer("Parse entries from plaintext files", logger): + current_entries = PlaintextToJsonl.convert_plaintext_entries_to_maps( + PlaintextToJsonl.extract_plaintext_entries(all_input_plaintext_files) + ) + + # Split entries by max tokens supported by model + with timer("Split entries by max token size supported by model", logger): + current_entries = self.split_entries_by_max_tokens(current_entries, max_tokens=256) + + # Identify, mark and merge any new entries with previous entries + with timer("Identify new or updated entries", logger): + entries_with_ids = TextToJsonl.mark_entries_for_update( + current_entries, previous_entries, key="compiled", logger=logger + ) + + with timer("Write entries to JSONL file", logger): + # Process Each Entry from All Notes Files + entries = list(map(lambda entry: entry[1], entries_with_ids)) + plaintext_data = PlaintextToJsonl.convert_entries_to_jsonl(entries) + + # Compress JSONL formatted Data + compress_jsonl_data(plaintext_data, output_file) + + return entries_with_ids + + @staticmethod + def get_plaintext_files(plaintext_files=None, plaintext_file_filters=None): + "Get all files to process" + absolute_plaintext_files, filtered_plaintext_files = set(), set() + if plaintext_files: + absolute_plaintext_files = {get_absolute_path(jsonl_file) for jsonl_file in plaintext_files} + if plaintext_file_filters: + filtered_plaintext_files = { + filtered_file + for jsonl_file_filter in plaintext_file_filters + for filtered_file in glob.glob(get_absolute_path(jsonl_file_filter), recursive=True) + } + + all_target_files = sorted(absolute_plaintext_files | filtered_plaintext_files) + + files_with_no_plaintext_extensions = { + target_files for target_files in all_target_files if not PlaintextToJsonl.is_plaintextfile(target_files) + } + if any(files_with_no_plaintext_extensions): + logger.warn(f"Skipping unsupported files from plaintext indexing: {files_with_no_plaintext_extensions}") + all_target_files = list(set(all_target_files) - files_with_no_plaintext_extensions) + + logger.debug(f"Processing files: {all_target_files}") + + return all_target_files + + @staticmethod + def is_plaintextfile(file: str): + "Check if file is plaintext file" + return file.endswith(("txt", "md", "markdown", "org", "mbox", "rst", "html", "htm", "xml")) + + @staticmethod + def extract_plaintext_entries(plaintext_files: List[str]): + "Extract entries from specified plaintext files" + entry_to_file_map = [] + + for plaintext_file in plaintext_files: + with open(plaintext_file, "r") as f: + plaintext_content = f.read() + entry_to_file_map.append((plaintext_content, plaintext_file)) + + return dict(entry_to_file_map) + + @staticmethod + def convert_plaintext_entries_to_maps(entry_to_file_map: dict) -> List[Entry]: + "Convert each plaintext entries into a dictionary" + entries = [] + for entry, file in entry_to_file_map.items(): + entries.append( + Entry( + raw=entry, + file=file, + compiled=f"{Path(file).stem}\n{entry}", + heading=Path(file).stem, + ) + ) + return entries + + @staticmethod + def convert_entries_to_jsonl(entries: List[Entry]): + "Convert each entry to JSON and collate as JSONL" + return "".join([f"{entry.to_json()}\n" for entry in entries]) diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 1ff926e4..69613177 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -525,6 +525,25 @@ async def search( ) ] + if ( + (t == SearchType.Plaintext or t == SearchType.All) + and state.content_index.plaintext + and state.search_models.text_search + ): + # query plaintext files + search_futures += [ + executor.submit( + text_search.query, + user_query, + state.search_models.text_search, + state.content_index.plaintext, + question_embedding=encoded_asymmetric_query, + rank_results=r or False, + score_threshold=score_threshold, + dedupe=dedupe or True, + ) + ] + # Query across each requested content types in parallel with timer("Query took", logger): for search_future in concurrent.futures.as_completed(search_futures): diff --git a/src/khoj/routers/web_client.py b/src/khoj/routers/web_client.py index f856aeb5..492a263c 100644 --- a/src/khoj/routers/web_client.py +++ b/src/khoj/routers/web_client.py @@ -15,7 +15,7 @@ import json web_client = APIRouter() templates = Jinja2Templates(directory=constants.web_directory) -VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf"] +VALID_TEXT_CONTENT_TYPES = ["org", "markdown", "pdf", "plaintext"] # Create Routes @@ -47,6 +47,7 @@ if not state.demo: "image": False, "github": False, "notion": False, + "plaintext": False, "enable_offline_model": False, "conversation_openai": False, "conversation_gpt4all": False, @@ -61,6 +62,7 @@ if not state.demo: "image": state.content_index.image is not None, "github": state.content_index.github is not None, "notion": state.content_index.notion is not None, + "plaintext": state.content_index.plaintext is not None, } ) diff --git a/src/khoj/utils/config.py b/src/khoj/utils/config.py index 4e254bee..a6532346 100644 --- a/src/khoj/utils/config.py +++ b/src/khoj/utils/config.py @@ -30,6 +30,7 @@ class SearchType(str, Enum): Pdf = "pdf" Github = "github" Notion = "notion" + Plaintext = "plaintext" class ProcessorType(str, Enum): @@ -70,6 +71,7 @@ class ContentIndex: github: Optional[TextContent] = None notion: Optional[TextContent] = None image: Optional[ImageContent] = None + plaintext: Optional[TextContent] = None plugins: Optional[Dict[str, TextContent]] = None diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 1b0efc00..c5a67714 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -46,6 +46,12 @@ default_config = { "compressed-jsonl": "~/.khoj/content/notion/notion.jsonl.gz", "embeddings-file": "~/.khoj/content/notion/notion_embeddings.pt", }, + "plaintext": { + "input-files": None, + "input-filter": None, + "compressed-jsonl": "~/.khoj/content/plaintext/plaintext.jsonl.gz", + "embeddings-file": "~/.khoj/content/plaintext/plaintext_embeddings.pt", + }, }, "search-type": { "symmetric": { diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index af7dda67..13c9b1cf 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -79,6 +79,7 @@ class ContentConfig(ConfigBase): image: Optional[ImageContentConfig] markdown: Optional[TextContentConfig] pdf: Optional[TextContentConfig] + plaintext: Optional[TextContentConfig] github: Optional[GithubContentConfig] plugins: Optional[Dict[str, TextContentConfig]] notion: Optional[NotionContentConfig] diff --git a/tests/data/config.yml b/tests/data/config.yml index 96009a42..06978cf1 100644 --- a/tests/data/config.yml +++ b/tests/data/config.yml @@ -25,4 +25,4 @@ search-type: asymmetric: cross-encoder: cross-encoder/ms-marco-MiniLM-L-6-v2 encoder: sentence-transformers/msmarco-MiniLM-L-6-v3 -version: 0.9.1.dev0 +version: 0.10.1 diff --git a/tests/test_plaintext_to_jsonl.py b/tests/test_plaintext_to_jsonl.py new file mode 100644 index 00000000..f9968ae4 --- /dev/null +++ b/tests/test_plaintext_to_jsonl.py @@ -0,0 +1,84 @@ +# Standard Packages +import json +from pathlib import Path + +# Internal Packages +from khoj.processor.plaintext.plaintext_to_jsonl import PlaintextToJsonl + + +def test_plaintext_file(tmp_path): + "Convert files with no heading to jsonl." + # Arrange + entry = f""" + Hi, I am a plaintext file and I have some plaintext words. + """ + plaintextfile = create_file(tmp_path, entry) + + filename = plaintextfile.stem + + # Act + # Extract Entries from specified plaintext files + file_to_entries = PlaintextToJsonl.extract_plaintext_entries(plaintext_files=[plaintextfile]) + + maps = PlaintextToJsonl.convert_plaintext_entries_to_maps(file_to_entries) + + # Convert each entry.file to absolute path to make them JSON serializable + for map in maps: + map.file = str(Path(map.file).absolute()) + + # Process Each Entry from All Notes Files + jsonl_string = PlaintextToJsonl.convert_entries_to_jsonl(maps) + jsonl_data = [json.loads(json_string) for json_string in jsonl_string.splitlines()] + + # Assert + assert len(jsonl_data) == 1 + # Ensure raw entry with no headings do not get heading prefix prepended + assert not jsonl_data[0]["raw"].startswith("#") + # Ensure compiled entry has filename prepended as top level heading + assert jsonl_data[0]["compiled"] == f"{filename}\n{entry}" + + +def test_get_plaintext_files(tmp_path): + "Ensure Plaintext files specified via input-filter, input-files extracted" + # Arrange + # Include via input-filter globs + group1_file1 = create_file(tmp_path, filename="group1-file1.md") + group1_file2 = create_file(tmp_path, filename="group1-file2.md") + + group2_file1 = create_file(tmp_path, filename="group2-file1.markdown") + group2_file2 = create_file(tmp_path, filename="group2-file2.markdown") + group2_file3 = create_file(tmp_path, filename="group2-file3.mbox") + group2_file4 = create_file(tmp_path, filename="group2-file4.html") + # Include via input-file field + file1 = create_file(tmp_path, filename="notes.txt") + # Include unsupported file types + create_file(tmp_path, filename="group2-unincluded.py") + create_file(tmp_path, filename="group2-unincluded.csv") + create_file(tmp_path, filename="group2-unincluded.csv") + # Not included by any filter + create_file(tmp_path, filename="not-included-markdown.md") + create_file(tmp_path, filename="not-included-text.txt") + + expected_files = sorted( + map(str, [group1_file1, group1_file2, group2_file1, group2_file2, file1, group2_file3, group2_file4]) + ) + + # Setup input-files, input-filters + input_files = [tmp_path / "notes.txt"] + input_filter = [tmp_path / "group1*.md", tmp_path / "group2*.*"] + + # Act + extracted_plaintext_files = PlaintextToJsonl.get_plaintext_files(input_files, input_filter) + + # Assert + assert len(extracted_plaintext_files) == 7 + assert set(extracted_plaintext_files) == set(expected_files) + + +# Helper Functions +def create_file(tmp_path: Path, entry=None, filename="test.md"): + file_ = tmp_path / filename + file_.touch() + if entry: + file_.write_text(entry) + return file_