diff --git a/README.md b/README.md index 5e8feb45..96aebc6c 100644 --- a/README.md +++ b/README.md @@ -328,6 +328,11 @@ Add your OpenAI API to Khoj by using either of the two options below: 1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj) 2. Interact with them from the [Khoj Swagger docs](http://locahost:8000/docs)[^2] +### Use a Github Repository as a source +Note that this plugin is currently *only* indexing Markdown files. It will ignore all other files in the repository. This is because Khoj, as it stands, is a semantic search engine. Eventually, we hope to get to a state where you can search for any file in your repository and even explain code. + +1. Get a [pat token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) with `repo` and `read:org` scopes in the classic flow. +2. Configure your settings to include the `owner` and `repo_name`. The `owner` will be the organization name if the repo is in an organization. The `repo_name` will be the name of the repository. Optionally, you can also supply a branch name. If no branch name is supplied, the `master` branch will be used. ## Performance @@ -458,7 +463,7 @@ conda activate khoj #### Before Creating PR -1. Run Tests +1. Run Tests. If you get an error complaining about a missing `fast_tokenizer_file`, follow the solution [in this Github issue](https://github.com/UKPLab/sentence-transformers/issues/1659). ```shell pytest ``` diff --git a/tests/conftest.py b/tests/conftest.py index 6ef2394f..e061f279 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,7 @@ from khoj.utils.rawconfig import ( ConversationProcessorConfig, ProcessorConfig, TextContentConfig, + GithubContentConfig, ImageContentConfig, SearchConfig, TextSearchConfig, @@ -89,6 +90,15 @@ def content_config(tmp_path_factory, search_config: SearchConfig): ) } + content_config.github = GithubContentConfig( + pat_token=os.getenv("GITHUB_PAT_TOKEN"), + repo_name="lantern", + repo_owner="khoj-ai", + repo_branch="master", + compressed_jsonl=content_dir.joinpath("github.jsonl.gz"), + embeddings_file=content_dir.joinpath("github_embeddings.pt"), + ) + filters = [DateFilter(), WordFilter(), FileFilter()] text_search.setup( JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters diff --git a/tests/test_text_search.py b/tests/test_text_search.py index 830feb9b..6eecac07 100644 --- a/tests/test_text_search.py +++ b/tests/test_text_search.py @@ -1,6 +1,7 @@ # System Packages import logging from pathlib import Path +import os # External Packages import pytest @@ -10,6 +11,7 @@ from khoj.utils.state import model from khoj.search_type import text_search from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl +from khoj.processor.github.github_to_jsonl import GithubToJsonl # Test @@ -170,3 +172,14 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search # Cleanup # reset input_files in config to empty list content_config.org.input_files = [] + + +# ---------------------------------------------------------------------------------------------------- +@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set") +def test_asymmetric_setup_github(content_config: ContentConfig, search_config: SearchConfig): + # Act + # Regenerate notes embeddings during asymmetric setup + github_model = text_search.setup(GithubToJsonl, content_config.github, search_config.asymmetric, regenerate=True) + + # Assert + assert len(github_model.entries) > 1