Add separate unit test for github. Will only run of a PAT token is set

This commit is contained in:
Saba 2023-06-13 16:55:58 -07:00
parent 3a61919344
commit 751edfefe5
3 changed files with 29 additions and 1 deletions

View file

@ -328,6 +328,11 @@ Add your OpenAI API to Khoj by using either of the two options below:
1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj) 1. [Setup your OpenAI API key in Khoj](#set-your-openai-api-key-in-khoj)
2. Interact with them from the [Khoj Swagger docs](http://locahost:8000/docs)[^2] 2. Interact with them from the [Khoj Swagger docs](http://locahost:8000/docs)[^2]
### Use a Github Repository as a source
Note that this plugin is currently *only* indexing Markdown files. It will ignore all other files in the repository. This is because Khoj, as it stands, is a semantic search engine. Eventually, we hope to get to a state where you can search for any file in your repository and even explain code.
1. Get a [pat token](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token) with `repo` and `read:org` scopes in the classic flow.
2. Configure your settings to include the `owner` and `repo_name`. The `owner` will be the organization name if the repo is in an organization. The `repo_name` will be the name of the repository. Optionally, you can also supply a branch name. If no branch name is supplied, the `master` branch will be used.
## Performance ## Performance
@ -458,7 +463,7 @@ conda activate khoj
#### Before Creating PR #### Before Creating PR
1. Run Tests 1. Run Tests. If you get an error complaining about a missing `fast_tokenizer_file`, follow the solution [in this Github issue](https://github.com/UKPLab/sentence-transformers/issues/1659).
```shell ```shell
pytest pytest
``` ```

View file

@ -16,6 +16,7 @@ from khoj.utils.rawconfig import (
ConversationProcessorConfig, ConversationProcessorConfig,
ProcessorConfig, ProcessorConfig,
TextContentConfig, TextContentConfig,
GithubContentConfig,
ImageContentConfig, ImageContentConfig,
SearchConfig, SearchConfig,
TextSearchConfig, TextSearchConfig,
@ -89,6 +90,15 @@ def content_config(tmp_path_factory, search_config: SearchConfig):
) )
} }
content_config.github = GithubContentConfig(
pat_token=os.getenv("GITHUB_PAT_TOKEN"),
repo_name="lantern",
repo_owner="khoj-ai",
repo_branch="master",
compressed_jsonl=content_dir.joinpath("github.jsonl.gz"),
embeddings_file=content_dir.joinpath("github_embeddings.pt"),
)
filters = [DateFilter(), WordFilter(), FileFilter()] filters = [DateFilter(), WordFilter(), FileFilter()]
text_search.setup( text_search.setup(
JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters JsonlToJsonl, content_config.plugins["plugin1"], search_config.asymmetric, regenerate=False, filters=filters

View file

@ -1,6 +1,7 @@
# System Packages # System Packages
import logging import logging
from pathlib import Path from pathlib import Path
import os
# External Packages # External Packages
import pytest import pytest
@ -10,6 +11,7 @@ from khoj.utils.state import model
from khoj.search_type import text_search from khoj.search_type import text_search
from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig from khoj.utils.rawconfig import ContentConfig, SearchConfig, TextContentConfig
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
# Test # Test
@ -170,3 +172,14 @@ def test_incremental_update(content_config: ContentConfig, search_config: Search
# Cleanup # Cleanup
# reset input_files in config to empty list # reset input_files in config to empty list
content_config.org.input_files = [] content_config.org.input_files = []
# ----------------------------------------------------------------------------------------------------
@pytest.mark.skipif(os.getenv("GITHUB_PAT_TOKEN") is None, reason="GITHUB_PAT_TOKEN not set")
def test_asymmetric_setup_github(content_config: ContentConfig, search_config: SearchConfig):
# Act
# Regenerate notes embeddings during asymmetric setup
github_model = text_search.setup(GithubToJsonl, content_config.github, search_config.asymmetric, regenerate=True)
# Assert
assert len(github_model.entries) > 1