mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Use Github Rest API to index Markdown files in Github Repository
The Llama_Hub Github plugin is fairly limited. The Github Rest API is well supported and can easily be extended to index commit messages, issues, discussions, PRs etc.
This commit is contained in:
parent
9f00a366ab
commit
c29c141a7e
2 changed files with 32 additions and 27 deletions
|
@ -56,7 +56,7 @@ dependencies = [
|
|||
"aiohttp == 3.8.4",
|
||||
"langchain >= 0.0.187",
|
||||
"pypdf >= 3.9.0",
|
||||
"llama-hub==0.0.3",
|
||||
"requests >= 2.26.0",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
# Standard Packages
|
||||
import logging
|
||||
from llama_index import download_loader
|
||||
|
||||
# External Packages
|
||||
import requests
|
||||
|
||||
# Internal Packages
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import GithubContentConfig
|
||||
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils import state
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -14,18 +18,11 @@ logger = logging.getLogger(__name__)
|
|||
class GithubToJsonl(TextToJsonl):
|
||||
def __init__(self, config: GithubContentConfig):
|
||||
super().__init__(config)
|
||||
download_loader("GithubRepositoryReader")
|
||||
self.config = config
|
||||
self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
|
||||
|
||||
def process(self, previous_entries=None):
|
||||
try:
|
||||
self.initialize()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
|
||||
)
|
||||
raise e
|
||||
|
||||
with timer("Download github repo", logger):
|
||||
with timer("Download markdown files from github repo", logger):
|
||||
try:
|
||||
docs = self.get_markdown_files()
|
||||
except Exception as e:
|
||||
|
@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl):
|
|||
|
||||
return entries_with_ids
|
||||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
|
||||
github_client = GithubClient(self.config.pat_token)
|
||||
self.loader = GithubRepositoryReader(
|
||||
github_client,
|
||||
owner=self.config.repo_owner,
|
||||
repo=self.config.repo_name,
|
||||
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
|
||||
verbose=state.verbose > 1,
|
||||
)
|
||||
|
||||
def get_markdown_files(self):
|
||||
return self.loader.load_data(branch=self.config.repo_branch)
|
||||
# set the url to get the contents of the repository
|
||||
repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
|
||||
# set the headers to include the authentication token
|
||||
headers = {"Authorization": f"{self.config.pat_token}"}
|
||||
|
||||
# get the contents of the repository
|
||||
response = requests.get(repo_content_url, headers=headers)
|
||||
contents = response.json()
|
||||
|
||||
markdown_files = []
|
||||
for item in contents["tree"]:
|
||||
# Find all markdown files in the repository
|
||||
if item["type"] == "blob" and item["path"].endswith(".md"):
|
||||
# Get text from each markdown file
|
||||
file_content_url = f'{self.repo_url}/contents/{item["path"]}'
|
||||
headers["Accept"] = "application/vnd.github.v3.raw"
|
||||
markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
|
||||
markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
|
||||
|
||||
return markdown_files
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_entries(markdown_files):
|
||||
|
@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl):
|
|||
entry_to_file_map = []
|
||||
for doc in markdown_files:
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
|
||||
doc["content"], doc["path"], entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
|
Loading…
Reference in a new issue