Use Github Rest API to index Markdown files in Github Repository

The Llama_Hub Github plugin is fairly limited. The Github Rest API is well supported and can easily be extended to index commit messages, issues, discussions, PRs etc.
2025-02-18 23:14:19 +00:00 · 2023-06-17 01:39:57 -07:00 · 2023-06-17 01:39:57 -07:00 · c29c141a7e
commit c29c141a7e
parent 9f00a366ab
2 changed files with 32 additions and 27 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -56,7 +56,7 @@ dependencies = [
    "aiohttp == 3.8.4",
    "langchain >= 0.0.187",
    "pypdf >= 3.9.0",
-    "llama-hub==0.0.3",
+    "requests >= 2.26.0",
 ]
 dynamic = ["version"]

--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@ -1,12 +1,16 @@
+# Standard Packages
 import logging
-from llama_index import download_loader
+
+# External Packages
+import requests
+
+# Internal Packages
 from khoj.utils.helpers import timer
 from khoj.utils.rawconfig import GithubContentConfig
-from llama_hub.github_repo import GithubRepositoryReader, GithubClient
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils import state
+

 logger = logging.getLogger(__name__)

@ -14,18 +18,11 @@ logger = logging.getLogger(__name__)
 class GithubToJsonl(TextToJsonl):
    def __init__(self, config: GithubContentConfig):
        super().__init__(config)
-        download_loader("GithubRepositoryReader")
+        self.config = config
+        self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"

    def process(self, previous_entries=None):
-        try:
-            self.initialize()
-        except Exception as e:
-            logger.error(
-                f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
-            )
-            raise e
-
-        with timer("Download github repo", logger):
+        with timer("Download markdown files from github repo", logger):
            try:
                docs = self.get_markdown_files()
            except Exception as e:
@ -64,19 +61,27 @@ class GithubToJsonl(TextToJsonl):

        return entries_with_ids

-    def initialize(self):
-        logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
-        github_client = GithubClient(self.config.pat_token)
-        self.loader = GithubRepositoryReader(
-            github_client,
-            owner=self.config.repo_owner,
-            repo=self.config.repo_name,
-            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
-            verbose=state.verbose > 1,
-        )
-
    def get_markdown_files(self):
-        return self.loader.load_data(branch=self.config.repo_branch)
+        # set the url to get the contents of the repository
+        repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
+        # set the headers to include the authentication token
+        headers = {"Authorization": f"{self.config.pat_token}"}
+
+        # get the contents of the repository
+        response = requests.get(repo_content_url, headers=headers)
+        contents = response.json()
+
+        markdown_files = []
+        for item in contents["tree"]:
+            # Find all markdown files in the repository
+            if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Get text from each markdown file
+                file_content_url = f'{self.repo_url}/contents/{item["path"]}'
+                headers["Accept"] = "application/vnd.github.v3.raw"
+                markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8")
+                markdown_files += [{"content": markdown_file_contents, "path": item["path"]}]
+
+        return markdown_files

    @staticmethod
    def extract_markdown_entries(markdown_files):
@ -84,6 +89,6 @@ class GithubToJsonl(TextToJsonl):
        entry_to_file_map = []
        for doc in markdown_files:
            entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
-                doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
+                doc["content"], doc["path"], entries, entry_to_file_map
            )
        return entries, dict(entry_to_file_map)