Use Github REST API and Index Commit Messages off Github Repository

- Migrate to Github REST API instead of Llama Hub to index Markdown Docs in Github Repository - Index Commit Messages from Github Repository as well
2025-02-17 16:14:21 +00:00 · 2023-06-18 14:51:32 +05:30 · 2023-06-18 14:51:32 +05:30 · e06be395f9
commit e06be395f9
parent 9f00a366ab e31a540a5e
3 changed files with 111 additions and 31 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -56,7 +56,7 @@ dependencies = [
    "aiohttp == 3.8.4",
    "langchain >= 0.0.187",
    "pypdf >= 3.9.0",
-    "llama-hub==0.0.3",
+    "requests >= 2.26.0",
 ]
 dynamic = ["version"]

--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -34,6 +34,9 @@
        function render_markdown(query, data) {
            var md = window.markdownit();
            return md.render(data.map(function (item) {
+                lines = item.entry.split("\n")
+                if (item.additional.file.startsWith("http"))
+                    return `${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`
                return `${item.entry}`
            }).join("\n"));
        }
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@ -1,12 +1,18 @@
+# Standard Packages
 import logging
-from llama_index import download_loader
+import time
+from typing import Dict, List
+
+# External Packages
+import requests
+
+# Internal Packages
 from khoj.utils.helpers import timer
-from khoj.utils.rawconfig import GithubContentConfig
-from llama_hub.github_repo import GithubRepositoryReader, GithubClient
+from khoj.utils.rawconfig import Entry, GithubContentConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils import state
+

 logger = logging.getLogger(__name__)

@ -14,31 +20,37 @@ logger = logging.getLogger(__name__)
 class GithubToJsonl(TextToJsonl):
    def __init__(self, config: GithubContentConfig):
        super().__init__(config)
-        download_loader("GithubRepositoryReader")
+        self.config = config
+        self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"
+
+    @staticmethod
+    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
+        if response.status_code != 200 and response.headers.get("X-RateLimit-Remaining") == "0":
+            wait_time = int(response.headers.get("X-RateLimit-Reset")) - int(time.time())
+            logger.info(f"Github Rate limit reached. Waiting for {wait_time} seconds")
+            time.sleep(wait_time)
+            return func(*args, **kwargs)
+        else:
+            return

    def process(self, previous_entries=None):
-        try:
-            self.initialize()
-        except Exception as e:
-            logger.error(
-                f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
-            )
-            raise e
-
-        with timer("Download github repo", logger):
+        with timer("Download markdown files from github repo", logger):
            try:
                docs = self.get_markdown_files()
            except Exception as e:
-                logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
+                logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
                raise e

-        logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
+        logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")

        with timer("Extract markdown entries from github repo", logger):
            current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
                *GithubToJsonl.extract_markdown_entries(docs)
            )

+        with timer("Extract commit messages from github repo", logger):
+            current_entries += self.convert_commits_to_entries(self.get_commits())
+
        with timer("Split entries by max token size supported by model", logger):
            current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)

@ -51,7 +63,7 @@ class GithubToJsonl(TextToJsonl):
                    current_entries, previous_entries, key="compiled", logger=logger
                )

-        with timer("Write markdown entries to JSONL file", logger):
+        with timer("Write github entries to JSONL file", logger):
            # Process Each Entry from All Notes Files
            entries = list(map(lambda entry: entry[1], entries_with_ids))
            jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
@ -64,19 +76,84 @@ class GithubToJsonl(TextToJsonl):

        return entries_with_ids

-    def initialize(self):
-        logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
-        github_client = GithubClient(self.config.pat_token)
-        self.loader = GithubRepositoryReader(
-            github_client,
-            owner=self.config.repo_owner,
-            repo=self.config.repo_name,
-            filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
-            verbose=state.verbose > 1,
-        )
-
    def get_markdown_files(self):
-        return self.loader.load_data(branch=self.config.repo_branch)
+        # Get the contents of the repository
+        repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
+        headers = {"Authorization": f"token {self.config.pat_token}"}
+        params = {"recursive": "true"}
+        response = requests.get(repo_content_url, headers=headers, params=params)
+        contents = response.json()
+
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_markdown_files)
+        if result is not None:
+            return result
+
+        # Extract markdown files from the repository
+        markdown_files = []
+        for item in contents["tree"]:
+            # Find all markdown files in the repository
+            if item["type"] == "blob" and item["path"].endswith(".md"):
+                # Create URL for each markdown file on Github
+                url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
+
+                # Add markdown file contents and URL to list
+                markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
+
+        return markdown_files
+
+    def get_file_contents(self, file_url):
+        # Get text from each markdown file
+        headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
+        response = requests.get(file_url, headers=headers)
+
+        # Wait for rate limit reset if needed
+        result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
+        if result is not None:
+            return result
+
+        return response.content.decode("utf-8")
+
+    def get_commits(self) -> List[Dict]:
+        # Get commit messages from the repository using the Github API
+        commits_url = f"{self.repo_url}/commits"
+        headers = {"Authorization": f"token {self.config.pat_token}"}
+        params = {"per_page": 100}
+        commits = []
+
+        while commits_url is not None:
+            # Get the next page of commits
+            response = requests.get(commits_url, headers=headers, params=params)
+            raw_commits = response.json()
+
+            # Wait for rate limit reset if needed
+            result = self.wait_for_rate_limit_reset(response, self.get_commits)
+            if result is not None:
+                return result
+
+            # Extract commit messages from the response
+            for commit in raw_commits:
+                commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
+
+            # Get the URL for the next page of commits, if any
+            commits_url = response.links.get("next", {}).get("url")
+
+        return commits
+
+    def convert_commits_to_entries(self, commits) -> List[Entry]:
+        entries: List[Entry] = []
+        for commit in commits:
+            compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
+            entries.append(
+                Entry(
+                    compiled=compiled,
+                    raw=f'### {commit["content"]}',
+                    heading=commit["content"].split("\n")[0],
+                    file=commit["path"],
+                )
+            )
+
+        return entries

    @staticmethod
    def extract_markdown_entries(markdown_files):
@ -84,6 +161,6 @@ class GithubToJsonl(TextToJsonl):
        entry_to_file_map = []
        for doc in markdown_files:
            entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
-                doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
+                doc["content"], doc["path"], entries, entry_to_file_map
            )
        return entries, dict(entry_to_file_map)