From 6fdac2441652d1fb6bcb0cfda413a32ced7ba24f Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sun, 18 Jun 2023 01:20:05 -0700 Subject: [PATCH] Set page size to 100 to reduce requests required to Github API to 1/3 - Default is 30. So number of paginated requests required to get all items (commits, files) will reduce by 67% - No need to increase page size for the get tree Github API request from `get_markdown_files' Get tree Github API doesn't support pagination and return 100K items in response. This should be way more than enough for our current use-cases --- src/khoj/processor/github/github_to_jsonl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index d76f4979..789d8259 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -117,11 +117,12 @@ class GithubToJsonl(TextToJsonl): # Get commit messages from the repository using the Github API commits_url = f"{self.repo_url}/commits" headers = {"Authorization": f"token {self.config.pat_token}"} + params = {"per_page": 100} commits = [] while commits_url is not None: # Get the next page of commits - response = requests.get(commits_url, headers=headers) + response = requests.get(commits_url, headers=headers, params=params) raw_commits = response.json() # Wait for rate limit reset if needed