Set page size to 100 to reduce requests required to Github API to 1/3

- Default is 30. So number of paginated requests required to get all
  items (commits, files) will reduce by 67%

- No need to increase page size for the get tree Github API request from
  `get_markdown_files'

  Get tree Github API doesn't support pagination and return 100K items
  in response. This should be way more than enough for our current
  use-cases
This commit is contained in:
Debanjum Singh Solanky 2023-06-18 01:20:05 -07:00
parent 87975e589a
commit 6fdac24416

View file

@ -117,11 +117,12 @@ class GithubToJsonl(TextToJsonl):
# Get commit messages from the repository using the Github API # Get commit messages from the repository using the Github API
commits_url = f"{self.repo_url}/commits" commits_url = f"{self.repo_url}/commits"
headers = {"Authorization": f"token {self.config.pat_token}"} headers = {"Authorization": f"token {self.config.pat_token}"}
params = {"per_page": 100}
commits = [] commits = []
while commits_url is not None: while commits_url is not None:
# Get the next page of commits # Get the next page of commits
response = requests.get(commits_url, headers=headers) response = requests.get(commits_url, headers=headers, params=params)
raw_commits = response.json() raw_commits = response.json()
# Wait for rate limit reset if needed # Wait for rate limit reset if needed