From 63ec84ad782cad2287f641895bcc16622444d8a5 Mon Sep 17 00:00:00 2001 From: Debanjum Singh Solanky Date: Sat, 17 Jun 2023 04:23:01 -0700 Subject: [PATCH] Store Github URL of Markdown files on Github in file jsonl param --- src/khoj/processor/github/github_to_jsonl.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index f862c951..d21f688b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -87,11 +87,16 @@ class GithubToJsonl(TextToJsonl): for item in contents["tree"]: # Find all markdown files in the repository if item["type"] == "blob" and item["path"].endswith(".md"): + # Create URL for each markdown file on Github + url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' + # Get text from each markdown file file_content_url = f'{self.repo_url}/contents/{item["path"]}' headers["Accept"] = "application/vnd.github.v3.raw" markdown_file_contents = requests.get(file_content_url, headers=headers).content.decode("utf-8") - markdown_files += [{"content": markdown_file_contents, "path": item["path"]}] + + # Add markdown file contents and URL to list + markdown_files += [{"content": markdown_file_contents, "path": url_path}] return markdown_files