From 70e550250a45e36a8160d729a45e624e3de2e72e Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 10:59:54 -0700
Subject: [PATCH 1/9] Add an additional data source for issues from Github
 repositories + quality of life updates

- Use a request session to reduce the overhead of setting up a new connection with the Github URL each request
- Use the streaming feature for the REST api to reduce some of the memory footprint
---
 .../interface/web/assets/markdown-it.min.js   |   3 +
 src/khoj/interface/web/index.html             |  16 +++
 src/khoj/processor/github/github_to_jsonl.py  | 135 ++++++++++++++++--
 3 files changed, 144 insertions(+), 10 deletions(-)
diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js
index 5b37e28e..8cac3add 100644
--- a/src/khoj/interface/web/assets/markdown-it.min.js
+++ b/src/khoj/interface/web/assets/markdown-it.min.js
@@ -3227,6 +3227,9 @@
         result += (token.nesting === -1 ? "</" : "<") + token.tag;
     // Encode attributes, e.g. `<img src="foo"`
         result += this.renderAttrs(token);
+
+    // Add a class name for all tokens, e.g. `<img class="foo"`
+        result += ' class="md-level"';
     // Add a slash for self-closing tags, e.g. `<img src="foo" /`
         if (token.nesting === 0 && options.xhtmlOut) {
       result += " /";
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index d5ccb8f9..a40cda37 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -61,6 +61,7 @@
             let org_files = data.filter((item) => item.additional.file.endsWith(".org"));
             let md_files = data.filter((item) => item.additional.file.endsWith(".md"));
             let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf"));
+            let issue_files = data.filter((item) => item.additional.file.includes("issues") && item.additional.file.includes("github.com"));
 
             let html = "";
             if (org_files.length > 0) {
@@ -71,6 +72,10 @@
                 html += render_markdown(query, md_files);
             }
 
+            if (issue_files.length > 0) {
+                html += render_markdown(query, issue_files);
+            }
+
             if (pdf_files.length > 0) {
                 html += render_pdf(query, pdf_files);
             }
@@ -370,6 +375,17 @@
             max-width: 100;
         }
 
+        a {
+            color: #3b82f6;
+            text-decoration: none;
+        }
+
+        img.md-level {
+            width: 20px;
+            height: 20px;
+            border-radius: 50%;
+        }
+
     </style>
 
 </html>
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 36584fab..6e9dc29b 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -13,6 +13,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.rawconfig import Entry
 
 
 logger = logging.getLogger(__name__)
@@ -22,6 +23,8 @@ class GithubToJsonl(TextToJsonl):
     def __init__(self, config: GithubContentConfig):
         super().__init__(config)
         self.config = config
+        self.session = requests.Session()
+        self.session.headers.update({"Authorization": f"token {self.config.pat_token}"})
 
     @staticmethod
     def wait_for_rate_limit_reset(response, func, *args, **kwargs):
@@ -53,6 +56,7 @@ class GithubToJsonl(TextToJsonl):
 
         logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}")
         logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}")
+        current_entries = []
 
         with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger):
             current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
@@ -65,6 +69,12 @@ class GithubToJsonl(TextToJsonl):
         with timer(f"Extract commit messages from github repo {repo_shorthand}", logger):
             current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
 
+        with timer(f"Extract issues from github repo {repo_shorthand}", logger):
+            issue_entries = GithubToJsonl.convert_issue_entries_to_maps(
+                *GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
+            )
+            current_entries += issue_entries
+
         with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger):
             current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
 
@@ -102,7 +112,7 @@ class GithubToJsonl(TextToJsonl):
         contents = response.json()
 
         # Wait for rate limit reset if needed
-        result = self.wait_for_rate_limit_reset(response, self.get_files)
+        result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo)
         if result is not None:
             return result
 
@@ -130,35 +140,43 @@ class GithubToJsonl(TextToJsonl):
 
     def get_file_contents(self, file_url):
         # Get text from each markdown file
-        headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"}
-        response = requests.get(file_url, headers=headers)
+        headers = {"Accept": "application/vnd.github.v3.raw"}
+        response = self.session.get(file_url, headers=headers, stream=True)
 
         # Wait for rate limit reset if needed
         result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url)
         if result is not None:
             return result
 
-        return response.content.decode("utf-8")
+        content = ""
+        for chunk in response.iter_content(chunk_size=2048):
+            if chunk:
+                content += chunk.decode("utf-8")
+
+        return content
 
     def get_commits(self, repo_url: str) -> List[Dict]:
+        return self._get_commits(f"{repo_url}/commits")
+
+    def _get_commits(self, commits_url: str | None) -> List[Dict]:
         # Get commit messages from the repository using the Github API
-        commits_url = f"{repo_url}/commits"
-        headers = {"Authorization": f"token {self.config.pat_token}"}
         params = {"per_page": 100}
         commits = []
 
         while commits_url is not None:
             # Get the next page of commits
-            response = requests.get(commits_url, headers=headers, params=params)
-            raw_commits = response.json()
+            response = self.session.get(commits_url, params=params, stream=True)
+
+            # Read the streamed response into a JSON object
+            content = response.json()
 
             # Wait for rate limit reset if needed
-            result = self.wait_for_rate_limit_reset(response, self.get_commits)
+            result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url)
             if result is not None:
                 return result
 
             # Extract commit messages from the response
-            for commit in raw_commits:
+            for commit in content:
                 commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}]
 
             # Get the URL for the next page of commits, if any
@@ -166,6 +184,74 @@ class GithubToJsonl(TextToJsonl):
 
         return commits
 
+    def get_issues(self, repo_url: str) -> List[Dict]:
+        return self._get_issues(f"{repo_url}/issues")
+
+    def _get_issues(self, issues_url: str | None) -> List[Dict]:
+        issues = []
+        per_page = 30
+        params = {"per_page": per_page, "state": "all"}
+
+        while issues_url is not None:
+            # Get the next page of issues
+            response = self.session.get(issues_url, params=params, stream=True)
+            raw_issues = response.json()
+
+            # Wait for rate limit reset if needed
+            result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url)
+            if result is not None:
+                return result
+
+            for issue in raw_issues:
+                username = issue["user"]["login"]
+                user_url = f"[{username}]({issue['user']['html_url']})"
+                issue_content = {
+                    "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}",
+                    "path": issue["html_url"],
+                }
+                issue_content["created_at"] = {issue["created_at"]}
+                if issue["comments"] > 0:
+                    issue_content["comments"] = self.get_comments(issue["comments_url"])
+                issues += [issue_content]
+
+            issues_url = response.links.get("next", {}).get("url")
+
+        return issues
+
+    def get_comments(self, comments_url: str | None) -> List[Dict]:
+        # By default, the number of results per page is 30. We'll keep it as-is for now.
+        comments = []
+        per_page = 30
+        params = {"per_page": per_page}
+
+        while comments_url is not None:
+            # Get the next page of comments
+            response = self.session.get(comments_url, params=params, stream=True)
+            raw_comments = response.json()
+
+            # Wait for rate limit reset if needed
+            result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url)
+            if result is not None:
+                return result
+
+            for comment in raw_comments:
+                created_at = comment["created_at"].split("T")[0]
+                commenter = comment["user"]["login"]
+                commenter_url = comment["user"]["html_url"]
+                comment_url = comment["html_url"]
+                comment_url_link = f"[{created_at}]({comment_url})"
+                avatar_url = comment["user"]["avatar_url"]
+                avatar = f"![{commenter}]({avatar_url})"
+                comments += [
+                    {
+                        "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}"
+                    }
+                ]
+
+            comments_url = response.links.get("next", {}).get("url")
+
+        return comments
+
     def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
         entries: List[Entry] = []
         for commit in commits:
@@ -201,3 +287,32 @@ class GithubToJsonl(TextToJsonl):
                 doc["content"], doc["path"], entries, entry_to_file_map
             )
         return entries, dict(entry_to_file_map)
+
+    @staticmethod
+    def extract_github_issues(issues):
+        entries = []
+        entry_to_file_map = {}
+        for issue in issues:
+            content = issue["content"]
+            if "comments" in issue:
+                for comment in issue["comments"]:
+                    content += "\n\n" + comment["content"]
+            entries.append(content)
+            entry_to_file_map[content] = {"path": issue["path"]}
+        return entries, entry_to_file_map
+
+    @staticmethod
+    def convert_issue_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, Dict]) -> List[Entry]:
+        entries = []
+        for entry in parsed_entries:
+            entry_file_name = entry_to_file_map[entry]["path"]
+            entries.append(
+                Entry(
+                    compiled=entry,
+                    raw=entry,
+                    heading=entry.split("\n")[0],
+                    file=entry_file_name,
+                )
+            )
+
+        return entries

From fecf6700d2007f33afd5db6477b99ed6b9fdbf5e Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 11:27:18 -0700
Subject: [PATCH 2/9] Limit small image rendering to just the avatar images

---
 src/khoj/interface/web/assets/markdown-it.min.js | 11 +++++++++--
 src/khoj/interface/web/index.html                |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js
index 8cac3add..7339517f 100644
--- a/src/khoj/interface/web/assets/markdown-it.min.js
+++ b/src/khoj/interface/web/assets/markdown-it.min.js
@@ -3228,8 +3228,15 @@
     // Encode attributes, e.g. `<img src="foo"`
         result += this.renderAttrs(token);
 
-    // Add a class name for all tokens, e.g. `<img class="foo"`
-        result += ' class="md-level"';
+      if (token.tag === "img" && token.attrs) {
+          for (var i = 0; i < token.attrs.length; i++) {
+            if (token.attrs[i][0] === "src") {
+              if (token.attrs[i][1].includes("avatar")) {
+                result += ' class="md-avatar"';
+              }
+            }
+          }
+      }
     // Add a slash for self-closing tags, e.g. `<img src="foo" /`
         if (token.nesting === 0 && options.xhtmlOut) {
       result += " /";
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index a40cda37..3bdb7b12 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -380,7 +380,7 @@
             text-decoration: none;
         }
 
-        img.md-level {
+        img.md-avatar {
             width: 20px;
             height: 20px;
             border-radius: 50%;

From 601b7381351a3f6c955cb1e1fc84470a0878788c Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 11:27:47 -0700
Subject: [PATCH 3/9] Bonus: Rename all md files to markdown for cleanliness

---
 ... Xiu turning 4.md => Birthday Gift for Xiu turning 4.markdown} | 0
 .../{Hike Mt. Kilimanjaro.md => Hike Mt. Kilimanjaro.markdown}    | 0
 ... Pablo for Lunch.md => Meet Arun and Pablo for Lunch.markdown} | 0
 ...aneous Transactions.md => Miscellaneous Transactions.markdown} | 0
 tests/data/markdown/{Namita.md => Namita.markdown}                | 0
 .../data/markdown/{Patent 6631372.md => Patent 6631372.markdown}  | 0
 ...axes for 2022.md => Preparing to File Taxes for 2022.markdown} | 0
 ...fer Letter.md => Sign Wayne Enterprises Offer Letter.markdown} | 0
 ...Acme.Inc.md => Submit Resignation Letter to Acme.Inc.markdown} | 0
 .../markdown/{Visit Seregenti.md => Visit Seregenti.markdown}     | 0
 tests/data/markdown/{Xi Li.md => Xi Li.markdown}                  | 0
 .../{copy_what_you_like.md => copy_what_you_like.markdown}        | 0
 tests/data/markdown/{having_kids.md => having_kids.markdown}      | 0
 ..._y_combinator_started.md => how_y_combinator_started.markdown} | 0
 .../{jessica_livingston.md => jessica_livingston.markdown}        | 0
 .../markdown/{undergraduation.md => undergraduation.markdown}     | 0
 ...{what_i_did_this_summer.md => what_i_did_this_summer.markdown} | 0
 .../markdown/{what_i_worked_on.md => what_i_worked_on.markdown}   | 0
 tests/data/markdown/{why_yc.md => why_yc.markdown}                | 0
 19 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/data/markdown/{Birthday Gift for Xiu turning 4.md => Birthday Gift for Xiu turning 4.markdown} (100%)
 rename tests/data/markdown/{Hike Mt. Kilimanjaro.md => Hike Mt. Kilimanjaro.markdown} (100%)
 rename tests/data/markdown/{Meet Arun and Pablo for Lunch.md => Meet Arun and Pablo for Lunch.markdown} (100%)
 rename tests/data/markdown/{Miscellaneous Transactions.md => Miscellaneous Transactions.markdown} (100%)
 rename tests/data/markdown/{Namita.md => Namita.markdown} (100%)
 rename tests/data/markdown/{Patent 6631372.md => Patent 6631372.markdown} (100%)
 rename tests/data/markdown/{Preparing to File Taxes for 2022.md => Preparing to File Taxes for 2022.markdown} (100%)
 rename tests/data/markdown/{Sign Wayne Enterprises Offer Letter.md => Sign Wayne Enterprises Offer Letter.markdown} (100%)
 rename tests/data/markdown/{Submit Resignation Letter to Acme.Inc.md => Submit Resignation Letter to Acme.Inc.markdown} (100%)
 rename tests/data/markdown/{Visit Seregenti.md => Visit Seregenti.markdown} (100%)
 rename tests/data/markdown/{Xi Li.md => Xi Li.markdown} (100%)
 rename tests/data/markdown/{copy_what_you_like.md => copy_what_you_like.markdown} (100%)
 rename tests/data/markdown/{having_kids.md => having_kids.markdown} (100%)
 rename tests/data/markdown/{how_y_combinator_started.md => how_y_combinator_started.markdown} (100%)
 rename tests/data/markdown/{jessica_livingston.md => jessica_livingston.markdown} (100%)
 rename tests/data/markdown/{undergraduation.md => undergraduation.markdown} (100%)
 rename tests/data/markdown/{what_i_did_this_summer.md => what_i_did_this_summer.markdown} (100%)
 rename tests/data/markdown/{what_i_worked_on.md => what_i_worked_on.markdown} (100%)
 rename tests/data/markdown/{why_yc.md => why_yc.markdown} (100%)

diff --git a/tests/data/markdown/Birthday Gift for Xiu turning 4.md b/tests/data/markdown/Birthday Gift for Xiu turning 4.markdown
similarity index 100%
rename from tests/data/markdown/Birthday Gift for Xiu turning 4.md
rename to tests/data/markdown/Birthday Gift for Xiu turning 4.markdown
diff --git a/tests/data/markdown/Hike Mt. Kilimanjaro.md b/tests/data/markdown/Hike Mt. Kilimanjaro.markdown
similarity index 100%
rename from tests/data/markdown/Hike Mt. Kilimanjaro.md
rename to tests/data/markdown/Hike Mt. Kilimanjaro.markdown
diff --git a/tests/data/markdown/Meet Arun and Pablo for Lunch.md b/tests/data/markdown/Meet Arun and Pablo for Lunch.markdown
similarity index 100%
rename from tests/data/markdown/Meet Arun and Pablo for Lunch.md
rename to tests/data/markdown/Meet Arun and Pablo for Lunch.markdown
diff --git a/tests/data/markdown/Miscellaneous Transactions.md b/tests/data/markdown/Miscellaneous Transactions.markdown
similarity index 100%
rename from tests/data/markdown/Miscellaneous Transactions.md
rename to tests/data/markdown/Miscellaneous Transactions.markdown
diff --git a/tests/data/markdown/Namita.md b/tests/data/markdown/Namita.markdown
similarity index 100%
rename from tests/data/markdown/Namita.md
rename to tests/data/markdown/Namita.markdown
diff --git a/tests/data/markdown/Patent 6631372.md b/tests/data/markdown/Patent 6631372.markdown
similarity index 100%
rename from tests/data/markdown/Patent 6631372.md
rename to tests/data/markdown/Patent 6631372.markdown
diff --git a/tests/data/markdown/Preparing to File Taxes for 2022.md b/tests/data/markdown/Preparing to File Taxes for 2022.markdown
similarity index 100%
rename from tests/data/markdown/Preparing to File Taxes for 2022.md
rename to tests/data/markdown/Preparing to File Taxes for 2022.markdown
diff --git a/tests/data/markdown/Sign Wayne Enterprises Offer Letter.md b/tests/data/markdown/Sign Wayne Enterprises Offer Letter.markdown
similarity index 100%
rename from tests/data/markdown/Sign Wayne Enterprises Offer Letter.md
rename to tests/data/markdown/Sign Wayne Enterprises Offer Letter.markdown
diff --git a/tests/data/markdown/Submit Resignation Letter to Acme.Inc.md b/tests/data/markdown/Submit Resignation Letter to Acme.Inc.markdown
similarity index 100%
rename from tests/data/markdown/Submit Resignation Letter to Acme.Inc.md
rename to tests/data/markdown/Submit Resignation Letter to Acme.Inc.markdown
diff --git a/tests/data/markdown/Visit Seregenti.md b/tests/data/markdown/Visit Seregenti.markdown
similarity index 100%
rename from tests/data/markdown/Visit Seregenti.md
rename to tests/data/markdown/Visit Seregenti.markdown
diff --git a/tests/data/markdown/Xi Li.md b/tests/data/markdown/Xi Li.markdown
similarity index 100%
rename from tests/data/markdown/Xi Li.md
rename to tests/data/markdown/Xi Li.markdown
diff --git a/tests/data/markdown/copy_what_you_like.md b/tests/data/markdown/copy_what_you_like.markdown
similarity index 100%
rename from tests/data/markdown/copy_what_you_like.md
rename to tests/data/markdown/copy_what_you_like.markdown
diff --git a/tests/data/markdown/having_kids.md b/tests/data/markdown/having_kids.markdown
similarity index 100%
rename from tests/data/markdown/having_kids.md
rename to tests/data/markdown/having_kids.markdown
diff --git a/tests/data/markdown/how_y_combinator_started.md b/tests/data/markdown/how_y_combinator_started.markdown
similarity index 100%
rename from tests/data/markdown/how_y_combinator_started.md
rename to tests/data/markdown/how_y_combinator_started.markdown
diff --git a/tests/data/markdown/jessica_livingston.md b/tests/data/markdown/jessica_livingston.markdown
similarity index 100%
rename from tests/data/markdown/jessica_livingston.md
rename to tests/data/markdown/jessica_livingston.markdown
diff --git a/tests/data/markdown/undergraduation.md b/tests/data/markdown/undergraduation.markdown
similarity index 100%
rename from tests/data/markdown/undergraduation.md
rename to tests/data/markdown/undergraduation.markdown
diff --git a/tests/data/markdown/what_i_did_this_summer.md b/tests/data/markdown/what_i_did_this_summer.markdown
similarity index 100%
rename from tests/data/markdown/what_i_did_this_summer.md
rename to tests/data/markdown/what_i_did_this_summer.markdown
diff --git a/tests/data/markdown/what_i_worked_on.md b/tests/data/markdown/what_i_worked_on.markdown
similarity index 100%
rename from tests/data/markdown/what_i_worked_on.md
rename to tests/data/markdown/what_i_worked_on.markdown
diff --git a/tests/data/markdown/why_yc.md b/tests/data/markdown/why_yc.markdown
similarity index 100%
rename from tests/data/markdown/why_yc.md
rename to tests/data/markdown/why_yc.markdown

From ab7dabe74f34d8eb54fa1dcc87a0d298cbf80172 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 11:44:30 -0700
Subject: [PATCH 4/9] Explicitly use Union type for function parameters for
 lint checks

---
 src/khoj/processor/github/github_to_jsonl.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 6e9dc29b..69f7033c 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,7 +1,7 @@
 # Standard Packages
 import logging
 import time
-from typing import Dict, List
+from typing import Dict, List, Union
 
 # External Packages
 import requests
@@ -158,7 +158,7 @@ class GithubToJsonl(TextToJsonl):
     def get_commits(self, repo_url: str) -> List[Dict]:
         return self._get_commits(f"{repo_url}/commits")
 
-    def _get_commits(self, commits_url: str | None) -> List[Dict]:
+    def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]:
         # Get commit messages from the repository using the Github API
         params = {"per_page": 100}
         commits = []
@@ -187,14 +187,14 @@ class GithubToJsonl(TextToJsonl):
     def get_issues(self, repo_url: str) -> List[Dict]:
         return self._get_issues(f"{repo_url}/issues")
 
-    def _get_issues(self, issues_url: str | None) -> List[Dict]:
+    def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]:
         issues = []
         per_page = 30
         params = {"per_page": per_page, "state": "all"}
 
         while issues_url is not None:
             # Get the next page of issues
-            response = self.session.get(issues_url, params=params, stream=True)
+            response = self.session.get(issues_url, params=params, stream=True)  # type: ignore
             raw_issues = response.json()
 
             # Wait for rate limit reset if needed
@@ -218,7 +218,7 @@ class GithubToJsonl(TextToJsonl):
 
         return issues
 
-    def get_comments(self, comments_url: str | None) -> List[Dict]:
+    def get_comments(self, comments_url: Union[str, None]) -> List[Dict]:
         # By default, the number of results per page is 30. We'll keep it as-is for now.
         comments = []
         per_page = 30

From e6053951f0ec98ff410343b661c8e3ef2a6a6a20 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 11:53:47 -0700
Subject: [PATCH 5/9] In chat conftest fixtures, use *.markdown rather than
 *.md

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index c5a000e3..dfb27b8b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -120,7 +120,7 @@ def md_content_config(tmp_path_factory):
     content_config = ContentConfig()
     content_config.markdown = TextContentConfig(
         input_files=None,
-        input_filter=["tests/data/markdown/*.md"],
+        input_filter=["tests/data/markdown/*.markdown"],
         compressed_jsonl=content_dir.joinpath("markdown.jsonl"),
         embeddings_file=content_dir.joinpath("markdown_embeddings.pt"),
     )

From b41c14b258a0a9d0c3eea766a5eeb21b45c054d3 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 11:55:18 -0700
Subject: [PATCH 6/9] Use *.markdown in the khoj_docker.yml

---
 config/khoj_docker.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/khoj_docker.yml b/config/khoj_docker.yml
index 6ffe59f5..d74a5e82 100644
--- a/config/khoj_docker.yml
+++ b/config/khoj_docker.yml
@@ -11,7 +11,7 @@ content-type:
 
   markdown:
     input-files: null
-    input-filter: ["/data/markdown/**/*.md"]
+    input-filter: ["/data/markdown/**/*.markdown"]
     compressed-jsonl: "/data/embeddings/markdown.jsonl.gz"
     embeddings-file: "/data/embeddings/markdown_embeddings.pt"
 

From 77672ac0aede45165fce6b0dada60b1bf48f8790 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 14:14:25 -0700
Subject: [PATCH 7/9] Demarcate different results with a border box

- Add back support for searching by type Github
- Remove custom class name in markdown js file
---
 .../interface/web/assets/markdown-it.min.js   | 10 ---
 src/khoj/interface/web/index.html             | 72 +++++++++++--------
 src/khoj/processor/github/github_to_jsonl.py  |  3 +-
 src/khoj/routers/api.py                       | 14 ++++
 4 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js
index 7339517f..5b37e28e 100644
--- a/src/khoj/interface/web/assets/markdown-it.min.js
+++ b/src/khoj/interface/web/assets/markdown-it.min.js
@@ -3227,16 +3227,6 @@
         result += (token.nesting === -1 ? "</" : "<") + token.tag;
     // Encode attributes, e.g. `<img src="foo"`
         result += this.renderAttrs(token);
-
-      if (token.tag === "img" && token.attrs) {
-          for (var i = 0; i < token.attrs.length; i++) {
-            if (token.attrs[i][0] === "src") {
-              if (token.attrs[i][1].includes("avatar")) {
-                result += ' class="md-avatar"';
-              }
-            }
-          }
-      }
     // Add a slash for self-closing tags, e.g. `<img src="foo" /`
         if (token.nesting === 0 && options.xhtmlOut) {
       result += " /";
diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index c9d69035..e6195336 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -36,11 +36,15 @@
         function render_markdown(query, data) {
             var md = window.markdownit();
             return data.map(function (item) {
+                let rendered = "";
                 if (item.additional.file.startsWith("http")) {
                     lines = item.entry.split("\n");
-                    return md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
+                    rendered = md.render(`${lines[0]}\t[*](${item.additional.file})\n${lines.slice(1).join("\n")}`);
                 }
-                return `<div class="results-markdown">` + md.render(`${item.entry}`) + `</div>`;
+                else {
+                    rendered = md.render(`${item.entry}`);
+                }
+                return `<div class="results-markdown">` + rendered + `</div>`;
             }).join("\n");
         }
 
@@ -59,29 +63,22 @@
             }).join("\n");
         }
 
-        function render_mutliple(query, data, type) {
-            let org_files = data.filter((item) => item.additional.file.endsWith(".org"));
-            let md_files = data.filter((item) => item.additional.file.endsWith(".md"));
-            let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf"));
-            let issue_files = data.filter((item) => item.additional.file.includes("issues") && item.additional.file.includes("github.com"));
-
+        function render_multiple(query, data, type) {
             let html = "";
-            if (org_files.length > 0) {
-                html += render_org(query, org_files, type);
-            }
-
-            if (md_files.length > 0) {
-                html += render_markdown(query, md_files);
-            }
-
-            if (issue_files.length > 0) {
-                html += render_markdown(query, issue_files);
-            }
-
-            if (pdf_files.length > 0) {
-                html += render_pdf(query, pdf_files);
-            }
-
+            data.forEach(item => {
+                if (item.additional.file.endsWith(".org")) {
+                    html += render_org(query, [item], "org-");
+                } else if (
+                    item.additional.file.endsWith(".md") ||
+                    item.additional.file.endsWith(".markdown") ||
+                        (item.additional.file.includes("issues") && item.additional.file.includes("github.com"))
+                    )
+                {
+                    html += render_markdown(query, [item]);
+                } else if (item.additional.file.endsWith(".pdf")) {
+                    html += render_pdf(query, [item]);
+                }
+            });
             return html;
         }
 
@@ -100,11 +97,25 @@
             } else if (type === "pdf") {
                 results = render_pdf(query, data);
             } else if (type === "github" || type === "all") {
-                results = render_mutliple(query, data, type);
+                results = render_multiple(query, data, type);
             } else {
                 results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
             }
-            return `<div id="results-${type}">${results}</div>`;
+
+            // Any POST rendering goes here.
+
+            let renderedResults = document.createElement("div");
+            renderedResults.id = `results-${type}`;
+            renderedResults.innerHTML = results;
+
+            // For all elements that are of type img in the results html and have a src with 'avatar' in the URL, add the class 'avatar'
+            // This is used to make the avatar images round
+            let images = renderedResults.querySelectorAll("img[src*='avatar']");
+            for (let i = 0; i < images.length; i++) {
+                images[i].classList.add("avatar");
+            }
+
+            return renderedResults.outerHTML;
         }
 
         function search(rerank=false) {
@@ -277,7 +288,6 @@
             margin: 0px;
             background: #f8fafc;
             color: #475569;
-            text-align: center;
             font-family: roboto, karma, segoe ui, sans-serif;
             font-size: 20px;
             font-weight: 300;
@@ -388,12 +398,18 @@
             text-decoration: none;
         }
 
-        img.md-avatar {
+        img.avatar {
             width: 20px;
             height: 20px;
             border-radius: 50%;
         }
 
+        div.results-markdown,
+        div.results-org,
+        div.results-pdf {
+            border: black 1px solid;
+        }
+
     </style>
 
 </html>
diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index 69f7033c..e94749f1 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -1,6 +1,7 @@
 # Standard Packages
 import logging
 import time
+from datetime import datetime
 from typing import Dict, List, Union
 
 # External Packages
@@ -235,7 +236,7 @@ class GithubToJsonl(TextToJsonl):
                 return result
 
             for comment in raw_comments:
-                created_at = comment["created_at"].split("T")[0]
+                created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M")
                 commenter = comment["user"]["login"]
                 commenter_url = comment["user"]["html_url"]
                 comment_url = comment["html_url"]
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index 9069418b..980ab1fb 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -216,6 +216,20 @@ async def search(
                 )
             ]
 
+        if (t == SearchType.Github or t == SearchType.All) and state.model.github_search:
+            # query github issues
+            search_futures += [
+                executor.submit(
+                    text_search.query,
+                    user_query,
+                    state.model.github_search,
+                    question_embedding=encoded_asymmetric_query,
+                    rank_results=r or False,
+                    score_threshold=score_threshold,
+                    dedupe=dedupe or True,
+                )
+            ]
+
         if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search:
             # query pdf files
             search_futures += [

From 65bf894302288819fa30b57e24349260080cdb8d Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 15:12:48 -0700
Subject: [PATCH 8/9] Interpret org files as a list and put them in separate
 divs. Update styling of search results to separate into cards

---
 src/khoj/interface/web/index.html | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html
index e6195336..7a0f896c 100644
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@@ -24,13 +24,12 @@
         }
 
         function render_org(query, data, classPrefix="") {
-            var orgCode = data.map(function (item) {
-                return `${item.entry}`
-            }).join("\n")
-            var orgParser = new Org.Parser();
-            var orgDocument = orgParser.parse(orgCode);
-            var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
-            return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
+            return data.map(function (item) {
+                var orgParser = new Org.Parser();
+                var orgDocument = orgParser.parse(item.entry);
+                var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix });
+                return `<div class="results-org">` + orgHTMLDocument.toString() + `</div>`;
+            }).join("\n");
         }
 
         function render_markdown(query, data) {
@@ -407,7 +406,12 @@
         div.results-markdown,
         div.results-org,
         div.results-pdf {
-            border: black 1px solid;
+            text-align: left;
+            box-shadow: 2px 2px 2px var(--primary-hover);
+            border-radius: 5px;
+            padding: 10px;
+            margin: 10px 0;
+            border: 1px solid rgb(229, 229, 229);
         }
 
     </style>

From b2dd946c6daec64f0cfae6867b5b192dcb1e79ee Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Thu, 29 Jun 2023 15:23:50 -0700
Subject: [PATCH 9/9] Rename issue to entry method for accuracy

---
 src/khoj/processor/github/github_to_jsonl.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py
index e94749f1..70ea7bf2 100644
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@@ -71,7 +71,7 @@ class GithubToJsonl(TextToJsonl):
             current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
 
         with timer(f"Extract issues from github repo {repo_shorthand}", logger):
-            issue_entries = GithubToJsonl.convert_issue_entries_to_maps(
+            issue_entries = GithubToJsonl.convert_issues_to_entries(
                 *GithubToJsonl.extract_github_issues(self.get_issues(repo_url))
             )
             current_entries += issue_entries
@@ -303,10 +303,10 @@ class GithubToJsonl(TextToJsonl):
         return entries, entry_to_file_map
 
     @staticmethod
-    def convert_issue_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, Dict]) -> List[Entry]:
+    def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]:
         entries = []
         for entry in parsed_entries:
-            entry_file_name = entry_to_file_map[entry]["path"]
+            entry_file_name = entry_to_metadata_map[entry]["path"]
             entries.append(
                 Entry(
                     compiled=entry,