From 70e550250a45e36a8160d729a45e624e3de2e72e Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 10:59:54 -0700 Subject: [PATCH 1/9] Add an additional data source for issues from Github repositories + quality of life updates - Use a request session to reduce the overhead of setting up a new connection with the Github URL each request - Use the streaming feature for the REST api to reduce some of the memory footprint --- .../interface/web/assets/markdown-it.min.js | 3 + src/khoj/interface/web/index.html | 16 +++ src/khoj/processor/github/github_to_jsonl.py | 135 ++++++++++++++++-- 3 files changed, 144 insertions(+), 10 deletions(-) diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js index 5b37e28e..8cac3add 100644 --- a/src/khoj/interface/web/assets/markdown-it.min.js +++ b/src/khoj/interface/web/assets/markdown-it.min.js @@ -3227,6 +3227,9 @@ result += (token.nesting === -1 ? " item.additional.file.endsWith(".org")); let md_files = data.filter((item) => item.additional.file.endsWith(".md")); let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf")); + let issue_files = data.filter((item) => item.additional.file.includes("issues") && item.additional.file.includes("github.com")); let html = ""; if (org_files.length > 0) { @@ -71,6 +72,10 @@ html += render_markdown(query, md_files); } + if (issue_files.length > 0) { + html += render_markdown(query, issue_files); + } + if (pdf_files.length > 0) { html += render_pdf(query, pdf_files); } @@ -370,6 +375,17 @@ max-width: 100; } + a { + color: #3b82f6; + text-decoration: none; + } + + img.md-level { + width: 20px; + height: 20px; + border-radius: 50%; + } + diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 36584fab..6e9dc29b 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -13,6 +13,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data +from khoj.utils.rawconfig import Entry logger = logging.getLogger(__name__) @@ -22,6 +23,8 @@ class GithubToJsonl(TextToJsonl): def __init__(self, config: GithubContentConfig): super().__init__(config) self.config = config + self.session = requests.Session() + self.session.headers.update({"Authorization": f"token {self.config.pat_token}"}) @staticmethod def wait_for_rate_limit_reset(response, func, *args, **kwargs): @@ -53,6 +56,7 @@ class GithubToJsonl(TextToJsonl): logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo_shorthand}") logger.info(f"Found {len(org_files)} org files in github repo {repo_shorthand}") + current_entries = [] with timer(f"Extract markdown entries from github repo {repo_shorthand}", logger): current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( @@ -65,6 +69,12 @@ class GithubToJsonl(TextToJsonl): with timer(f"Extract commit messages from github repo {repo_shorthand}", logger): current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) + with timer(f"Extract issues from github repo {repo_shorthand}", logger): + issue_entries = GithubToJsonl.convert_issue_entries_to_maps( + *GithubToJsonl.extract_github_issues(self.get_issues(repo_url)) + ) + current_entries += issue_entries + with timer(f"Split entries by max token size supported by model {repo_shorthand}", logger): current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) @@ -102,7 +112,7 @@ class GithubToJsonl(TextToJsonl): contents = response.json() # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_files) + result = self.wait_for_rate_limit_reset(response, self.get_files, repo_url, repo) if result is not None: return result @@ -130,35 +140,43 @@ class GithubToJsonl(TextToJsonl): def get_file_contents(self, file_url): # Get text from each markdown file - headers = {"Authorization": f"token {self.config.pat_token}", "Accept": "application/vnd.github.v3.raw"} - response = requests.get(file_url, headers=headers) + headers = {"Accept": "application/vnd.github.v3.raw"} + response = self.session.get(file_url, headers=headers, stream=True) # Wait for rate limit reset if needed result = self.wait_for_rate_limit_reset(response, self.get_file_contents, file_url) if result is not None: return result - return response.content.decode("utf-8") + content = "" + for chunk in response.iter_content(chunk_size=2048): + if chunk: + content += chunk.decode("utf-8") + + return content def get_commits(self, repo_url: str) -> List[Dict]: + return self._get_commits(f"{repo_url}/commits") + + def _get_commits(self, commits_url: str | None) -> List[Dict]: # Get commit messages from the repository using the Github API - commits_url = f"{repo_url}/commits" - headers = {"Authorization": f"token {self.config.pat_token}"} params = {"per_page": 100} commits = [] while commits_url is not None: # Get the next page of commits - response = requests.get(commits_url, headers=headers, params=params) - raw_commits = response.json() + response = self.session.get(commits_url, params=params, stream=True) + + # Read the streamed response into a JSON object + content = response.json() # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_commits) + result = self.wait_for_rate_limit_reset(response, self._get_commits, commits_url) if result is not None: return result # Extract commit messages from the response - for commit in raw_commits: + for commit in content: commits += [{"content": commit["commit"]["message"], "path": commit["html_url"]}] # Get the URL for the next page of commits, if any @@ -166,6 +184,74 @@ class GithubToJsonl(TextToJsonl): return commits + def get_issues(self, repo_url: str) -> List[Dict]: + return self._get_issues(f"{repo_url}/issues") + + def _get_issues(self, issues_url: str | None) -> List[Dict]: + issues = [] + per_page = 30 + params = {"per_page": per_page, "state": "all"} + + while issues_url is not None: + # Get the next page of issues + response = self.session.get(issues_url, params=params, stream=True) + raw_issues = response.json() + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self._get_issues, issues_url) + if result is not None: + return result + + for issue in raw_issues: + username = issue["user"]["login"] + user_url = f"[{username}]({issue['user']['html_url']})" + issue_content = { + "content": f"## [Issue {issue['number']}]({issue['html_url']}) {issue['title']}\nby {user_url}\n\n{issue['body']}", + "path": issue["html_url"], + } + issue_content["created_at"] = {issue["created_at"]} + if issue["comments"] > 0: + issue_content["comments"] = self.get_comments(issue["comments_url"]) + issues += [issue_content] + + issues_url = response.links.get("next", {}).get("url") + + return issues + + def get_comments(self, comments_url: str | None) -> List[Dict]: + # By default, the number of results per page is 30. We'll keep it as-is for now. + comments = [] + per_page = 30 + params = {"per_page": per_page} + + while comments_url is not None: + # Get the next page of comments + response = self.session.get(comments_url, params=params, stream=True) + raw_comments = response.json() + + # Wait for rate limit reset if needed + result = self.wait_for_rate_limit_reset(response, self.get_comments, comments_url) + if result is not None: + return result + + for comment in raw_comments: + created_at = comment["created_at"].split("T")[0] + commenter = comment["user"]["login"] + commenter_url = comment["user"]["html_url"] + comment_url = comment["html_url"] + comment_url_link = f"[{created_at}]({comment_url})" + avatar_url = comment["user"]["avatar_url"] + avatar = f"![{commenter}]({avatar_url})" + comments += [ + { + "content": f"### {avatar} [{commenter}]({commenter_url}) - ({comment_url_link})\n\n{comment['body']}" + } + ] + + comments_url = response.links.get("next", {}).get("url") + + return comments + def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]: entries: List[Entry] = [] for commit in commits: @@ -201,3 +287,32 @@ class GithubToJsonl(TextToJsonl): doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) + + @staticmethod + def extract_github_issues(issues): + entries = [] + entry_to_file_map = {} + for issue in issues: + content = issue["content"] + if "comments" in issue: + for comment in issue["comments"]: + content += "\n\n" + comment["content"] + entries.append(content) + entry_to_file_map[content] = {"path": issue["path"]} + return entries, entry_to_file_map + + @staticmethod + def convert_issue_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, Dict]) -> List[Entry]: + entries = [] + for entry in parsed_entries: + entry_file_name = entry_to_file_map[entry]["path"] + entries.append( + Entry( + compiled=entry, + raw=entry, + heading=entry.split("\n")[0], + file=entry_file_name, + ) + ) + + return entries From fecf6700d2007f33afd5db6477b99ed6b9fdbf5e Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 11:27:18 -0700 Subject: [PATCH 2/9] Limit small image rendering to just the avatar images --- src/khoj/interface/web/assets/markdown-it.min.js | 11 +++++++++-- src/khoj/interface/web/index.html | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js index 8cac3add..7339517f 100644 --- a/src/khoj/interface/web/assets/markdown-it.min.js +++ b/src/khoj/interface/web/assets/markdown-it.min.js @@ -3228,8 +3228,15 @@ // Encode attributes, e.g. ` Date: Thu, 29 Jun 2023 11:27:47 -0700 Subject: [PATCH 3/9] Bonus: Rename all md files to markdown for cleanliness --- ... Xiu turning 4.md => Birthday Gift for Xiu turning 4.markdown} | 0 .../{Hike Mt. Kilimanjaro.md => Hike Mt. Kilimanjaro.markdown} | 0 ... Pablo for Lunch.md => Meet Arun and Pablo for Lunch.markdown} | 0 ...aneous Transactions.md => Miscellaneous Transactions.markdown} | 0 tests/data/markdown/{Namita.md => Namita.markdown} | 0 .../data/markdown/{Patent 6631372.md => Patent 6631372.markdown} | 0 ...axes for 2022.md => Preparing to File Taxes for 2022.markdown} | 0 ...fer Letter.md => Sign Wayne Enterprises Offer Letter.markdown} | 0 ...Acme.Inc.md => Submit Resignation Letter to Acme.Inc.markdown} | 0 .../markdown/{Visit Seregenti.md => Visit Seregenti.markdown} | 0 tests/data/markdown/{Xi Li.md => Xi Li.markdown} | 0 .../{copy_what_you_like.md => copy_what_you_like.markdown} | 0 tests/data/markdown/{having_kids.md => having_kids.markdown} | 0 ..._y_combinator_started.md => how_y_combinator_started.markdown} | 0 .../{jessica_livingston.md => jessica_livingston.markdown} | 0 .../markdown/{undergraduation.md => undergraduation.markdown} | 0 ...{what_i_did_this_summer.md => what_i_did_this_summer.markdown} | 0 .../markdown/{what_i_worked_on.md => what_i_worked_on.markdown} | 0 tests/data/markdown/{why_yc.md => why_yc.markdown} | 0 19 files changed, 0 insertions(+), 0 deletions(-) rename tests/data/markdown/{Birthday Gift for Xiu turning 4.md => Birthday Gift for Xiu turning 4.markdown} (100%) rename tests/data/markdown/{Hike Mt. Kilimanjaro.md => Hike Mt. Kilimanjaro.markdown} (100%) rename tests/data/markdown/{Meet Arun and Pablo for Lunch.md => Meet Arun and Pablo for Lunch.markdown} (100%) rename tests/data/markdown/{Miscellaneous Transactions.md => Miscellaneous Transactions.markdown} (100%) rename tests/data/markdown/{Namita.md => Namita.markdown} (100%) rename tests/data/markdown/{Patent 6631372.md => Patent 6631372.markdown} (100%) rename tests/data/markdown/{Preparing to File Taxes for 2022.md => Preparing to File Taxes for 2022.markdown} (100%) rename tests/data/markdown/{Sign Wayne Enterprises Offer Letter.md => Sign Wayne Enterprises Offer Letter.markdown} (100%) rename tests/data/markdown/{Submit Resignation Letter to Acme.Inc.md => Submit Resignation Letter to Acme.Inc.markdown} (100%) rename tests/data/markdown/{Visit Seregenti.md => Visit Seregenti.markdown} (100%) rename tests/data/markdown/{Xi Li.md => Xi Li.markdown} (100%) rename tests/data/markdown/{copy_what_you_like.md => copy_what_you_like.markdown} (100%) rename tests/data/markdown/{having_kids.md => having_kids.markdown} (100%) rename tests/data/markdown/{how_y_combinator_started.md => how_y_combinator_started.markdown} (100%) rename tests/data/markdown/{jessica_livingston.md => jessica_livingston.markdown} (100%) rename tests/data/markdown/{undergraduation.md => undergraduation.markdown} (100%) rename tests/data/markdown/{what_i_did_this_summer.md => what_i_did_this_summer.markdown} (100%) rename tests/data/markdown/{what_i_worked_on.md => what_i_worked_on.markdown} (100%) rename tests/data/markdown/{why_yc.md => why_yc.markdown} (100%) diff --git a/tests/data/markdown/Birthday Gift for Xiu turning 4.md b/tests/data/markdown/Birthday Gift for Xiu turning 4.markdown similarity index 100% rename from tests/data/markdown/Birthday Gift for Xiu turning 4.md rename to tests/data/markdown/Birthday Gift for Xiu turning 4.markdown diff --git a/tests/data/markdown/Hike Mt. Kilimanjaro.md b/tests/data/markdown/Hike Mt. Kilimanjaro.markdown similarity index 100% rename from tests/data/markdown/Hike Mt. Kilimanjaro.md rename to tests/data/markdown/Hike Mt. Kilimanjaro.markdown diff --git a/tests/data/markdown/Meet Arun and Pablo for Lunch.md b/tests/data/markdown/Meet Arun and Pablo for Lunch.markdown similarity index 100% rename from tests/data/markdown/Meet Arun and Pablo for Lunch.md rename to tests/data/markdown/Meet Arun and Pablo for Lunch.markdown diff --git a/tests/data/markdown/Miscellaneous Transactions.md b/tests/data/markdown/Miscellaneous Transactions.markdown similarity index 100% rename from tests/data/markdown/Miscellaneous Transactions.md rename to tests/data/markdown/Miscellaneous Transactions.markdown diff --git a/tests/data/markdown/Namita.md b/tests/data/markdown/Namita.markdown similarity index 100% rename from tests/data/markdown/Namita.md rename to tests/data/markdown/Namita.markdown diff --git a/tests/data/markdown/Patent 6631372.md b/tests/data/markdown/Patent 6631372.markdown similarity index 100% rename from tests/data/markdown/Patent 6631372.md rename to tests/data/markdown/Patent 6631372.markdown diff --git a/tests/data/markdown/Preparing to File Taxes for 2022.md b/tests/data/markdown/Preparing to File Taxes for 2022.markdown similarity index 100% rename from tests/data/markdown/Preparing to File Taxes for 2022.md rename to tests/data/markdown/Preparing to File Taxes for 2022.markdown diff --git a/tests/data/markdown/Sign Wayne Enterprises Offer Letter.md b/tests/data/markdown/Sign Wayne Enterprises Offer Letter.markdown similarity index 100% rename from tests/data/markdown/Sign Wayne Enterprises Offer Letter.md rename to tests/data/markdown/Sign Wayne Enterprises Offer Letter.markdown diff --git a/tests/data/markdown/Submit Resignation Letter to Acme.Inc.md b/tests/data/markdown/Submit Resignation Letter to Acme.Inc.markdown similarity index 100% rename from tests/data/markdown/Submit Resignation Letter to Acme.Inc.md rename to tests/data/markdown/Submit Resignation Letter to Acme.Inc.markdown diff --git a/tests/data/markdown/Visit Seregenti.md b/tests/data/markdown/Visit Seregenti.markdown similarity index 100% rename from tests/data/markdown/Visit Seregenti.md rename to tests/data/markdown/Visit Seregenti.markdown diff --git a/tests/data/markdown/Xi Li.md b/tests/data/markdown/Xi Li.markdown similarity index 100% rename from tests/data/markdown/Xi Li.md rename to tests/data/markdown/Xi Li.markdown diff --git a/tests/data/markdown/copy_what_you_like.md b/tests/data/markdown/copy_what_you_like.markdown similarity index 100% rename from tests/data/markdown/copy_what_you_like.md rename to tests/data/markdown/copy_what_you_like.markdown diff --git a/tests/data/markdown/having_kids.md b/tests/data/markdown/having_kids.markdown similarity index 100% rename from tests/data/markdown/having_kids.md rename to tests/data/markdown/having_kids.markdown diff --git a/tests/data/markdown/how_y_combinator_started.md b/tests/data/markdown/how_y_combinator_started.markdown similarity index 100% rename from tests/data/markdown/how_y_combinator_started.md rename to tests/data/markdown/how_y_combinator_started.markdown diff --git a/tests/data/markdown/jessica_livingston.md b/tests/data/markdown/jessica_livingston.markdown similarity index 100% rename from tests/data/markdown/jessica_livingston.md rename to tests/data/markdown/jessica_livingston.markdown diff --git a/tests/data/markdown/undergraduation.md b/tests/data/markdown/undergraduation.markdown similarity index 100% rename from tests/data/markdown/undergraduation.md rename to tests/data/markdown/undergraduation.markdown diff --git a/tests/data/markdown/what_i_did_this_summer.md b/tests/data/markdown/what_i_did_this_summer.markdown similarity index 100% rename from tests/data/markdown/what_i_did_this_summer.md rename to tests/data/markdown/what_i_did_this_summer.markdown diff --git a/tests/data/markdown/what_i_worked_on.md b/tests/data/markdown/what_i_worked_on.markdown similarity index 100% rename from tests/data/markdown/what_i_worked_on.md rename to tests/data/markdown/what_i_worked_on.markdown diff --git a/tests/data/markdown/why_yc.md b/tests/data/markdown/why_yc.markdown similarity index 100% rename from tests/data/markdown/why_yc.md rename to tests/data/markdown/why_yc.markdown From ab7dabe74f34d8eb54fa1dcc87a0d298cbf80172 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 11:44:30 -0700 Subject: [PATCH 4/9] Explicitly use Union type for function parameters for lint checks --- src/khoj/processor/github/github_to_jsonl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 6e9dc29b..69f7033c 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,7 +1,7 @@ # Standard Packages import logging import time -from typing import Dict, List +from typing import Dict, List, Union # External Packages import requests @@ -158,7 +158,7 @@ class GithubToJsonl(TextToJsonl): def get_commits(self, repo_url: str) -> List[Dict]: return self._get_commits(f"{repo_url}/commits") - def _get_commits(self, commits_url: str | None) -> List[Dict]: + def _get_commits(self, commits_url: Union[str, None]) -> List[Dict]: # Get commit messages from the repository using the Github API params = {"per_page": 100} commits = [] @@ -187,14 +187,14 @@ class GithubToJsonl(TextToJsonl): def get_issues(self, repo_url: str) -> List[Dict]: return self._get_issues(f"{repo_url}/issues") - def _get_issues(self, issues_url: str | None) -> List[Dict]: + def _get_issues(self, issues_url: Union[str, None]) -> List[Dict]: issues = [] per_page = 30 params = {"per_page": per_page, "state": "all"} while issues_url is not None: # Get the next page of issues - response = self.session.get(issues_url, params=params, stream=True) + response = self.session.get(issues_url, params=params, stream=True) # type: ignore raw_issues = response.json() # Wait for rate limit reset if needed @@ -218,7 +218,7 @@ class GithubToJsonl(TextToJsonl): return issues - def get_comments(self, comments_url: str | None) -> List[Dict]: + def get_comments(self, comments_url: Union[str, None]) -> List[Dict]: # By default, the number of results per page is 30. We'll keep it as-is for now. comments = [] per_page = 30 From e6053951f0ec98ff410343b661c8e3ef2a6a6a20 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 11:53:47 -0700 Subject: [PATCH 5/9] In chat conftest fixtures, use *.markdown rather than *.md --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index c5a000e3..dfb27b8b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,7 +120,7 @@ def md_content_config(tmp_path_factory): content_config = ContentConfig() content_config.markdown = TextContentConfig( input_files=None, - input_filter=["tests/data/markdown/*.md"], + input_filter=["tests/data/markdown/*.markdown"], compressed_jsonl=content_dir.joinpath("markdown.jsonl"), embeddings_file=content_dir.joinpath("markdown_embeddings.pt"), ) From b41c14b258a0a9d0c3eea766a5eeb21b45c054d3 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 11:55:18 -0700 Subject: [PATCH 6/9] Use *.markdown in the khoj_docker.yml --- config/khoj_docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/khoj_docker.yml b/config/khoj_docker.yml index 6ffe59f5..d74a5e82 100644 --- a/config/khoj_docker.yml +++ b/config/khoj_docker.yml @@ -11,7 +11,7 @@ content-type: markdown: input-files: null - input-filter: ["/data/markdown/**/*.md"] + input-filter: ["/data/markdown/**/*.markdown"] compressed-jsonl: "/data/embeddings/markdown.jsonl.gz" embeddings-file: "/data/embeddings/markdown_embeddings.pt" From 77672ac0aede45165fce6b0dada60b1bf48f8790 Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 14:14:25 -0700 Subject: [PATCH 7/9] Demarcate different results with a border box - Add back support for searching by type Github - Remove custom class name in markdown js file --- .../interface/web/assets/markdown-it.min.js | 10 --- src/khoj/interface/web/index.html | 72 +++++++++++-------- src/khoj/processor/github/github_to_jsonl.py | 3 +- src/khoj/routers/api.py | 14 ++++ 4 files changed, 60 insertions(+), 39 deletions(-) diff --git a/src/khoj/interface/web/assets/markdown-it.min.js b/src/khoj/interface/web/assets/markdown-it.min.js index 7339517f..5b37e28e 100644 --- a/src/khoj/interface/web/assets/markdown-it.min.js +++ b/src/khoj/interface/web/assets/markdown-it.min.js @@ -3227,16 +3227,6 @@ result += (token.nesting === -1 ? "` + md.render(`${item.entry}`) + ``; + else { + rendered = md.render(`${item.entry}`); + } + return `
` + rendered + `
`; }).join("\n"); } @@ -59,29 +63,22 @@ }).join("\n"); } - function render_mutliple(query, data, type) { - let org_files = data.filter((item) => item.additional.file.endsWith(".org")); - let md_files = data.filter((item) => item.additional.file.endsWith(".md")); - let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf")); - let issue_files = data.filter((item) => item.additional.file.includes("issues") && item.additional.file.includes("github.com")); - + function render_multiple(query, data, type) { let html = ""; - if (org_files.length > 0) { - html += render_org(query, org_files, type); - } - - if (md_files.length > 0) { - html += render_markdown(query, md_files); - } - - if (issue_files.length > 0) { - html += render_markdown(query, issue_files); - } - - if (pdf_files.length > 0) { - html += render_pdf(query, pdf_files); - } - + data.forEach(item => { + if (item.additional.file.endsWith(".org")) { + html += render_org(query, [item], "org-"); + } else if ( + item.additional.file.endsWith(".md") || + item.additional.file.endsWith(".markdown") || + (item.additional.file.includes("issues") && item.additional.file.includes("github.com")) + ) + { + html += render_markdown(query, [item]); + } else if (item.additional.file.endsWith(".pdf")) { + html += render_pdf(query, [item]); + } + }); return html; } @@ -100,11 +97,25 @@ } else if (type === "pdf") { results = render_pdf(query, data); } else if (type === "github" || type === "all") { - results = render_mutliple(query, data, type); + results = render_multiple(query, data, type); } else { results = data.map((item) => `
` + `

${item.entry}

` + `
`).join("\n") } - return `
${results}
`; + + // Any POST rendering goes here. + + let renderedResults = document.createElement("div"); + renderedResults.id = `results-${type}`; + renderedResults.innerHTML = results; + + // For all elements that are of type img in the results html and have a src with 'avatar' in the URL, add the class 'avatar' + // This is used to make the avatar images round + let images = renderedResults.querySelectorAll("img[src*='avatar']"); + for (let i = 0; i < images.length; i++) { + images[i].classList.add("avatar"); + } + + return renderedResults.outerHTML; } function search(rerank=false) { @@ -277,7 +288,6 @@ margin: 0px; background: #f8fafc; color: #475569; - text-align: center; font-family: roboto, karma, segoe ui, sans-serif; font-size: 20px; font-weight: 300; @@ -388,12 +398,18 @@ text-decoration: none; } - img.md-avatar { + img.avatar { width: 20px; height: 20px; border-radius: 50%; } + div.results-markdown, + div.results-org, + div.results-pdf { + border: black 1px solid; + } + diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 69f7033c..e94749f1 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -1,6 +1,7 @@ # Standard Packages import logging import time +from datetime import datetime from typing import Dict, List, Union # External Packages @@ -235,7 +236,7 @@ class GithubToJsonl(TextToJsonl): return result for comment in raw_comments: - created_at = comment["created_at"].split("T")[0] + created_at = datetime.strptime(comment["created_at"], "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d %H:%M") commenter = comment["user"]["login"] commenter_url = comment["user"]["html_url"] comment_url = comment["html_url"] diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py index 9069418b..980ab1fb 100644 --- a/src/khoj/routers/api.py +++ b/src/khoj/routers/api.py @@ -216,6 +216,20 @@ async def search( ) ] + if (t == SearchType.Github or t == SearchType.All) and state.model.github_search: + # query github issues + search_futures += [ + executor.submit( + text_search.query, + user_query, + state.model.github_search, + question_embedding=encoded_asymmetric_query, + rank_results=r or False, + score_threshold=score_threshold, + dedupe=dedupe or True, + ) + ] + if (t == SearchType.Pdf or t == SearchType.All) and state.model.pdf_search: # query pdf files search_futures += [ From 65bf894302288819fa30b57e24349260080cdb8d Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 15:12:48 -0700 Subject: [PATCH 8/9] Interpret org files as a list and put them in separate divs. Update styling of search results to separate into cards --- src/khoj/interface/web/index.html | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index e6195336..7a0f896c 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -24,13 +24,12 @@ } function render_org(query, data, classPrefix="") { - var orgCode = data.map(function (item) { - return `${item.entry}` - }).join("\n") - var orgParser = new Org.Parser(); - var orgDocument = orgParser.parse(orgCode); - var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix }); - return `
` + orgHTMLDocument.toString() + `
`; + return data.map(function (item) { + var orgParser = new Org.Parser(); + var orgDocument = orgParser.parse(item.entry); + var orgHTMLDocument = orgDocument.convert(Org.ConverterHTML, { htmlClassPrefix: classPrefix }); + return `
` + orgHTMLDocument.toString() + `
`; + }).join("\n"); } function render_markdown(query, data) { @@ -407,7 +406,12 @@ div.results-markdown, div.results-org, div.results-pdf { - border: black 1px solid; + text-align: left; + box-shadow: 2px 2px 2px var(--primary-hover); + border-radius: 5px; + padding: 10px; + margin: 10px 0; + border: 1px solid rgb(229, 229, 229); } From b2dd946c6daec64f0cfae6867b5b192dcb1e79ee Mon Sep 17 00:00:00 2001 From: sabaimran Date: Thu, 29 Jun 2023 15:23:50 -0700 Subject: [PATCH 9/9] Rename issue to entry method for accuracy --- src/khoj/processor/github/github_to_jsonl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index e94749f1..70ea7bf2 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -71,7 +71,7 @@ class GithubToJsonl(TextToJsonl): current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) with timer(f"Extract issues from github repo {repo_shorthand}", logger): - issue_entries = GithubToJsonl.convert_issue_entries_to_maps( + issue_entries = GithubToJsonl.convert_issues_to_entries( *GithubToJsonl.extract_github_issues(self.get_issues(repo_url)) ) current_entries += issue_entries @@ -303,10 +303,10 @@ class GithubToJsonl(TextToJsonl): return entries, entry_to_file_map @staticmethod - def convert_issue_entries_to_maps(parsed_entries: List[str], entry_to_file_map: Dict[str, Dict]) -> List[Entry]: + def convert_issues_to_entries(parsed_entries: List[str], entry_to_metadata_map: Dict[str, Dict]) -> List[Entry]: entries = [] for entry in parsed_entries: - entry_file_name = entry_to_file_map[entry]["path"] + entry_file_name = entry_to_metadata_map[entry]["path"] entries.append( Entry( compiled=entry,