From 37a1f15c385ee0010582bf95b63d7f1ec5d81c5d Mon Sep 17 00:00:00 2001 From: sabaimran Date: Tue, 27 Jun 2023 12:06:15 -0700 Subject: [PATCH 1/4] Add backend support for indexing multiple repositories - Add support for indexing org files as well as markdown files from the Github repository and update corresponding search view - Support indexing a list of repositories --- src/khoj/interface/web/index.html | 23 +++++- src/khoj/processor/github/github_to_jsonl.py | 74 ++++++++++++++----- .../processor/markdown/markdown_to_jsonl.py | 6 +- src/khoj/processor/org_mode/org_to_jsonl.py | 16 +++- src/khoj/processor/org_mode/orgnode.py | 9 ++- src/khoj/utils/constants.py | 6 +- src/khoj/utils/rawconfig.py | 10 ++- 7 files changed, 113 insertions(+), 31 deletions(-) diff --git a/src/khoj/interface/web/index.html b/src/khoj/interface/web/index.html index 409a4504..d5ccb8f9 100644 --- a/src/khoj/interface/web/index.html +++ b/src/khoj/interface/web/index.html @@ -57,6 +57,27 @@ }).join("\n") + ``; } + function render_mutliple(query, data, type) { + let org_files = data.filter((item) => item.additional.file.endsWith(".org")); + let md_files = data.filter((item) => item.additional.file.endsWith(".md")); + let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf")); + + let html = ""; + if (org_files.length > 0) { + html += render_org(query, org_files, type); + } + + if (md_files.length > 0) { + html += render_markdown(query, md_files); + } + + if (pdf_files.length > 0) { + html += render_pdf(query, pdf_files); + } + + return html; + } + function render_json(data, query, type) { if (type === "markdown") { return render_markdown(query, data); @@ -71,7 +92,7 @@ } else if (type === "pdf") { return render_pdf(query, data); } else if (type == "github") { - return render_markdown(query, data); + return render_mutliple(query, data, type); } else { return `
` + data.map((item) => `

${item.entry}

`).join("\n") diff --git a/src/khoj/processor/github/github_to_jsonl.py b/src/khoj/processor/github/github_to_jsonl.py index 80d55f38..4562d91d 100644 --- a/src/khoj/processor/github/github_to_jsonl.py +++ b/src/khoj/processor/github/github_to_jsonl.py @@ -8,8 +8,9 @@ import requests # Internal Packages from khoj.utils.helpers import timer -from khoj.utils.rawconfig import Entry, GithubContentConfig +from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl +from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data @@ -21,7 +22,6 @@ class GithubToJsonl(TextToJsonl): def __init__(self, config: GithubContentConfig): super().__init__(config) self.config = config - self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}" @staticmethod def wait_for_rate_limit_reset(response, func, *args, **kwargs): @@ -34,26 +34,42 @@ class GithubToJsonl(TextToJsonl): return def process(self, previous_entries=None): + current_entries = [] + for repo in self.config.repos: + current_entries += self.process_repo(repo, previous_entries) + + return self.update_entries_with_ids(current_entries, previous_entries) + + def process_repo(self, repo: GithubRepoConfig, previous_entries=None): + repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}" + logger.info(f"Processing github repo {repo.owner}/{repo.name}") with timer("Download markdown files from github repo", logger): try: - docs = self.get_markdown_files() + markdown_files, org_files = self.get_files(repo_url, repo) except Exception as e: - logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}") + logger.error(f"Unable to download github repo {repo.owner}/{repo.name}") raise e - logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}") + logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo.owner}/{repo.name}") + logger.info(f"Found {len(org_files)} org files in github repo {repo.owner}/{repo.name}") - with timer("Extract markdown entries from github repo", logger): + with timer(f"Extract markdown entries from github repo {repo.owner}/{repo.name}", logger): current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps( - *GithubToJsonl.extract_markdown_entries(docs) + *GithubToJsonl.extract_markdown_entries(markdown_files) ) - with timer("Extract commit messages from github repo", logger): - current_entries += self.convert_commits_to_entries(self.get_commits()) + with timer(f"Extract org entries from github repo {repo.owner}/{repo.name}", logger): + current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files)) - with timer("Split entries by max token size supported by model", logger): + with timer(f"Extract commit messages from github repo {repo.owner}/{repo.name}", logger): + current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo) + + with timer(f"Split entries by max token size supported by model {repo.owner}/{repo.name}", logger): current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256) + return current_entries + + def update_entries_with_ids(self, current_entries, previous_entries): # Identify, mark and merge any new entries with previous entries with timer("Identify new or updated entries", logger): if not previous_entries: @@ -76,31 +92,40 @@ class GithubToJsonl(TextToJsonl): return entries_with_ids - def get_markdown_files(self): + def get_files(self, repo_url: str, repo: GithubRepoConfig): # Get the contents of the repository - repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}" + repo_content_url = f"{repo_url}/git/trees/{repo.branch}" headers = {"Authorization": f"token {self.config.pat_token}"} params = {"recursive": "true"} response = requests.get(repo_content_url, headers=headers, params=params) contents = response.json() # Wait for rate limit reset if needed - result = self.wait_for_rate_limit_reset(response, self.get_markdown_files) + result = self.wait_for_rate_limit_reset(response, self.get_files) if result is not None: return result # Extract markdown files from the repository markdown_files = [] + org_files = [] for item in contents["tree"]: # Find all markdown files in the repository if item["type"] == "blob" and item["path"].endswith(".md"): # Create URL for each markdown file on Github - url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}' + url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}' # Add markdown file contents and URL to list markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}] - return markdown_files + # Find all org files in the repository + elif item["type"] == "blob" and item["path"].endswith(".org"): + # Create URL for each org file on Github + url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}' + + # Add org file contents and URL to list + org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}] + + return markdown_files, org_files def get_file_contents(self, file_url): # Get text from each markdown file @@ -114,9 +139,9 @@ class GithubToJsonl(TextToJsonl): return response.content.decode("utf-8") - def get_commits(self) -> List[Dict]: + def get_commits(self, repo_url: str) -> List[Dict]: # Get commit messages from the repository using the Github API - commits_url = f"{self.repo_url}/commits" + commits_url = f"{repo_url}/commits" headers = {"Authorization": f"token {self.config.pat_token}"} params = {"per_page": 100} commits = [] @@ -140,10 +165,10 @@ class GithubToJsonl(TextToJsonl): return commits - def convert_commits_to_entries(self, commits) -> List[Entry]: + def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]: entries: List[Entry] = [] for commit in commits: - compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}' + compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}' entries.append( Entry( compiled=compiled, @@ -164,3 +189,14 @@ class GithubToJsonl(TextToJsonl): doc["content"], doc["path"], entries, entry_to_file_map ) return entries, dict(entry_to_file_map) + + @staticmethod + def extract_org_entries(org_files): + entries = [] + entry_to_file_map = [] + + for doc in org_files: + entries, entry_to_file_map = OrgToJsonl.process_single_org_file( + doc["content"], doc["path"], entries, entry_to_file_map + ) + return entries, dict(entry_to_file_map) diff --git a/src/khoj/processor/markdown/markdown_to_jsonl.py b/src/khoj/processor/markdown/markdown_to_jsonl.py index efb508ad..6c2beb45 100644 --- a/src/khoj/processor/markdown/markdown_to_jsonl.py +++ b/src/khoj/processor/markdown/markdown_to_jsonl.py @@ -10,13 +10,17 @@ from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer from khoj.utils.constants import empty_escape_sequences from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data -from khoj.utils.rawconfig import Entry +from khoj.utils.rawconfig import Entry, TextContentConfig logger = logging.getLogger(__name__) class MarkdownToJsonl(TextToJsonl): + def __init__(self, config: TextContentConfig): + super().__init__(config) + self.config = config + # Define Functions def process(self, previous_entries=None): # Extract required fields from config diff --git a/src/khoj/processor/org_mode/org_to_jsonl.py b/src/khoj/processor/org_mode/org_to_jsonl.py index 96f2238e..5f29ddc9 100644 --- a/src/khoj/processor/org_mode/org_to_jsonl.py +++ b/src/khoj/processor/org_mode/org_to_jsonl.py @@ -9,7 +9,7 @@ from khoj.processor.org_mode import orgnode from khoj.processor.text_to_jsonl import TextToJsonl from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data -from khoj.utils.rawconfig import Entry +from khoj.utils.rawconfig import Entry, TextContentConfig from khoj.utils import state @@ -17,6 +17,10 @@ logger = logging.getLogger(__name__) class OrgToJsonl(TextToJsonl): + def __init__(self, config: TextContentConfig): + super().__init__(config) + self.config = config + # Define Functions def process(self, previous_entries: List[Entry] = None): # Extract required fields from config @@ -96,12 +100,20 @@ class OrgToJsonl(TextToJsonl): entries = [] entry_to_file_map = [] for org_file in org_files: - org_file_entries = orgnode.makelist(str(org_file)) + org_file_entries = orgnode.makelist_with_filepath(str(org_file)) entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) entries.extend(org_file_entries) return entries, dict(entry_to_file_map) + @staticmethod + def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List): + # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. We'll split the raw conetnt of this file by new line to mimic the same behavior. + org_file_entries = orgnode.makelist(org_content.split("\n"), org_file) + entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries)) + entries.extend(org_file_entries) + return entries, entry_to_file_map + @staticmethod def convert_org_nodes_to_entries( parsed_entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False diff --git a/src/khoj/processor/org_mode/orgnode.py b/src/khoj/processor/org_mode/orgnode.py index c4b0afa6..e6c92ea9 100644 --- a/src/khoj/processor/org_mode/orgnode.py +++ b/src/khoj/processor/org_mode/orgnode.py @@ -53,14 +53,19 @@ def normalize_filename(filename): return escaped_filename -def makelist(filename): +def makelist_with_filepath(filename): + f = open(filename, "r") + return makelist(f, filename) + + +def makelist(file, filename): """ Read an org-mode file and return a list of Orgnode objects created from this file. """ ctr = 0 - f = open(filename, "r") + f = file todos = { "TODO": "", diff --git a/src/khoj/utils/constants.py b/src/khoj/utils/constants.py index 9bc0c418..ce4596f0 100644 --- a/src/khoj/utils/constants.py +++ b/src/khoj/utils/constants.py @@ -49,9 +49,9 @@ default_config = { }, "github": { "pat-token": None, - "repo-name": None, - "repo-owner": None, - "repo-branch": "master", + "repos": [ + {"name": "khoj", "owner": "khoj-ai", "branch": "master"}, + ], "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz", "embeddings-file": "~/.khoj/content/github/github_embeddings.pt", }, diff --git a/src/khoj/utils/rawconfig.py b/src/khoj/utils/rawconfig.py index e4f5074a..5d9dcff4 100644 --- a/src/khoj/utils/rawconfig.py +++ b/src/khoj/utils/rawconfig.py @@ -41,11 +41,15 @@ class TextContentConfig(TextConfigBase): return input_filter +class GithubRepoConfig(ConfigBase): + name: str + owner: str + branch: Optional[str] = "master" + + class GithubContentConfig(TextConfigBase): pat_token: str - repo_name: str - repo_owner: str - repo_branch: Optional[str] = "master" + repos: List[GithubRepoConfig] class ImageContentConfig(ConfigBase): From 227169ebdef899a96c32baf05e0e95ead599a61c Mon Sep 17 00:00:00 2001 From: sabaimran Date: Tue, 27 Jun 2023 14:10:09 -0700 Subject: [PATCH 2/4] Support configuration of multiple Github repositories in the settings interface - Add cards to configure each of the Github repositories - Fix a bug in the API which caused all other settings to be wiped when updating one of the content types - Provide an error message to the user if they have a misconfiguration in their chat settings --- src/khoj/interface/web/chat.html | 21 ++- .../web/content_type_github_input.html | 128 +++++++++++++----- src/khoj/routers/api.py | 14 +- 3 files changed, 125 insertions(+), 38 deletions(-) diff --git a/src/khoj/interface/web/chat.html b/src/khoj/interface/web/chat.html index e998836e..56cc99d8 100644 --- a/src/khoj/interface/web/chat.html +++ b/src/khoj/interface/web/chat.html @@ -84,12 +84,21 @@ window.onload = function () { fetch('/api/chat?client=web') .then(response => response.json()) - .then(data => data.response) - .then(chat_logs => { + .then(data => { + if (data.detail) { + // If the server returns a 500 error with detail, render it as a message. + renderMessage(data.detail + " You can configure Khoj chat in your settings.", "khoj"); + } + return data.response; + }) + .then(response => { // Render conversation history, if any - chat_logs.forEach(chat_log => { + response.forEach(chat_log => { renderMessageWithReference(chat_log.message, chat_log.by, chat_log.context, new Date(chat_log.created)); }); + }) + .catch(err => { + return; }); // Set welcome message on load @@ -235,6 +244,12 @@ font-size: medium; } + a.inline-chat-link { + color: #475569; + text-decoration: none; + border-bottom: 1px dotted #475569; + } + @media (pointer: coarse), (hover: none) { abbr[title] { position: relative; diff --git a/src/khoj/interface/web/content_type_github_input.html b/src/khoj/interface/web/content_type_github_input.html index fc1c4fce..f22f00cf 100644 --- a/src/khoj/interface/web/content_type_github_input.html +++ b/src/khoj/interface/web/content_type_github_input.html @@ -16,31 +16,25 @@ - - - - - - - - - - - - - - - - - - - - - - - - +

Repositories

+
+ {% for repo in current_config['repos'] %} +
+ + + + + + + +
+ {% endfor %} +
+

You probably don't need to edit these.

@@ -68,16 +62,86 @@ +