Add backend support for indexing multiple repositories

- Add support for indexing org files as well as markdown files from the Github repository and update corresponding search view - Support indexing a list of repositories
2024-11-27 17:35:07 +01:00 · 2023-06-27 12:06:15 -07:00 · 2023-06-27 12:06:15 -07:00 · 37a1f15c38
commit 37a1f15c38
parent ddd550e6f4
7 changed files with 113 additions and 31 deletions
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -57,6 +57,27 @@
            }).join("\n") + `</div>`;
        }

+        function render_mutliple(query, data, type) {
+            let org_files = data.filter((item) => item.additional.file.endsWith(".org"));
+            let md_files = data.filter((item) => item.additional.file.endsWith(".md"));
+            let pdf_files = data.filter((item) => item.additional.file.endsWith(".pdf"));
+
+            let html = "";
+            if (org_files.length > 0) {
+                html += render_org(query, org_files, type);
+            }
+
+            if (md_files.length > 0) {
+                html += render_markdown(query, md_files);
+            }
+
+            if (pdf_files.length > 0) {
+                html += render_pdf(query, pdf_files);
+            }
+
+            return html;
+        }
+
        function render_json(data, query, type) {
            if (type === "markdown") {
                return render_markdown(query, data);
@ -71,7 +92,7 @@
            } else if (type === "pdf") {
                return render_pdf(query, data);
            } else if (type == "github") {
-                return render_markdown(query, data);
+                return render_mutliple(query, data, type);
            } else {
                return `<div id="results-plugin">`
                    + data.map((item) => `<p>${item.entry}</p>`).join("\n")
--- a/src/khoj/processor/github/github_to_jsonl.py
+++ b/src/khoj/processor/github/github_to_jsonl.py
@ -8,8 +8,9 @@ import requests

 # Internal Packages
 from khoj.utils.helpers import timer
-from khoj.utils.rawconfig import Entry, GithubContentConfig
+from khoj.utils.rawconfig import Entry, GithubContentConfig, GithubRepoConfig
 from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
+from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data

@ -21,7 +22,6 @@ class GithubToJsonl(TextToJsonl):
    def __init__(self, config: GithubContentConfig):
        super().__init__(config)
        self.config = config
-        self.repo_url = f"https://api.github.com/repos/{self.config.repo_owner}/{self.config.repo_name}"

    @staticmethod
    def wait_for_rate_limit_reset(response, func, *args, **kwargs):
@ -34,26 +34,42 @@ class GithubToJsonl(TextToJsonl):
            return

    def process(self, previous_entries=None):
+        current_entries = []
+        for repo in self.config.repos:
+            current_entries += self.process_repo(repo, previous_entries)
+
+        return self.update_entries_with_ids(current_entries, previous_entries)
+
+    def process_repo(self, repo: GithubRepoConfig, previous_entries=None):
+        repo_url = f"https://api.github.com/repos/{repo.owner}/{repo.name}"
+        logger.info(f"Processing github repo {repo.owner}/{repo.name}")
        with timer("Download markdown files from github repo", logger):
            try:
-                docs = self.get_markdown_files()
+                markdown_files, org_files = self.get_files(repo_url, repo)
            except Exception as e:
-                logger.error(f"Unable to download github repo {self.config.repo_owner}/{self.config.repo_name}")
+                logger.error(f"Unable to download github repo {repo.owner}/{repo.name}")
                raise e

-        logger.info(f"Found {len(docs)} documents in github repo {self.config.repo_owner}/{self.config.repo_name}")
+        logger.info(f"Found {len(markdown_files)} markdown files in github repo {repo.owner}/{repo.name}")
+        logger.info(f"Found {len(org_files)} org files in github repo {repo.owner}/{repo.name}")

-        with timer("Extract markdown entries from github repo", logger):
+        with timer(f"Extract markdown entries from github repo {repo.owner}/{repo.name}", logger):
            current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
-                *GithubToJsonl.extract_markdown_entries(docs)
+                *GithubToJsonl.extract_markdown_entries(markdown_files)
            )

-        with timer("Extract commit messages from github repo", logger):
-            current_entries += self.convert_commits_to_entries(self.get_commits())
+        with timer(f"Extract org entries from github repo {repo.owner}/{repo.name}", logger):
+            current_entries += OrgToJsonl.convert_org_nodes_to_entries(*GithubToJsonl.extract_org_entries(org_files))

-        with timer("Split entries by max token size supported by model", logger):
+        with timer(f"Extract commit messages from github repo {repo.owner}/{repo.name}", logger):
+            current_entries += self.convert_commits_to_entries(self.get_commits(repo_url), repo)
+
+        with timer(f"Split entries by max token size supported by model {repo.owner}/{repo.name}", logger):
            current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)

+        return current_entries
+
+    def update_entries_with_ids(self, current_entries, previous_entries):
        # Identify, mark and merge any new entries with previous entries
        with timer("Identify new or updated entries", logger):
            if not previous_entries:
@ -76,31 +92,40 @@ class GithubToJsonl(TextToJsonl):

        return entries_with_ids

-    def get_markdown_files(self):
+    def get_files(self, repo_url: str, repo: GithubRepoConfig):
        # Get the contents of the repository
-        repo_content_url = f"{self.repo_url}/git/trees/{self.config.repo_branch}"
+        repo_content_url = f"{repo_url}/git/trees/{repo.branch}"
        headers = {"Authorization": f"token {self.config.pat_token}"}
        params = {"recursive": "true"}
        response = requests.get(repo_content_url, headers=headers, params=params)
        contents = response.json()

        # Wait for rate limit reset if needed
-        result = self.wait_for_rate_limit_reset(response, self.get_markdown_files)
+        result = self.wait_for_rate_limit_reset(response, self.get_files)
        if result is not None:
            return result

        # Extract markdown files from the repository
        markdown_files = []
+        org_files = []
        for item in contents["tree"]:
            # Find all markdown files in the repository
            if item["type"] == "blob" and item["path"].endswith(".md"):
                # Create URL for each markdown file on Github
-                url_path = f'https://github.com/{self.config.repo_owner}/{self.config.repo_name}/blob/{self.config.repo_branch}/{item["path"]}'
+                url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'

                # Add markdown file contents and URL to list
                markdown_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]

-        return markdown_files
+            # Find all org files in the repository
+            elif item["type"] == "blob" and item["path"].endswith(".org"):
+                # Create URL for each org file on Github
+                url_path = f'https://github.com/{repo.owner}/{repo.name}/blob/{repo.branch}/{item["path"]}'
+
+                # Add org file contents and URL to list
+                org_files += [{"content": self.get_file_contents(item["url"]), "path": url_path}]
+
+        return markdown_files, org_files

    def get_file_contents(self, file_url):
        # Get text from each markdown file
@ -114,9 +139,9 @@ class GithubToJsonl(TextToJsonl):

        return response.content.decode("utf-8")

-    def get_commits(self) -> List[Dict]:
+    def get_commits(self, repo_url: str) -> List[Dict]:
        # Get commit messages from the repository using the Github API
-        commits_url = f"{self.repo_url}/commits"
+        commits_url = f"{repo_url}/commits"
        headers = {"Authorization": f"token {self.config.pat_token}"}
        params = {"per_page": 100}
        commits = []
@ -140,10 +165,10 @@ class GithubToJsonl(TextToJsonl):

        return commits

-    def convert_commits_to_entries(self, commits) -> List[Entry]:
+    def convert_commits_to_entries(self, commits, repo: GithubRepoConfig) -> List[Entry]:
        entries: List[Entry] = []
        for commit in commits:
-            compiled = f'Commit message from {self.config.repo_owner}/{self.config.repo_name}:\n{commit["content"]}'
+            compiled = f'Commit message from {repo.owner}/{repo.name}:\n{commit["content"]}'
            entries.append(
                Entry(
                    compiled=compiled,
@ -164,3 +189,14 @@ class GithubToJsonl(TextToJsonl):
                doc["content"], doc["path"], entries, entry_to_file_map
            )
        return entries, dict(entry_to_file_map)
+
+    @staticmethod
+    def extract_org_entries(org_files):
+        entries = []
+        entry_to_file_map = []
+
+        for doc in org_files:
+            entries, entry_to_file_map = OrgToJsonl.process_single_org_file(
+                doc["content"], doc["path"], entries, entry_to_file_map
+            )
+        return entries, dict(entry_to_file_map)
--- a/src/khoj/processor/markdown/markdown_to_jsonl.py
+++ b/src/khoj/processor/markdown/markdown_to_jsonl.py
@ -10,13 +10,17 @@ from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
 from khoj.utils.constants import empty_escape_sequences
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils.rawconfig import Entry
+from khoj.utils.rawconfig import Entry, TextContentConfig


 logger = logging.getLogger(__name__)


 class MarkdownToJsonl(TextToJsonl):
+    def __init__(self, config: TextContentConfig):
+        super().__init__(config)
+        self.config = config
+
    # Define Functions
    def process(self, previous_entries=None):
        # Extract required fields from config
--- a/src/khoj/processor/org_mode/org_to_jsonl.py
+++ b/src/khoj/processor/org_mode/org_to_jsonl.py
@ -9,7 +9,7 @@ from khoj.processor.org_mode import orgnode
 from khoj.processor.text_to_jsonl import TextToJsonl
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, timer
 from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
-from khoj.utils.rawconfig import Entry
+from khoj.utils.rawconfig import Entry, TextContentConfig
 from khoj.utils import state


@ -17,6 +17,10 @@ logger = logging.getLogger(__name__)


 class OrgToJsonl(TextToJsonl):
+    def __init__(self, config: TextContentConfig):
+        super().__init__(config)
+        self.config = config
+
    # Define Functions
    def process(self, previous_entries: List[Entry] = None):
        # Extract required fields from config
@ -96,12 +100,20 @@ class OrgToJsonl(TextToJsonl):
        entries = []
        entry_to_file_map = []
        for org_file in org_files:
-            org_file_entries = orgnode.makelist(str(org_file))
+            org_file_entries = orgnode.makelist_with_filepath(str(org_file))
            entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
            entries.extend(org_file_entries)

        return entries, dict(entry_to_file_map)

+    @staticmethod
+    def process_single_org_file(org_content: str, org_file: str, entries: List, entry_to_file_map: List):
+        # Process single org file. The org parser assumes that the file is a single org file and reads it from a buffer. We'll split the raw conetnt of this file by new line to mimic the same behavior.
+        org_file_entries = orgnode.makelist(org_content.split("\n"), org_file)
+        entry_to_file_map += zip(org_file_entries, [org_file] * len(org_file_entries))
+        entries.extend(org_file_entries)
+        return entries, entry_to_file_map
+
    @staticmethod
    def convert_org_nodes_to_entries(
        parsed_entries: List[orgnode.Orgnode], entry_to_file_map, index_heading_entries=False
--- a/src/khoj/processor/org_mode/orgnode.py
+++ b/src/khoj/processor/org_mode/orgnode.py
@ -53,14 +53,19 @@ def normalize_filename(filename):
    return escaped_filename


-def makelist(filename):
+def makelist_with_filepath(filename):
+    f = open(filename, "r")
+    return makelist(f, filename)
+
+
+def makelist(file, filename):
    """
    Read an org-mode file and return a list of Orgnode objects
    created from this file.
    """
    ctr = 0

-    f = open(filename, "r")
+    f = file

    todos = {
        "TODO": "",
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -49,9 +49,9 @@ default_config = {
        },
        "github": {
            "pat-token": None,
-            "repo-name": None,
-            "repo-owner": None,
-            "repo-branch": "master",
+            "repos": [
+                {"name": "khoj", "owner": "khoj-ai", "branch": "master"},
+            ],
            "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz",
            "embeddings-file": "~/.khoj/content/github/github_embeddings.pt",
        },
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -41,11 +41,15 @@ class TextContentConfig(TextConfigBase):
        return input_filter


+class GithubRepoConfig(ConfigBase):
+    name: str
+    owner: str
+    branch: Optional[str] = "master"
+
+
 class GithubContentConfig(TextConfigBase):
    pat_token: str
-    repo_name: str
-    repo_owner: str
-    repo_branch: Optional[str] = "master"
+    repos: List[GithubRepoConfig]


 class ImageContentConfig(ConfigBase):