mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 23:48:56 +01:00
Add a Github plugin which can be used to read from a Github repository
This commit is contained in:
parent
c68cde4803
commit
a6cd96a6a9
18 changed files with 224 additions and 25 deletions
|
@ -63,7 +63,7 @@
|
|||
- **General**
|
||||
- **Natural**: Advanced natural language understanding using Transformer based ML Models
|
||||
- **Pluggable**: Modular architecture makes it easy to plug in new data sources, frontends and ML models
|
||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files and Photos
|
||||
- **Multiple Sources**: Index your Org-mode and Markdown notes, Beancount transactions, PDF files, Github repositories, and Photos
|
||||
- **Multiple Interfaces**: Interact from your [Web Browser](./src/khoj/interface/web/index.html), [Emacs](./src/interface/emacs/khoj.el) or [Obsidian](./src/interface/obsidian/)
|
||||
|
||||
## Demos
|
||||
|
@ -75,7 +75,7 @@ https://github.com/debanjum/khoj/assets/6413477/3e33d8ea-25bb-46c8-a3bf-c92f78d0
|
|||
- Install Khoj via `pip` and start Khoj backend in non-gui mode
|
||||
- Install Khoj plugin via Community Plugins settings pane on Obsidian app
|
||||
- Check the new Khoj plugin settings
|
||||
- Let Khoj backend index the markdown, pdf files in the current Vault
|
||||
- Let Khoj backend index the markdown, pdf, Github markdown files in the current Vault
|
||||
- Open Khoj plugin on Obsidian via Search button on Left Pane
|
||||
- Search \"*Announce plugin to folks*\" in the [Obsidian Plugin docs](https://marcus.se.net/obsidian-plugin-docs/)
|
||||
- Jump to the [search result](https://marcus.se.net/obsidian-plugin-docs/publishing/submit-your-plugin)
|
||||
|
@ -396,7 +396,7 @@ git clone https://github.com/debanjum/khoj && cd khoj
|
|||
|
||||
##### 2. Configure
|
||||
|
||||
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf and beancount directories
|
||||
- **Required**: Update [docker-compose.yml](./docker-compose.yml) to mount your images, (org-mode or markdown) notes, pdf, Github repositories, and beancount directories
|
||||
- **Optional**: Edit application configuration in [khoj_docker.yml](./config/khoj_docker.yml)
|
||||
|
||||
##### 3. Run
|
||||
|
|
|
@ -56,6 +56,7 @@ dependencies = [
|
|||
"aiohttp == 3.8.4",
|
||||
"langchain >= 0.0.187",
|
||||
"pypdf >= 3.9.0",
|
||||
"llama-hub==0.0.3",
|
||||
]
|
||||
dynamic = ["version"]
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ from khoj.processor.jsonl.jsonl_to_jsonl import JsonlToJsonl
|
|||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
|
||||
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
|
||||
from khoj.processor.github.github_to_jsonl import GithubToJsonl
|
||||
from khoj.search_type import image_search, text_search
|
||||
from khoj.utils import constants, state
|
||||
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
|
||||
|
@ -153,6 +154,20 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
|
|||
config.content_type.image, search_config=config.search_type.image, regenerate=regenerate
|
||||
)
|
||||
|
||||
if (t == state.SearchType.Github or t == None) and config.content_type.github:
|
||||
logger.info("🐙 Setting up search for github")
|
||||
# Extract Entries, Generate Github Embeddings
|
||||
try:
|
||||
model.github_search = text_search.setup(
|
||||
GithubToJsonl,
|
||||
config.content_type.github,
|
||||
search_config=config.search_type.asymmetric,
|
||||
regenerate=regenerate,
|
||||
filters=[DateFilter(), WordFilter(), FileFilter()],
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to setup github search: {e}")
|
||||
|
||||
# Initialize External Plugin Search
|
||||
if (t == None or t in state.SearchType) and config.content_type.plugins:
|
||||
logger.info("🔌 Setting up search for plugins")
|
||||
|
|
|
@ -3,14 +3,18 @@ from PyQt6 import QtWidgets
|
|||
|
||||
# Internal Packages
|
||||
from khoj.utils.config import ProcessorType
|
||||
from khoj.utils.config import SearchType
|
||||
|
||||
|
||||
class LabelledTextField(QtWidgets.QWidget):
|
||||
def __init__(self, title, processor_type: ProcessorType = None, default_value: str = None):
|
||||
def __init__(
|
||||
self, title, search_type: SearchType = None, processor_type: ProcessorType = None, default_value: str = None
|
||||
):
|
||||
QtWidgets.QWidget.__init__(self)
|
||||
layout = QtWidgets.QHBoxLayout()
|
||||
self.setLayout(layout)
|
||||
self.processor_type = processor_type
|
||||
self.search_type = search_type
|
||||
|
||||
self.label = QtWidgets.QLabel()
|
||||
self.label.setText(title)
|
||||
|
|
|
@ -62,7 +62,6 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||
search_type, None
|
||||
) or self.get_default_config(search_type=search_type)
|
||||
self.search_settings_panels += [self.add_settings_panel(current_content_config, search_type)]
|
||||
|
||||
# Add Conversation Processor Panel to Configure Screen
|
||||
self.processor_settings_panels = []
|
||||
conversation_type = ProcessorType.Conversation
|
||||
|
@ -88,6 +87,8 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||
if search_type == SearchType.Image:
|
||||
current_content_files = current_content_config.get("input-directories", [])
|
||||
file_input_text = f"{search_type.name} Folders"
|
||||
elif search_type == SearchType.Github:
|
||||
return self.add_github_settings_panel(current_content_config, SearchType.Github)
|
||||
else:
|
||||
current_content_files = current_content_config.get("input-files", [])
|
||||
file_input_text = f"{search_type.name} Files"
|
||||
|
@ -111,6 +112,47 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||
|
||||
return search_type_settings
|
||||
|
||||
def add_github_settings_panel(self, current_content_config: dict, search_type: SearchType):
|
||||
search_type_settings = QtWidgets.QWidget()
|
||||
search_type_layout = QtWidgets.QVBoxLayout(search_type_settings)
|
||||
enable_search_type = SearchCheckBox(f"Search {search_type.name}", search_type)
|
||||
# Add labelled text input field
|
||||
input_fields = []
|
||||
|
||||
pat_token = current_content_config.get("pat-token", None)
|
||||
input_field = LabelledTextField("pat-token", search_type=search_type, default_value=pat_token)
|
||||
search_type_layout.addWidget(input_field)
|
||||
input_fields += [input_field]
|
||||
|
||||
repo_name = current_content_config.get("repo-name", None)
|
||||
input_field = LabelledTextField("repo-name", search_type=search_type, default_value=repo_name)
|
||||
search_type_layout.addWidget(input_field)
|
||||
input_fields += [input_field]
|
||||
|
||||
repo_owner = current_content_config.get("repo-owner", None)
|
||||
input_field = LabelledTextField("repo-owner", search_type=search_type, default_value=repo_owner)
|
||||
search_type_layout.addWidget(input_field)
|
||||
input_fields += [input_field]
|
||||
|
||||
repo_branch = current_content_config.get("repo-branch", None)
|
||||
input_field = LabelledTextField("repo-branch", search_type=search_type, default_value=repo_branch)
|
||||
search_type_layout.addWidget(input_field)
|
||||
input_fields += [input_field]
|
||||
|
||||
# Set enabled/disabled based on checkbox state
|
||||
enable_search_type.setChecked(bool(repo_name or repo_owner or repo_branch or pat_token))
|
||||
for input_field in input_fields:
|
||||
input_field.setEnabled(enable_search_type.isChecked())
|
||||
enable_search_type.stateChanged.connect(lambda _: [input_field.setEnabled(enable_search_type.isChecked()) for input_field in input_fields]) # type: ignore[attr-defined]
|
||||
|
||||
# Add setting widgets for given search type to panel
|
||||
search_type_layout.addWidget(enable_search_type)
|
||||
for input_field in input_fields:
|
||||
search_type_layout.addWidget(input_field)
|
||||
self.wlayout.addWidget(search_type_settings)
|
||||
|
||||
return search_type_settings
|
||||
|
||||
def add_processor_panel(self, current_conversation_config: dict, processor_type: ProcessorType):
|
||||
"Add Conversation Processor Panel"
|
||||
# Get current settings from config for given processor type
|
||||
|
@ -185,7 +227,7 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||
"Update config with search settings from UI"
|
||||
for settings_panel in self.search_settings_panels:
|
||||
for child in settings_panel.children():
|
||||
if not isinstance(child, (SearchCheckBox, FileBrowser)):
|
||||
if not isinstance(child, (SearchCheckBox, FileBrowser, LabelledTextField)):
|
||||
continue
|
||||
if isinstance(child, SearchCheckBox):
|
||||
# Search Type Disabled
|
||||
|
@ -207,6 +249,10 @@ class MainWindow(QtWidgets.QMainWindow):
|
|||
self.new_config["content-type"][child.search_type.value]["input-files"] = (
|
||||
child.getPaths() if child.getPaths() != [] else None
|
||||
)
|
||||
elif isinstance(child, LabelledTextField):
|
||||
self.new_config["content-type"][child.search_type.value][
|
||||
child.label.text()
|
||||
] = child.input_field.toPlainText()
|
||||
|
||||
def update_processor_settings(self):
|
||||
"Update config with conversation settings from UI"
|
||||
|
|
|
@ -66,6 +66,8 @@
|
|||
return render_ledger(query, data);
|
||||
} else if (type === "pdf") {
|
||||
return render_pdf(query, data);
|
||||
} else if (type == "github") {
|
||||
return render_markdown(query, data);
|
||||
} else {
|
||||
return `<div id="results-plugin">`
|
||||
+ data.map((item) => `<p>${item.entry}</p>`).join("\n")
|
||||
|
@ -296,7 +298,7 @@
|
|||
text-align: left;
|
||||
white-space: pre-line;
|
||||
}
|
||||
#results-markdown {
|
||||
#results-markdown, #results-github {
|
||||
text-align: left;
|
||||
}
|
||||
#results-music,
|
||||
|
|
0
src/khoj/processor/github/__init__.py
Normal file
0
src/khoj/processor/github/__init__.py
Normal file
89
src/khoj/processor/github/github_to_jsonl.py
Normal file
89
src/khoj/processor/github/github_to_jsonl.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
import logging
|
||||
from llama_index import download_loader
|
||||
from khoj.utils.helpers import timer
|
||||
from khoj.utils.rawconfig import GithubContentConfig
|
||||
from llama_hub.github_repo import GithubRepositoryReader, GithubClient
|
||||
from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
|
||||
from khoj.processor.text_to_jsonl import TextToJsonl
|
||||
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
|
||||
from khoj.utils import state
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GithubToJsonl:
|
||||
def __init__(self, config: GithubContentConfig):
|
||||
self.config = config
|
||||
download_loader("GithubRepositoryReader")
|
||||
|
||||
def process(self, previous_entries=None):
|
||||
try:
|
||||
self.initialize()
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unable to initialize Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}"
|
||||
)
|
||||
raise e
|
||||
|
||||
with timer("Download github repo", logger):
|
||||
try:
|
||||
docs = self.get_markdown_files()
|
||||
except Exception as e:
|
||||
logger.error(f"Unable to download github repo for {self.config.repo_owner}/{self.config.repo_name}")
|
||||
raise e
|
||||
|
||||
logger.info(f"Found {len(docs)} documents in {self.config.repo_owner}/{self.config.repo_name}")
|
||||
|
||||
with timer("Extract markdown entries from github repo", logger):
|
||||
current_entries = MarkdownToJsonl.convert_markdown_entries_to_maps(
|
||||
*GithubToJsonl.extract_markdown_entries(docs)
|
||||
)
|
||||
|
||||
with timer("Split entries by max token size supported by model", logger):
|
||||
current_entries = TextToJsonl.split_entries_by_max_tokens(current_entries, max_tokens=256)
|
||||
|
||||
# Identify, mark and merge any new entries with previous entries
|
||||
with timer("Identify new or updated entries", logger):
|
||||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
with timer("Write markdown entries to JSONL file", logger):
|
||||
# Process Each Entry from All Notes Files
|
||||
entries = list(map(lambda entry: entry[1], entries_with_ids))
|
||||
jsonl_data = MarkdownToJsonl.convert_markdown_maps_to_jsonl(entries)
|
||||
|
||||
# Compress JSONL formatted Data
|
||||
if self.config.compressed_jsonl.suffix == ".gz":
|
||||
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
|
||||
elif self.config.compressed_jsonl.suffix == ".jsonl":
|
||||
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
|
||||
|
||||
return entries_with_ids
|
||||
|
||||
def initialize(self):
|
||||
logger.info(f"Initializing Github Repository Reader for {self.config.repo_owner}/{self.config.repo_name}")
|
||||
github_client = GithubClient(self.config.pat_token)
|
||||
self.loader = GithubRepositoryReader(
|
||||
github_client,
|
||||
owner=self.config.repo_owner,
|
||||
repo=self.config.repo_name,
|
||||
filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE),
|
||||
verbose=state.verbose > 1,
|
||||
)
|
||||
|
||||
def get_markdown_files(self):
|
||||
return self.loader.load_data(branch=self.config.repo_branch)
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_entries(markdown_files):
|
||||
entries = []
|
||||
entry_to_file_map = []
|
||||
for doc in markdown_files:
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
doc.get_text(), doc.extra_info.get("file_path"), entries, entry_to_file_map
|
||||
)
|
||||
return entries, dict(entry_to_file_map)
|
|
@ -41,7 +41,7 @@ class JsonlToJsonl(TextToJsonl):
|
|||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries,
|
||||
previous_entries,
|
||||
key="compiled",
|
||||
|
|
|
@ -48,7 +48,7 @@ class BeancountToJsonl(TextToJsonl):
|
|||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ class MarkdownToJsonl(TextToJsonl):
|
|||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
|
@ -101,27 +101,37 @@ class MarkdownToJsonl(TextToJsonl):
|
|||
"Extract entries by heading from specified Markdown files"
|
||||
|
||||
# Regex to extract Markdown Entries by Heading
|
||||
markdown_heading_regex = r"^#"
|
||||
|
||||
entries = []
|
||||
entry_to_file_map = []
|
||||
for markdown_file in markdown_files:
|
||||
with open(markdown_file, "r", encoding="utf8") as f:
|
||||
markdown_content = f.read()
|
||||
markdown_entries_per_file = []
|
||||
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||
# Add heading level as the regex split removed it from entries with headings
|
||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||
stripped_entry = entry.strip(empty_escape_sequences)
|
||||
if stripped_entry != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||
|
||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||
entries.extend(markdown_entries_per_file)
|
||||
entries, entry_to_file_map = MarkdownToJsonl.process_single_markdown_file(
|
||||
markdown_content, markdown_file, entries, entry_to_file_map
|
||||
)
|
||||
|
||||
return entries, dict(entry_to_file_map)
|
||||
|
||||
@staticmethod
|
||||
def process_single_markdown_file(
|
||||
markdown_content: str, markdown_file: Path, entries: List, entry_to_file_map: List
|
||||
):
|
||||
markdown_heading_regex = r"^#"
|
||||
|
||||
markdown_entries_per_file = []
|
||||
any_headings = re.search(markdown_heading_regex, markdown_content, flags=re.MULTILINE)
|
||||
for entry in re.split(markdown_heading_regex, markdown_content, flags=re.MULTILINE):
|
||||
# Add heading level as the regex split removed it from entries with headings
|
||||
prefix = "#" if entry.startswith("#") else "# " if any_headings else ""
|
||||
stripped_entry = entry.strip(empty_escape_sequences)
|
||||
if stripped_entry != "":
|
||||
markdown_entries_per_file.append(f"{prefix}{stripped_entry}")
|
||||
|
||||
entry_to_file_map += zip(markdown_entries_per_file, [markdown_file] * len(markdown_entries_per_file))
|
||||
entries.extend(markdown_entries_per_file)
|
||||
return entries, entry_to_file_map
|
||||
|
||||
@staticmethod
|
||||
def convert_markdown_entries_to_maps(parsed_entries: List[str], entry_to_file_map) -> List[Entry]:
|
||||
"Convert each Markdown entries into a dictionary"
|
||||
|
|
|
@ -50,7 +50,7 @@ class OrgToJsonl(TextToJsonl):
|
|||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ class PdfToJsonl(TextToJsonl):
|
|||
if not previous_entries:
|
||||
entries_with_ids = list(enumerate(current_entries))
|
||||
else:
|
||||
entries_with_ids = self.mark_entries_for_update(
|
||||
entries_with_ids = TextToJsonl.mark_entries_for_update(
|
||||
current_entries, previous_entries, key="compiled", logger=logger
|
||||
)
|
||||
|
||||
|
|
|
@ -60,8 +60,9 @@ class TextToJsonl(ABC):
|
|||
|
||||
return chunked_entries
|
||||
|
||||
@staticmethod
|
||||
def mark_entries_for_update(
|
||||
self, current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
|
||||
current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
|
||||
) -> List[Tuple[int, Entry]]:
|
||||
# Hash all current and previous entries to identify new entries
|
||||
with timer("Hash previous, current entries", logger):
|
||||
|
|
|
@ -121,6 +121,17 @@ def search(
|
|||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Github or t == None) and state.model.github_search:
|
||||
# query github embeddings
|
||||
with timer("Query took", logger):
|
||||
hits, entries = text_search.query(
|
||||
user_query, state.model.github_search, rank_results=r, score_threshold=score_threshold, dedupe=dedupe
|
||||
)
|
||||
|
||||
# collate and return results
|
||||
with timer("Collating results took", logger):
|
||||
results = text_search.collate_results(hits, entries, results_count)
|
||||
|
||||
elif (t == SearchType.Ledger or t == None) and state.model.ledger_search:
|
||||
# query transactions
|
||||
with timer("Query took", logger):
|
||||
|
|
|
@ -23,6 +23,7 @@ class SearchType(str, Enum):
|
|||
Markdown = "markdown"
|
||||
Image = "image"
|
||||
Pdf = "pdf"
|
||||
Github = "github"
|
||||
|
||||
|
||||
class ProcessorType(str, Enum):
|
||||
|
@ -64,6 +65,7 @@ class SearchModels:
|
|||
markdown_search: TextSearchModel = None
|
||||
pdf_search: TextSearchModel = None
|
||||
image_search: ImageSearchModel = None
|
||||
github_search: TextSearchModel = None
|
||||
plugin_search: Dict[str, TextSearchModel] = None
|
||||
|
||||
|
||||
|
|
|
@ -47,6 +47,14 @@ default_config = {
|
|||
"compressed-jsonl": "~/.khoj/content/music/music.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/music/music_embeddings.pt",
|
||||
},
|
||||
"github": {
|
||||
"pat-token": None,
|
||||
"repo-name": None,
|
||||
"repo-owner": None,
|
||||
"repo-branch": "master",
|
||||
"compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz",
|
||||
"embeddings-file": "~/.khoj/content/github/github_embeddings.pt",
|
||||
},
|
||||
},
|
||||
"search-type": {
|
||||
"symmetric": {
|
||||
|
|
|
@ -32,6 +32,15 @@ class TextContentConfig(ConfigBase):
|
|||
return input_filter
|
||||
|
||||
|
||||
class GithubContentConfig(ConfigBase):
|
||||
pat_token: str
|
||||
repo_name: str
|
||||
repo_owner: str
|
||||
repo_branch: Optional[str] = "master"
|
||||
compressed_jsonl: Path
|
||||
embeddings_file: Path
|
||||
|
||||
|
||||
class ImageContentConfig(ConfigBase):
|
||||
input_directories: Optional[List[Path]]
|
||||
input_filter: Optional[List[str]]
|
||||
|
@ -57,6 +66,7 @@ class ContentConfig(ConfigBase):
|
|||
music: Optional[TextContentConfig]
|
||||
markdown: Optional[TextContentConfig]
|
||||
pdf: Optional[TextContentConfig]
|
||||
github: Optional[GithubContentConfig]
|
||||
plugins: Optional[Dict[str, TextContentConfig]]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue