Add a plugin which allows users to index their Notion pages (#284)

* For the demo instance, re-instate the scheduler, but infrequently for api updates
- In constants, determine the cadence based on whether it's a demo instance or not
- This allow us to collect telemetry again. This will also allow us to save the chat session
* Conditionally skip updating the index altogether if it's a demo isntance
* Add backend support for Notion data parsing
- Add a NotionToJsonl class which parses the text of Notion documents made accessible to the API token
- Make corresponding updates to the default config, raw config to support the new notion addition
* Add corresponding views to support configuring Notion from the web-based settings page
- Support backend APIs for deleting/configuring notion setup as well
- Streamline some of the index updating code
* Use defaults for search and chat queries results count
* Update pagination of retrieving pages from Notion
* Update state conversation processor when update is hit
* frequency_penalty should be passed to gpt through kwargs
* Add check for notion in render_multiple method
* Add headings to Notion render
* Revert results count slider and split Notion files by blocks
* Clean/fix misc things in the function to update index
- Use the successText and errorText variables appropriately
- Name parameters in function calls
- Add emojis, woohoo
* Clean up and further modularize code for processing data in Notion
This commit is contained in:
sabaimran 2023-07-09 15:29:26 -07:00 committed by GitHub
parent 77755c0284
commit 62704cac09
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 520 additions and 64 deletions

View file

@ -17,6 +17,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
from khoj.processor.github.github_to_jsonl import GithubToJsonl
from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
from khoj.search_type import image_search, text_search
from khoj.utils import constants, state
from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -169,6 +170,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
regenerate=regenerate,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
# Initialize Notion Search
if (t == None or t in state.SearchType) and config.content_type.notion:
logger.info("🔌 Setting up search for notion")
model.notion_search = text_search.setup(
NotionToJsonl,
config.content_type.notion,
search_config=config.search_type.asymmetric,
regenerate=regenerate,
filters=[DateFilter(), WordFilter(), FileFilter()],
)
except Exception as e:
logger.error("🚨 Failed to setup search")
raise e
@ -248,7 +261,7 @@ def save_chat_session():
@schedule.repeat(schedule.every(59).minutes)
def upload_telemetry():
if not state.config or not state.config.app.should_log_telemetry or not state.telemetry:
if not state.config or not state.config.app or not state.config.app.should_log_telemetry or not state.telemetry:
message = "📡 No telemetry to upload" if not state.telemetry else "📡 Telemetry logging disabled"
logger.debug(message)
return

View file

@ -0,0 +1,4 @@
<svg width="100" height="100" viewBox="0 0 100 100" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M6.017 4.313l55.333 -4.087c6.797 -0.583 8.543 -0.19 12.817 2.917l17.663 12.443c2.913 2.14 3.883 2.723 3.883 5.053v68.243c0 4.277 -1.553 6.807 -6.99 7.193L24.467 99.967c-4.08 0.193 -6.023 -0.39 -8.16 -3.113L3.3 79.94c-2.333 -3.113 -3.3 -5.443 -3.3 -8.167V11.113c0 -3.497 1.553 -6.413 6.017 -6.8z" fill="#fff"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M61.35 0.227l-55.333 4.087C1.553 4.7 0 7.617 0 11.113v60.66c0 2.723 0.967 5.053 3.3 8.167l13.007 16.913c2.137 2.723 4.08 3.307 8.16 3.113l64.257 -3.89c5.433 -0.387 6.99 -2.917 6.99 -7.193V20.64c0 -2.21 -0.873 -2.847 -3.443 -4.733L74.167 3.143c-4.273 -3.107 -6.02 -3.5 -12.817 -2.917zM25.92 19.523c-5.247 0.353 -6.437 0.433 -9.417 -1.99L8.927 11.507c-0.77 -0.78 -0.383 -1.753 1.557 -1.947l53.193 -3.887c4.467 -0.39 6.793 1.167 8.54 2.527l9.123 6.61c0.39 0.197 1.36 1.36 0.193 1.36l-54.933 3.307 -0.68 0.047zM19.803 88.3V30.367c0 -2.53 0.777 -3.697 3.103 -3.893L86 22.78c2.14 -0.193 3.107 1.167 3.107 3.693v57.547c0 2.53 -0.39 4.67 -3.883 4.863l-60.377 3.5c-3.493 0.193 -5.043 -0.97 -5.043 -4.083zm59.6 -54.827c0.387 1.75 0 3.5 -1.75 3.7l-2.91 0.577v42.773c-2.527 1.36 -4.853 2.137 -6.797 2.137 -3.107 0 -3.883 -0.973 -6.21 -3.887l-19.03 -29.94v28.967l6.02 1.363s0 3.5 -4.857 3.5l-13.39 0.777c-0.39 -0.78 0 -2.723 1.357 -3.11l3.497 -0.97v-38.3L30.48 40.667c-0.39 -1.75 0.58 -4.277 3.3 -4.473l14.367 -0.967 19.8 30.327v-26.83l-5.047 -0.58c-0.39 -2.143 1.163 -3.7 3.103 -3.89l13.4 -0.78z" fill="#000"/>
</svg>

After

Width:  |  Height:  |  Size: 1.5 KiB

View file

@ -93,6 +93,7 @@
// Decode message chunk from stream
const chunk = decoder.decode(value, { stream: true });
if (chunk.includes("### compiled references:")) {
const additionalResponse = chunk.split("### compiled references:")[0];
new_response_text.innerHTML += additionalResponse;

View file

@ -14,7 +14,6 @@
<img id="configured-icon-github" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
{% endif %}
</h3>
</div>
<div class="card-description-row">
<p class="card-description">Set repositories for Khoj to index</p>
@ -37,6 +36,37 @@
</div>
{% endif %}
</div>
<div class="card">
<div class="card-title-row">
<img class="card-icon" src="/static/assets/icons/notion.svg" alt="Notion">
<h3 class="card-title">
Notion
{% if current_config.content_type.notion %}
<img id="configured-icon-notion" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
{% endif %}
</h3>
</div>
<div class="card-description-row">
<p class="card-description">Configure your settings from Notion</p>
</div>
<div class="card-action-row">
<a class="card-button" href="/config/content_type/notion">
{% if current_config.content_type.content %}
Update
{% else %}
Setup
{% endif %}
<svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
</a>
</div>
{% if current_config.content_type.notion %}
<div id="clear-notion" class="card-action-row">
<button class="card-button" onclick="clearContentType('notion')">
Disable
</button>
</div>
{% endif %}
</div>
<div class="card">
<div class="card-title-row">
<img class="card-icon" src="/static/assets/icons/markdown.svg" alt="markdown">
@ -224,40 +254,32 @@
var configure = document.getElementById("configure");
configure.addEventListener("click", function(event) {
event.preventDefault();
configure.disabled = true;
configure.innerHTML = "Configuring...";
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
fetch('/api/update?&client=web', {
method: 'GET',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': csrfToken
}
})
.then(response => response.json())
.then(data => {
console.log('Success:', data);
document.getElementById("status").innerHTML = "Configured successfully!";
document.getElementById("status").style.display = "block";
configure.disabled = false;
configure.innerHTML = "⚙️ Configured";
})
.catch((error) => {
console.error('Error:', error);
document.getElementById("status").innerHTML = "Unable to save configuration. Raise issue on Khoj Discord or Github.";
document.getElementById("status").style.display = "block";
configure.disabled = false;
configure.innerHTML = "⚙️ Configure";
});
updateIndex(
force=false,
successText="Configured successfully!",
errorText="Unable to configure. Raise issue on Khoj <a href='https://github.com/khoj-ai/khoj/issues'>Github</a> or <a href='https://discord.gg/BDgyabRM6e'>Discord</a>.",
button=configure,
loadingText="Configuring...",
emoji="⚙️");
});
var reinitialize = document.getElementById("reinitialize");
reinitialize.addEventListener("click", function(event) {
event.preventDefault();
reinitialize.disabled = true;
reinitialize.innerHTML = "Reinitializing...";
updateIndex(
force=true,
successText="Reinitialized successfully!",
errorText="Unable to reinitialize. Raise issue on Khoj <a href='https://github.com/khoj-ai/khoj/issues'>Github</a> or <a href='https://discord.gg/BDgyabRM6e'>Discord</a>.",
button=reinitialize,
loadingText="Reinitializing...",
emoji="🔄");
});
function updateIndex(force, successText, errorText, button, loadingText, emoji) {
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
fetch('/api/update?&client=web&force=True', {
button.disabled = true;
button.innerHTML = emoji + loadingText;
fetch('/api/update?&client=web&force=' + force, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
@ -267,19 +289,22 @@
.then(response => response.json())
.then(data => {
console.log('Success:', data);
document.getElementById("status").innerHTML = "Reinitialized successfully!";
if (data.detail != null) {
throw new Error(data.detail);
}
document.getElementById("status").innerHTML = emoji + successText;
document.getElementById("status").style.display = "block";
reinitialize.disabled = false;
reinitialize.innerHTML = "🔄 Reinitialized";
button.disabled = false;
button.innerHTML = '✅ Done!';
})
.catch((error) => {
console.error('Error:', error);
document.getElementById("status").innerHTML = "Unable to reinitialize. Raise issue on Khoj Discord or Github.";
document.getElementById("status").innerHTML = emoji + errorText
document.getElementById("status").style.display = "block";
reinitialize.disabled = false;
reinitialize.innerHTML = "🔄 Reinitialize";
button.disabled = false;
button.innerHTML = '⚠️ Unsuccessful';
});
});
}
// Setup the results count slider
const resultsCountSlider = document.getElementById('results-count-slider');

View file

@ -0,0 +1,86 @@
{% extends "base_config.html" %}
{% block content %}
<div class="page">
<div class="section">
<h2 class="section-title">
<img class="card-icon" src="/static/assets/icons/notion.svg" alt="Notion">
<span class="card-title-text">Notion</span>
</h2>
<form>
<table>
<tr>
<td>
<label for="token">Token</label>
</td>
<td>
<input type="text" id="token" name="pat" value="{{ current_config['token'] }}">
</td>
</tr>
</table>
<table style="display: none;" >
<tr>
<td>
<label for="compressed-jsonl">Compressed JSONL (Output)</label>
</td>
<td>
<input type="text" id="compressed-jsonl" name="compressed-jsonl" value="{{ current_config['compressed_jsonl'] }}">
</td>
</tr>
<tr>
<td>
<label for="embeddings-file">Embeddings File (Output)</label>
</td>
<td>
<input type="text" id="embeddings-file" name="embeddings-file" value="{{ current_config['embeddings_file'] }}">
</td>
</tr>
</table>
<div class="section">
<div id="success" style="display: none;"></div>
<button id="submit" type="submit">Save</button>
</div>
</form>
</div>
</div>
<script>
const submit = document.getElementById("submit");
submit.addEventListener("click", function(event) {
event.preventDefault();
const compressed_jsonl = document.getElementById("compressed-jsonl").value;
const embeddings_file = document.getElementById("embeddings-file").value;
const token = document.getElementById("token").value;
if (token == "") {
document.getElementById("success").innerHTML = "❌ Please enter a Notion Token.";
document.getElementById("success").style.display = "block";
return;
}
const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
fetch('/api/config/data/content_type/notion', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': csrfToken,
},
body: JSON.stringify({
"token": token,
"compressed_jsonl": compressed_jsonl,
"embeddings_file": embeddings_file,
})
})
.then(response => response.json())
.then(data => {
if (data["status"] == "ok") {
document.getElementById("success").innerHTML = "✅ Successfully updated. Go to your <a href='/config'>settings page</a> to complete setup.";
document.getElementById("success").style.display = "block";
} else {
document.getElementById("success").innerHTML = "⚠️ Failed to update settings.";
document.getElementById("success").style.display = "block";
}
})
});
</script>
{% endblock %}

View file

@ -71,6 +71,8 @@
html += render_markdown(query, [item]);
} else if (item.additional.file.endsWith(".pdf")) {
html += render_pdf(query, [item]);
} else if (item.additional.file.includes("notion.so")) {
html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
}
});
return html;
@ -86,7 +88,7 @@
results = data.map(render_image).join('');
} else if (type === "pdf") {
results = render_pdf(query, data);
} else if (type === "github" || type === "all") {
} else if (type === "github" || type === "all" || type === "notion") {
results = render_multiple(query, data, type);
} else {
results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
@ -127,7 +129,7 @@
setQueryFieldInUrl(query);
// Execute Search and Render Results
url = createRequestUrl(query, type, results_count, rerank);
url = createRequestUrl(query, type, results_count || 5, rerank);
fetch(url)
.then(response => response.json())
.then(data => {
@ -347,6 +349,7 @@
white-space: pre-wrap;
}
.results-pdf,
.results-notion,
.results-plugin {
text-align: left;
white-space: pre-line;
@ -404,6 +407,7 @@
div#results-error,
div.results-markdown,
div.results-notion,
div.results-org,
div.results-pdf {
text-align: left;

View file

@ -32,8 +32,7 @@ def summarize(session, model, api_key=None, temperature=0.5, max_tokens=200):
model_name=model,
temperature=temperature,
max_tokens=max_tokens,
frequency_penalty=0.2,
model_kwargs={"stop": ['"""']},
model_kwargs={"stop": ['"""'], "frequency_penalty": 0.2},
openai_api_key=api_key,
)

View file

@ -0,0 +1,243 @@
# Standard Packages
import logging
# External Packages
import requests
# Internal Packages
from khoj.utils.helpers import timer
from khoj.utils.rawconfig import Entry, NotionContentConfig
from khoj.processor.text_to_jsonl import TextToJsonl
from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
from khoj.utils.rawconfig import Entry
from enum import Enum
logger = logging.getLogger(__name__)
class NotionBlockType(Enum):
PARAGRAPH = "paragraph"
HEADING_1 = "heading_1"
HEADING_2 = "heading_2"
HEADING_3 = "heading_3"
BULLETED_LIST_ITEM = "bulleted_list_item"
NUMBERED_LIST_ITEM = "numbered_list_item"
TO_DO = "to_do"
TOGGLE = "toggle"
CHILD_PAGE = "child_page"
UNSUPPORTED = "unsupported"
BOOKMARK = "bookmark"
DIVIDER = "divider"
PDF = "pdf"
IMAGE = "image"
EMBED = "embed"
VIDEO = "video"
FILE = "file"
SYNCED_BLOCK = "synced_block"
TABLE_OF_CONTENTS = "table_of_contents"
COLUMN = "column"
EQUATION = "equation"
LINK_PREVIEW = "link_preview"
COLUMN_LIST = "column_list"
QUOTE = "quote"
BREADCRUMB = "breadcrumb"
LINK_TO_PAGE = "link_to_page"
CHILD_DATABASE = "child_database"
TEMPLATE = "template"
CALLOUT = "callout"
class NotionToJsonl(TextToJsonl):
def __init__(self, config: NotionContentConfig):
super().__init__(config)
self.config = config
self.session = requests.Session()
self.session.headers.update({"Authorization": f"Bearer {config.token}", "Notion-Version": "2022-02-22"})
self.unsupported_block_types = [
NotionBlockType.BOOKMARK.value,
NotionBlockType.DIVIDER.value,
NotionBlockType.CHILD_DATABASE.value,
NotionBlockType.TEMPLATE.value,
NotionBlockType.CALLOUT.value,
NotionBlockType.UNSUPPORTED.value,
]
self.display_block_block_types = [
NotionBlockType.PARAGRAPH.value,
NotionBlockType.HEADING_1.value,
NotionBlockType.HEADING_2.value,
NotionBlockType.HEADING_3.value,
NotionBlockType.BULLETED_LIST_ITEM.value,
NotionBlockType.NUMBERED_LIST_ITEM.value,
NotionBlockType.TO_DO.value,
NotionBlockType.TOGGLE.value,
NotionBlockType.CHILD_PAGE.value,
NotionBlockType.BOOKMARK.value,
NotionBlockType.DIVIDER.value,
]
def process(self, previous_entries=None):
current_entries = []
# Get all pages
with timer("Getting all pages via search endpoint", logger=logger):
responses = []
while True:
result = self.session.post(
"https://api.notion.com/v1/search",
json={"page_size": 100},
).json()
responses.append(result)
if result["has_more"] == False:
break
else:
self.session.params = {"start_cursor": responses[-1]["next_cursor"]}
for response in responses:
with timer("Processing response", logger=logger):
pages_or_databases = response["results"]
# Get all pages content
for p_or_d in pages_or_databases:
with timer(f"Processing {p_or_d['object']} {p_or_d['id']}", logger=logger):
if p_or_d["object"] == "database":
# TODO: Handle databases
continue
elif p_or_d["object"] == "page":
page_entries = self.process_page(p_or_d)
current_entries.extend(page_entries)
return self.update_entries_with_ids(current_entries, previous_entries)
def process_page(self, page):
page_id = page["id"]
title, content = self.get_page_content(page_id)
if title == None or content == None:
return []
current_entries = []
curr_heading = ""
for block in content["results"]:
block_type = block.get("type")
if block_type == None:
continue
block_data = block[block_type]
if block_data.get("rich_text") == None or len(block_data["rich_text"]) == 0:
# There's no text to handle here.
continue
raw_content = ""
if block_type in ["heading_1", "heading_2", "heading_3"]:
# If the current block is a heading, we can consider the previous block processing completed.
# Add it as an entry and move on to processing the next chunk of the page.
if raw_content != "":
current_entries.append(
Entry(
compiled=raw_content,
raw=raw_content,
heading=title,
file=page["url"],
)
)
curr_heading = block_data["rich_text"][0]["plain_text"]
else:
if curr_heading != "":
# Add the last known heading to the content for additional context
raw_content = self.process_heading(curr_heading)
for text in block_data["rich_text"]:
raw_content += self.process_text(text)
if block.get("has_children", True):
raw_content += "\n"
raw_content = self.process_nested_children(
self.get_block_children(block["id"]), raw_content, block_type
)
if raw_content != "":
current_entries.append(
Entry(
compiled=raw_content,
raw=raw_content,
heading=title,
file=page["url"],
)
)
return current_entries
def process_heading(self, heading):
return f"\n<b>{heading}</b>\n"
def process_nested_children(self, children, raw_content, block_type=None):
for child in children["results"]:
child_type = child.get("type")
if child_type == None:
continue
child_data = child[child_type]
if child_data.get("rich_text") and len(child_data["rich_text"]) > 0:
for text in child_data["rich_text"]:
raw_content += self.process_text(text, block_type)
if child_data.get("has_children", True):
return self.process_nested_children(self.get_block_children(child["id"]), raw_content, block_type)
return raw_content
def process_text(self, text, block_type=None):
text_type = text.get("type", None)
if text_type in self.unsupported_block_types:
return ""
if text.get("href", None):
return f"<a href='{text['href']}'>{text['plain_text']}</a>"
raw_text = text["plain_text"]
if text_type in self.display_block_block_types or block_type in self.display_block_block_types:
return f"\n{raw_text}\n"
return raw_text
def get_block_children(self, block_id):
return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
def get_page(self, page_id):
return self.session.get(f"https://api.notion.com/v1/pages/{page_id}").json()
def get_page_children(self, page_id):
return self.session.get(f"https://api.notion.com/v1/blocks/{page_id}/children").json()
def get_page_content(self, page_id):
try:
page = self.get_page(page_id)
content = self.get_page_children(page_id)
except Exception as e:
logger.error(f"Error getting page {page_id}: {e}")
return None, None
properties = page["properties"]
title_field = "Title" if "Title" in properties else "title"
title = page["properties"][title_field]["title"][0]["text"]["content"]
return title, content
def update_entries_with_ids(self, current_entries, previous_entries):
# Identify, mark and merge any new entries with previous entries
with timer("Identify new or updated entries", logger):
if not previous_entries:
entries_with_ids = list(enumerate(current_entries))
else:
entries_with_ids = TextToJsonl.mark_entries_for_update(
current_entries, previous_entries, key="compiled", logger=logger
)
with timer("Write Notion entries to JSONL file", logger):
# Process Each Entry from all Notion entries
entries = list(map(lambda entry: entry[1], entries_with_ids))
jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
# Compress JSONL formatted Data
if self.config.compressed_jsonl.suffix == ".gz":
compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
elif self.config.compressed_jsonl.suffix == ".jsonl":
dump_jsonl(jsonl_data, self.config.compressed_jsonl)
return entries_with_ids

View file

@ -62,7 +62,7 @@ class TextToJsonl(ABC):
@staticmethod
def mark_entries_for_update(
current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger: logging.Logger = None
) -> List[Tuple[int, Entry]]:
# Hash all current and previous entries to identify new entries
with timer("Hash previous, current entries", logger):
@ -90,3 +90,8 @@ class TextToJsonl(ABC):
entries_with_ids = existing_entries_sorted + new_entries
return entries_with_ids
@staticmethod
def convert_text_maps_to_jsonl(entries: List[Entry]) -> str:
# Convert each entry to JSON and write to JSONL file
return "".join([f"{entry.to_json()}\n" for entry in entries])

View file

@ -28,6 +28,7 @@ from khoj.utils.rawconfig import (
TextContentConfig,
ConversationProcessorConfig,
GithubContentConfig,
NotionContentConfig,
)
from khoj.utils.state import SearchType
from khoj.utils import state, constants
@ -45,6 +46,11 @@ logger = logging.getLogger(__name__)
# If it's a demo instance, prevent updating any of the configuration.
if not state.demo:
def _initialize_config():
if state.config is None:
state.config = FullConfig()
state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
@api.get("/config/data", response_model=FullConfig)
def get_config_data():
return state.config
@ -59,9 +65,7 @@ if not state.demo:
@api.post("/config/data/content_type/github", status_code=200)
async def set_content_config_github_data(updated_config: Union[GithubContentConfig, None]):
if not state.config:
state.config = FullConfig()
state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
_initialize_config()
if not state.config.content_type:
state.config.content_type = ContentConfig(**{"github": updated_config})
@ -74,6 +78,21 @@ if not state.demo:
except Exception as e:
return {"status": "error", "message": str(e)}
@api.post("/config/data/content_type/notion", status_code=200)
async def set_content_config_notion_data(updated_config: Union[NotionContentConfig, None]):
_initialize_config()
if not state.config.content_type:
state.config.content_type = ContentConfig(**{"notion": updated_config})
else:
state.config.content_type.notion = updated_config
try:
save_config_to_file_updated_state()
return {"status": "ok"}
except Exception as e:
return {"status": "error", "message": str(e)}
@api.post("/delete/config/data/content_type/{content_type}", status_code=200)
async def remove_content_config_data(content_type: str):
if not state.config or not state.config.content_type:
@ -84,6 +103,8 @@ if not state.demo:
if content_type == "github":
state.model.github_search = None
elif content_type == "notion":
state.model.notion_search = None
elif content_type == "plugins":
state.model.plugin_search = None
elif content_type == "pdf":
@ -114,9 +135,7 @@ if not state.demo:
@api.post("/config/data/content_type/{content_type}", status_code=200)
async def set_content_config_data(content_type: str, updated_config: Union[TextContentConfig, None]):
if not state.config:
state.config = FullConfig()
state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
_initialize_config()
if not state.config.content_type:
state.config.content_type = ContentConfig(**{content_type: updated_config})
@ -131,9 +150,8 @@ if not state.demo:
@api.post("/config/data/processor/conversation", status_code=200)
async def set_processor_conversation_config_data(updated_config: Union[ConversationProcessorConfig, None]):
if not state.config:
state.config = FullConfig()
state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
_initialize_config()
state.config.processor = ProcessorConfig(conversation=updated_config)
state.processor_config = configure_processor(state.config.processor)
try:
@ -312,6 +330,20 @@ async def search(
)
]
if (t == SearchType.Notion or t == SearchType.All) and state.model.notion_search:
# query notion pages
search_futures += [
executor.submit(
text_search.query,
user_query,
state.model.notion_search,
question_embedding=encoded_asymmetric_query,
rank_results=r or False,
score_threshold=score_threshold,
dedupe=dedupe or True,
)
]
# Query across each requested content types in parallel
with timer("Query took", logger):
for search_future in concurrent.futures.as_completed(search_futures):

View file

@ -63,6 +63,28 @@ if not state.demo:
"content_type_github_input.html", context={"request": request, "current_config": current_config}
)
@web_client.get("/config/content_type/notion", response_class=HTMLResponse)
def notion_config_page(request: Request):
default_copy = constants.default_config.copy()
default_notion = default_copy["content-type"]["notion"] # type: ignore
default_config = TextContentConfig(
compressed_jsonl=default_notion["compressed-jsonl"],
embeddings_file=default_notion["embeddings-file"],
)
current_config = (
state.config.content_type.notion
if state.config and state.config.content_type and state.config.content_type.notion
else default_config
)
current_config = json.loads(current_config.json())
return templates.TemplateResponse(
"content_type_notion_input.html", context={"request": request, "current_config": current_config}
)
@web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
def content_config_page(request: Request, content_type: str):
if content_type not in VALID_TEXT_CONTENT_TYPES:

View file

@ -15,7 +15,7 @@ from khoj.utils import state
from khoj.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model, timer
from khoj.utils.config import TextSearchModel
from khoj.utils.models import BaseEncoder
from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextContentConfig, Entry
from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextConfigBase, Entry
from khoj.utils.jsonl import load_jsonl
@ -159,7 +159,11 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]
{
"entry": entries[hit["corpus_id"]].raw,
"score": f"{hit.get('cross-score') or hit.get('score')}",
"additional": {"file": entries[hit["corpus_id"]].file, "compiled": entries[hit["corpus_id"]].compiled},
"additional": {
"file": entries[hit["corpus_id"]].file,
"compiled": entries[hit["corpus_id"]].compiled,
"heading": entries[hit["corpus_id"]].heading,
},
}
)
for hit in hits[0:count]
@ -168,7 +172,7 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]
def setup(
text_to_jsonl: Type[TextToJsonl],
config: TextContentConfig,
config: TextConfigBase,
search_config: TextSearchConfig,
regenerate: bool,
filters: List[BaseFilter] = [],
@ -186,7 +190,8 @@ def setup(
# Extract Updated Entries
entries = extract_entries(config.compressed_jsonl)
if is_none_or_empty(entries):
raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
config_params = ", ".join([f"{key}={value}" for key, value in config.dict().items()])
raise ValueError(f"No valid entries found in specified files: {config_params}")
top_k = min(len(entries), top_k) # top_k hits can't be more than the total entries in corpus
# Compute or Load Embeddings

View file

@ -3,7 +3,7 @@ from __future__ import annotations # to avoid quoting type hints
from enum import Enum
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List
from typing import TYPE_CHECKING, Dict, List, Union
# External Packages
import torch
@ -23,6 +23,7 @@ class SearchType(str, Enum):
Image = "image"
Pdf = "pdf"
Github = "github"
Notion = "notion"
class ProcessorType(str, Enum):
@ -58,12 +59,13 @@ class ImageSearchModel:
@dataclass
class SearchModels:
org_search: TextSearchModel = None
markdown_search: TextSearchModel = None
pdf_search: TextSearchModel = None
image_search: ImageSearchModel = None
github_search: TextSearchModel = None
plugin_search: Dict[str, TextSearchModel] = None
org_search: Union[TextSearchModel, None] = None
markdown_search: Union[TextSearchModel, None] = None
pdf_search: Union[TextSearchModel, None] = None
image_search: Union[ImageSearchModel, None] = None
github_search: Union[TextSearchModel, None] = None
notion_search: Union[TextSearchModel, None] = None
plugin_search: Union[Dict[str, TextSearchModel], None] = None
class ConversationProcessorConfigModel:
@ -78,4 +80,4 @@ class ConversationProcessorConfigModel:
@dataclass
class ProcessorConfigModel:
conversation: ConversationProcessorConfigModel = None
conversation: Union[ConversationProcessorConfigModel, None] = None

View file

@ -41,6 +41,11 @@ default_config = {
"compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz",
"embeddings-file": "~/.khoj/content/github/github_embeddings.pt",
},
"notion": {
"token": None,
"compressed-jsonl": "~/.khoj/content/notion/notion.jsonl.gz",
"embeddings-file": "~/.khoj/content/notion/notion_embeddings.pt",
},
},
"search-type": {
"symmetric": {

View file

@ -52,6 +52,10 @@ class GithubContentConfig(TextConfigBase):
repos: List[GithubRepoConfig]
class NotionContentConfig(TextConfigBase):
token: str
class ImageContentConfig(ConfigBase):
input_directories: Optional[List[Path]]
input_filter: Optional[List[str]]
@ -77,6 +81,7 @@ class ContentConfig(ConfigBase):
pdf: Optional[TextContentConfig]
github: Optional[GithubContentConfig]
plugins: Optional[Dict[str, TextContentConfig]]
notion: Optional[NotionContentConfig]
class TextSearchConfig(ConfigBase):
@ -148,4 +153,9 @@ class Entry:
@classmethod
def from_dict(cls, dictionary: dict):
return cls(raw=dictionary["raw"], compiled=dictionary["compiled"], file=dictionary.get("file", None))
return cls(
raw=dictionary["raw"],
compiled=dictionary["compiled"],
file=dictionary.get("file", None),
heading=dictionary.get("heading", None),
)