Add a plugin which allows users to index their Notion pages (#284)

* For the demo instance, re-instate the scheduler, but infrequently for api updates - In constants, determine the cadence based on whether it's a demo instance or not - This allow us to collect telemetry again. This will also allow us to save the chat session * Conditionally skip updating the index altogether if it's a demo isntance * Add backend support for Notion data parsing - Add a NotionToJsonl class which parses the text of Notion documents made accessible to the API token - Make corresponding updates to the default config, raw config to support the new notion addition * Add corresponding views to support configuring Notion from the web-based settings page - Support backend APIs for deleting/configuring notion setup as well - Streamline some of the index updating code * Use defaults for search and chat queries results count * Update pagination of retrieving pages from Notion * Update state conversation processor when update is hit * frequency_penalty should be passed to gpt through kwargs * Add check for notion in render_multiple method * Add headings to Notion render * Revert results count slider and split Notion files by blocks * Clean/fix misc things in the function to update index - Use the successText and errorText variables appropriately - Name parameters in function calls - Add emojis, woohoo * Clean up and further modularize code for processing data in Notion
2025-02-18 22:14:19 +00:00 · 2023-07-09 15:29:26 -07:00 · 2023-07-09 15:29:26 -07:00 · 62704cac09
commit 62704cac09
parent 77755c0284
15 changed files with 520 additions and 64 deletions
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@ -17,6 +17,7 @@ from khoj.processor.markdown.markdown_to_jsonl import MarkdownToJsonl
 from khoj.processor.org_mode.org_to_jsonl import OrgToJsonl
 from khoj.processor.pdf.pdf_to_jsonl import PdfToJsonl
 from khoj.processor.github.github_to_jsonl import GithubToJsonl
+from khoj.processor.notion.notion_to_jsonl import NotionToJsonl
 from khoj.search_type import image_search, text_search
 from khoj.utils import constants, state
 from khoj.utils.config import SearchType, SearchModels, ProcessorConfigModel, ConversationProcessorConfigModel
@ -169,6 +170,18 @@ def configure_search(model: SearchModels, config: FullConfig, regenerate: bool,
                    regenerate=regenerate,
                    filters=[DateFilter(), WordFilter(), FileFilter()],
                )
+
+        # Initialize Notion Search
+        if (t == None or t in state.SearchType) and config.content_type.notion:
+            logger.info("🔌 Setting up search for notion")
+            model.notion_search = text_search.setup(
+                NotionToJsonl,
+                config.content_type.notion,
+                search_config=config.search_type.asymmetric,
+                regenerate=regenerate,
+                filters=[DateFilter(), WordFilter(), FileFilter()],
+            )
+
    except Exception as e:
        logger.error("🚨 Failed to setup search")
        raise e
@ -248,7 +261,7 @@ def save_chat_session():

@schedule.repeat(schedule.every(59).minutes)
 def upload_telemetry():
-    if not state.config or not state.config.app.should_log_telemetry or not state.telemetry:
+    if not state.config or not state.config.app or not state.config.app.should_log_telemetry or not state.telemetry:
        message = "📡 No telemetry to upload" if not state.telemetry else "📡 Telemetry logging disabled"
        logger.debug(message)
        return
--- a/src/khoj/interface/web/assets/icons/notion.svg
+++ b/src/khoj/interface/web/assets/icons/notion.svg
@ -0,0 +1,4 @@
+<svg width="100" height="100" viewBox="0 0 100 100" fill="none" xmlns="http://www.w3.org/2000/svg">
+  <path d="M6.017 4.313l55.333 -4.087c6.797 -0.583 8.543 -0.19 12.817 2.917l17.663 12.443c2.913 2.14 3.883 2.723 3.883 5.053v68.243c0 4.277 -1.553 6.807 -6.99 7.193L24.467 99.967c-4.08 0.193 -6.023 -0.39 -8.16 -3.113L3.3 79.94c-2.333 -3.113 -3.3 -5.443 -3.3 -8.167V11.113c0 -3.497 1.553 -6.413 6.017 -6.8z" fill="#fff"/>
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M61.35 0.227l-55.333 4.087C1.553 4.7 0 7.617 0 11.113v60.66c0 2.723 0.967 5.053 3.3 8.167l13.007 16.913c2.137 2.723 4.08 3.307 8.16 3.113l64.257 -3.89c5.433 -0.387 6.99 -2.917 6.99 -7.193V20.64c0 -2.21 -0.873 -2.847 -3.443 -4.733L74.167 3.143c-4.273 -3.107 -6.02 -3.5 -12.817 -2.917zM25.92 19.523c-5.247 0.353 -6.437 0.433 -9.417 -1.99L8.927 11.507c-0.77 -0.78 -0.383 -1.753 1.557 -1.947l53.193 -3.887c4.467 -0.39 6.793 1.167 8.54 2.527l9.123 6.61c0.39 0.197 1.36 1.36 0.193 1.36l-54.933 3.307 -0.68 0.047zM19.803 88.3V30.367c0 -2.53 0.777 -3.697 3.103 -3.893L86 22.78c2.14 -0.193 3.107 1.167 3.107 3.693v57.547c0 2.53 -0.39 4.67 -3.883 4.863l-60.377 3.5c-3.493 0.193 -5.043 -0.97 -5.043 -4.083zm59.6 -54.827c0.387 1.75 0 3.5 -1.75 3.7l-2.91 0.577v42.773c-2.527 1.36 -4.853 2.137 -6.797 2.137 -3.107 0 -3.883 -0.973 -6.21 -3.887l-19.03 -29.94v28.967l6.02 1.363s0 3.5 -4.857 3.5l-13.39 0.777c-0.39 -0.78 0 -2.723 1.357 -3.11l3.497 -0.97v-38.3L30.48 40.667c-0.39 -1.75 0.58 -4.277 3.3 -4.473l14.367 -0.967 19.8 30.327v-26.83l-5.047 -0.58c-0.39 -2.143 1.163 -3.7 3.103 -3.89l13.4 -0.78z" fill="#000"/>
+</svg>
--- a/src/khoj/interface/web/chat.html
+++ b/src/khoj/interface/web/chat.html
@ -93,6 +93,7 @@

                            // Decode message chunk from stream
                            const chunk = decoder.decode(value, { stream: true });
+
                            if (chunk.includes("### compiled references:")) {
                                const additionalResponse = chunk.split("### compiled references:")[0];
                                new_response_text.innerHTML += additionalResponse;
--- a/src/khoj/interface/web/config.html
+++ b/src/khoj/interface/web/config.html
@ -14,7 +14,6 @@
                            <img id="configured-icon-github" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
                        {% endif %}
                    </h3>
-
                </div>
                <div class="card-description-row">
                <p class="card-description">Set repositories for Khoj to index</p>
@ -37,6 +36,37 @@
                    </div>
                {% endif %}
            </div>
+            <div class="card">
+                <div class="card-title-row">
+                    <img class="card-icon" src="/static/assets/icons/notion.svg" alt="Notion">
+                    <h3 class="card-title">
+                        Notion
+                        {% if current_config.content_type.notion %}
+                            <img id="configured-icon-notion" class="configured-icon" src="/static/assets/icons/confirm-icon.svg" alt="Configured">
+                        {% endif %}
+                    </h3>
+                </div>
+                <div class="card-description-row">
+                    <p class="card-description">Configure your settings from Notion</p>
+                </div>
+                <div class="card-action-row">
+                    <a class="card-button" href="/config/content_type/notion">
+                        {% if current_config.content_type.content %}
+                            Update
+                        {% else %}
+                            Setup
+                        {% endif %}
+                        <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M5 12h14M12 5l7 7-7 7"></path></svg>
+                    </a>
+                </div>
+                {% if current_config.content_type.notion %}
+                    <div id="clear-notion" class="card-action-row">
+                        <button class="card-button" onclick="clearContentType('notion')">
+                            Disable
+                        </button>
+                    </div>
+                {% endif %}
+            </div>
            <div class="card">
                <div class="card-title-row">
                    <img class="card-icon" src="/static/assets/icons/markdown.svg" alt="markdown">
@ -224,40 +254,32 @@
    var configure = document.getElementById("configure");
    configure.addEventListener("click", function(event) {
        event.preventDefault();
-        configure.disabled = true;
-        configure.innerHTML = "Configuring...";
-        const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
-        fetch('/api/update?&client=web', {
-            method: 'GET',
-            headers: {
-                'Content-Type': 'application/json',
-                'X-CSRFToken': csrfToken
-            }
-        })
-        .then(response => response.json())
-        .then(data => {
-            console.log('Success:', data);
-            document.getElementById("status").innerHTML = "Configured successfully!";
-            document.getElementById("status").style.display = "block";
-            configure.disabled = false;
-            configure.innerHTML = "⚙️ Configured";
-        })
-        .catch((error) => {
-            console.error('Error:', error);
-            document.getElementById("status").innerHTML = "Unable to save configuration. Raise issue on Khoj Discord or Github.";
-            document.getElementById("status").style.display = "block";
-            configure.disabled = false;
-            configure.innerHTML = "⚙️ Configure";
-        });
+        updateIndex(
+            force=false,
+            successText="Configured successfully!",
+            errorText="Unable to configure. Raise issue on Khoj <a href='https://github.com/khoj-ai/khoj/issues'>Github</a> or <a href='https://discord.gg/BDgyabRM6e'>Discord</a>.",
+            button=configure,
+            loadingText="Configuring...",
+            emoji="⚙️");
    });

    var reinitialize = document.getElementById("reinitialize");
    reinitialize.addEventListener("click", function(event) {
        event.preventDefault();
-        reinitialize.disabled = true;
-        reinitialize.innerHTML = "Reinitializing...";
+        updateIndex(
+            force=true,
+            successText="Reinitialized successfully!",
+            errorText="Unable to reinitialize. Raise issue on Khoj <a href='https://github.com/khoj-ai/khoj/issues'>Github</a> or <a href='https://discord.gg/BDgyabRM6e'>Discord</a>.",
+            button=reinitialize,
+            loadingText="Reinitializing...",
+            emoji="🔄");
+    });
+
+    function updateIndex(force, successText, errorText, button, loadingText, emoji) {
        const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
-        fetch('/api/update?&client=web&force=True', {
+        button.disabled = true;
+        button.innerHTML = emoji + loadingText;
+        fetch('/api/update?&client=web&force=' + force, {
            method: 'GET',
            headers: {
                'Content-Type': 'application/json',
@ -267,19 +289,22 @@
        .then(response => response.json())
        .then(data => {
            console.log('Success:', data);
-            document.getElementById("status").innerHTML = "Reinitialized successfully!";
+            if (data.detail != null) {
+                throw new Error(data.detail);
+            }
+            document.getElementById("status").innerHTML = emoji + successText;
            document.getElementById("status").style.display = "block";
-            reinitialize.disabled = false;
-            reinitialize.innerHTML = "🔄 Reinitialized";
+            button.disabled = false;
+            button.innerHTML = '✅ Done!';
        })
        .catch((error) => {
            console.error('Error:', error);
-            document.getElementById("status").innerHTML = "Unable to reinitialize. Raise issue on Khoj Discord or Github.";
+            document.getElementById("status").innerHTML = emoji + errorText
            document.getElementById("status").style.display = "block";
-            reinitialize.disabled = false;
-            reinitialize.innerHTML = "🔄 Reinitialize";
+            button.disabled = false;
+            button.innerHTML = '⚠️ Unsuccessful';
        });
-    });
+    }

    // Setup the results count slider
    const resultsCountSlider = document.getElementById('results-count-slider');
--- a/src/khoj/interface/web/content_type_notion_input.html
+++ b/src/khoj/interface/web/content_type_notion_input.html
@ -0,0 +1,86 @@
+{% extends "base_config.html" %}
+{% block content %}
+<div class="page">
+    <div class="section">
+        <h2 class="section-title">
+            <img class="card-icon" src="/static/assets/icons/notion.svg" alt="Notion">
+            <span class="card-title-text">Notion</span>
+        </h2>
+        <form>
+            <table>
+                <tr>
+                    <td>
+                        <label for="token">Token</label>
+                    </td>
+                    <td>
+                        <input type="text" id="token" name="pat" value="{{ current_config['token'] }}">
+                    </td>
+                </tr>
+            </table>
+            <table style="display: none;" >
+                <tr>
+                    <td>
+                        <label for="compressed-jsonl">Compressed JSONL (Output)</label>
+                    </td>
+                    <td>
+                        <input type="text" id="compressed-jsonl" name="compressed-jsonl" value="{{ current_config['compressed_jsonl'] }}">
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <label for="embeddings-file">Embeddings File (Output)</label>
+                    </td>
+                    <td>
+                        <input type="text" id="embeddings-file" name="embeddings-file" value="{{ current_config['embeddings_file'] }}">
+                    </td>
+                </tr>
+            </table>
+            <div class="section">
+                <div id="success" style="display: none;"></div>
+                <button id="submit" type="submit">Save</button>
+            </div>
+        </form>
+    </div>
+</div>
+<script>
+    const submit = document.getElementById("submit");
+
+    submit.addEventListener("click", function(event) {
+        event.preventDefault();
+
+        const compressed_jsonl = document.getElementById("compressed-jsonl").value;
+        const embeddings_file = document.getElementById("embeddings-file").value;
+        const token = document.getElementById("token").value;
+
+        if (token == "") {
+            document.getElementById("success").innerHTML = "❌ Please enter a Notion Token.";
+            document.getElementById("success").style.display = "block";
+            return;
+        }
+
+        const csrfToken = document.cookie.split('; ').find(row => row.startsWith('csrftoken'))?.split('=')[1];
+        fetch('/api/config/data/content_type/notion', {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'X-CSRFToken': csrfToken,
+            },
+            body: JSON.stringify({
+                "token": token,
+                "compressed_jsonl": compressed_jsonl,
+                "embeddings_file": embeddings_file,
+            })
+        })
+        .then(response => response.json())
+        .then(data => {
+            if (data["status"] == "ok") {
+                document.getElementById("success").innerHTML = "✅ Successfully updated. Go to your <a href='/config'>settings page</a> to complete setup.";
+                document.getElementById("success").style.display = "block";
+            } else {
+                document.getElementById("success").innerHTML = "⚠️ Failed to update settings.";
+                document.getElementById("success").style.display = "block";
+            }
+        })
+    });
+</script>
+{% endblock %}
--- a/src/khoj/interface/web/index.html
+++ b/src/khoj/interface/web/index.html
@ -71,6 +71,8 @@
                    html += render_markdown(query, [item]);
                } else if (item.additional.file.endsWith(".pdf")) {
                    html += render_pdf(query, [item]);
+                } else if (item.additional.file.includes("notion.so")) {
+                    html += `<div class="results-notion">` + `<b><a href="${item.additional.file}">${item.additional.heading}</a></b>` + `<p>${item.entry}</p>` + `</div>`;
                }
            });
            return html;
@ -86,7 +88,7 @@
                results = data.map(render_image).join('');
            } else if (type === "pdf") {
                results = render_pdf(query, data);
-            } else if (type === "github" || type === "all") {
+            } else if (type === "github" || type === "all" || type === "notion") {
                results = render_multiple(query, data, type);
            } else {
                results = data.map((item) => `<div class="results-plugin">` + `<p>${item.entry}</p>` + `</div>`).join("\n")
@ -127,7 +129,7 @@
                setQueryFieldInUrl(query);

            // Execute Search and Render Results
-            url = createRequestUrl(query, type, results_count, rerank);
+            url = createRequestUrl(query, type, results_count || 5, rerank);
            fetch(url)
                .then(response => response.json())
                .then(data => {
@ -347,6 +349,7 @@
            white-space: pre-wrap;
        }
        .results-pdf,
+        .results-notion,
        .results-plugin {
            text-align: left;
            white-space: pre-line;
@ -404,6 +407,7 @@

        div#results-error,
        div.results-markdown,
+        div.results-notion,
        div.results-org,
        div.results-pdf {
            text-align: left;
--- a/src/khoj/processor/conversation/gpt.py
+++ b/src/khoj/processor/conversation/gpt.py
@ -32,8 +32,7 @@ def summarize(session, model, api_key=None, temperature=0.5, max_tokens=200):
        model_name=model,
        temperature=temperature,
        max_tokens=max_tokens,
-        frequency_penalty=0.2,
-        model_kwargs={"stop": ['"""']},
+        model_kwargs={"stop": ['"""'], "frequency_penalty": 0.2},
        openai_api_key=api_key,
    )

--- a/src/khoj/processor/notion/notion_to_jsonl.py
+++ b/src/khoj/processor/notion/notion_to_jsonl.py
@ -0,0 +1,243 @@
+# Standard Packages
+import logging
+
+# External Packages
+import requests
+
+# Internal Packages
+from khoj.utils.helpers import timer
+from khoj.utils.rawconfig import Entry, NotionContentConfig
+from khoj.processor.text_to_jsonl import TextToJsonl
+from khoj.utils.jsonl import dump_jsonl, compress_jsonl_data
+from khoj.utils.rawconfig import Entry
+
+from enum import Enum
+
+
+logger = logging.getLogger(__name__)
+
+
+class NotionBlockType(Enum):
+    PARAGRAPH = "paragraph"
+    HEADING_1 = "heading_1"
+    HEADING_2 = "heading_2"
+    HEADING_3 = "heading_3"
+    BULLETED_LIST_ITEM = "bulleted_list_item"
+    NUMBERED_LIST_ITEM = "numbered_list_item"
+    TO_DO = "to_do"
+    TOGGLE = "toggle"
+    CHILD_PAGE = "child_page"
+    UNSUPPORTED = "unsupported"
+    BOOKMARK = "bookmark"
+    DIVIDER = "divider"
+    PDF = "pdf"
+    IMAGE = "image"
+    EMBED = "embed"
+    VIDEO = "video"
+    FILE = "file"
+    SYNCED_BLOCK = "synced_block"
+    TABLE_OF_CONTENTS = "table_of_contents"
+    COLUMN = "column"
+    EQUATION = "equation"
+    LINK_PREVIEW = "link_preview"
+    COLUMN_LIST = "column_list"
+    QUOTE = "quote"
+    BREADCRUMB = "breadcrumb"
+    LINK_TO_PAGE = "link_to_page"
+    CHILD_DATABASE = "child_database"
+    TEMPLATE = "template"
+    CALLOUT = "callout"
+
+
+class NotionToJsonl(TextToJsonl):
+    def __init__(self, config: NotionContentConfig):
+        super().__init__(config)
+        self.config = config
+        self.session = requests.Session()
+        self.session.headers.update({"Authorization": f"Bearer {config.token}", "Notion-Version": "2022-02-22"})
+        self.unsupported_block_types = [
+            NotionBlockType.BOOKMARK.value,
+            NotionBlockType.DIVIDER.value,
+            NotionBlockType.CHILD_DATABASE.value,
+            NotionBlockType.TEMPLATE.value,
+            NotionBlockType.CALLOUT.value,
+            NotionBlockType.UNSUPPORTED.value,
+        ]
+
+        self.display_block_block_types = [
+            NotionBlockType.PARAGRAPH.value,
+            NotionBlockType.HEADING_1.value,
+            NotionBlockType.HEADING_2.value,
+            NotionBlockType.HEADING_3.value,
+            NotionBlockType.BULLETED_LIST_ITEM.value,
+            NotionBlockType.NUMBERED_LIST_ITEM.value,
+            NotionBlockType.TO_DO.value,
+            NotionBlockType.TOGGLE.value,
+            NotionBlockType.CHILD_PAGE.value,
+            NotionBlockType.BOOKMARK.value,
+            NotionBlockType.DIVIDER.value,
+        ]
+
+    def process(self, previous_entries=None):
+        current_entries = []
+
+        # Get all pages
+        with timer("Getting all pages via search endpoint", logger=logger):
+            responses = []
+
+            while True:
+                result = self.session.post(
+                    "https://api.notion.com/v1/search",
+                    json={"page_size": 100},
+                ).json()
+                responses.append(result)
+                if result["has_more"] == False:
+                    break
+                else:
+                    self.session.params = {"start_cursor": responses[-1]["next_cursor"]}
+
+        for response in responses:
+            with timer("Processing response", logger=logger):
+                pages_or_databases = response["results"]
+
+                # Get all pages content
+                for p_or_d in pages_or_databases:
+                    with timer(f"Processing {p_or_d['object']} {p_or_d['id']}", logger=logger):
+                        if p_or_d["object"] == "database":
+                            # TODO: Handle databases
+                            continue
+                        elif p_or_d["object"] == "page":
+                            page_entries = self.process_page(p_or_d)
+                            current_entries.extend(page_entries)
+
+        return self.update_entries_with_ids(current_entries, previous_entries)
+
+    def process_page(self, page):
+        page_id = page["id"]
+        title, content = self.get_page_content(page_id)
+
+        if title == None or content == None:
+            return []
+
+        current_entries = []
+        curr_heading = ""
+        for block in content["results"]:
+            block_type = block.get("type")
+
+            if block_type == None:
+                continue
+            block_data = block[block_type]
+
+            if block_data.get("rich_text") == None or len(block_data["rich_text"]) == 0:
+                # There's no text to handle here.
+                continue
+
+            raw_content = ""
+            if block_type in ["heading_1", "heading_2", "heading_3"]:
+                # If the current block is a heading, we can consider the previous block processing completed.
+                # Add it as an entry and move on to processing the next chunk of the page.
+                if raw_content != "":
+                    current_entries.append(
+                        Entry(
+                            compiled=raw_content,
+                            raw=raw_content,
+                            heading=title,
+                            file=page["url"],
+                        )
+                    )
+                curr_heading = block_data["rich_text"][0]["plain_text"]
+            else:
+                if curr_heading != "":
+                    # Add the last known heading to the content for additional context
+                    raw_content = self.process_heading(curr_heading)
+            for text in block_data["rich_text"]:
+                raw_content += self.process_text(text)
+
+            if block.get("has_children", True):
+                raw_content += "\n"
+                raw_content = self.process_nested_children(
+                    self.get_block_children(block["id"]), raw_content, block_type
+                )
+
+            if raw_content != "":
+                current_entries.append(
+                    Entry(
+                        compiled=raw_content,
+                        raw=raw_content,
+                        heading=title,
+                        file=page["url"],
+                    )
+                )
+        return current_entries
+
+    def process_heading(self, heading):
+        return f"\n<b>{heading}</b>\n"
+
+    def process_nested_children(self, children, raw_content, block_type=None):
+        for child in children["results"]:
+            child_type = child.get("type")
+            if child_type == None:
+                continue
+            child_data = child[child_type]
+            if child_data.get("rich_text") and len(child_data["rich_text"]) > 0:
+                for text in child_data["rich_text"]:
+                    raw_content += self.process_text(text, block_type)
+            if child_data.get("has_children", True):
+                return self.process_nested_children(self.get_block_children(child["id"]), raw_content, block_type)
+
+        return raw_content
+
+    def process_text(self, text, block_type=None):
+        text_type = text.get("type", None)
+        if text_type in self.unsupported_block_types:
+            return ""
+        if text.get("href", None):
+            return f"<a href='{text['href']}'>{text['plain_text']}</a>"
+        raw_text = text["plain_text"]
+        if text_type in self.display_block_block_types or block_type in self.display_block_block_types:
+            return f"\n{raw_text}\n"
+        return raw_text
+
+    def get_block_children(self, block_id):
+        return self.session.get(f"https://api.notion.com/v1/blocks/{block_id}/children").json()
+
+    def get_page(self, page_id):
+        return self.session.get(f"https://api.notion.com/v1/pages/{page_id}").json()
+
+    def get_page_children(self, page_id):
+        return self.session.get(f"https://api.notion.com/v1/blocks/{page_id}/children").json()
+
+    def get_page_content(self, page_id):
+        try:
+            page = self.get_page(page_id)
+            content = self.get_page_children(page_id)
+        except Exception as e:
+            logger.error(f"Error getting page {page_id}: {e}")
+            return None, None
+        properties = page["properties"]
+        title_field = "Title" if "Title" in properties else "title"
+        title = page["properties"][title_field]["title"][0]["text"]["content"]
+        return title, content
+
+    def update_entries_with_ids(self, current_entries, previous_entries):
+        # Identify, mark and merge any new entries with previous entries
+        with timer("Identify new or updated entries", logger):
+            if not previous_entries:
+                entries_with_ids = list(enumerate(current_entries))
+            else:
+                entries_with_ids = TextToJsonl.mark_entries_for_update(
+                    current_entries, previous_entries, key="compiled", logger=logger
+                )
+
+        with timer("Write Notion entries to JSONL file", logger):
+            # Process Each Entry from all Notion entries
+            entries = list(map(lambda entry: entry[1], entries_with_ids))
+            jsonl_data = TextToJsonl.convert_text_maps_to_jsonl(entries)
+
+            # Compress JSONL formatted Data
+            if self.config.compressed_jsonl.suffix == ".gz":
+                compress_jsonl_data(jsonl_data, self.config.compressed_jsonl)
+            elif self.config.compressed_jsonl.suffix == ".jsonl":
+                dump_jsonl(jsonl_data, self.config.compressed_jsonl)
+
+        return entries_with_ids
--- a/src/khoj/processor/text_to_jsonl.py
+++ b/src/khoj/processor/text_to_jsonl.py
@ -62,7 +62,7 @@ class TextToJsonl(ABC):

    @staticmethod
    def mark_entries_for_update(
-        current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger=None
+        current_entries: List[Entry], previous_entries: List[Entry], key="compiled", logger: logging.Logger = None
    ) -> List[Tuple[int, Entry]]:
        # Hash all current and previous entries to identify new entries
        with timer("Hash previous, current entries", logger):
@ -90,3 +90,8 @@ class TextToJsonl(ABC):
            entries_with_ids = existing_entries_sorted + new_entries

        return entries_with_ids
+
+    @staticmethod
+    def convert_text_maps_to_jsonl(entries: List[Entry]) -> str:
+        # Convert each entry to JSON and write to JSONL file
+        return "".join([f"{entry.to_json()}\n" for entry in entries])
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@ -28,6 +28,7 @@ from khoj.utils.rawconfig import (
    TextContentConfig,
    ConversationProcessorConfig,
    GithubContentConfig,
+    NotionContentConfig,
 )
 from khoj.utils.state import SearchType
 from khoj.utils import state, constants
@ -45,6 +46,11 @@ logger = logging.getLogger(__name__)
 # If it's a demo instance, prevent updating any of the configuration.
 if not state.demo:

+    def _initialize_config():
+        if state.config is None:
+            state.config = FullConfig()
+            state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
+
    @api.get("/config/data", response_model=FullConfig)
    def get_config_data():
        return state.config
@ -59,9 +65,7 @@ if not state.demo:

    @api.post("/config/data/content_type/github", status_code=200)
    async def set_content_config_github_data(updated_config: Union[GithubContentConfig, None]):
-        if not state.config:
-            state.config = FullConfig()
-            state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
+        _initialize_config()

        if not state.config.content_type:
            state.config.content_type = ContentConfig(**{"github": updated_config})
@ -74,6 +78,21 @@ if not state.demo:
        except Exception as e:
            return {"status": "error", "message": str(e)}

+    @api.post("/config/data/content_type/notion", status_code=200)
+    async def set_content_config_notion_data(updated_config: Union[NotionContentConfig, None]):
+        _initialize_config()
+
+        if not state.config.content_type:
+            state.config.content_type = ContentConfig(**{"notion": updated_config})
+        else:
+            state.config.content_type.notion = updated_config
+
+        try:
+            save_config_to_file_updated_state()
+            return {"status": "ok"}
+        except Exception as e:
+            return {"status": "error", "message": str(e)}
+
    @api.post("/delete/config/data/content_type/{content_type}", status_code=200)
    async def remove_content_config_data(content_type: str):
        if not state.config or not state.config.content_type:
@ -84,6 +103,8 @@ if not state.demo:

        if content_type == "github":
            state.model.github_search = None
+        elif content_type == "notion":
+            state.model.notion_search = None
        elif content_type == "plugins":
            state.model.plugin_search = None
        elif content_type == "pdf":
@ -114,9 +135,7 @@ if not state.demo:

    @api.post("/config/data/content_type/{content_type}", status_code=200)
    async def set_content_config_data(content_type: str, updated_config: Union[TextContentConfig, None]):
-        if not state.config:
-            state.config = FullConfig()
-            state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
+        _initialize_config()

        if not state.config.content_type:
            state.config.content_type = ContentConfig(**{content_type: updated_config})
@ -131,9 +150,8 @@ if not state.demo:

    @api.post("/config/data/processor/conversation", status_code=200)
    async def set_processor_conversation_config_data(updated_config: Union[ConversationProcessorConfig, None]):
-        if not state.config:
-            state.config = FullConfig()
-            state.config.search_type = SearchConfig.parse_obj(constants.default_config["search-type"])
+        _initialize_config()
+
        state.config.processor = ProcessorConfig(conversation=updated_config)
        state.processor_config = configure_processor(state.config.processor)
        try:
@ -312,6 +330,20 @@ async def search(
                )
            ]

+        if (t == SearchType.Notion or t == SearchType.All) and state.model.notion_search:
+            # query notion pages
+            search_futures += [
+                executor.submit(
+                    text_search.query,
+                    user_query,
+                    state.model.notion_search,
+                    question_embedding=encoded_asymmetric_query,
+                    rank_results=r or False,
+                    score_threshold=score_threshold,
+                    dedupe=dedupe or True,
+                )
+            ]
+
        # Query across each requested content types in parallel
        with timer("Query took", logger):
            for search_future in concurrent.futures.as_completed(search_futures):
--- a/src/khoj/routers/web_client.py
+++ b/src/khoj/routers/web_client.py
@ -63,6 +63,28 @@ if not state.demo:
            "content_type_github_input.html", context={"request": request, "current_config": current_config}
        )

+    @web_client.get("/config/content_type/notion", response_class=HTMLResponse)
+    def notion_config_page(request: Request):
+        default_copy = constants.default_config.copy()
+        default_notion = default_copy["content-type"]["notion"]  # type: ignore
+
+        default_config = TextContentConfig(
+            compressed_jsonl=default_notion["compressed-jsonl"],
+            embeddings_file=default_notion["embeddings-file"],
+        )
+
+        current_config = (
+            state.config.content_type.notion
+            if state.config and state.config.content_type and state.config.content_type.notion
+            else default_config
+        )
+
+        current_config = json.loads(current_config.json())
+
+        return templates.TemplateResponse(
+            "content_type_notion_input.html", context={"request": request, "current_config": current_config}
+        )
+
    @web_client.get("/config/content_type/{content_type}", response_class=HTMLResponse)
    def content_config_page(request: Request, content_type: str):
        if content_type not in VALID_TEXT_CONTENT_TYPES:
--- a/src/khoj/search_type/text_search.py
+++ b/src/khoj/search_type/text_search.py
@ -15,7 +15,7 @@ from khoj.utils import state
 from khoj.utils.helpers import get_absolute_path, is_none_or_empty, resolve_absolute_path, load_model, timer
 from khoj.utils.config import TextSearchModel
 from khoj.utils.models import BaseEncoder
-from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextContentConfig, Entry
+from khoj.utils.rawconfig import SearchResponse, TextSearchConfig, TextConfigBase, Entry
 from khoj.utils.jsonl import load_jsonl


@ -159,7 +159,11 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]
            {
                "entry": entries[hit["corpus_id"]].raw,
                "score": f"{hit.get('cross-score') or hit.get('score')}",
-                "additional": {"file": entries[hit["corpus_id"]].file, "compiled": entries[hit["corpus_id"]].compiled},
+                "additional": {
+                    "file": entries[hit["corpus_id"]].file,
+                    "compiled": entries[hit["corpus_id"]].compiled,
+                    "heading": entries[hit["corpus_id"]].heading,
+                },
            }
        )
        for hit in hits[0:count]
@ -168,7 +172,7 @@ def collate_results(hits, entries: List[Entry], count=5) -> List[SearchResponse]

 def setup(
    text_to_jsonl: Type[TextToJsonl],
-    config: TextContentConfig,
+    config: TextConfigBase,
    search_config: TextSearchConfig,
    regenerate: bool,
    filters: List[BaseFilter] = [],
@ -186,7 +190,8 @@ def setup(
    # Extract Updated Entries
    entries = extract_entries(config.compressed_jsonl)
    if is_none_or_empty(entries):
-        raise ValueError(f"No valid entries found in specified files: {config.input_files} or {config.input_filter}")
+        config_params = ", ".join([f"{key}={value}" for key, value in config.dict().items()])
+        raise ValueError(f"No valid entries found in specified files: {config_params}")
    top_k = min(len(entries), top_k)  # top_k hits can't be more than the total entries in corpus

    # Compute or Load Embeddings
--- a/src/khoj/utils/config.py
+++ b/src/khoj/utils/config.py
@ -3,7 +3,7 @@ from __future__ import annotations  # to avoid quoting type hints
 from enum import Enum
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, Dict, List, Union

 # External Packages
 import torch
@ -23,6 +23,7 @@ class SearchType(str, Enum):
    Image = "image"
    Pdf = "pdf"
    Github = "github"
+    Notion = "notion"


 class ProcessorType(str, Enum):
@ -58,12 +59,13 @@ class ImageSearchModel:

@dataclass
 class SearchModels:
-    org_search: TextSearchModel = None
-    markdown_search: TextSearchModel = None
-    pdf_search: TextSearchModel = None
-    image_search: ImageSearchModel = None
-    github_search: TextSearchModel = None
-    plugin_search: Dict[str, TextSearchModel] = None
+    org_search: Union[TextSearchModel, None] = None
+    markdown_search: Union[TextSearchModel, None] = None
+    pdf_search: Union[TextSearchModel, None] = None
+    image_search: Union[ImageSearchModel, None] = None
+    github_search: Union[TextSearchModel, None] = None
+    notion_search: Union[TextSearchModel, None] = None
+    plugin_search: Union[Dict[str, TextSearchModel], None] = None


 class ConversationProcessorConfigModel:
@ -78,4 +80,4 @@ class ConversationProcessorConfigModel:

@dataclass
 class ProcessorConfigModel:
-    conversation: ConversationProcessorConfigModel = None
+    conversation: Union[ConversationProcessorConfigModel, None] = None
--- a/src/khoj/utils/constants.py
+++ b/src/khoj/utils/constants.py
@ -41,6 +41,11 @@ default_config = {
            "compressed-jsonl": "~/.khoj/content/github/github.jsonl.gz",
            "embeddings-file": "~/.khoj/content/github/github_embeddings.pt",
        },
+        "notion": {
+            "token": None,
+            "compressed-jsonl": "~/.khoj/content/notion/notion.jsonl.gz",
+            "embeddings-file": "~/.khoj/content/notion/notion_embeddings.pt",
+        },
    },
    "search-type": {
        "symmetric": {
--- a/src/khoj/utils/rawconfig.py
+++ b/src/khoj/utils/rawconfig.py
@ -52,6 +52,10 @@ class GithubContentConfig(TextConfigBase):
    repos: List[GithubRepoConfig]


+class NotionContentConfig(TextConfigBase):
+    token: str
+
+
 class ImageContentConfig(ConfigBase):
    input_directories: Optional[List[Path]]
    input_filter: Optional[List[str]]
@ -77,6 +81,7 @@ class ContentConfig(ConfigBase):
    pdf: Optional[TextContentConfig]
    github: Optional[GithubContentConfig]
    plugins: Optional[Dict[str, TextContentConfig]]
+    notion: Optional[NotionContentConfig]


 class TextSearchConfig(ConfigBase):
@ -148,4 +153,9 @@ class Entry:

    @classmethod
    def from_dict(cls, dictionary: dict):
-        return cls(raw=dictionary["raw"], compiled=dictionary["compiled"], file=dictionary.get("file", None))
+        return cls(
+            raw=dictionary["raw"],
+            compiled=dictionary["compiled"],
+            file=dictionary.get("file", None),
+            heading=dictionary.get("heading", None),
+        )