From 78630603f45fc2136291f2a981e8f3adf1e65b48 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Fri, 8 Nov 2024 17:27:42 -0800
Subject: [PATCH 01/12] Delete the fact checker application

---
 .../app/factchecker/factChecker.module.css    | 172 -----
 src/interface/web/app/factchecker/layout.tsx  |  33 -
 src/interface/web/app/factchecker/page.tsx    | 676 ------------------
 src/khoj/routers/web_client.py                |  10 -
 4 files changed, 891 deletions(-)
 delete mode 100644 src/interface/web/app/factchecker/factChecker.module.css
 delete mode 100644 src/interface/web/app/factchecker/layout.tsx
 delete mode 100644 src/interface/web/app/factchecker/page.tsx

diff --git a/src/interface/web/app/factchecker/factChecker.module.css b/src/interface/web/app/factchecker/factChecker.module.css
deleted file mode 100644
index 9882b2a3..00000000
--- a/src/interface/web/app/factchecker/factChecker.module.css
+++ /dev/null
@@ -1,172 +0,0 @@
-input.factVerification {
-    width: 100%;
-    display: block;
-    padding: 12px 20px;
-    margin: 8px 0;
-    border: none;
-    box-sizing: border-box;
-    border-radius: 4px;
-    text-align: left;
-    margin: auto;
-    margin-top: 8px;
-    margin-bottom: 8px;
-    font-size: large;
-}
-
-div.factCheckerContainer {
-    width: 75vw;
-    margin: auto;
-}
-
-input.factVerification:focus {
-    outline: none;
-    box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
-}
-
-div.responseText {
-    margin: 0;
-    padding: 0;
-    border-radius: 8px;
-}
-
-div.response {
-    margin-bottom: 12px;
-}
-
-a.titleLink {
-    color: #333;
-    font-weight: bold;
-}
-
-a.subLinks {
-    color: #333;
-    text-decoration: none;
-    font-weight: small;
-    border-radius: 4px;
-    font-size: small;
-}
-
-div.subLinks {
-    display: flex;
-    flex-direction: row;
-    gap: 8px;
-    flex-wrap: wrap;
-}
-
-div.reference {
-    padding: 12px;
-    margin: 8px;
-    border-radius: 8px;
-}
-
-footer.footer {
-    width: 100%;
-    background: transparent;
-    text-align: left;
-}
-
-div.reportActions {
-    display: flex;
-    flex-direction: row;
-    gap: 8px;
-    justify-content: space-between;
-    margin-top: 8px;
-}
-
-button.factCheckButton {
-    border: none;
-    cursor: pointer;
-    width: 100%;
-    border-radius: 4px;
-    margin: 8px;
-    padding-left: 1rem;
-    padding-right: 1rem;
-    line-height: 1.25rem;
-}
-
-button.factCheckButton:hover {
-    box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
-}
-
-div.spinner {
-    margin: 20px;
-    width: 40px;
-    height: 40px;
-    position: relative;
-    text-align: center;
-
-    -webkit-animation: sk-rotate 2.0s infinite linear;
-    animation: sk-rotate 2.0s infinite linear;
-}
-
-div.inputFields {
-    width: 100%;
-    display: grid;
-    grid-template-columns: 1fr auto;
-    grid-gap: 8px;
-}
-
-
-/* Loading Animation */
-div.dot1,
-div.dot2 {
-    width: 60%;
-    height: 60%;
-    display: inline-block;
-    position: absolute;
-    top: 0;
-    border-radius: 100%;
-
-    -webkit-animation: sk-bounce 2.0s infinite ease-in-out;
-    animation: sk-bounce 2.0s infinite ease-in-out;
-}
-
-div.dot2 {
-    top: auto;
-    bottom: 0;
-    -webkit-animation-delay: -1.0s;
-    animation-delay: -1.0s;
-}
-
-@media screen and (max-width: 768px) {
-    div.factCheckerContainer {
-        width: 95vw;
-    }
-}
-
-@-webkit-keyframes sk-rotate {
-    100% {
-        -webkit-transform: rotate(360deg)
-    }
-}
-
-@keyframes sk-rotate {
-    100% {
-        transform: rotate(360deg);
-        -webkit-transform: rotate(360deg)
-    }
-}
-
-@-webkit-keyframes sk-bounce {
-    0%,
-    100% {
-        -webkit-transform: scale(0.0)
-    }
-
-    50% {
-        -webkit-transform: scale(1.0)
-    }
-}
-
-@keyframes sk-bounce {
-    0%,
-    100% {
-        transform: scale(0.0);
-        -webkit-transform: scale(0.0);
-    }
-
-    50% {
-        transform: scale(1.0);
-        -webkit-transform: scale(1.0);
-    }
-}
diff --git a/src/interface/web/app/factchecker/layout.tsx b/src/interface/web/app/factchecker/layout.tsx
deleted file mode 100644
index 8f6a7662..00000000
--- a/src/interface/web/app/factchecker/layout.tsx
+++ /dev/null
@@ -1,33 +0,0 @@
-import type { Metadata } from "next";
-
-export const metadata: Metadata = {
-    title: "Khoj AI - Fact Checker",
-    description:
-        "Use the Fact Checker with Khoj AI for verifying statements. It can research the internet for you, either refuting or confirming the statement using fresh data.",
-    icons: {
-        icon: "/static/assets/icons/khoj_lantern.ico",
-        apple: "/static/assets/icons/khoj_lantern_256x256.png",
-    },
-    openGraph: {
-        siteName: "Khoj AI",
-        title: "Khoj AI - Fact Checker",
-        description: "Your Second Brain.",
-        url: "https://app.khoj.dev/factchecker",
-        type: "website",
-        images: [
-            {
-                url: "https://assets.khoj.dev/khoj_lantern_256x256.png",
-                width: 256,
-                height: 256,
-            },
-        ],
-    },
-};
-
-export default function RootLayout({
-    children,
-}: Readonly<{
-    children: React.ReactNode;
-}>) {
-    return <div>{children}</div>;
-}
diff --git a/src/interface/web/app/factchecker/page.tsx b/src/interface/web/app/factchecker/page.tsx
deleted file mode 100644
index fc79f3c8..00000000
--- a/src/interface/web/app/factchecker/page.tsx
+++ /dev/null
@@ -1,676 +0,0 @@
-"use client";
-
-import styles from "./factChecker.module.css";
-import { useAuthenticatedData } from "@/app/common/auth";
-import { useState, useEffect } from "react";
-
-import ChatMessage, {
-    CodeContext,
-    Context,
-    OnlineContext,
-    OnlineContextData,
-    WebPage,
-} from "../components/chatMessage/chatMessage";
-import { ModelPicker, Model } from "../components/modelPicker/modelPicker";
-import ShareLink from "../components/shareLink/shareLink";
-
-import { Input } from "@/components/ui/input";
-import { Button } from "@/components/ui/button";
-
-import { Card, CardContent, CardFooter, CardHeader, CardTitle } from "@/components/ui/card";
-import Link from "next/link";
-import SidePanel from "../components/sidePanel/chatHistorySidePanel";
-import { useIsMobileWidth } from "../common/utils";
-
-const chatURL = "/api/chat";
-const verificationPrecursor =
-    "Limit your search to reputable sources. Search the internet for relevant supporting or refuting information. Do not reference my notes. Refuse to answer any queries that are not falsifiable by informing me that you will not answer the question. You're not permitted to ask follow-up questions, so do the best with what you have. Respond with **TRUE** or **FALSE** or **INCONCLUSIVE**, then provide your justification. Fact Check:";
-
-const LoadingSpinner = () => (
-    <div className={styles.loading}>
-        <div className={styles.loadingVerification}>
-            Researching...
-            <div className={styles.spinner}>
-                <div className={`${styles.dot1} bg-blue-300`}></div>
-                <div className={`${styles.dot2} bg-blue-300`}></div>
-            </div>
-        </div>
-    </div>
-);
-
-interface SupplementReferences {
-    additionalLink: string;
-    response: string;
-    linkTitle: string;
-}
-
-interface ResponseWithReferences {
-    context?: Context[];
-    online?: OnlineContext;
-    code?: CodeContext;
-    response?: string;
-}
-
-function handleCompiledReferences(chunk: string, currentResponse: string) {
-    const rawReference = chunk.split("### compiled references:")[1];
-    const rawResponse = chunk.split("### compiled references:")[0];
-    let references: ResponseWithReferences = {};
-
-    // Set the initial response
-    references.response = currentResponse + rawResponse;
-
-    const rawReferenceAsJson = JSON.parse(rawReference);
-    if (rawReferenceAsJson instanceof Array) {
-        references.context = rawReferenceAsJson;
-    } else if (typeof rawReferenceAsJson === "object" && rawReferenceAsJson !== null) {
-        references.online = rawReferenceAsJson;
-    }
-
-    return references;
-}
-
-async function verifyStatement(
-    message: string,
-    conversationId: string,
-    setIsLoading: (loading: boolean) => void,
-    setInitialResponse: (response: string) => void,
-    setInitialReferences: (references: ResponseWithReferences) => void,
-) {
-    setIsLoading(true);
-    // Construct the verification payload
-    let verificationMessage = `${verificationPrecursor} ${message}`;
-    const apiURL = `${chatURL}?client=web`;
-    const requestBody = {
-        q: verificationMessage,
-        conversation_id: conversationId,
-        stream: true,
-    };
-
-    try {
-        // Send a message to the chat server to verify the fact
-        const response = await fetch(apiURL, {
-            method: "POST",
-            headers: {
-                "Content-Type": "application/json",
-            },
-            body: JSON.stringify(requestBody),
-        });
-        if (!response.body) throw new Error("No response body found");
-
-        const reader = response.body?.getReader();
-        let decoder = new TextDecoder();
-        let result = "";
-
-        while (true) {
-            const { done, value } = await reader.read();
-            if (done) break;
-
-            let chunk = decoder.decode(value, { stream: true });
-
-            if (chunk.includes("### compiled references:")) {
-                const references = handleCompiledReferences(chunk, result);
-                if (references.response) {
-                    result = references.response;
-                    setInitialResponse(references.response);
-                    setInitialReferences(references);
-                }
-            } else {
-                result += chunk;
-                setInitialResponse(result);
-            }
-        }
-    } catch (error) {
-        console.error("Error verifying statement: ", error);
-    } finally {
-        setIsLoading(false);
-    }
-}
-
-async function spawnNewConversation(setConversationID: (conversationID: string) => void) {
-    let createURL = `/api/chat/sessions?client=web`;
-
-    const response = await fetch(createURL, { method: "POST" });
-
-    const data = await response.json();
-    setConversationID(data.conversation_id);
-}
-
-interface ReferenceVerificationProps {
-    message: string;
-    additionalLink: string;
-    conversationId: string;
-    linkTitle: string;
-    setChildReferencesCallback: (
-        additionalLink: string,
-        response: string,
-        linkTitle: string,
-    ) => void;
-    prefilledResponse?: string;
-}
-
-function ReferenceVerification(props: ReferenceVerificationProps) {
-    const [initialResponse, setInitialResponse] = useState("");
-    const [isLoading, setIsLoading] = useState(true);
-    const verificationStatement = `${props.message}. Use this link for reference: ${props.additionalLink}`;
-    const isMobileWidth = useIsMobileWidth();
-
-    useEffect(() => {
-        if (props.prefilledResponse) {
-            setInitialResponse(props.prefilledResponse);
-            setIsLoading(false);
-        } else {
-            verifyStatement(
-                verificationStatement,
-                props.conversationId,
-                setIsLoading,
-                setInitialResponse,
-                () => {},
-            );
-        }
-    }, [verificationStatement, props.conversationId, props.prefilledResponse]);
-
-    useEffect(() => {
-        if (initialResponse === "") return;
-        if (props.prefilledResponse) return;
-
-        if (!isLoading) {
-            // Only set the child references when it's done loading and if the initial response is not prefilled (i.e. it was fetched from the server)
-            props.setChildReferencesCallback(
-                props.additionalLink,
-                initialResponse,
-                props.linkTitle,
-            );
-        }
-    }, [initialResponse, isLoading, props]);
-
-    return (
-        <div>
-            {isLoading && <LoadingSpinner />}
-            <ChatMessage
-                chatMessage={{
-                    automationId: "",
-                    by: "AI",
-                    message: initialResponse,
-                    context: [],
-                    created: new Date().toISOString(),
-                    onlineContext: {},
-                    codeContext: {},
-                    conversationId: props.conversationId,
-                    turnId: "",
-                }}
-                isMobileWidth={isMobileWidth}
-                onDeleteMessage={(turnId?: string) => {}}
-                conversationId={props.conversationId}
-            />
-        </div>
-    );
-}
-
-interface SupplementalReferenceProps {
-    onlineData?: OnlineContextData;
-    officialFactToVerify: string;
-    conversationId: string;
-    additionalLink: string;
-    setChildReferencesCallback: (
-        additionalLink: string,
-        response: string,
-        linkTitle: string,
-    ) => void;
-    prefilledResponse?: string;
-    linkTitle?: string;
-}
-
-function SupplementalReference(props: SupplementalReferenceProps) {
-    const linkTitle = props.linkTitle || props.onlineData?.organic?.[0]?.title || "Reference";
-    const linkAsWebpage = { link: props.additionalLink } as WebPage;
-    return (
-        <Card className={`mt-2 mb-4`}>
-            <CardHeader>
-                <a
-                    className={styles.titleLink}
-                    href={props.additionalLink}
-                    target="_blank"
-                    rel="noreferrer"
-                >
-                    {linkTitle}
-                </a>
-                <WebPageLink {...linkAsWebpage} />
-            </CardHeader>
-            <CardContent>
-                <ReferenceVerification
-                    additionalLink={props.additionalLink}
-                    message={props.officialFactToVerify}
-                    linkTitle={linkTitle}
-                    conversationId={props.conversationId}
-                    setChildReferencesCallback={props.setChildReferencesCallback}
-                    prefilledResponse={props.prefilledResponse}
-                />
-            </CardContent>
-        </Card>
-    );
-}
-
-const WebPageLink = (webpage: WebPage) => {
-    const webpageDomain = new URL(webpage.link).hostname;
-    return (
-        <div className={styles.subLinks}>
-            <a
-                className={`${styles.subLinks} bg-blue-200 px-2`}
-                href={webpage.link}
-                target="_blank"
-                rel="noreferrer"
-            >
-                {webpageDomain}
-            </a>
-        </div>
-    );
-};
-
-export default function FactChecker() {
-    const [factToVerify, setFactToVerify] = useState("");
-    const [officialFactToVerify, setOfficialFactToVerify] = useState("");
-    const [isLoading, setIsLoading] = useState(false);
-    const [initialResponse, setInitialResponse] = useState("");
-    const [clickedVerify, setClickedVerify] = useState(false);
-    const [initialReferences, setInitialReferences] = useState<ResponseWithReferences>();
-    const [childReferences, setChildReferences] = useState<SupplementReferences[]>();
-    const [modelUsed, setModelUsed] = useState<Model>();
-    const isMobileWidth = useIsMobileWidth();
-
-    const [conversationID, setConversationID] = useState("");
-    const [runId, setRunId] = useState("");
-    const [loadedFromStorage, setLoadedFromStorage] = useState(false);
-
-    const [initialModel, setInitialModel] = useState<Model>();
-
-    function setChildReferencesCallback(
-        additionalLink: string,
-        response: string,
-        linkTitle: string,
-    ) {
-        const newReferences = childReferences || [];
-        const exists = newReferences.find(
-            (reference) => reference.additionalLink === additionalLink,
-        );
-        if (exists) return;
-        newReferences.push({ additionalLink, response, linkTitle });
-        setChildReferences(newReferences);
-    }
-
-    let userData = useAuthenticatedData();
-
-    function storeData() {
-        const data = {
-            factToVerify,
-            response: initialResponse,
-            references: initialReferences,
-            childReferences,
-            runId,
-            modelUsed,
-        };
-
-        fetch(`/api/chat/store/factchecker`, {
-            method: "POST",
-            headers: {
-                "Content-Type": "application/json",
-            },
-            body: JSON.stringify({
-                runId: runId,
-                storeData: data,
-            }),
-        });
-    }
-
-    useEffect(() => {
-        if (factToVerify) {
-            document.title = `AI Fact Check: ${factToVerify}`;
-        } else {
-            document.title = "AI Fact Checker";
-        }
-    }, [factToVerify]);
-
-    useEffect(() => {
-        const storedFact = localStorage.getItem("factToVerify");
-        if (storedFact) {
-            setFactToVerify(storedFact);
-        }
-
-        // Get query params from the URL
-        const urlParams = new URLSearchParams(window.location.search);
-        const factToVerifyParam = urlParams.get("factToVerify");
-
-        if (factToVerifyParam) {
-            setFactToVerify(factToVerifyParam);
-        }
-
-        const runIdParam = urlParams.get("runId");
-        if (runIdParam) {
-            setRunId(runIdParam);
-
-            // Define an async function to fetch data
-            const fetchData = async () => {
-                const storedDataURL = `/api/chat/store/factchecker?runId=${runIdParam}`;
-                try {
-                    const response = await fetch(storedDataURL);
-                    if (response.status !== 200) {
-                        throw new Error("Failed to fetch stored data");
-                    }
-                    const storedData = JSON.parse(await response.json());
-                    if (storedData) {
-                        setOfficialFactToVerify(storedData.factToVerify);
-                        setInitialResponse(storedData.response);
-                        setInitialReferences(storedData.references);
-                        setChildReferences(storedData.childReferences);
-                        setInitialModel(storedData.modelUsed);
-                    }
-                    setLoadedFromStorage(true);
-                } catch (error) {
-                    console.error("Error fetching stored data: ", error);
-                }
-            };
-
-            // Call the async function
-            fetchData();
-        }
-    }, []);
-
-    function onClickVerify() {
-        if (clickedVerify) return;
-
-        // Perform validation checks on the fact to verify
-        if (!factToVerify) {
-            alert("Please enter a fact to verify.");
-            return;
-        }
-
-        setClickedVerify(true);
-        if (!userData) {
-            let currentURL = window.location.href;
-            window.location.href = `/login?next=${currentURL}`;
-        }
-
-        setInitialReferences(undefined);
-        setInitialResponse("");
-
-        spawnNewConversation(setConversationID);
-
-        // Set the runId to a random 12-digit alphanumeric string
-        const newRunId = [...Array(16)].map(() => Math.random().toString(36)[2]).join("");
-        setRunId(newRunId);
-        window.history.pushState(
-            {},
-            document.title,
-            window.location.pathname + `?runId=${newRunId}`,
-        );
-
-        setOfficialFactToVerify(factToVerify);
-        setClickedVerify(false);
-    }
-
-    useEffect(() => {
-        if (!conversationID) return;
-        verifyStatement(
-            officialFactToVerify,
-            conversationID,
-            setIsLoading,
-            setInitialResponse,
-            setInitialReferences,
-        );
-    }, [conversationID, officialFactToVerify]);
-
-    // Store factToVerify in localStorage whenever it changes
-    useEffect(() => {
-        localStorage.setItem("factToVerify", factToVerify);
-    }, [factToVerify]);
-
-    // Update the meta tags for the description and og:description
-    useEffect(() => {
-        let metaTag = document.querySelector('meta[name="description"]');
-        if (metaTag) {
-            metaTag.setAttribute("content", initialResponse);
-        }
-        let metaOgTag = document.querySelector('meta[property="og:description"]');
-        if (!metaOgTag) {
-            metaOgTag = document.createElement("meta");
-            metaOgTag.setAttribute("property", "og:description");
-            document.getElementsByTagName("head")[0].appendChild(metaOgTag);
-        }
-        metaOgTag.setAttribute("content", initialResponse);
-    }, [initialResponse]);
-
-    const renderReferences = (
-        conversationId: string,
-        initialReferences: ResponseWithReferences,
-        officialFactToVerify: string,
-        loadedFromStorage: boolean,
-        childReferences?: SupplementReferences[],
-    ) => {
-        if (loadedFromStorage && childReferences) {
-            return renderSupplementalReferences(childReferences);
-        }
-
-        const seenLinks = new Set();
-
-        // Any links that are present in webpages should not be searched again
-        Object.entries(initialReferences.online || {}).map(([key, onlineData], index) => {
-            const webpages = onlineData?.webpages || [];
-            // Webpage can be a list or a single object
-            if (webpages instanceof Array) {
-                for (let i = 0; i < webpages.length; i++) {
-                    const webpage = webpages[i];
-                    const additionalLink = webpage.link || "";
-                    if (seenLinks.has(additionalLink)) {
-                        return null;
-                    }
-                    seenLinks.add(additionalLink);
-                }
-            } else {
-                let singleWebpage = webpages as WebPage;
-                const additionalLink = singleWebpage.link || "";
-                if (seenLinks.has(additionalLink)) {
-                    return null;
-                }
-                seenLinks.add(additionalLink);
-            }
-        });
-
-        return Object.entries(initialReferences.online || {})
-            .map(([key, onlineData], index) => {
-                let additionalLink = "";
-
-                // Loop through organic links until we find one that hasn't been searched
-                for (let i = 0; i < onlineData?.organic?.length; i++) {
-                    const webpage = onlineData?.organic?.[i];
-                    additionalLink = webpage.link || "";
-
-                    if (!seenLinks.has(additionalLink)) {
-                        break;
-                    }
-                }
-
-                seenLinks.add(additionalLink);
-
-                if (additionalLink === "") return null;
-
-                return (
-                    <SupplementalReference
-                        key={index}
-                        onlineData={onlineData}
-                        officialFactToVerify={officialFactToVerify}
-                        conversationId={conversationId}
-                        additionalLink={additionalLink}
-                        setChildReferencesCallback={setChildReferencesCallback}
-                    />
-                );
-            })
-            .filter(Boolean);
-    };
-
-    const renderSupplementalReferences = (references: SupplementReferences[]) => {
-        return references.map((reference, index) => {
-            return (
-                <SupplementalReference
-                    key={index}
-                    additionalLink={reference.additionalLink}
-                    officialFactToVerify={officialFactToVerify}
-                    conversationId={conversationID}
-                    linkTitle={reference.linkTitle}
-                    setChildReferencesCallback={setChildReferencesCallback}
-                    prefilledResponse={reference.response}
-                />
-            );
-        });
-    };
-
-    const renderWebpages = (webpages: WebPage[] | WebPage) => {
-        if (webpages instanceof Array) {
-            return webpages.map((webpage, index) => {
-                return WebPageLink(webpage);
-            });
-        } else {
-            return WebPageLink(webpages);
-        }
-    };
-
-    function constructShareUrl() {
-        const url = new URL(window.location.href);
-        url.searchParams.set("runId", runId);
-        return url.href;
-    }
-
-    return (
-        <>
-            <div className="relative md:fixed h-full">
-                <SidePanel conversationId={null} uploadedFiles={[]} isMobileWidth={isMobileWidth} />
-            </div>
-            <div className={styles.factCheckerContainer}>
-                <h1
-                    className={`${styles.response} pt-8 md:pt-4 font-large outline-slate-800 dark:outline-slate-200`}
-                >
-                    AI Fact Checker
-                </h1>
-                <footer className={`${styles.footer} mt-4`}>
-                    This is an experimental AI tool. It may make mistakes.
-                </footer>
-                {initialResponse && initialReferences && childReferences ? (
-                    <div className={styles.reportActions}>
-                        <Button asChild variant="secondary">
-                            <Link href="/factchecker" target="_blank" rel="noopener noreferrer">
-                                Try Another
-                            </Link>
-                        </Button>
-                        <ShareLink
-                            buttonTitle="Share report"
-                            title="AI Fact Checking Report"
-                            description="Share this fact checking report with others. Anyone who has this link will be able to view the report."
-                            url={constructShareUrl()}
-                            onShare={loadedFromStorage ? () => {} : storeData}
-                        />
-                    </div>
-                ) : (
-                    <div className={styles.newReportActions}>
-                        <div className={`${styles.inputFields} mt-4`}>
-                            <Input
-                                type="text"
-                                maxLength={200}
-                                placeholder="Enter a falsifiable statement to verify"
-                                disabled={isLoading}
-                                onChange={(e) => setFactToVerify(e.target.value)}
-                                value={factToVerify}
-                                onKeyDown={(e) => {
-                                    if (e.key === "Enter") {
-                                        onClickVerify();
-                                    }
-                                }}
-                                onFocus={(e) => (e.target.placeholder = "")}
-                                onBlur={(e) =>
-                                    (e.target.placeholder =
-                                        "Enter a falsifiable statement to verify")
-                                }
-                            />
-                            <Button disabled={clickedVerify} onClick={() => onClickVerify()}>
-                                Verify
-                            </Button>
-                        </div>
-                        <h3 className={`mt-4 mb-4`}>
-                            Try with a particular model. You must be{" "}
-                            <a
-                                href="/settings"
-                                className="font-medium text-blue-600 dark:text-blue-500 hover:underline"
-                            >
-                                subscribed
-                            </a>{" "}
-                            to configure the model.
-                        </h3>
-                    </div>
-                )}
-                <ModelPicker
-                    disabled={isLoading || loadedFromStorage}
-                    setModelUsed={setModelUsed}
-                    initialModel={initialModel}
-                />
-                {isLoading && (
-                    <div className={styles.loading}>
-                        <LoadingSpinner />
-                    </div>
-                )}
-                {initialResponse && (
-                    <Card className={`mt-4`}>
-                        <CardHeader>
-                            <CardTitle>{officialFactToVerify}</CardTitle>
-                        </CardHeader>
-                        <CardContent>
-                            <div className={styles.responseText}>
-                                <ChatMessage
-                                    chatMessage={{
-                                        automationId: "",
-                                        by: "AI",
-                                        message: initialResponse,
-                                        context: [],
-                                        created: new Date().toISOString(),
-                                        onlineContext: {},
-                                        codeContext: {},
-                                        conversationId: conversationID,
-                                        turnId: "",
-                                    }}
-                                    conversationId={conversationID}
-                                    onDeleteMessage={(turnId?: string) => {}}
-                                    isMobileWidth={isMobileWidth}
-                                />
-                            </div>
-                        </CardContent>
-                        <CardFooter>
-                            {initialReferences &&
-                                initialReferences.online &&
-                                Object.keys(initialReferences.online).length > 0 && (
-                                    <div className={styles.subLinks}>
-                                        {Object.entries(initialReferences.online).map(
-                                            ([key, onlineData], index) => {
-                                                const webpages = onlineData?.webpages || [];
-                                                return renderWebpages(webpages);
-                                            },
-                                        )}
-                                    </div>
-                                )}
-                        </CardFooter>
-                    </Card>
-                )}
-                {initialReferences && (
-                    <div className={styles.referenceContainer}>
-                        <h2 className="mt-4 mb-4">Supplements</h2>
-                        <div className={styles.references}>
-                            {initialReferences.online !== undefined &&
-                                renderReferences(
-                                    conversationID,
-                                    initialReferences,
-                                    officialFactToVerify,
-                                    loadedFromStorage,
-                                    childReferences,
-                                )}
-                        </div>
-                    </div>
-                )}
-            </div>
-        </>
-    );
-}
diff --git a/src/khoj/routers/web_client.py b/src/khoj/routers/web_client.py
index 1ac360d9..76ce7dd4 100644
--- a/src/khoj/routers/web_client.py
+++ b/src/khoj/routers/web_client.py
@@ -51,16 +51,6 @@ def chat_page(request: Request):
     )
 
 
-@web_client.get("/factchecker", response_class=FileResponse)
-def fact_checker_page(request: Request):
-    return templates.TemplateResponse(
-        "factchecker/index.html",
-        context={
-            "request": request,
-        },
-    )
-
-
 @web_client.get("/login", response_class=FileResponse)
 def login_page(request: Request):
     next_url = get_next_url(request)

From ceb29eae744f9ce483463fd3d6dacd418610475f Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Sat, 9 Nov 2024 12:25:36 -0800
Subject: [PATCH 02/12] Add phone number verification and remove telemetry
 update call from place where authentication middleware isn't yet installed
 (in the middleware itself).

---
 src/khoj/configure.py      | 6 ------
 src/khoj/database/admin.py | 7 ++++++-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/khoj/configure.py b/src/khoj/configure.py
index df0760fb..a1f4a7db 100644
--- a/src/khoj/configure.py
+++ b/src/khoj/configure.py
@@ -168,12 +168,6 @@ class UserAuthenticationBackend(AuthenticationBackend):
             if create_if_not_exists:
                 user, is_new = await aget_or_create_user_by_phone_number(phone_number)
                 if user and is_new:
-                    update_telemetry_state(
-                        request=request,
-                        telemetry_type="api",
-                        api="create_user",
-                        metadata={"server_id": str(user.uuid)},
-                    )
                     logger.log(logging.INFO, f"🥳 New User Created: {user.uuid}")
             else:
                 user = await aget_user_by_phone_number(phone_number)
diff --git a/src/khoj/database/admin.py b/src/khoj/database/admin.py
index b4059e8c..f7c140c1 100644
--- a/src/khoj/database/admin.py
+++ b/src/khoj/database/admin.py
@@ -78,7 +78,12 @@ class KhojUserAdmin(UserAdmin):
     search_fields = ("email", "username", "phone_number", "uuid")
     filter_horizontal = ("groups", "user_permissions")
 
-    fieldsets = (("Personal info", {"fields": ("phone_number", "email_verification_code")}),) + UserAdmin.fieldsets
+    fieldsets = (
+        (
+            "Personal info",
+            {"fields": ("phone_number", "email_verification_code", "verified_phone_number", "verified_email")},
+        ),
+    ) + UserAdmin.fieldsets
 
     actions = ["get_email_login_url"]
 

From 84a8088c2b1f7b9d505a9f00ddfd093520039c26 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 7 Nov 2024 15:23:30 -0800
Subject: [PATCH 03/12] Only evaluate non-empty responses to reduce eval script
 latency, cost

Empty responses by Khoj will always be an incorrect response, so no
need to make call to an evaluator agent to check that
---
 tests/eval_frames.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/eval_frames.py b/tests/eval_frames.py
index 8a1d849e..ae44f63e 100644
--- a/tests/eval_frames.py
+++ b/tests/eval_frames.py
@@ -113,7 +113,11 @@ def process_batch(batch, counter, results, dataset_length):
         agent_response = get_agent_response(prompt)
 
         # Evaluate response
-        evaluation = evaluate_response(prompt, agent_response, answer)
+        if agent_response is None or agent_response.strip() == "":
+            evaluation["decision"] = False
+            evaluation["explanation"] = "Agent response is empty. This maybe due to a service error."
+        else:
+            evaluation = evaluate_response(prompt, agent_response, answer)
 
         # Store results
         results.append(

From f967bdf7020c383f41fb3c0d35b4d1beb326ba19 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 8 Nov 2024 15:46:44 -0800
Subject: [PATCH 04/12] Show correct example index being currently processed in
 frames eval

Previously the batch start index wasn't being passed so all batches
started in parallel were showing the same processing example index

This change doesn't impact the evaluation itself, just the index shown
of the example currently being evaluated
---
 tests/eval_frames.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/eval_frames.py b/tests/eval_frames.py
index ae44f63e..102da469 100644
--- a/tests/eval_frames.py
+++ b/tests/eval_frames.py
@@ -101,10 +101,10 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic
         return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"}
 
 
-def process_batch(batch, counter, results, dataset_length):
-    for prompt, answer, reasoning_type in batch:
-        counter += 1
-        logger.info(f"Processing example: {counter}/{dataset_length}")
+def process_batch(batch, batch_start, results, dataset_length):
+    for idx, (prompt, answer, reasoning_type) in enumerate(batch):
+        current_index = batch_start + idx
+        logger.info(f"Processing example: {current_index}/{dataset_length}")
 
         # Trigger research mode if enabled
         prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
@@ -122,7 +122,7 @@ def process_batch(batch, counter, results, dataset_length):
         # Store results
         results.append(
             {
-                "index": counter,
+                "index": current_index,
                 "prompt": prompt,
                 "ground_truth": answer,
                 "agent_response": agent_response,
@@ -169,12 +169,13 @@ def main():
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = []
         for i in range(0, dataset_length, BATCH_SIZE):
+            batch_start = i
             batch = zip(
                 dataset["Prompt"][i : i + BATCH_SIZE],
                 dataset["Answer"][i : i + BATCH_SIZE],
                 dataset["reasoning_types"][i : i + BATCH_SIZE],
             )
-            futures.append(executor.submit(process_batch, batch, counter, results, dataset_length))
+            futures.append(executor.submit(process_batch, batch, batch_start, results, dataset_length))
 
         # Wait for all futures to complete
         concurrent.futures.wait(futures)

From 80ee35b9b1a731d894df632618bbed18566c5240 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Thu, 7 Nov 2024 15:56:43 -0800
Subject: [PATCH 05/12] Wrap messages in web, obsidian UI to stay within screen
 when long links

Wrap long links etc. in chat messages and train of thought lists on
web app app and obsidian plugin by breaking them into newlines by word
---
 src/interface/obsidian/styles.css                               | 1 +
 .../web/app/components/chatMessage/chatMessage.module.css       | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/src/interface/obsidian/styles.css b/src/interface/obsidian/styles.css
index 42c1b3ce..b02b2ff3 100644
--- a/src/interface/obsidian/styles.css
+++ b/src/interface/obsidian/styles.css
@@ -78,6 +78,7 @@ If your plugin does not need CSS, delete this file.
     user-select: text;
     color: var(--text-normal);
     background-color: var(--active-bg);
+    word-break: break-word;
 }
 /* color chat bubble by khoj blue */
 .khoj-chat-message-text.khoj {
diff --git a/src/interface/web/app/components/chatMessage/chatMessage.module.css b/src/interface/web/app/components/chatMessage/chatMessage.module.css
index ba9372b8..b055d0a5 100644
--- a/src/interface/web/app/components/chatMessage/chatMessage.module.css
+++ b/src/interface/web/app/components/chatMessage/chatMessage.module.css
@@ -4,6 +4,7 @@ div.chatMessageContainer {
     margin: 12px;
     border-radius: 16px;
     padding: 8px 16px 0 16px;
+    word-break: break-word;
 }
 
 div.chatMessageWrapper {
@@ -170,6 +171,7 @@ div.trainOfThoughtElement {
 div.trainOfThoughtElement ol,
 div.trainOfThoughtElement ul {
     margin: auto;
+    word-break: break-word;
 }
 
 @media screen and (max-width: 768px) {

From d892ab3174e0fb4d3265594dd3bae8a4fd337586 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sat, 9 Nov 2024 11:30:55 -0800
Subject: [PATCH 06/12] Fix handling of command rate limit and improve rate
 limit messages

Command rate limit wouldn't be shown to user as server wouldn't be
able to handle HTTP exception in the middle of streaming.

Catch exception and render it as LLM response message instead for
visibility into command rate limiting to user on client

Log rate limmit messages for all rate limit events on server as info
messages

Convert exception messages into first person responses by Khoj to
prevent breaking the third wall and provide more details on wht
happened and possible ways to resolve them.
---
 src/khoj/routers/api_chat.py | 13 ++++----
 src/khoj/routers/helpers.py  | 63 ++++++++++++++++++++++++------------
 2 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py
index a20982ea..8646a695 100644
--- a/src/khoj/routers/api_chat.py
+++ b/src/khoj/routers/api_chat.py
@@ -738,8 +738,13 @@ async def chat(
                 conversation_commands.append(mode)
 
         for cmd in conversation_commands:
-            await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
-            q = q.replace(f"/{cmd.value}", "").strip()
+            try:
+                await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
+                q = q.replace(f"/{cmd.value}", "").strip()
+            except HTTPException as e:
+                async for result in send_llm_response(str(e.detail)):
+                    yield result
+                return
 
         defiltered_query = defilter_query(q)
 
@@ -775,10 +780,6 @@ async def chat(
             # researched_results = await extract_relevant_info(q, researched_results, agent)
             logger.info(f"Researched Results: {researched_results}")
 
-        for cmd in conversation_commands:
-            await conversation_command_rate_limiter.update_and_check_if_valid(request, cmd)
-            q = q.replace(f"/{cmd.value}", "").strip()
-
         used_slash_summarize = conversation_commands == [ConversationCommand.Summarize]
         file_filters = conversation.file_filters if conversation else []
         # Skip trying to summarize if
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index 990fa33f..c7a90fc3 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1306,25 +1306,28 @@ class ApiUserRateLimiter:
         # Check if the user has exceeded the rate limit
         if subscribed and count_requests >= self.subscribed_requests:
             logger.info(
-                f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
-            )
-            raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
-        if not subscribed and count_requests >= self.requests:
-            if self.requests >= self.subscribed_requests:
-                logger.info(
-                    f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
-                )
-                raise HTTPException(
-                    status_code=429,
-                    detail="Slow down! Too Many Requests",
-                )
-
-            logger.info(
-                f"Rate limit: {count_requests} requests in {self.window} seconds for user: {user}. Limit is {self.subscribed_requests} requests."
+                f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for subscribed user: {user}."
             )
             raise HTTPException(
                 status_code=429,
-                detail="I'm glad you're enjoying interacting with me! But you've exceeded your usage limit for today. Come back tomorrow or subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings).",
+                detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
+            )
+        if not subscribed and count_requests >= self.requests:
+            if self.requests >= self.subscribed_requests:
+                logger.info(
+                    f"Rate limit: {count_requests}/{self.subscribed_requests} requests not allowed in {self.window} seconds for user: {user}."
+                )
+                raise HTTPException(
+                    status_code=429,
+                    detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. But let's chat more tomorrow?",
+                )
+
+            logger.info(
+                f"Rate limit: {count_requests}/{self.requests} requests not allowed in {self.window} seconds for user: {user}."
+            )
+            raise HTTPException(
+                status_code=429,
+                detail="I'm glad you're enjoying interacting with me! You've unfortunately exceeded your usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can continue our conversation tomorrow?",
             )
 
         # Add the current request to the cache
@@ -1350,6 +1353,7 @@ class ApiImageRateLimiter:
 
         # Check number of images
         if len(body.images) > self.max_images:
+            logger.info(f"Rate limit: {len(body.images)}/{self.max_images} images not allowed per message.")
             raise HTTPException(
                 status_code=429,
                 detail=f"Those are way too many images for me! I can handle up to {self.max_images} images per message.",
@@ -1370,6 +1374,7 @@ class ApiImageRateLimiter:
             total_size_mb += len(image_bytes) / (1024 * 1024)  # Convert bytes to MB
 
         if total_size_mb > self.max_combined_size_mb:
+            logger.info(f"Data limit: {total_size_mb}MB/{self.max_combined_size_mb}MB size not allowed per message.")
             raise HTTPException(
                 status_code=429,
                 detail=f"Those images are way too large for me! I can handle up to {self.max_combined_size_mb}MB of images per message.",
@@ -1405,13 +1410,19 @@ class ConversationCommandRateLimiter:
 
         if subscribed and count_requests >= self.subscribed_rate_limit:
             logger.info(
-                f"Rate limit: {count_requests} requests in 24 hours for user: {user}. Limit is {self.subscribed_rate_limit} requests."
+                f"Rate limit: {count_requests}/{self.subscribed_rate_limit} requests not allowed in 24 hours for subscribed user: {user}."
             )
-            raise HTTPException(status_code=429, detail="Slow down! Too Many Requests")
-        if not subscribed and count_requests >= self.trial_rate_limit:
             raise HTTPException(
                 status_code=429,
-                detail=f"We're glad you're enjoying Khoj! You've exceeded your `/{conversation_command.value}` command usage limit for today. Subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings).",
+                detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. Maybe we can talk about something else for today?",
+            )
+        if not subscribed and count_requests >= self.trial_rate_limit:
+            logger.info(
+                f"Rate limit: {count_requests}/{self.trial_rate_limit} requests not allowed in 24 hours for user: {user}."
+            )
+            raise HTTPException(
+                status_code=429,
+                detail=f"I'm glad you're enjoying interacting with me! You've unfortunately exceeded your `/{conversation_command.value}` command usage limit for today. You can subscribe to increase your usage limit via [your settings](https://app.khoj.dev/settings) or we can talk about something else for today?",
             )
         await UserRequests.objects.acreate(user=user, slug=command_slug)
         return
@@ -1457,16 +1468,28 @@ class ApiIndexedDataLimiter:
         logger.info(f"Deleted {num_deleted_entries} entries for user: {user}.")
 
         if subscribed and incoming_data_size_mb >= self.subscribed_num_entries_size:
+            logger.info(
+                f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.subscribed_num_entries_size}MB allowed for subscribed user: {user}."
+            )
             raise HTTPException(status_code=429, detail="Too much data indexed.")
         if not subscribed and incoming_data_size_mb >= self.num_entries_size:
+            logger.info(
+                f"Data limit: {incoming_data_size_mb}MB incoming will exceed {self.num_entries_size}MB allowed for user: {user}."
+            )
             raise HTTPException(
                 status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
             )
 
         user_size_data = EntryAdapters.get_size_of_indexed_data_in_mb(user)
         if subscribed and user_size_data + incoming_data_size_mb >= self.subscribed_total_entries_size:
+            logger.info(
+                f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for subscribed user: {user}."
+            )
             raise HTTPException(status_code=429, detail="Too much data indexed.")
         if not subscribed and user_size_data + incoming_data_size_mb >= self.total_entries_size_limit:
+            logger.info(
+                f"Data limit: {incoming_data_size_mb}MB incoming + {user_size_data}MB existing will exceed {self.subscribed_total_entries_size}MB allowed for non subscribed user: {user}."
+            )
             raise HTTPException(
                 status_code=429, detail="Too much data indexed. Subscribe to increase your data index limit."
             )

From 8ef7892c5e1c896774a27bcd9a3aab1ec1a3e0eb Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 10 Nov 2024 13:42:55 -0800
Subject: [PATCH 07/12] Exclude non-dictionary doc context from chat history
 sent to chat models

This fixes chat with old chat sessions. Fixes issue with old Whatsapp
users can't chat with Khoj because chat history doc context was
stored as a list earlier
---
 src/khoj/processor/conversation/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index 74c464d9..c6b3bc2f 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -338,7 +338,11 @@ def generate_chatml_messages_with_context(
             message_context += chat.get("intent").get("inferred-queries")[0]
         if not is_none_or_empty(chat.get("context")):
             references = "\n\n".join(
-                {f"# File: {item['file']}\n## {item['compiled']}\n" for item in chat.get("context") or []}
+                {
+                    f"# File: {item['file']}\n## {item['compiled']}\n"
+                    for item in chat.get("context") or []
+                    if isinstance(item, dict)
+                }
             )
             message_context += f"{prompts.notes_conversation.format(references=references)}\n\n"
         if not is_none_or_empty(chat.get("onlineContext")):

From eb492f3025d05ace145f22aeb372d84cdecf4b2f Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 10 Nov 2024 13:19:24 -0800
Subject: [PATCH 08/12] Only keep webpage content requested, even if Jina API
 gets more data

Jina search API returns content of all webpages in search results.
Previously code wouldn't remove content beyond max_webpages_to_read
limit set. Now, webpage content in organic results aree explicitly
removed beyond the requested max_webpage_to_read limit.

This should align behavior of online results from Jina with other
online search providers. And restrict llm context to a reasonable size
when using Jina for online search.
---
 src/khoj/processor/tools/online_search.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index c6fc7c20..34c4911a 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -95,17 +95,21 @@ async def search_online(
         response_dict = {subquery: search_result for subquery, search_result in search_results}
 
     # Gather distinct web pages from organic results for subqueries without an instant answer.
-    # Content of web pages is directly available when Jina is used for search.
     webpages: Dict[str, Dict] = {}
     for subquery in response_dict:
         if "answerBox" in response_dict[subquery]:
             continue
-        for organic in response_dict[subquery].get("organic", [])[:max_webpages_to_read]:
+        for idx, organic in enumerate(response_dict[subquery].get("organic", [])):
             link = organic.get("link")
-            if link in webpages:
+            if link in webpages and idx < max_webpages_to_read:
                 webpages[link]["queries"].add(subquery)
-            else:
+            # Content of web pages is directly available when Jina is used for search.
+            elif idx < max_webpages_to_read:
                 webpages[link] = {"queries": {subquery}, "content": organic.get("content")}
+            # Only keep webpage content for up to max_webpages_to_read organic results.
+            if idx >= max_webpages_to_read and not is_none_or_empty(organic.get("content")):
+                organic["content"] = None
+                response_dict[subquery]["organic"][idx] = organic
 
     # Read, extract relevant info from the retrieved web pages
     if webpages:

From 306f7a21327e705a5e3af008022b40d7063c7718 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Sun, 10 Nov 2024 13:14:46 -0800
Subject: [PATCH 09/12] Show error in picking next tool to researcher llm in
 next iteration

Previously the whole research mode response would fail if the pick
next tool call to chat model failed. Now instead of it completely
failing, the researcher actor is told to try again in next iteration.

This allows for a more graceful degradation in answering a research
question even if a (few?) calls to the chat model fail.
---
 src/khoj/routers/research.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/khoj/routers/research.py b/src/khoj/routers/research.py
index 4f9c6b4e..46d4c424 100644
--- a/src/khoj/routers/research.py
+++ b/src/khoj/routers/research.py
@@ -87,15 +87,24 @@ async def apick_next_tool(
         max_iterations=max_iterations,
     )
 
-    with timer("Chat actor: Infer information sources to refer", logger):
-        response = await send_message_to_model_wrapper(
-            query=query,
-            context=function_planning_prompt,
-            response_type="json_object",
-            user=user,
-            query_images=query_images,
-            tracer=tracer,
+    try:
+        with timer("Chat actor: Infer information sources to refer", logger):
+            response = await send_message_to_model_wrapper(
+                query=query,
+                context=function_planning_prompt,
+                response_type="json_object",
+                user=user,
+                query_images=query_images,
+                tracer=tracer,
+            )
+    except Exception as e:
+        logger.error(f"Failed to infer information sources to refer: {e}", exc_info=True)
+        yield InformationCollectionIteration(
+            tool=None,
+            query=None,
+            warning="Failed to infer information sources to refer. Skipping iteration. Try again.",
         )
+        return
 
     try:
         response = clean_json(response)

From 137687ee4907c27bb2d7beb259ef733e8690b216 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 8 Nov 2024 10:43:02 -0800
Subject: [PATCH 10/12] Deduplicate searches in normal mode & across research
 iterations

- Deduplicate online, doc search queries across research iterations.
  This avoids running previously run online, doc searches again and
  dedupes online, doc context seen by model to generate response.
- Deduplicate online search queries generated by chat model for each
  user query.
- Do not pass online, docs, code context separately when generate
  response in research mode. These are already collected in the meta
  research passed with the user query
- Improve formatting of context passed to generate research response
  - Use xml tags to delimit context. Pass per iteration queries in each
    iteration result
  - Put user query before meta research results in user message passed
    for generating response

This deduplications will improve speed, cost & quality of research mode
---
 src/khoj/processor/conversation/utils.py  |  2 +
 src/khoj/processor/tools/online_search.py | 30 ++++++----
 src/khoj/routers/api.py                   |  4 +-
 src/khoj/routers/api_chat.py              |  3 +-
 src/khoj/routers/helpers.py               | 21 ++++---
 src/khoj/routers/research.py              | 68 ++++++++++++++++-------
 6 files changed, 86 insertions(+), 42 deletions(-)

diff --git a/src/khoj/processor/conversation/utils.py b/src/khoj/processor/conversation/utils.py
index c6b3bc2f..0aaaa4b3 100644
--- a/src/khoj/processor/conversation/utils.py
+++ b/src/khoj/processor/conversation/utils.py
@@ -112,6 +112,7 @@ class InformationCollectionIteration:
         onlineContext: dict = None,
         codeContext: dict = None,
         summarizedResult: str = None,
+        warning: str = None,
     ):
         self.tool = tool
         self.query = query
@@ -119,6 +120,7 @@ class InformationCollectionIteration:
         self.onlineContext = onlineContext
         self.codeContext = codeContext
         self.summarizedResult = summarizedResult
+        self.warning = warning
 
 
 def construct_iteration_history(
diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index 34c4911a..d2d8c685 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -4,7 +4,7 @@ import logging
 import os
 import urllib.parse
 from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 
 import aiohttp
 from bs4 import BeautifulSoup
@@ -66,6 +66,7 @@ async def search_online(
     custom_filters: List[str] = [],
     max_webpages_to_read: int = DEFAULT_MAX_WEBPAGES_TO_READ,
     query_images: List[str] = None,
+    previous_subqueries: Set = set(),
     agent: Agent = None,
     tracer: dict = {},
 ):
@@ -76,19 +77,24 @@ async def search_online(
         return
 
     # Breakdown the query into subqueries to get the correct answer
-    subqueries = await generate_online_subqueries(
+    new_subqueries = await generate_online_subqueries(
         query, conversation_history, location, user, query_images=query_images, agent=agent, tracer=tracer
     )
-    response_dict = {}
+    subqueries = list(new_subqueries - previous_subqueries)
+    response_dict: Dict[str, Dict[str, List[Dict] | Dict]] = {}
 
-    if subqueries:
-        logger.info(f"🌐 Searching the Internet for {list(subqueries)}")
-        if send_status_func:
-            subqueries_str = "\n- " + "\n- ".join(list(subqueries))
-            async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
-                yield {ChatEvent.STATUS: event}
+    if is_none_or_empty(subqueries):
+        logger.info("No new subqueries to search online")
+        yield response_dict
+        return
 
-    with timer(f"Internet searches for {list(subqueries)} took", logger):
+    logger.info(f"🌐 Searching the Internet for {subqueries}")
+    if send_status_func:
+        subqueries_str = "\n- " + "\n- ".join(subqueries)
+        async for event in send_status_func(f"**Searching the Internet for**: {subqueries_str}"):
+            yield {ChatEvent.STATUS: event}
+
+    with timer(f"Internet searches for {subqueries} took", logger):
         search_func = search_with_google if SERPER_DEV_API_KEY else search_with_jina
         search_tasks = [search_func(subquery, location) for subquery in subqueries]
         search_results = await asyncio.gather(*search_tasks)
@@ -119,7 +125,9 @@ async def search_online(
             async for event in send_status_func(f"**Reading web pages**: {webpage_links_str}"):
                 yield {ChatEvent.STATUS: event}
     tasks = [
-        read_webpage_and_extract_content(data["queries"], link, data["content"], user=user, agent=agent, tracer=tracer)
+        read_webpage_and_extract_content(
+            data["queries"], link, data.get("content"), user=user, agent=agent, tracer=tracer
+        )
         for link, data in webpages.items()
     ]
     results = await asyncio.gather(*tasks)
diff --git a/src/khoj/routers/api.py b/src/khoj/routers/api.py
index bed7c27b..3eb2dea5 100644
--- a/src/khoj/routers/api.py
+++ b/src/khoj/routers/api.py
@@ -6,7 +6,7 @@ import os
 import threading
 import time
 import uuid
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Set, Union
 
 import cron_descriptor
 import pytz
@@ -349,6 +349,7 @@ async def extract_references_and_questions(
     location_data: LocationData = None,
     send_status_func: Optional[Callable] = None,
     query_images: Optional[List[str]] = None,
+    previous_inferred_queries: Set = set(),
     agent: Agent = None,
     tracer: dict = {},
 ):
@@ -477,6 +478,7 @@ async def extract_references_and_questions(
             )
 
     # Collate search results as context for GPT
+    inferred_queries = list(set(inferred_queries) - previous_inferred_queries)
     with timer("Searching knowledge base took", logger):
         search_results = []
         logger.info(f"🔍 Searching knowledge base with queries: {inferred_queries}")
diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py
index 8646a695..c30f4cf8 100644
--- a/src/khoj/routers/api_chat.py
+++ b/src/khoj/routers/api_chat.py
@@ -778,7 +778,8 @@ async def chat(
                     yield research_result
 
             # researched_results = await extract_relevant_info(q, researched_results, agent)
-            logger.info(f"Researched Results: {researched_results}")
+            if state.verbose > 1:
+                logger.debug(f"Researched Results: {researched_results}")
 
         used_slash_summarize = conversation_commands == [ConversationCommand.Summarize]
         file_filters = conversation.file_filters if conversation else []
diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index c7a90fc3..d89ed147 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -20,6 +20,7 @@ from typing import (
     Iterator,
     List,
     Optional,
+    Set,
     Tuple,
     Union,
 )
@@ -494,7 +495,7 @@ async def generate_online_subqueries(
     query_images: List[str] = None,
     agent: Agent = None,
     tracer: dict = {},
-) -> List[str]:
+) -> Set[str]:
     """
     Generate subqueries from the given query
     """
@@ -529,14 +530,14 @@ async def generate_online_subqueries(
     try:
         response = clean_json(response)
         response = json.loads(response)
-        response = [q.strip() for q in response["queries"] if q.strip()]
-        if not isinstance(response, list) or not response or len(response) == 0:
+        response = {q.strip() for q in response["queries"] if q.strip()}
+        if not isinstance(response, set) or not response or len(response) == 0:
             logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
-            return [q]
+            return {q}
         return response
     except Exception as e:
         logger.error(f"Invalid response for constructing subqueries: {response}. Returning original query: {q}")
-        return [q]
+        return {q}
 
 
 async def schedule_query(
@@ -1128,9 +1129,6 @@ def generate_chat_response(
 
     metadata = {}
     agent = AgentAdapters.get_conversation_agent_by_id(conversation.agent.id) if conversation.agent else None
-    query_to_run = q
-    if meta_research:
-        query_to_run = f"AI Research: {meta_research} {q}"
     try:
         partial_completion = partial(
             save_to_conversation_log,
@@ -1148,6 +1146,13 @@ def generate_chat_response(
             train_of_thought=train_of_thought,
         )
 
+        query_to_run = q
+        if meta_research:
+            query_to_run = f"<query>{q}</query>\n<collected_research>\n{meta_research}\n</collected_research>"
+            compiled_references = []
+            online_results = {}
+            code_results = {}
+
         conversation_config = ConversationAdapters.get_valid_conversation_config(user, conversation)
         vision_available = conversation_config.vision_enabled
         if not vision_available and query_images:
diff --git a/src/khoj/routers/research.py b/src/khoj/routers/research.py
index 46d4c424..1caf7c96 100644
--- a/src/khoj/routers/research.py
+++ b/src/khoj/routers/research.py
@@ -43,38 +43,35 @@ async def apick_next_tool(
     location: LocationData = None,
     user_name: str = None,
     agent: Agent = None,
-    previous_iterations_history: str = None,
+    previous_iterations: List[InformationCollectionIteration] = [],
     max_iterations: int = 5,
     send_status_func: Optional[Callable] = None,
     tracer: dict = {},
 ):
-    """
-    Given a query, determine which of the available tools the agent should use in order to answer appropriately. One at a time, and it's able to use subsequent iterations to refine the answer.
-    """
+    """Given a query, determine which of the available tools the agent should use in order to answer appropriately."""
 
+    # Construct tool options for the agent to choose from
     tool_options = dict()
     tool_options_str = ""
-
     agent_tools = agent.input_tools if agent else []
-
     for tool, description in function_calling_description_for_llm.items():
         tool_options[tool.value] = description
         if len(agent_tools) == 0 or tool.value in agent_tools:
             tool_options_str += f'- "{tool.value}": "{description}"\n'
 
+    # Construct chat history with user and iteration history with researcher agent for context
     chat_history = construct_chat_history(conversation_history, agent_name=agent.name if agent else "Khoj")
+    previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
 
     if query_images:
         query = f"[placeholder for user attached images]\n{query}"
 
+    today = datetime.today()
+    location_data = f"{location}" if location else "Unknown"
     personality_context = (
         prompts.personality_context.format(personality=agent.personality) if agent and agent.personality else ""
     )
 
-    # Extract Past User Message and Inferred Questions from Conversation Log
-    today = datetime.today()
-    location_data = f"{location}" if location else "Unknown"
-
     function_planning_prompt = prompts.plan_function_execution.format(
         tools=tool_options_str,
         chat_history=chat_history,
@@ -112,8 +109,15 @@ async def apick_next_tool(
         selected_tool = response.get("tool", None)
         generated_query = response.get("query", None)
         scratchpad = response.get("scratchpad", None)
+        warning = None
         logger.info(f"Response for determining relevant tools: {response}")
-        if send_status_func:
+
+        # Detect selection of previously used query, tool combination.
+        previous_tool_query_combinations = {(i.tool, i.query) for i in previous_iterations}
+        if (selected_tool, generated_query) in previous_tool_query_combinations:
+            warning = f"Repeated tool, query combination detected. Skipping iteration. Try something different."
+        # Only send client status updates if we'll execute this iteration
+        elif send_status_func:
             determined_tool_message = "**Determined Tool**: "
             determined_tool_message += f"{selected_tool}({generated_query})." if selected_tool else "respond."
             determined_tool_message += f"\nReason: {scratchpad}" if scratchpad else ""
@@ -123,13 +127,14 @@ async def apick_next_tool(
         yield InformationCollectionIteration(
             tool=selected_tool,
             query=generated_query,
+            warning=warning,
         )
-
     except Exception as e:
         logger.error(f"Invalid response for determining relevant tools: {response}. {e}", exc_info=True)
         yield InformationCollectionIteration(
             tool=None,
             query=None,
+            warning=f"Invalid response for determining relevant tools: {response}. Skipping iteration. Fix error: {e}",
         )
 
 
@@ -156,7 +161,6 @@ async def execute_information_collection(
         document_results: List[Dict[str, str]] = []
         summarize_files: str = ""
         this_iteration = InformationCollectionIteration(tool=None, query=query)
-        previous_iterations_history = construct_iteration_history(previous_iterations, prompts.previous_iteration)
 
         async for result in apick_next_tool(
             query,
@@ -166,7 +170,7 @@ async def execute_information_collection(
             location,
             user_name,
             agent,
-            previous_iterations_history,
+            previous_iterations,
             MAX_ITERATIONS,
             send_status_func,
             tracer=tracer,
@@ -176,9 +180,16 @@ async def execute_information_collection(
             elif isinstance(result, InformationCollectionIteration):
                 this_iteration = result
 
-        if this_iteration.tool == ConversationCommand.Notes:
+        # Skip running iteration if warning present in iteration
+        if this_iteration.warning:
+            logger.warning(f"Research mode: {this_iteration.warning}.")
+
+        elif this_iteration.tool == ConversationCommand.Notes:
             this_iteration.context = []
             document_results = []
+            previous_inferred_queries = {
+                c["query"] for iteration in previous_iterations if iteration.context for c in iteration.context
+            }
             async for result in extract_references_and_questions(
                 request,
                 construct_tool_chat_history(previous_iterations, ConversationCommand.Notes),
@@ -190,6 +201,7 @@ async def execute_information_collection(
                 location,
                 send_status_func,
                 query_images,
+                previous_inferred_queries=previous_inferred_queries,
                 agent=agent,
                 tracer=tracer,
             ):
@@ -213,6 +225,12 @@ async def execute_information_collection(
                     logger.error(f"Error extracting document references: {e}", exc_info=True)
 
         elif this_iteration.tool == ConversationCommand.Online:
+            previous_subqueries = {
+                subquery
+                for iteration in previous_iterations
+                if iteration.onlineContext
+                for subquery in iteration.onlineContext.keys()
+            }
             async for result in search_online(
                 this_iteration.query,
                 construct_tool_chat_history(previous_iterations, ConversationCommand.Online),
@@ -222,11 +240,16 @@ async def execute_information_collection(
                 [],
                 max_webpages_to_read=0,
                 query_images=query_images,
+                previous_subqueries=previous_subqueries,
                 agent=agent,
                 tracer=tracer,
             ):
                 if isinstance(result, dict) and ChatEvent.STATUS in result:
                     yield result[ChatEvent.STATUS]
+                elif is_none_or_empty(result):
+                    this_iteration.warning = (
+                        "Detected previously run online search queries. Skipping iteration. Try something different."
+                    )
                 else:
                     online_results: Dict[str, Dict] = result  # type: ignore
                     this_iteration.onlineContext = online_results
@@ -311,16 +334,19 @@ async def execute_information_collection(
 
         current_iteration += 1
 
-        if document_results or online_results or code_results or summarize_files:
-            results_data = f"**Results**:\n"
+        if document_results or online_results or code_results or summarize_files or this_iteration.warning:
+            results_data = f"\n<iteration>{current_iteration}\n<tool>{this_iteration.tool}</tool>\n<query>{this_iteration.query}</query>\n<results>"
             if document_results:
-                results_data += f"**Document References**:\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+                results_data += f"\n<document_references>\n{yaml.dump(document_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</document_references>"
             if online_results:
-                results_data += f"**Online Results**:\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+                results_data += f"\n<online_results>\n{yaml.dump(online_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</online_results>"
             if code_results:
-                results_data += f"**Code Results**:\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+                results_data += f"\n<code_results>\n{yaml.dump(code_results, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</code_results>"
             if summarize_files:
-                results_data += f"**Summarized Files**:\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n"
+                results_data += f"\n<summarized_files>\n{yaml.dump(summarize_files, allow_unicode=True, sort_keys=False, default_flow_style=False)}\n</summarized_files>"
+            if this_iteration.warning:
+                results_data += f"\n<warning>\n{this_iteration.warning}\n</warning>"
+            results_data += "\n</results>\n</iteration>"
 
             # intermediate_result = await extract_relevant_info(this_iteration.query, results_data, agent)
             this_iteration.summarizedResult = results_data

From 7468f6a6ed26215f44b0fd1906d698da301d921c Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Fri, 8 Nov 2024 14:49:09 -0800
Subject: [PATCH 11/12] Deduplicate online references returned by chat API to
 clients

This will ensure only unique online references are shown in all
clients.

The duplication issue was exacerbated in research mode as even with
different online search queries, you can get previously seen results.

This change does a global deduplication across all online results seen
across research iterations before returning them in client reponse.
---
 src/khoj/processor/tools/online_search.py | 22 ++++++++++++++++++++++
 src/khoj/routers/api_chat.py              |  9 +++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/khoj/processor/tools/online_search.py b/src/khoj/processor/tools/online_search.py
index d2d8c685..2ee6e72c 100644
--- a/src/khoj/processor/tools/online_search.py
+++ b/src/khoj/processor/tools/online_search.py
@@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
                 for item in response_json["data"]
             ]
             return query, {"organic": parsed_response}
+
+
+def deduplicate_organic_results(online_results: dict) -> dict:
+    """Deduplicate organic search results based on links across all queries."""
+    # Keep track of seen links to filter out duplicates across queries
+    seen_links = set()
+    deduplicated_results = {}
+
+    # Process each query's results
+    for query, results in online_results.items():
+        # Filter organic results keeping only first occurrence of each link
+        filtered_organic = []
+        for result in results.get("organic", []):
+            link = result.get("link")
+            if link and link not in seen_links:
+                seen_links.add(link)
+                filtered_organic.append(result)
+
+        # Update results with deduplicated organic entries
+        deduplicated_results[query] = {**results, "organic": filtered_organic}
+
+    return deduplicated_results
diff --git a/src/khoj/routers/api_chat.py b/src/khoj/routers/api_chat.py
index c30f4cf8..648fb8dd 100644
--- a/src/khoj/routers/api_chat.py
+++ b/src/khoj/routers/api_chat.py
@@ -28,7 +28,11 @@ from khoj.processor.conversation.prompts import help_message, no_entries_found
 from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
 from khoj.processor.image.generate import text_to_image
 from khoj.processor.speech.text_to_speech import generate_text_to_speech
-from khoj.processor.tools.online_search import read_webpages, search_online
+from khoj.processor.tools.online_search import (
+    deduplicate_organic_results,
+    read_webpages,
+    search_online,
+)
 from khoj.processor.tools.run_code import run_code
 from khoj.routers.api import extract_references_and_questions
 from khoj.routers.email import send_query_feedback
@@ -1026,12 +1030,13 @@ async def chat(
                 )
 
         ## Send Gathered References
+        unique_online_results = deduplicate_organic_results(online_results)
         async for result in send_event(
             ChatEvent.REFERENCES,
             {
                 "inferredQueries": inferred_queries,
                 "context": compiled_references,
-                "onlineContext": online_results,
+                "onlineContext": unique_online_results,
                 "codeContext": code_results,
             },
         ):

From a5e2b9e745185ee88cbb685ff58c71d66ff2fe50 Mon Sep 17 00:00:00 2001
From: sabaimran <narmiabas@gmail.com>
Date: Sun, 10 Nov 2024 19:22:21 -0800
Subject: [PATCH 12/12] Exit early when running an automation if the
 conversation for the automation does not exist.

---
 src/khoj/routers/helpers.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/khoj/routers/helpers.py b/src/khoj/routers/helpers.py
index d89ed147..ca78ba35 100644
--- a/src/khoj/routers/helpers.py
+++ b/src/khoj/routers/helpers.py
@@ -1582,6 +1582,11 @@ def scheduled_chat(
         # encode the conversation_id to avoid any issues with special characters
         query_dict["conversation_id"] = [quote(str(conversation_id))]
 
+        # validate that the conversation id exists. If not, delete the automation and exit.
+        if not ConversationAdapters.get_conversation_by_id(conversation_id):
+            AutomationAdapters.delete_automation(user, job_id)
+            return
+
     # Restructure the original query_dict into a valid JSON payload for the chat API
     json_payload = {key: values[0] for key, values in query_dict.items()}