mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-02-17 08:04:21 +00:00
Merge branch 'master' of github.com:khoj-ai/khoj into features/include-full-file-in-convo-with-filter
This commit is contained in:
commit
623a97a9ee
9 changed files with 217 additions and 18 deletions
|
@ -31,7 +31,4 @@ Using LiteLLM with Khoj makes it possible to turn any LLM behind an API into you
|
||||||
- Openai Config: `<the proxy config you created in step 3>`
|
- Openai Config: `<the proxy config you created in step 3>`
|
||||||
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
||||||
- Tokenizer: *Do not set for OpenAI, Mistral, Llama3 based models*
|
- Tokenizer: *Do not set for OpenAI, Mistral, Llama3 based models*
|
||||||
5. Create a new [Server Chat Setting](http://localhost:42110/server/admin/database/serverchatsettings/add/) on your Khoj admin panel
|
5. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
||||||
- Default model: `<name of chat model option you created in step 4>`
|
|
||||||
- Summarizer model: `<name of chat model option you created in step 4>`
|
|
||||||
6. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
|
||||||
|
|
|
@ -24,7 +24,4 @@ LM Studio can expose an [OpenAI API compatible server](https://lmstudio.ai/docs/
|
||||||
- Openai Config: `<the proxy config you created in step 3>`
|
- Openai Config: `<the proxy config you created in step 3>`
|
||||||
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
||||||
- Tokenizer: *Do not set for OpenAI, mistral, llama3 based models*
|
- Tokenizer: *Do not set for OpenAI, mistral, llama3 based models*
|
||||||
5. Create a new [Server Chat Setting](http://localhost:42110/server/admin/database/serverchatsettings/add/) on your Khoj admin panel
|
5. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
||||||
- Default model: `<name of chat model option you created in step 4>`
|
|
||||||
- Summarizer model: `<name of chat model option you created in step 4>`
|
|
||||||
6. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
|
||||||
|
|
|
@ -28,9 +28,6 @@ Ollama exposes a local [OpenAI API compatible server](https://github.com/ollama/
|
||||||
- Model Type: `Openai`
|
- Model Type: `Openai`
|
||||||
- Openai Config: `<the ollama config you created in step 3>`
|
- Openai Config: `<the ollama config you created in step 3>`
|
||||||
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
- Max prompt size: `20000` (replace with the max prompt size of your model)
|
||||||
5. Create a new [Server Chat Setting](http://localhost:42110/server/admin/database/serverchatsettings/add/) on your Khoj admin panel
|
5. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
||||||
- Default model: `<name of chat model option you created in step 4>`
|
|
||||||
- Summarizer model: `<name of chat model option you created in step 4>`
|
|
||||||
6. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
|
||||||
|
|
||||||
That's it! You should now be able to chat with your Ollama model from Khoj. If you want to add additional models running on Ollama, repeat step 6 for each model.
|
That's it! You should now be able to chat with your Ollama model from Khoj. If you want to add additional models running on Ollama, repeat step 6 for each model.
|
||||||
|
|
|
@ -31,7 +31,4 @@ For specific integrations, see our [Ollama](/advanced/ollama), [LMStudio](/advan
|
||||||
- Openai Config: `<the proxy config you created in step 2>`
|
- Openai Config: `<the proxy config you created in step 2>`
|
||||||
- Max prompt size: `2000` (replace with the max prompt size of your model)
|
- Max prompt size: `2000` (replace with the max prompt size of your model)
|
||||||
- Tokenizer: *Do not set for OpenAI, mistral, llama3 based models*
|
- Tokenizer: *Do not set for OpenAI, mistral, llama3 based models*
|
||||||
4. Create a new [Server Chat Setting](http://localhost:42110/server/admin/database/serverchatsettings/add/) on your Khoj admin panel
|
4. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
||||||
- Default model: `<name of chat model option you created in step 3>`
|
|
||||||
- Summarizer model: `<name of chat model option you created in step 3>`
|
|
||||||
5. Go to [your config](http://localhost:42110/settings) and select the model you just created in the chat model dropdown.
|
|
||||||
|
|
|
@ -120,6 +120,8 @@ dev = [
|
||||||
"black >= 23.1.0",
|
"black >= 23.1.0",
|
||||||
"pre-commit >= 3.0.4",
|
"pre-commit >= 3.0.4",
|
||||||
"gitpython ~= 3.1.43",
|
"gitpython ~= 3.1.43",
|
||||||
|
"datasets",
|
||||||
|
"pandas",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.hatch.version]
|
[tool.hatch.version]
|
||||||
|
|
|
@ -607,7 +607,7 @@ export const ChatInputArea = forwardRef<HTMLTextAreaElement, ChatInputProps>((pr
|
||||||
disabled={props.sendDisabled || recording}
|
disabled={props.sendDisabled || recording}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div className="flex items-center">
|
<div className="flex items-end pb-2">
|
||||||
{recording ? (
|
{recording ? (
|
||||||
<TooltipProvider>
|
<TooltipProvider>
|
||||||
<Tooltip>
|
<Tooltip>
|
||||||
|
|
|
@ -49,7 +49,7 @@ def normalize_filename(filename):
|
||||||
normalized_filename = f"~/{relpath(filename, start=Path.home())}"
|
normalized_filename = f"~/{relpath(filename, start=Path.home())}"
|
||||||
else:
|
else:
|
||||||
normalized_filename = filename
|
normalized_filename = filename
|
||||||
escaped_filename = f"{normalized_filename}".replace("[", "\[").replace("]", "\]")
|
escaped_filename = f"{normalized_filename}".replace("[", r"\[").replace("]", r"\]")
|
||||||
return escaped_filename
|
return escaped_filename
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -92,6 +92,8 @@ def extract_questions_anthropic(
|
||||||
|
|
||||||
messages.append(ChatMessage(content=prompt, role="user"))
|
messages.append(ChatMessage(content=prompt, role="user"))
|
||||||
|
|
||||||
|
messages, system_prompt = format_messages_for_anthropic(messages, system_prompt)
|
||||||
|
|
||||||
response = anthropic_completion_with_backoff(
|
response = anthropic_completion_with_backoff(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
system_prompt=system_prompt,
|
system_prompt=system_prompt,
|
||||||
|
|
207
tests/eval_frames.py
Normal file
207
tests/eval_frames.py
Normal file
|
@ -0,0 +1,207 @@
|
||||||
|
import concurrent.futures
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
from khoj.utils.helpers import timer
|
||||||
|
|
||||||
|
# Configure root logger
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
|
||||||
|
KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
|
||||||
|
KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
|
||||||
|
KHOJ_MODE = os.getenv("KHOJ_MODE") # E.g research, general, notes etc.
|
||||||
|
|
||||||
|
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
||||||
|
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
|
||||||
|
GEMINI_API_URL = (
|
||||||
|
f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_EVAL_MODEL}:generateContent?key={GEMINI_API_KEY}"
|
||||||
|
)
|
||||||
|
|
||||||
|
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
||||||
|
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
||||||
|
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10)) # Number of examples to evaluate in parallel
|
||||||
|
SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
|
||||||
|
|
||||||
|
|
||||||
|
def load_frames_dataset():
|
||||||
|
"""Load the FRAMES benchmark dataset from HuggingFace"""
|
||||||
|
try:
|
||||||
|
dataset = load_dataset("google/frames-benchmark")
|
||||||
|
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
||||||
|
# Use test split for evaluation. Sample and shuffle dataset if configured
|
||||||
|
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading dataset: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_agent_response(prompt: str) -> str:
|
||||||
|
"""Get response from the Khoj API"""
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
KHOJ_CHAT_API_URL,
|
||||||
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {KHOJ_API_KEY}"},
|
||||||
|
json={
|
||||||
|
"q": prompt,
|
||||||
|
"create_new": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json().get("response", "")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting agent response: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dict[str, Any]:
|
||||||
|
"""Evaluate Khoj response against benchmark ground truth using Gemini"""
|
||||||
|
evaluation_prompt = f"""
|
||||||
|
Compare the following agent response with the ground truth answer.
|
||||||
|
Determine if the agent response contains the key information from the ground truth.
|
||||||
|
Focus on factual correctness rather than exact wording.
|
||||||
|
|
||||||
|
Query: {query}
|
||||||
|
Agent Response: {agent_response}
|
||||||
|
Ground Truth: {ground_truth}
|
||||||
|
|
||||||
|
Provide your evaluation in the following json format:
|
||||||
|
{"explanation:" "[How you made the decision?)", "decision:" "(TRUE if response contains key information, FALSE otherwise)"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
GEMINI_API_URL,
|
||||||
|
headers={"Content-Type": "application/json", "response_mime_type": "application/json"},
|
||||||
|
json={"contents": [{"parts": [{"text": evaluation_prompt}]}]},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Parse evaluation response
|
||||||
|
eval_response = json.loads(clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"]))
|
||||||
|
if "decision" in eval_response and isinstance(eval_response["decision"], str):
|
||||||
|
eval_response["decision"] = eval_response["decision"].upper() == "TRUE"
|
||||||
|
# Extract decision and explanation from structured response
|
||||||
|
return {
|
||||||
|
"decision": eval_response.get("decision", False),
|
||||||
|
"explanation": eval_response.get("explanation", ""),
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in evaluation: {e}")
|
||||||
|
return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"}
|
||||||
|
|
||||||
|
|
||||||
|
def process_batch(batch, counter, results, dataset_length):
|
||||||
|
for prompt, answer, reasoning_type in batch:
|
||||||
|
counter += 1
|
||||||
|
logger.info(f"Processing example: {counter}/{dataset_length}")
|
||||||
|
|
||||||
|
# Trigger research mode if enabled
|
||||||
|
prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
|
||||||
|
|
||||||
|
# Get agent response
|
||||||
|
agent_response = get_agent_response(prompt)
|
||||||
|
|
||||||
|
# Evaluate response
|
||||||
|
evaluation = evaluate_response(prompt, agent_response, answer)
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"index": counter,
|
||||||
|
"prompt": prompt,
|
||||||
|
"ground_truth": answer,
|
||||||
|
"agent_response": agent_response,
|
||||||
|
"evaluation_decision": evaluation["decision"],
|
||||||
|
"evaluation_explanation": evaluation["explanation"],
|
||||||
|
"reasoning_type": reasoning_type,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Color the decision based on its value
|
||||||
|
decision = evaluation["decision"]
|
||||||
|
decision_color = "green" if decision == True else "red"
|
||||||
|
colored_decision = color_text(str(decision), decision_color)
|
||||||
|
logger.info(
|
||||||
|
f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
time.sleep(SLEEP_SECONDS) # Rate limiting
|
||||||
|
|
||||||
|
|
||||||
|
def color_text(text, color):
|
||||||
|
colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"}
|
||||||
|
return f"{colors[color]}{text}{colors['reset']}"
|
||||||
|
|
||||||
|
|
||||||
|
def clean_json(response: str):
|
||||||
|
"""Remove any markdown json codeblock and newline formatting if present. Useful for non schema enforceable models"""
|
||||||
|
return response.strip().replace("\n", "").removeprefix("```json").removesuffix("```")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Load dataset
|
||||||
|
with timer("Loaded dataset in", logger):
|
||||||
|
dataset = load_frames_dataset()
|
||||||
|
if dataset is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Initialize variables
|
||||||
|
counter = 0
|
||||||
|
results = []
|
||||||
|
dataset_length = len(dataset["Prompt"])
|
||||||
|
|
||||||
|
# Process examples in batches
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
futures = []
|
||||||
|
for i in range(0, dataset_length, BATCH_SIZE):
|
||||||
|
batch = zip(
|
||||||
|
dataset["Prompt"][i : i + BATCH_SIZE],
|
||||||
|
dataset["Answer"][i : i + BATCH_SIZE],
|
||||||
|
dataset["reasoning_types"][i : i + BATCH_SIZE],
|
||||||
|
)
|
||||||
|
futures.append(executor.submit(process_batch, batch, counter, results, dataset_length))
|
||||||
|
|
||||||
|
# Wait for all futures to complete
|
||||||
|
concurrent.futures.wait(futures)
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
df = pd.DataFrame(results)
|
||||||
|
accuracy = (df["evaluation_decision"] == True).mean()
|
||||||
|
|
||||||
|
# Calculate accuracy by reasoning type
|
||||||
|
reasoning_type_accuracy = df.groupby("reasoning_type")["evaluation_decision"].apply(lambda x: (x == True).mean())
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
df.to_csv("frames_evaluation_results.csv", index=False)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
logger.info(f"\nOverall Accuracy: {accuracy:.2%}")
|
||||||
|
logger.info("\nAccuracy by Reasoning Type:")
|
||||||
|
logger.info(reasoning_type_accuracy)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""
|
||||||
|
Evaluate Khoj on the Google FRAMES benchmark.
|
||||||
|
Response are evaluated by GEMINI_EVAL_MODEL (default: gemini-pro-1.5-002).
|
||||||
|
|
||||||
|
Khoj should be running at KHOJ_URL, default at http://localhost:42110.
|
||||||
|
The Gemini judge model is accessed via the Gemini API with your GEMINI_API_KEY.
|
||||||
|
To evaluate Khoj in research mode, set the KHOJ_MODE environment variable to "research".
|
||||||
|
|
||||||
|
Run the script using the following command:
|
||||||
|
KHOJ_MODE="research" GEMINI_API_KEY="<your_gemini_api_key>" python eval_frames.py
|
||||||
|
"""
|
||||||
|
with timer("Ran eval in", logger):
|
||||||
|
main()
|
Loading…
Add table
Reference in a new issue