Improve error handling, display and configurability of eval script

- Default to evaluation decision of None when either agent or
  evaluator llm fails. This fixes accuracy calculations on errors
- Fix showing color for decision True
- Enable arg flags to specify output results file paths
This commit is contained in:
Debanjum 2024-11-13 03:13:36 -08:00
parent 15b0cfa3dd
commit f4e37209a2

View file

@ -1,18 +1,20 @@
import argparse
import concurrent.futures import concurrent.futures
import json import json
import logging import logging
import os import os
import time import time
from datetime import datetime
from typing import Any, Dict from typing import Any, Dict
import pandas as pd import pandas as pd
import requests import requests
from datasets import load_dataset from datasets import load_dataset
from khoj.utils.helpers import timer from khoj.utils.helpers import is_none_or_empty, timer
# Configure root logger # Configure root logger
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s") logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Configuration # Configuration
@ -82,23 +84,28 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic
try: try:
response = requests.post( response = requests.post(
GEMINI_API_URL, GEMINI_API_URL,
headers={"Content-Type": "application/json", "response_mime_type": "application/json"}, headers={"Content-Type": "application/json"},
json={"contents": [{"parts": [{"text": evaluation_prompt}]}]}, json={
"contents": [{"parts": [{"text": evaluation_prompt}]}],
"generationConfig": {"response_mime_type": "application/json"},
},
) )
response.raise_for_status() response.raise_for_status()
# Parse evaluation response # Parse evaluation response
eval_response = json.loads(clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"])) eval_response: dict[str, str] = json.loads(
if "decision" in eval_response and isinstance(eval_response["decision"], str): clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"])
eval_response["decision"] = eval_response["decision"].upper() == "TRUE" )
decision = str(eval_response.get("decision", "")).upper() == "TRUE"
explanation = eval_response.get("explanation", "")
# Handle evaluation service errors
if "503 Service Error" in explanation:
decision = None
# Extract decision and explanation from structured response # Extract decision and explanation from structured response
return { return decision, explanation
"decision": eval_response.get("decision", False),
"explanation": eval_response.get("explanation", ""),
}
except Exception as e: except Exception as e:
logger.error(f"Error in evaluation: {e}") logger.error(f"Error in evaluation: {e}")
return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"} return None, f"Evaluation failed: {str(e)}"
def process_batch(batch, batch_start, results, dataset_length): def process_batch(batch, batch_start, results, dataset_length):
@ -107,17 +114,17 @@ def process_batch(batch, batch_start, results, dataset_length):
logger.info(f"Processing example: {current_index}/{dataset_length}") logger.info(f"Processing example: {current_index}/{dataset_length}")
# Trigger research mode if enabled # Trigger research mode if enabled
prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE and not prompt.startswith(f"/{KHOJ_MODE}") else prompt
# Get agent response # Get agent response
agent_response = get_agent_response(prompt) agent_response = get_agent_response(prompt)
# Evaluate response # Evaluate response
if agent_response is None or agent_response.strip() == "": if is_none_or_empty(agent_response):
evaluation["decision"] = False decision = None
evaluation["explanation"] = "Agent response is empty. This maybe due to a service error." explanation = "Agent response is empty. This maybe due to a service error."
else: else:
evaluation = evaluate_response(prompt, agent_response, answer) decision, explanation = evaluate_response(prompt, agent_response, answer)
# Store results # Store results
results.append( results.append(
@ -126,25 +133,29 @@ def process_batch(batch, batch_start, results, dataset_length):
"prompt": prompt, "prompt": prompt,
"ground_truth": answer, "ground_truth": answer,
"agent_response": agent_response, "agent_response": agent_response,
"evaluation_decision": evaluation["decision"], "evaluation_decision": decision,
"evaluation_explanation": evaluation["explanation"], "evaluation_explanation": explanation,
"reasoning_type": reasoning_type, "reasoning_type": reasoning_type,
} }
) )
# Color the decision based on its value # Log results
decision = evaluation["decision"] decision_color = {True: "green", None: "blue", False: "red"}[decision]
decision_color = "green" if decision == True else "red"
colored_decision = color_text(str(decision), decision_color) colored_decision = color_text(str(decision), decision_color)
logger.info( logger.info(
f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n' f"Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {explanation}\n"
) )
time.sleep(SLEEP_SECONDS) # Rate limiting time.sleep(SLEEP_SECONDS) # Rate limiting
def color_text(text, color): def color_text(text, color):
colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"} colors = {
"red": "\033[91m", # Bright red
"green": "\033[32m", # Standard green
"blue": "\033[94m", # Bright blue
"reset": "\033[0m",
}
return f"{colors[color]}{text}{colors['reset']}" return f"{colors[color]}{text}{colors['reset']}"
@ -153,7 +164,21 @@ def clean_json(response: str):
return response.strip().replace("\n", "").removeprefix("```json").removesuffix("```") return response.strip().replace("\n", "").removeprefix("```json").removesuffix("```")
def parse_args():
parser = argparse.ArgumentParser(description="Evaluate Khoj on the Google FRAMES benchmark.")
parser.add_argument(
"--output",
"-o",
default=None,
help="Path to store evaluation results CSV (default: frames_evaluation_results_[datetime].csv)",
)
return parser.parse_args()
def main(): def main():
# Initialize variables
args = parse_args()
# Load dataset # Load dataset
with timer("Loaded dataset in", logger): with timer("Loaded dataset in", logger):
dataset = load_frames_dataset() dataset = load_frames_dataset()
@ -161,7 +186,6 @@ def main():
return return
# Initialize variables # Initialize variables
counter = 0
results = [] results = []
dataset_length = len(dataset["Prompt"]) dataset_length = len(dataset["Prompt"])
@ -182,18 +206,22 @@ def main():
# Calculate metrics # Calculate metrics
df = pd.DataFrame(results) df = pd.DataFrame(results)
accuracy = (df["evaluation_decision"] == True).mean() eval_df = df.dropna(subset=["evaluation_decision"]) # Exclude rows with missing evaluation decision
accuracy = (eval_df["evaluation_decision"] == True).mean()
# Calculate accuracy by reasoning type # Calculate accuracy by reasoning type
reasoning_type_accuracy = df.groupby("reasoning_type")["evaluation_decision"].apply(lambda x: (x == True).mean()) reasoning_type_accuracy = eval_df.groupby("reasoning_type")["evaluation_decision"].apply(
lambda x: (x == True).mean()
# Save results )
df.to_csv("frames_evaluation_results.csv", index=False)
# Print summary # Print summary
logger.info(f"\nOverall Accuracy: {accuracy:.2%}") logger.info(f"\nOverall Accuracy: {accuracy:.2%}")
logger.info("\nAccuracy by Reasoning Type:") logger.info(f"\nAccuracy by Reasoning Type:\n{reasoning_type_accuracy}")
logger.info(reasoning_type_accuracy)
# Save results
output_file = args.output or f"frames_evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
df.to_csv(output_file, index=False)
logger.info(f"Results saved to {output_file}")
if __name__ == "__main__": if __name__ == "__main__":
@ -201,12 +229,14 @@ if __name__ == "__main__":
Evaluate Khoj on the Google FRAMES benchmark. Evaluate Khoj on the Google FRAMES benchmark.
Response are evaluated by GEMINI_EVAL_MODEL (default: gemini-pro-1.5-002). Response are evaluated by GEMINI_EVAL_MODEL (default: gemini-pro-1.5-002).
Khoj should be running at KHOJ_URL, default at http://localhost:42110. Khoj should be running at KHOJ_URL (default: http://localhost:42110).
The Gemini judge model is accessed via the Gemini API with your GEMINI_API_KEY. The Gemini judge model is accessed via the Gemini API with your GEMINI_API_KEY.
To evaluate Khoj in research mode, set the KHOJ_MODE environment variable to "research". To evaluate Khoj in research mode, set the KHOJ_MODE environment variable to "research".
Run the script using the following command: Run the script using the following command:
KHOJ_MODE="research" GEMINI_API_KEY="<your_gemini_api_key>" python eval_frames.py KHOJ_MODE="research" GEMINI_API_KEY="<your_gemini_api_key>" python eval_frames.py
""" """
with timer("Ran eval in", logger): logger.info(f"{datetime.now()} - Begin Quizzing Khoj on the FRAMES benchmark.")
with timer("Ran eval script in", logger, log_level=logging.INFO):
main() main()
logger.info(f"{datetime.now()} - End Quizzing Khoj on the FRAMES benchmark.")