Improve error handling, display and configurability of eval script

- Default to evaluation decision of None when either agent or evaluator llm fails. This fixes accuracy calculations on errors - Fix showing color for decision True - Enable arg flags to specify output results file paths
2025-02-17 08:04:21 +00:00 · 2024-11-13 03:13:36 -08:00 · 2024-11-13 03:13:36 -08:00 · f4e37209a2
commit f4e37209a2
parent 15b0cfa3dd
1 changed files with 64 additions and 34 deletions
--- a/tests/eval_frames.py
+++ b/tests/eval_frames.py
@ -1,18 +1,20 @@
 import argparse
 import concurrent.futures
 import json
 import logging
 import os
 import time
 from datetime import datetime
 from typing import Any, Dict
 import pandas as pd
 import requests
 from datasets import load_dataset
-from khoj.utils.helpers import timer
+from khoj.utils.helpers import is_none_or_empty, timer
 # Configure root logger
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
+logging.basicConfig(level=logging.INFO, format="%(message)s")
 logger = logging.getLogger(__name__)
 # Configuration
@ -82,23 +84,28 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic
    try:
        response = requests.post(
            GEMINI_API_URL,
-            headers={"Content-Type": "application/json", "response_mime_type": "application/json"},
+            headers={"Content-Type": "application/json"},
-            json={"contents": [{"parts": [{"text": evaluation_prompt}]}]},
+            json={
                "contents": [{"parts": [{"text": evaluation_prompt}]}],
                "generationConfig": {"response_mime_type": "application/json"},
            },
        )
        response.raise_for_status()
        # Parse evaluation response
-        eval_response = json.loads(clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"]))
+        eval_response: dict[str, str] = json.loads(
-        if "decision" in eval_response and isinstance(eval_response["decision"], str):
+            clean_json(response.json()["candidates"][0]["content"]["parts"][0]["text"])
-            eval_response["decision"] = eval_response["decision"].upper() == "TRUE"
+        )
        decision = str(eval_response.get("decision", "")).upper() == "TRUE"
        explanation = eval_response.get("explanation", "")
        # Handle evaluation service errors
        if "503 Service Error" in explanation:
            decision = None
        # Extract decision and explanation from structured response
-        return {
+        return decision, explanation
            "decision": eval_response.get("decision", False),
            "explanation": eval_response.get("explanation", ""),
        }
    except Exception as e:
        logger.error(f"Error in evaluation: {e}")
-        return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"}
+        return None, f"Evaluation failed: {str(e)}"
 def process_batch(batch, batch_start, results, dataset_length):
@ -107,17 +114,17 @@ def process_batch(batch, batch_start, results, dataset_length):
        logger.info(f"Processing example: {current_index}/{dataset_length}")
        # Trigger research mode if enabled
-        prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
+        prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE and not prompt.startswith(f"/{KHOJ_MODE}") else prompt
        # Get agent response
        agent_response = get_agent_response(prompt)
        # Evaluate response
-        if agent_response is None or agent_response.strip() == "":
+        if is_none_or_empty(agent_response):
-            evaluation["decision"] = False
+            decision = None
-            evaluation["explanation"] = "Agent response is empty. This maybe due to a service error."
+            explanation = "Agent response is empty. This maybe due to a service error."
        else:
-            evaluation = evaluate_response(prompt, agent_response, answer)
+            decision, explanation = evaluate_response(prompt, agent_response, answer)
        # Store results
        results.append(
@ -126,25 +133,29 @@ def process_batch(batch, batch_start, results, dataset_length):
                "prompt": prompt,
                "ground_truth": answer,
                "agent_response": agent_response,
-                "evaluation_decision": evaluation["decision"],
+                "evaluation_decision": decision,
-                "evaluation_explanation": evaluation["explanation"],
+                "evaluation_explanation": explanation,
                "reasoning_type": reasoning_type,
            }
        )
-        # Color the decision based on its value
+        # Log results
-        decision = evaluation["decision"]
+        decision_color = {True: "green", None: "blue", False: "red"}[decision]
        decision_color = "green" if decision == True else "red"
        colored_decision = color_text(str(decision), decision_color)
        logger.info(
-            f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n'
+            f"Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {explanation}\n"
        )
        time.sleep(SLEEP_SECONDS)  # Rate limiting
 def color_text(text, color):
-    colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"}
+    colors = {
        "red": "\033[91m",  # Bright red
        "green": "\033[32m",  # Standard green
        "blue": "\033[94m",  # Bright blue
        "reset": "\033[0m",
    }
    return f"{colors[color]}{text}{colors['reset']}"
@ -153,7 +164,21 @@ def clean_json(response: str):
    return response.strip().replace("\n", "").removeprefix("```json").removesuffix("```")
 def parse_args():
    parser = argparse.ArgumentParser(description="Evaluate Khoj on the Google FRAMES benchmark.")
    parser.add_argument(
        "--output",
        "-o",
        default=None,
        help="Path to store evaluation results CSV (default: frames_evaluation_results_[datetime].csv)",
    )
    return parser.parse_args()
 def main():
    # Initialize variables
    args = parse_args()
    # Load dataset
    with timer("Loaded dataset in", logger):
        dataset = load_frames_dataset()
@ -161,7 +186,6 @@ def main():
        return
    # Initialize variables
    counter = 0
    results = []
    dataset_length = len(dataset["Prompt"])
@ -182,18 +206,22 @@ def main():
    # Calculate metrics
    df = pd.DataFrame(results)
-    accuracy = (df["evaluation_decision"] == True).mean()
+    eval_df = df.dropna(subset=["evaluation_decision"])  # Exclude rows with missing evaluation decision
    accuracy = (eval_df["evaluation_decision"] == True).mean()
    # Calculate accuracy by reasoning type
-    reasoning_type_accuracy = df.groupby("reasoning_type")["evaluation_decision"].apply(lambda x: (x == True).mean())
+    reasoning_type_accuracy = eval_df.groupby("reasoning_type")["evaluation_decision"].apply(
-
+        lambda x: (x == True).mean()
-    # Save results
+    )
    df.to_csv("frames_evaluation_results.csv", index=False)
    # Print summary
    logger.info(f"\nOverall Accuracy: {accuracy:.2%}")
-    logger.info("\nAccuracy by Reasoning Type:")
+    logger.info(f"\nAccuracy by Reasoning Type:\n{reasoning_type_accuracy}")
-    logger.info(reasoning_type_accuracy)
+
    # Save results
    output_file = args.output or f"frames_evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    df.to_csv(output_file, index=False)
    logger.info(f"Results saved to {output_file}")
 if __name__ == "__main__":
@ -201,12 +229,14 @@ if __name__ == "__main__":
    Evaluate Khoj on the Google FRAMES benchmark.
    Response are evaluated by GEMINI_EVAL_MODEL (default: gemini-pro-1.5-002).
-    Khoj should be running at KHOJ_URL, default at http://localhost:42110.
+    Khoj should be running at KHOJ_URL (default: http://localhost:42110).
    The Gemini judge model is accessed via the Gemini API with your GEMINI_API_KEY.
    To evaluate Khoj in research mode, set the KHOJ_MODE environment variable to "research".
    Run the script using the following command:
    KHOJ_MODE="research" GEMINI_API_KEY="<your_gemini_api_key>" python eval_frames.py
    """
-    with timer("Ran eval in", logger):
+    logger.info(f"{datetime.now()} - Begin Quizzing Khoj on the FRAMES benchmark.")
    with timer("Ran eval script in", logger, log_level=logging.INFO):
        main()
    logger.info(f"{datetime.now()} - End Quizzing Khoj on the FRAMES benchmark.")