Enable evaluating Khoj on the OpenAI SimpleQA bench using eval script

- Just load the raw csv from OpenAI bucket. Normalize it into FRAMES format
- Improve docstring for frames datasets as well
- Log the load dataset perf timer at info level
This commit is contained in:
Debanjum 2024-11-14 15:55:00 -08:00
parent eb5bc6d9eb
commit d9d5884958

View file

@ -5,6 +5,7 @@ import logging
import os import os
import time import time
from datetime import datetime from datetime import datetime
from io import StringIO
from typing import Any, Dict from typing import Any, Dict
import pandas as pd import pandas as pd
@ -36,11 +37,21 @@ SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
def load_frames_dataset(): def load_frames_dataset():
"""Load the FRAMES benchmark dataset from HuggingFace""" """
Load the Google FRAMES benchmark dataset from HuggingFace
FRAMES is a benchmark dataset to evaluate retrieval and answering capabilities of agents.
It contains ~800 requiring multi-hop retrieval and reasoning across various topics.
### Data Fields
- Prompt: The question to be answered
- Answer: The ground truth answer
- reasoning_types: The type of reasoning required to answer the question
"""
try: try:
dataset = load_dataset("google/frames-benchmark") dataset = load_dataset("google/frames-benchmark")
dataset = dataset.shuffle() if RANDOMIZE else dataset
# Use test split for evaluation. Sample and shuffle dataset if configured # Use test split for evaluation. Sample and shuffle dataset if configured
dataset = dataset.shuffle() if RANDOMIZE else dataset
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"] return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
except Exception as e: except Exception as e:
@ -48,6 +59,49 @@ def load_frames_dataset():
return None return None
def load_simpleqa_dataset():
"""
Load the OpenAI SimpleQA benchmark dataset from their public bucket.
SimpleQA is a dataset of moderately difficult q&a for 2024 models to answer across various topics.
It contains ~4000 human vetted questions and answers with additional metadata.
Its usage can be seen in openai/simple-evals github repository as well.
### Data Fields
- problem: The question to be answered
- answer: The ground truth answer
- metadata: Additional metadata including topic information
"""
try:
# Load SimpleQA benchmark from OpenAI public bucket
raw_url = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
response = requests.get(raw_url)
response.raise_for_status()
# Parse benchmark from raw CSV response
csv_data = pd.read_csv(StringIO(response.text))
# Normalize it into FRAMES format
formatted_data = [
{
"Prompt": d["problem"],
"Answer": d["answer"],
"reasoning_types": json.loads(csv_data.to_dict("records")[0]["metadata"].replace("'", '"'))["topic"],
}
for d in csv_data.to_dict("records")
]
# Convert benchmark to HF Dataset
dataset = Dataset.from_list(formatted_data)
dataset = dataset.shuffle() if RANDOMIZE else dataset
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
return dataset
except Exception as e:
logger.error(f"Error loading simpleqa dataset: {e}")
return None
def get_agent_response(prompt: str) -> str: def get_agent_response(prompt: str) -> str:
"""Get response from the Khoj API""" """Get response from the Khoj API"""
try: try:
@ -176,7 +230,7 @@ def parse_args():
"--dataset", "--dataset",
"-d", "-d",
default="frames", default="frames",
choices=["frames"], choices=["frames", "simpleqa"],
help="Dataset to use for evaluation (default: frames)", help="Dataset to use for evaluation (default: frames)",
) )
return parser.parse_args() return parser.parse_args()
@ -188,9 +242,11 @@ def main():
dataset = None dataset = None
# Load dataset # Load dataset
with timer(f"Loaded {args.dataset} dataset in", logger): with timer(f"Loaded {args.dataset} dataset in", logger, log_level=logging.INFO):
if args.dataset == "frames": if args.dataset == "frames":
dataset = load_frames_dataset() dataset = load_frames_dataset()
elif args.dataset == "simpleqa":
dataset = load_simpleqa_dataset()
if dataset is None: if dataset is None:
return return