mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-27 17:35:07 +01:00
Enable evaluating Khoj on the OpenAI SimpleQA bench using eval script
- Just load the raw csv from OpenAI bucket. Normalize it into FRAMES format - Improve docstring for frames datasets as well - Log the load dataset perf timer at info level
This commit is contained in:
parent
eb5bc6d9eb
commit
d9d5884958
1 changed files with 60 additions and 4 deletions
|
@ -5,6 +5,7 @@ import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from io import StringIO
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -36,11 +37,21 @@ SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
|
||||||
|
|
||||||
|
|
||||||
def load_frames_dataset():
|
def load_frames_dataset():
|
||||||
"""Load the FRAMES benchmark dataset from HuggingFace"""
|
"""
|
||||||
|
Load the Google FRAMES benchmark dataset from HuggingFace
|
||||||
|
|
||||||
|
FRAMES is a benchmark dataset to evaluate retrieval and answering capabilities of agents.
|
||||||
|
It contains ~800 requiring multi-hop retrieval and reasoning across various topics.
|
||||||
|
|
||||||
|
### Data Fields
|
||||||
|
- Prompt: The question to be answered
|
||||||
|
- Answer: The ground truth answer
|
||||||
|
- reasoning_types: The type of reasoning required to answer the question
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
dataset = load_dataset("google/frames-benchmark")
|
dataset = load_dataset("google/frames-benchmark")
|
||||||
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
|
||||||
# Use test split for evaluation. Sample and shuffle dataset if configured
|
# Use test split for evaluation. Sample and shuffle dataset if configured
|
||||||
|
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
||||||
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
|
return dataset["test"][: int(SAMPLE_SIZE)] if SAMPLE_SIZE else dataset["test"]
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -48,6 +59,49 @@ def load_frames_dataset():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_simpleqa_dataset():
|
||||||
|
"""
|
||||||
|
Load the OpenAI SimpleQA benchmark dataset from their public bucket.
|
||||||
|
|
||||||
|
SimpleQA is a dataset of moderately difficult q&a for 2024 models to answer across various topics.
|
||||||
|
It contains ~4000 human vetted questions and answers with additional metadata.
|
||||||
|
Its usage can be seen in openai/simple-evals github repository as well.
|
||||||
|
|
||||||
|
### Data Fields
|
||||||
|
- problem: The question to be answered
|
||||||
|
- answer: The ground truth answer
|
||||||
|
- metadata: Additional metadata including topic information
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load SimpleQA benchmark from OpenAI public bucket
|
||||||
|
raw_url = "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
|
||||||
|
response = requests.get(raw_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Parse benchmark from raw CSV response
|
||||||
|
csv_data = pd.read_csv(StringIO(response.text))
|
||||||
|
# Normalize it into FRAMES format
|
||||||
|
formatted_data = [
|
||||||
|
{
|
||||||
|
"Prompt": d["problem"],
|
||||||
|
"Answer": d["answer"],
|
||||||
|
"reasoning_types": json.loads(csv_data.to_dict("records")[0]["metadata"].replace("'", '"'))["topic"],
|
||||||
|
}
|
||||||
|
for d in csv_data.to_dict("records")
|
||||||
|
]
|
||||||
|
|
||||||
|
# Convert benchmark to HF Dataset
|
||||||
|
dataset = Dataset.from_list(formatted_data)
|
||||||
|
dataset = dataset.shuffle() if RANDOMIZE else dataset
|
||||||
|
dataset = dataset.select(range(int(SAMPLE_SIZE))) if SAMPLE_SIZE else dataset
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading simpleqa dataset: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_agent_response(prompt: str) -> str:
|
def get_agent_response(prompt: str) -> str:
|
||||||
"""Get response from the Khoj API"""
|
"""Get response from the Khoj API"""
|
||||||
try:
|
try:
|
||||||
|
@ -176,7 +230,7 @@ def parse_args():
|
||||||
"--dataset",
|
"--dataset",
|
||||||
"-d",
|
"-d",
|
||||||
default="frames",
|
default="frames",
|
||||||
choices=["frames"],
|
choices=["frames", "simpleqa"],
|
||||||
help="Dataset to use for evaluation (default: frames)",
|
help="Dataset to use for evaluation (default: frames)",
|
||||||
)
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
@ -188,9 +242,11 @@ def main():
|
||||||
dataset = None
|
dataset = None
|
||||||
|
|
||||||
# Load dataset
|
# Load dataset
|
||||||
with timer(f"Loaded {args.dataset} dataset in", logger):
|
with timer(f"Loaded {args.dataset} dataset in", logger, log_level=logging.INFO):
|
||||||
if args.dataset == "frames":
|
if args.dataset == "frames":
|
||||||
dataset = load_frames_dataset()
|
dataset = load_frames_dataset()
|
||||||
|
elif args.dataset == "simpleqa":
|
||||||
|
dataset = load_simpleqa_dataset()
|
||||||
if dataset is None:
|
if dataset is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue