Run prompt batches in parallel for faster eval runs

This commit is contained in:
Debanjum 2024-11-02 04:58:03 -07:00
parent 96904e0769
commit 791eb205f6

View file

@ -1,3 +1,4 @@
import concurrent.futures
import json import json
import os import os
import time import time
@ -90,6 +91,43 @@ def evaluate_response(query: str, agent_response: str, ground_truth: str) -> Dic
return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"} return {"decision": "FALSE", "explanation": f"Evaluation failed: {str(e)}"}
def process_batch(batch, counter, results, dataset_length):
for prompt, answer, reasoning_type in batch:
counter += 1
print(f"Processing example: {counter}/{dataset_length}")
# Trigger research mode if enabled
prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
# Get agent response
agent_response = get_agent_response(prompt)
# Evaluate response
evaluation = evaluate_response(prompt, agent_response, answer)
# Store results
results.append(
{
"index": counter,
"prompt": prompt,
"ground_truth": answer,
"agent_response": agent_response,
"evaluation_decision": evaluation["decision"],
"evaluation_explanation": evaluation["explanation"],
"reasoning_type": reasoning_type,
}
)
# Color the decision based on its value
decision_color = "green" if evaluation["decision"] == True else "red"
colored_decision = color_text(evaluation["decision"], decision_color)
print(
f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n'
)
time.sleep(SLEEP_SECONDS) # Rate limiting
def color_text(text, color): def color_text(text, color):
colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"} colors = {"red": "\033[91m", "green": "\033[92m", "reset": "\033[0m"}
return f"{colors[color]}{text}{colors['reset']}" return f"{colors[color]}{text}{colors['reset']}"
@ -109,49 +147,21 @@ def main():
# Initialize variables # Initialize variables
counter = 0 counter = 0
results = [] results = []
dataset_length = len(dataset["Prompt"])
# Process examples in batches # Process examples in batches
for i in range(0, len(dataset), BATCH_SIZE): with concurrent.futures.ThreadPoolExecutor() as executor:
batch = zip( futures = []
dataset["Prompt"][i : i + BATCH_SIZE], for i in range(0, dataset_length, BATCH_SIZE):
dataset["Answer"][i : i + BATCH_SIZE], batch = zip(
dataset["reasoning_types"][i : i + BATCH_SIZE], dataset["Prompt"][i : i + BATCH_SIZE],
) dataset["Answer"][i : i + BATCH_SIZE],
dataset["reasoning_types"][i : i + BATCH_SIZE],
for prompt, answer, reasoning_type in batch:
counter += 1
print(f'Processing example: {counter}/{len(dataset["Prompt"])}')
# Trigger research mode if enabled
prompt = f"/{KHOJ_MODE} {prompt}" if KHOJ_MODE else prompt
# Get agent response
agent_response = get_agent_response(prompt)
# Evaluate response
evaluation = evaluate_response(agent_response, answer)
# Store results
results.append(
{
"index": i,
"prompt": prompt,
"ground_truth": answer,
"agent_response": agent_response,
"evaluation_decision": evaluation["decision"],
"evaluation_explanation": evaluation["explanation"],
"reasoning_type": reasoning_type,
}
) )
futures.append(executor.submit(process_batch, batch, counter, results, dataset_length))
# Color the decision based on its value # Wait for all futures to complete
decision_color = "green" if evaluation["decision"] == True else "red" concurrent.futures.wait(futures)
colored_decision = color_text(evaluation["decision"], decision_color)
print(
f'Decision: {colored_decision}\nQuestion: {prompt}\nExpected Answer: {answer}\nAgent Answer: {agent_response}\nExplanation: {evaluation["explanation"]}\n'
)
time.sleep(SLEEP_SECONDS) # Rate limiting
# Calculate metrics # Calculate metrics
df = pd.DataFrame(results) df = pd.DataFrame(results)