mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Add GitHub workflow to quiz Khoj across modes and specified evals (#982)
- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes - Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models - Trigger eval workflow on release or manually - Make dataset, khoj mode and sample size configurable when triggered via manual workflow - Enable Web search, webpage read tools during evaluation
This commit is contained in:
parent
f75085dc7a
commit
7c0fd71bfd
2 changed files with 127 additions and 3 deletions
122
.github/workflows/run_evals.yml
vendored
Normal file
122
.github/workflows/run_evals.yml
vendored
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
name: Run Khoj Evals
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Run on every releases
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
# Allow manual triggers from GitHub UI
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
khoj_mode:
|
||||||
|
description: 'Khoj Mode (general/default/research)'
|
||||||
|
required: true
|
||||||
|
default: 'default'
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- general
|
||||||
|
- default
|
||||||
|
- research
|
||||||
|
dataset:
|
||||||
|
description: 'Dataset to evaluate (frames/simpleqa)'
|
||||||
|
required: true
|
||||||
|
default: 'frames'
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- frames
|
||||||
|
- simpleqa
|
||||||
|
sample_size:
|
||||||
|
description: 'Number of samples to evaluate'
|
||||||
|
required: false
|
||||||
|
default: 200
|
||||||
|
type: number
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
eval:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
# Use input from manual trigger if available, else run all combinations
|
||||||
|
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
|
||||||
|
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
|
||||||
|
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: ankane/pgvector
|
||||||
|
env:
|
||||||
|
POSTGRES_PASSWORD: postgres
|
||||||
|
POSTGRES_USER: postgres
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: 3.10
|
||||||
|
|
||||||
|
- name: Get App Version
|
||||||
|
id: hatch
|
||||||
|
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: ⏬️ Install Dependencies
|
||||||
|
env:
|
||||||
|
DEBIAN_FRONTEND: noninteractive
|
||||||
|
run: |
|
||||||
|
apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
|
||||||
|
apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
|
||||||
|
python -m ensurepip --upgrade
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
|
||||||
|
- name: ⬇️ Install Application
|
||||||
|
run: |
|
||||||
|
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
|
||||||
|
pip install --upgrade .[dev]
|
||||||
|
|
||||||
|
- name: 📝 Run Evals
|
||||||
|
env:
|
||||||
|
KHOJ_MODE: ${{ matrix.khoj_mode }}
|
||||||
|
SAMPLE_SIZE: ${{ inputs.sample_size }}
|
||||||
|
BATCH_SIZE: "20"
|
||||||
|
RANDOMIZE: "True"
|
||||||
|
KHOJ_URL: "http://localhost:42110"
|
||||||
|
KHOJ_LLM_SEED: "42"
|
||||||
|
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
||||||
|
SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }}
|
||||||
|
OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }}
|
||||||
|
POSTGRES_HOST: postgres
|
||||||
|
POSTGRES_PORT: 5432
|
||||||
|
POSTGRES_USER: postgres
|
||||||
|
POSTGRES_PASSWORD: postgres
|
||||||
|
POSTGRES_DB: postgres
|
||||||
|
KHOJ_DEBUG: "False" # To disable prompt tracer
|
||||||
|
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
|
||||||
|
run: |
|
||||||
|
# Start Khoj server in background
|
||||||
|
khoj --anonymous-mode --non-interactive &
|
||||||
|
|
||||||
|
# Wait for server to be ready
|
||||||
|
timeout=120
|
||||||
|
while ! curl -s http://localhost:42110/api/health > /dev/null; do
|
||||||
|
if [ $timeout -le 0 ]; then
|
||||||
|
echo "Timed out waiting for Khoj server"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Waiting for Khoj server..."
|
||||||
|
sleep 2
|
||||||
|
timeout=$((timeout-2))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Run evals
|
||||||
|
python tests/evals/eval.py -d ${{ matrix.dataset }}
|
||||||
|
|
||||||
|
- name: Upload Results
|
||||||
|
if: always() # Upload results even if tests fail
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
|
||||||
|
path: "*_evaluation_results_*.csv"
|
|
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
|
||||||
KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
|
KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
|
||||||
KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
|
KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
|
||||||
KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
|
KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
|
||||||
KHOJ_MODE = os.getenv("KHOJ_MODE") # E.g research, general, notes etc.
|
KHOJ_MODE = os.getenv("KHOJ_MODE", "default") # E.g research, general, notes etc.
|
||||||
|
|
||||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
||||||
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
|
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
|
||||||
|
@ -32,8 +32,10 @@ GEMINI_API_URL = (
|
||||||
|
|
||||||
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
||||||
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
||||||
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10)) # Number of examples to evaluate in parallel
|
BATCH_SIZE = int(
|
||||||
SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
|
os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
|
||||||
|
) # Examples to evaluate in each batch
|
||||||
|
SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1 # Sleep between API calls to avoid rate limiting
|
||||||
|
|
||||||
|
|
||||||
def load_frames_dataset():
|
def load_frames_dataset():
|
||||||
|
|
Loading…
Reference in a new issue