mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-11-23 15:38:55 +01:00
Add GitHub workflow to quiz Khoj across modes and specified evals (#982)
- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes - Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models - Trigger eval workflow on release or manually - Make dataset, khoj mode and sample size configurable when triggered via manual workflow - Enable Web search, webpage read tools during evaluation
This commit is contained in:
parent
f75085dc7a
commit
7c0fd71bfd
2 changed files with 127 additions and 3 deletions
122
.github/workflows/run_evals.yml
vendored
Normal file
122
.github/workflows/run_evals.yml
vendored
Normal file
|
@ -0,0 +1,122 @@
|
|||
name: Run Khoj Evals
|
||||
|
||||
on:
|
||||
# Run on every releases
|
||||
release:
|
||||
types: [published]
|
||||
# Allow manual triggers from GitHub UI
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
khoj_mode:
|
||||
description: 'Khoj Mode (general/default/research)'
|
||||
required: true
|
||||
default: 'default'
|
||||
type: choice
|
||||
options:
|
||||
- general
|
||||
- default
|
||||
- research
|
||||
dataset:
|
||||
description: 'Dataset to evaluate (frames/simpleqa)'
|
||||
required: true
|
||||
default: 'frames'
|
||||
type: choice
|
||||
options:
|
||||
- frames
|
||||
- simpleqa
|
||||
sample_size:
|
||||
description: 'Number of samples to evaluate'
|
||||
required: false
|
||||
default: 200
|
||||
type: number
|
||||
|
||||
jobs:
|
||||
eval:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
# Use input from manual trigger if available, else run all combinations
|
||||
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
|
||||
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: ankane/pgvector
|
||||
env:
|
||||
POSTGRES_PASSWORD: postgres
|
||||
POSTGRES_USER: postgres
|
||||
ports:
|
||||
- 5432:5432
|
||||
options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.10
|
||||
|
||||
- name: Get App Version
|
||||
id: hatch
|
||||
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: ⏬️ Install Dependencies
|
||||
env:
|
||||
DEBIAN_FRONTEND: noninteractive
|
||||
run: |
|
||||
apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
|
||||
apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
|
||||
python -m ensurepip --upgrade
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
- name: ⬇️ Install Application
|
||||
run: |
|
||||
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
|
||||
pip install --upgrade .[dev]
|
||||
|
||||
- name: 📝 Run Evals
|
||||
env:
|
||||
KHOJ_MODE: ${{ matrix.khoj_mode }}
|
||||
SAMPLE_SIZE: ${{ inputs.sample_size }}
|
||||
BATCH_SIZE: "20"
|
||||
RANDOMIZE: "True"
|
||||
KHOJ_URL: "http://localhost:42110"
|
||||
KHOJ_LLM_SEED: "42"
|
||||
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
||||
SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }}
|
||||
OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }}
|
||||
POSTGRES_HOST: postgres
|
||||
POSTGRES_PORT: 5432
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: postgres
|
||||
POSTGRES_DB: postgres
|
||||
KHOJ_DEBUG: "False" # To disable prompt tracer
|
||||
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
|
||||
run: |
|
||||
# Start Khoj server in background
|
||||
khoj --anonymous-mode --non-interactive &
|
||||
|
||||
# Wait for server to be ready
|
||||
timeout=120
|
||||
while ! curl -s http://localhost:42110/api/health > /dev/null; do
|
||||
if [ $timeout -le 0 ]; then
|
||||
echo "Timed out waiting for Khoj server"
|
||||
exit 1
|
||||
fi
|
||||
echo "Waiting for Khoj server..."
|
||||
sleep 2
|
||||
timeout=$((timeout-2))
|
||||
done
|
||||
|
||||
# Run evals
|
||||
python tests/evals/eval.py -d ${{ matrix.dataset }}
|
||||
|
||||
- name: Upload Results
|
||||
if: always() # Upload results even if tests fail
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
|
||||
path: "*_evaluation_results_*.csv"
|
|
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|||
KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
|
||||
KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
|
||||
KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
|
||||
KHOJ_MODE = os.getenv("KHOJ_MODE") # E.g research, general, notes etc.
|
||||
KHOJ_MODE = os.getenv("KHOJ_MODE", "default") # E.g research, general, notes etc.
|
||||
|
||||
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
||||
GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
|
||||
|
@ -32,8 +32,10 @@ GEMINI_API_URL = (
|
|||
|
||||
SAMPLE_SIZE = os.getenv("SAMPLE_SIZE") # Number of examples to evaluate
|
||||
RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true" # Randomize examples
|
||||
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10)) # Number of examples to evaluate in parallel
|
||||
SLEEP_SECONDS = 1 # Delay between API calls to avoid rate limiting
|
||||
BATCH_SIZE = int(
|
||||
os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
|
||||
) # Examples to evaluate in each batch
|
||||
SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1 # Sleep between API calls to avoid rate limiting
|
||||
|
||||
|
||||
def load_frames_dataset():
|
||||
|
|
Loading…
Reference in a new issue