mirror of
https://github.com/khoj-ai/khoj.git
synced 2024-12-18 18:47:11 +00:00
29e801c381
Evaluate simpler MATH500 responses with gemini 1.5 flash This improves both the speed and cost of running this eval
157 lines
5.4 KiB
YAML
157 lines
5.4 KiB
YAML
name: eval
|
|
|
|
on:
|
|
# Run on every release
|
|
push:
|
|
tags:
|
|
- "*"
|
|
# Allow manual triggers from GitHub UI
|
|
workflow_dispatch:
|
|
inputs:
|
|
khoj_mode:
|
|
description: 'Khoj Mode (general/default/research)'
|
|
required: true
|
|
default: 'default'
|
|
type: choice
|
|
options:
|
|
- general
|
|
- default
|
|
- research
|
|
dataset:
|
|
description: 'Dataset to evaluate (frames/simpleqa)'
|
|
required: true
|
|
default: 'frames'
|
|
type: choice
|
|
options:
|
|
- frames
|
|
- simpleqa
|
|
- gpqa
|
|
- math500
|
|
sample_size:
|
|
description: 'Number of samples to evaluate'
|
|
required: false
|
|
default: 200
|
|
type: number
|
|
|
|
jobs:
|
|
eval:
|
|
runs-on: ubuntu-latest
|
|
strategy:
|
|
matrix:
|
|
# Use input from manual trigger if available, else run all combinations
|
|
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
|
|
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
|
|
|
|
services:
|
|
postgres:
|
|
image: ankane/pgvector
|
|
env:
|
|
POSTGRES_PASSWORD: postgres
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_DB: postgres
|
|
ports:
|
|
- 5432:5432
|
|
options: >-
|
|
--health-cmd pg_isready
|
|
--health-interval 10s
|
|
--health-timeout 5s
|
|
--health-retries 5
|
|
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
with:
|
|
fetch-depth: 0
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: '3.10'
|
|
|
|
- name: Get App Version
|
|
id: hatch
|
|
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
|
|
|
|
- name: ⏬️ Install Dependencies
|
|
env:
|
|
DEBIAN_FRONTEND: noninteractive
|
|
run: |
|
|
# install postgres and other dependencies
|
|
apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
|
|
apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
|
|
# upgrade pip
|
|
python -m ensurepip --upgrade && python -m pip install --upgrade pip
|
|
# install terrarium for code sandbox
|
|
git clone https://github.com/cohere-ai/cohere-terrarium.git && cd cohere-terrarium && npm install && mkdir pyodide_cache
|
|
|
|
- name: ⬇️ Install Application
|
|
run: |
|
|
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
|
|
pip install --upgrade .[dev]
|
|
|
|
- name: 📝 Run Eval
|
|
env:
|
|
KHOJ_MODE: ${{ matrix.khoj_mode }}
|
|
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
|
|
BATCH_SIZE: "20"
|
|
RANDOMIZE: "True"
|
|
KHOJ_URL: "http://localhost:42110"
|
|
KHOJ_LLM_SEED: "42"
|
|
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
|
|
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }}
|
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
KHOJ_ADMIN_EMAIL: khoj
|
|
KHOJ_ADMIN_PASSWORD: khoj
|
|
POSTGRES_HOST: localhost
|
|
POSTGRES_PORT: 5432
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_PASSWORD: postgres
|
|
POSTGRES_DB: postgres
|
|
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
|
|
run: |
|
|
# Start Khoj server in background
|
|
khoj --anonymous-mode --non-interactive &
|
|
|
|
# Start code sandbox
|
|
npm run dev --prefix cohere-terrarium &
|
|
|
|
# Wait for server to be ready
|
|
timeout=120
|
|
while ! curl -s http://localhost:42110/api/health > /dev/null; do
|
|
if [ $timeout -le 0 ]; then
|
|
echo "Timed out waiting for Khoj server"
|
|
exit 1
|
|
fi
|
|
echo "Waiting for Khoj server..."
|
|
sleep 2
|
|
timeout=$((timeout-2))
|
|
done
|
|
|
|
# Run evals
|
|
python tests/evals/eval.py -d ${{ matrix.dataset }}
|
|
|
|
- name: Upload Results
|
|
if: always() # Upload results even if tests fail
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
|
|
path: |
|
|
*_evaluation_results_*.csv
|
|
*_evaluation_summary_*.txt
|
|
|
|
- name: Display Results
|
|
if: always()
|
|
run: |
|
|
# Read and display summary
|
|
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
|
|
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
|
|
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
|
|
echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
|
|
# Display in logs too
|
|
echo "===== EVALUATION RESULTS ====="
|
|
cat *_evaluation_summary_*.txt
|