mirror of
https://github.com/khoj-ai/khoj.git
synced 2025-01-05 11:08:08 +00:00
157 lines
5.4 KiB
YAML
157 lines
5.4 KiB
YAML
name: eval
|
|
|
|
on:
|
|
# Run on every release
|
|
push:
|
|
tags:
|
|
- "*"
|
|
# Allow manual triggers from GitHub UI
|
|
workflow_dispatch:
|
|
inputs:
|
|
khoj_mode:
|
|
description: 'Khoj Mode (general/default/research)'
|
|
required: true
|
|
default: 'default'
|
|
type: choice
|
|
options:
|
|
- general
|
|
- default
|
|
- research
|
|
dataset:
|
|
description: 'Dataset to evaluate (frames/simpleqa)'
|
|
required: true
|
|
default: 'frames'
|
|
type: choice
|
|
options:
|
|
- frames
|
|
- simpleqa
|
|
- gpqa
|
|
- math500
|
|
sample_size:
|
|
description: 'Number of samples to evaluate'
|
|
required: false
|
|
default: 200
|
|
type: number
|
|
|
|
jobs:
|
|
eval:
|
|
runs-on: ubuntu-latest
|
|
strategy:
|
|
matrix:
|
|
# Use input from manual trigger if available, else run all combinations
|
|
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
|
|
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
|
|
|
|
services:
|
|
postgres:
|
|
image: ankane/pgvector
|
|
env:
|
|
POSTGRES_PASSWORD: postgres
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_DB: postgres
|
|
ports:
|
|
- 5432:5432
|
|
options: >-
|
|
--health-cmd pg_isready
|
|
--health-interval 10s
|
|
--health-timeout 5s
|
|
--health-retries 5
|
|
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
with:
|
|
fetch-depth: 0
|
|
|
|
- name: Set up Python
|
|
uses: actions/setup-python@v4
|
|
with:
|
|
python-version: '3.10'
|
|
|
|
- name: Get App Version
|
|
id: hatch
|
|
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
|
|
|
|
- name: ⏬️ Install Dependencies
|
|
env:
|
|
DEBIAN_FRONTEND: noninteractive
|
|
run: |
|
|
# install postgres and other dependencies
|
|
sudo apt update && sudo apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
|
|
sudo apt install -y postgresql postgresql-client && sudo apt install -y postgresql-server-dev-14
|
|
# upgrade pip
|
|
python -m ensurepip --upgrade && python -m pip install --upgrade pip
|
|
# install terrarium for code sandbox
|
|
git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache
|
|
|
|
- name: ⬇️ Install Application
|
|
run: |
|
|
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
|
|
pip install --upgrade .[dev]
|
|
|
|
- name: 📝 Run Eval
|
|
env:
|
|
KHOJ_MODE: ${{ matrix.khoj_mode }}
|
|
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
|
|
BATCH_SIZE: "20"
|
|
RANDOMIZE: "True"
|
|
KHOJ_URL: "http://localhost:42110"
|
|
KHOJ_LLM_SEED: "42"
|
|
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
|
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
|
|
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }}
|
|
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
|
KHOJ_ADMIN_EMAIL: khoj
|
|
KHOJ_ADMIN_PASSWORD: khoj
|
|
POSTGRES_HOST: localhost
|
|
POSTGRES_PORT: 5432
|
|
POSTGRES_USER: postgres
|
|
POSTGRES_PASSWORD: postgres
|
|
POSTGRES_DB: postgres
|
|
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
|
|
run: |
|
|
# Start Khoj server in background
|
|
khoj --anonymous-mode --non-interactive &
|
|
|
|
# Start code sandbox
|
|
npm run dev --prefix terrarium &
|
|
|
|
# Wait for server to be ready
|
|
timeout=120
|
|
while ! curl -s http://localhost:42110/api/health > /dev/null; do
|
|
if [ $timeout -le 0 ]; then
|
|
echo "Timed out waiting for Khoj server"
|
|
exit 1
|
|
fi
|
|
echo "Waiting for Khoj server..."
|
|
sleep 2
|
|
timeout=$((timeout-2))
|
|
done
|
|
|
|
# Run evals
|
|
python tests/evals/eval.py -d ${{ matrix.dataset }}
|
|
|
|
- name: Upload Results
|
|
if: always() # Upload results even if tests fail
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
|
|
path: |
|
|
*_evaluation_results_*.csv
|
|
*_evaluation_summary_*.txt
|
|
|
|
- name: Display Results
|
|
if: always()
|
|
run: |
|
|
# Read and display summary
|
|
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
|
|
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
|
|
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
|
|
echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
|
|
# Display in logs too
|
|
echo "===== EVALUATION RESULTS ====="
|
|
cat *_evaluation_summary_*.txt
|