Add GitHub workflow to quiz Khoj across modes and specified evals (#982)

- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes - Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models - Trigger eval workflow on release or manually - Make dataset, khoj mode and sample size configurable when triggered via manual workflow - Enable Web search, webpage read tools during evaluation
2024-12-03 12:23:02 +01:00 · 2024-11-18 02:19:30 -08:00 · 2024-11-18 02:19:30 -08:00 · 7c0fd71bfd
commit 7c0fd71bfd
parent f75085dc7a
2 changed files with 127 additions and 3 deletions
--- a/.github/workflows/run_evals.yml
+++ b/.github/workflows/run_evals.yml
@ -0,0 +1,122 @@
 name: Run Khoj Evals
 on:
  # Run on every releases
  release:
    types: [published]
  # Allow manual triggers from GitHub UI
  workflow_dispatch:
    inputs:
      khoj_mode:
        description: 'Khoj Mode (general/default/research)'
        required: true
        default: 'default'
        type: choice
        options:
          - general
          - default
          - research
      dataset:
        description: 'Dataset to evaluate (frames/simpleqa)'
        required: true
        default: 'frames'
        type: choice
        options:
          - frames
          - simpleqa
      sample_size:
        description: 'Number of samples to evaluate'
        required: false
        default: 200
        type: number
 jobs:
  eval:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Use input from manual trigger if available, else run all combinations
        khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
    services:
      postgres:
        image: ankane/pgvector
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_USER: postgres
        ports:
          - 5432:5432
        options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.10
      - name: Get App Version
        id: hatch
        run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
      - name: ⏬️ Install Dependencies
        env:
          DEBIAN_FRONTEND: noninteractive
        run: |
          apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
          apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
          python -m ensurepip --upgrade
          python -m pip install --upgrade pip
      - name: ⬇️ Install Application
        run: |
          sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
          pip install --upgrade .[dev]
      - name: 📝 Run Evals
        env:
          KHOJ_MODE: ${{ matrix.khoj_mode }}
          SAMPLE_SIZE: ${{ inputs.sample_size }}
          BATCH_SIZE: "20"
          RANDOMIZE: "True"
          KHOJ_URL: "http://localhost:42110"
          KHOJ_LLM_SEED: "42"
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }}
          OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }}
          POSTGRES_HOST: postgres
          POSTGRES_PORT: 5432
          POSTGRES_USER: postgres
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: postgres
          KHOJ_DEBUG: "False"             # To disable prompt tracer
          KHOJ_TELEMETRY_DISABLE: "True"  # To disable telemetry for tests
        run: |
          # Start Khoj server in background
          khoj --anonymous-mode --non-interactive &
          # Wait for server to be ready
          timeout=120
          while ! curl -s http://localhost:42110/api/health > /dev/null; do
            if [ $timeout -le 0 ]; then
              echo "Timed out waiting for Khoj server"
              exit 1
            fi
            echo "Waiting for Khoj server..."
            sleep 2
            timeout=$((timeout-2))
          done
          # Run evals
          python tests/evals/eval.py -d ${{ matrix.dataset }}
      - name: Upload Results
        if: always()  # Upload results even if tests fail
        uses: actions/upload-artifact@v3
        with:
          name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
          path: "*_evaluation_results_*.csv"
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
 KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
 KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
-KHOJ_MODE = os.getenv("KHOJ_MODE")  # E.g research, general, notes etc.
+KHOJ_MODE = os.getenv("KHOJ_MODE", "default")  # E.g research, general, notes etc.
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
@ -32,8 +32,10 @@ GEMINI_API_URL = (
 SAMPLE_SIZE = os.getenv("SAMPLE_SIZE")  # Number of examples to evaluate
 RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true"  # Randomize examples
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10))  # Number of examples to evaluate in parallel
+BATCH_SIZE = int(
-SLEEP_SECONDS = 1  # Delay between API calls to avoid rate limiting
+    os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
 )  # Examples to evaluate in each batch
 SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1  # Sleep between API calls to avoid rate limiting
 def load_frames_dataset():