khoj/.github/workflows/run_evals.yml

name: eval

on:
  # Run on every release
  push:
    tags:
      - "*"
  # Allow manual triggers from GitHub UI
  workflow_dispatch:
    inputs:
      khoj_mode:
        description: 'Khoj Mode (general/default/research)'
        required: true
        default: 'default'
        type: choice
        options:
          - general
          - default
          - research
      dataset:
        description: 'Dataset to evaluate (frames/simpleqa)'
        required: true
        default: 'frames'
        type: choice
        options:
          - frames
          - simpleqa
          - gpqa
          - math500
      sample_size:
        description: 'Number of samples to evaluate'
        required: false
        default: 200
        type: number

jobs:
  eval:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        # Use input from manual trigger if available, else run all combinations
        khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}

    services:
      postgres:
        image: ankane/pgvector
        env:
          POSTGRES_PASSWORD: postgres
          POSTGRES_USER: postgres
          POSTGRES_DB: postgres
        ports:
          - 5432:5432
        options: >-
          --health-cmd pg_isready
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'

      - name: Get App Version
        id: hatch
        run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT

      - name: ⏬️ Install Dependencies
        env:
          DEBIAN_FRONTEND: noninteractive
        run: |
          # install postgres and other dependencies
          apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
          apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
          # upgrade pip
          python -m ensurepip --upgrade && python -m pip install --upgrade pip
          # install terrarium for code sandbox
          git clone https://github.com/cohere-ai/cohere-terrarium.git && cd cohere-terrarium && npm install && mkdir pyodide_cache

      - name: ⬇️ Install Application
        run: |
          sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
          pip install --upgrade .[dev]

      - name: 📝 Run Eval
        env:
          KHOJ_MODE: ${{ matrix.khoj_mode }}
          SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
          BATCH_SIZE: "20"
          RANDOMIZE: "True"
          KHOJ_URL: "http://localhost:42110"
          KHOJ_LLM_SEED: "42"
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
          OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }}
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          KHOJ_ADMIN_EMAIL: khoj
          KHOJ_ADMIN_PASSWORD: khoj
          POSTGRES_HOST: localhost
          POSTGRES_PORT: 5432
          POSTGRES_USER: postgres
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: postgres
          KHOJ_TELEMETRY_DISABLE: "True"  # To disable telemetry for tests
        run: |
          # Start Khoj server in background
          khoj --anonymous-mode --non-interactive &

          # Start code sandbox
          npm run dev --prefix cohere-terrarium &

          # Wait for server to be ready
          timeout=120
          while ! curl -s http://localhost:42110/api/health > /dev/null; do
            if [ $timeout -le 0 ]; then
              echo "Timed out waiting for Khoj server"
              exit 1
            fi
            echo "Waiting for Khoj server..."
            sleep 2
            timeout=$((timeout-2))
          done

          # Run evals
          python tests/evals/eval.py -d ${{ matrix.dataset }}

      - name: Upload Results
        if: always()  # Upload results even if tests fail
        uses: actions/upload-artifact@v3
        with:
          name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
          path: |
            *_evaluation_results_*.csv
            *_evaluation_summary_*.txt

      - name: Display Results
        if: always()
        run: |
          # Read and display summary
          echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
          echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
          echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
          echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY
          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
          tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
          echo "" >> $GITHUB_STEP_SUMMARY
          echo "\`\`\`" >> $GITHUB_STEP_SUMMARY

          # Display in logs too
          echo "===== EVALUATION RESULTS ====="
          cat *_evaluation_summary_*.txt