From 7c0fd71bfd2803f40b1c39937748b4ab67108c28 Mon Sep 17 00:00:00 2001
From: Debanjum <debanjum@gmail.com>
Date: Mon, 18 Nov 2024 02:19:30 -0800
Subject: [PATCH] Add GitHub workflow to quiz Khoj across modes and specified
 evals (#982)

- Evaluate khoj on random 200 questions from each of google frames and openai simpleqa benchmarks across *general*, *default* and *research* modes
- Run eval with Gemini 1.5 Flash as test giver and Gemini 1.5 Pro as test evaluator models
- Trigger eval workflow on release or manually
- Make dataset, khoj mode and sample size configurable when triggered via manual workflow
- Enable Web search, webpage read tools during evaluation
---
 .github/workflows/run_evals.yml | 122 ++++++++++++++++++++++++++++++++
 tests/evals/eval.py             |   8 ++-
 2 files changed, 127 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/run_evals.yml

diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml
new file mode 100644
index 00000000..37d4fff9
--- /dev/null
+++ b/.github/workflows/run_evals.yml
@@ -0,0 +1,122 @@
+name: Run Khoj Evals
+
+on:
+  # Run on every releases
+  release:
+    types: [published]
+  # Allow manual triggers from GitHub UI
+  workflow_dispatch:
+    inputs:
+      khoj_mode:
+        description: 'Khoj Mode (general/default/research)'
+        required: true
+        default: 'default'
+        type: choice
+        options:
+          - general
+          - default
+          - research
+      dataset:
+        description: 'Dataset to evaluate (frames/simpleqa)'
+        required: true
+        default: 'frames'
+        type: choice
+        options:
+          - frames
+          - simpleqa
+      sample_size:
+        description: 'Number of samples to evaluate'
+        required: false
+        default: 200
+        type: number
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # Use input from manual trigger if available, else run all combinations
+        khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
+        dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
+
+    services:
+      postgres:
+        image: ankane/pgvector
+        env:
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_USER: postgres
+        ports:
+          - 5432:5432
+        options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10
+
+      - name: Get App Version
+        id: hatch
+        run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
+
+      - name: ⏬️ Install Dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt update && apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
+          apt install -y postgresql postgresql-client && apt install -y postgresql-server-dev-14
+          python -m ensurepip --upgrade
+          python -m pip install --upgrade pip
+
+      - name: ⬇️ Install Application
+        run: |
+          sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
+          pip install --upgrade .[dev]
+
+      - name: 📝 Run Evals
+        env:
+          KHOJ_MODE: ${{ matrix.khoj_mode }}
+          SAMPLE_SIZE: ${{ inputs.sample_size }}
+          BATCH_SIZE: "20"
+          RANDOMIZE: "True"
+          KHOJ_URL: "http://localhost:42110"
+          KHOJ_LLM_SEED: "42"
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+          SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }}
+          OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }}
+          POSTGRES_HOST: postgres
+          POSTGRES_PORT: 5432
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_DB: postgres
+          KHOJ_DEBUG: "False"             # To disable prompt tracer
+          KHOJ_TELEMETRY_DISABLE: "True"  # To disable telemetry for tests
+        run: |
+          # Start Khoj server in background
+          khoj --anonymous-mode --non-interactive &
+
+          # Wait for server to be ready
+          timeout=120
+          while ! curl -s http://localhost:42110/api/health > /dev/null; do
+            if [ $timeout -le 0 ]; then
+              echo "Timed out waiting for Khoj server"
+              exit 1
+            fi
+            echo "Waiting for Khoj server..."
+            sleep 2
+            timeout=$((timeout-2))
+          done
+
+          # Run evals
+          python tests/evals/eval.py -d ${{ matrix.dataset }}
+
+      - name: Upload Results
+        if: always()  # Upload results even if tests fail
+        uses: actions/upload-artifact@v3
+        with:
+          name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
+          path: "*_evaluation_results_*.csv"
diff --git a/tests/evals/eval.py b/tests/evals/eval.py
index 8716e4d1..e3f3b7d0 100644
--- a/tests/evals/eval.py
+++ b/tests/evals/eval.py
@@ -22,7 +22,7 @@ logger = logging.getLogger(__name__)
 KHOJ_URL = os.getenv("KHOJ_URL", "http://localhost:42110")
 KHOJ_CHAT_API_URL = f"{KHOJ_URL}/api/chat"
 KHOJ_API_KEY = os.getenv("KHOJ_API_KEY")
-KHOJ_MODE = os.getenv("KHOJ_MODE")  # E.g research, general, notes etc.
+KHOJ_MODE = os.getenv("KHOJ_MODE", "default")  # E.g research, general, notes etc.
 
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
 GEMINI_EVAL_MODEL = os.getenv("GEMINI_EVAL_MODEL", "gemini-1.5-pro-002")
@@ -32,8 +32,10 @@ GEMINI_API_URL = (
 
 SAMPLE_SIZE = os.getenv("SAMPLE_SIZE")  # Number of examples to evaluate
 RANDOMIZE = os.getenv("RANDOMIZE", "false").lower() == "true"  # Randomize examples
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10))  # Number of examples to evaluate in parallel
-SLEEP_SECONDS = 1  # Delay between API calls to avoid rate limiting
+BATCH_SIZE = int(
+    os.getenv("BATCH_SIZE", int(SAMPLE_SIZE) / 10 if SAMPLE_SIZE else 10)
+)  # Examples to evaluate in each batch
+SLEEP_SECONDS = 3 if KHOJ_MODE == "general" else 1  # Sleep between API calls to avoid rate limiting
 
 
 def load_frames_dataset():