diff --git a/.github/workflows/run_evals.yml b/.github/workflows/run_evals.yml index 37d4fff9..fdca6f12 100644 --- a/.github/workflows/run_evals.yml +++ b/.github/workflows/run_evals.yml @@ -45,9 +45,14 @@ jobs: env: POSTGRES_PASSWORD: postgres POSTGRES_USER: postgres + POSTGRES_DB: postgres ports: - 5432:5432 - options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 steps: - uses: actions/checkout@v3 @@ -57,7 +62,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: '3.10' - name: Get App Version id: hatch @@ -88,7 +93,9 @@ jobs: GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} SERPER_DEV_API_KEY: ${{ secrets.SERPER_DEV_API_KEY }} OLOSTEP_API_KEY: ${{ secrets.OLOSTEP_API_KEY }} - POSTGRES_HOST: postgres + KHOJ_ADMIN_EMAIL: khoj + KHOJ_ADMIN_PASSWORD: khoj + POSTGRES_HOST: localhost POSTGRES_PORT: 5432 POSTGRES_USER: postgres POSTGRES_PASSWORD: postgres @@ -119,4 +126,23 @@ jobs: uses: actions/upload-artifact@v3 with: name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }} - path: "*_evaluation_results_*.csv" + path: | + *_evaluation_results_*.csv + *_evaluation_summary_*.txt + + - name: Display Results + if: always() + run: | + # Read and display summary + echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY + echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY + echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY + echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY + + # Display in logs too + echo "===== EVALUATION RESULTS =====" + cat *_evaluation_summary_*.txt diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c7eace70..2583f351 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,6 +5,7 @@ on: paths: - src/khoj/** - tests/** + - '!tests/evals/**' - config/** - pyproject.toml - .pre-commit-config.yml @@ -15,6 +16,7 @@ on: paths: - src/khoj/** - tests/** + - '!tests/evals/**' - config/** - pyproject.toml - .pre-commit-config.yml diff --git a/tests/evals/eval.py b/tests/evals/eval.py index e3f3b7d0..9c018b76 100644 --- a/tests/evals/eval.py +++ b/tests/evals/eval.py @@ -286,10 +286,23 @@ def main(): logger.info(f"\nOverall Accuracy: {colored_accuracy}") logger.info(f"\nAccuracy by Reasoning Type:\n{reasoning_type_accuracy}") - # Save results + # Save summary to file + sample_type = f"Sampling Type: {SAMPLE_SIZE} samples." if SAMPLE_SIZE else "Whole dataset." + sample_type += " Randomized." if RANDOMIZE else "" + summary = ( + f"Overall Accuracy: {accuracy:.2%}\n\nAccuracy by Reasoning Type:\n{reasoning_type_accuracy}\n\n{sample_type}\n" + ) + summary_file = args.output.replace(".csv", ".txt") if args.output else None + summary_file = ( + summary_file or f"{args.dataset}_evaluation_summary_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.txt" + ) + with open(summary_file, "w") as f: + f.write(summary) + + # Save raw results to file output_file = args.output or f"{args.dataset}_evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv" df.to_csv(output_file, index=False) - logger.info(f"Results saved to {output_file}") + logger.info(f"Results saved to {summary_file}, {output_file}") if __name__ == "__main__":