Skip to content

CoPilot LLM Performance Evaluation #290

CoPilot LLM Performance Evaluation

CoPilot LLM Performance Evaluation #290

name: CoPilot LLM Performance Evaluation
on:
push:
branches: [ "build-test-ci" ]
pull_request:
branches: [ "main", "dev" ]
types: [ labeled ]
schedule:
- cron: '45 22 * * 6'
workflow_dispatch:
jobs:
build-and-test:
runs-on: [ self-hosted, dind ]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Build Docker.tests image
uses: docker/build-push-action@v5
with:
context: .
file: copilot/Dockerfile.tests
push: false
load: true
tags: nlqs/tests:0.1
- name: Create db config
run: |
mkdir configs
echo "$DB_CONFIG" > configs/db_config.json
echo "$LLM_CONFIG_OPENAI_GPT4" > configs/openai_gpt4_config.json
echo "$LLM_CONFIG_OPENAI_GPT4O" > configs/openai_gpt4o_config.json
echo "$LLM_CONFIG_AZURE_GPT35" > configs/azure_llm_config.json
echo "$LLM_CONFIG_OPENAI_GPT35" > configs/openai_gpt3.5-turbo_config.json
echo "$LLM_CONFIG_GCP_TEXT_BISON" > configs/gcp_text-bison_config.json
echo "$LLM_CONFIG_HUGGINGFACE_PHI3" > configs/huggingface_severless_endpoint_phi3_config.json
echo "$GCP_CREDS_CONFIG" > configs/GCP_CREDS.json
echo "$LLM_TEST_EVALUATOR" > configs/test_evaluation_model_config.json
echo "$LLM_CONFIG_GROQ_MIXTRAL" > configs/groq_mixtral_config.json
echo "$LLM_CONFIG_BEDROCK_CLAUDE3" > configs/bedrock_config.json
echo "$LLM_CONFIG_HUGGINGFACE_LLAMA70B" > configs/huggingface_llama70b_config.json
echo "$LLM_CONFIG_WATSONX_MISTRAL_LARGE" > configs/ibm_watsonx_config.json
echo "$MILVUS_CONFIG" > configs/milvus_config.json
env:
DB_CONFIG: ${{ secrets.DB_CONFIG }}
LLM_CONFIG_OPENAI_GPT4: ${{ secrets.LLM_CONFIG_OPENAI_GPT4 }}
LLM_CONFIG_AZURE_GPT35: ${{ secrets.LLM_CONFIG_AZURE_GPT35 }}
LLM_CONFIG_GCP_TEXT_BISON: ${{ secrets.LLM_CONFIG_GCP_TEXT_BISON }}
LLM_CONFIG_OPENAI_GPT35: ${{ secrets.LLM_CONFIG_OPENAI_GPT35 }}
LLM_CONFIG_GROQ_MIXTRAL: ${{ secrets.LLM_CONFIG_GROQ_MIXTRAL }}
LLM_CONFIG_BEDROCK_CLAUDE3: ${{ secrets.LLM_CONFIG_BEDROCK_CLAUDE3 }}
LLM_CONFIG_HUGGINGFACE_PHI3: ${{ secrets.LLM_CONFIG_HUGGINGFACE_PHI3 }}
LLM_CONFIG_OPENAI_GPT4O: ${{ secrets.LLM_CONFIG_OPENAI_GPT4O }}
LLM_CONFIG_HUGGINGFACE_LLAMA70B: ${{ secrets.LLM_CONFIG_HUGGINGFACE_LLAMA70B }}
LLM_CONFIG_WATSONX_MISTRAL_LARGE: ${{ secrets.LLM_CONFIG_WATSONX_MISTRAL_LARGE }}
GCP_CREDS_CONFIG: ${{ secrets.GCP_CREDS_CONFIG }}
LLM_TEST_EVALUATOR: ${{ secrets.LLM_TEST_EVALUATOR }}
MILVUS_CONFIG: ${{ secrets.MILVUS_CONFIG }}
- name: Run Docker Container for PR
if: ${{ github.event_name }} == 'pull_request'
run: |
docker rm -f nlqs-tests || true
docker run -it -v $(pwd)/configs/:/code/app/configs -e GOOGLE_APPLICATION_CREDENTIALS=/code/app/configs/GCP_CREDS.json -e WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} -e PR_NUMBER=${{ github.event.number }} --name nlqs-tests -d nlqs/tests:0.1
- name: Run Docker Container for Regress
if: ${{ github.event_name == 'schedule' }}
run: |
docker rm -f nlqs-tests || true
docker run -it -v $(pwd)/configs/:/code/app/configs -e GOOGLE_APPLICATION_CREDENTIALS=/code/app/configs/GCP_CREDS.json -e WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} -e PR_NUMBER="DailyRegression" --name nlqs-tests -d nlqs/tests:0.1
- name: Execute PR Tests
if: github.event_name == 'pull_request'
run: |
docker exec nlqs-tests bash -c "./run_tests.sh ${{ github.event.label.name }} all"
status=$?
if [ $status -ne 0 ]; then
echo "test failed with status $status"
exit $status
fi
url=$(docker exec nlqs-tests bash -c "cat /code/tests/report_url.txt")
echo "REPORT_URL=${url}" >> $GITHUB_ENV
- name: Execute Regress Tests
if: github.event_name == 'schedule'
run: |
docker exec nlqs-tests bash -c "./run_tests.sh all all"
status=$?
if [ $status -ne 0 ]; then
echo "test failed with status $status"
exit $status
fi
url=$(docker exec nlqs-tests bash -c "cat /code/tests/report_url.txt")
echo "REPORT_URL=${url}" >> $GITHUB_ENV
- name: Add PR Comment
if: ${{ github.event_name }} == 'pull_request'
uses: mshick/add-pr-comment@v2.8.1
with:
message: |
Test Report: ${{ env.REPORT_URL }}
- name: Cleanup
run: |
docker rm -f nlqs-tests
echo ${{ env.REPORT_URL }}