From 4b533608b607a71e6383a962b70fee868ba1f50a Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 24 Mar 2026 18:21:32 -0700 Subject: [PATCH] Python: Update sample validation scripts (#4870) * Update sample validation scripts * Adjust prompt * Update autogen-migration samples * Add fix suggestion * Split jobs * Add .env * Create trend report * Add timestamp * Add more env vars * Comments * force node24 * force node24 * force node22 --- .../sample-validation-setup/action.yml | 4 +- .../workflows/python-sample-validation.yml | 528 +++++++++++++++++- .../chat_client/built_in_chat_clients.py | 16 +- .../01_round_robin_group_chat.py | 26 +- .../orchestrations/02_selector_group_chat.py | 22 +- .../orchestrations/03_swarm.py | 21 +- .../orchestrations/04_magentic_one.py | 21 +- .../single_agent/01_basic_assistant_agent.py | 19 +- .../02_assistant_agent_with_tool.py | 20 +- .../03_assistant_agent_thread_and_stream.py | 19 +- .../single_agent/04_agent_as_tool.py | 20 +- python/scripts/sample_validation/README.md | 13 +- python/scripts/sample_validation/__main__.py | 10 +- python/scripts/sample_validation/aggregate.py | 224 ++++++++ .../create_dynamic_workflow_executor.py | 104 ++-- python/scripts/sample_validation/discovery.py | 24 +- python/scripts/sample_validation/models.py | 20 +- python/scripts/sample_validation/report.py | 16 +- ...un_dynamic_validation_workflow_executor.py | 3 +- 19 files changed, 928 insertions(+), 202 deletions(-) create mode 100644 python/scripts/sample_validation/aggregate.py diff --git a/.github/actions/sample-validation-setup/action.yml b/.github/actions/sample-validation-setup/action.yml index 3736348579..2920aaa5bd 100644 --- a/.github/actions/sample-validation-setup/action.yml +++ b/.github/actions/sample-validation-setup/action.yml @@ -24,7 +24,9 @@ runs: using: "composite" steps: - name: Set up Node.js environment - uses: actions/setup-node@v4 + uses: actions/setup-node@v6 + with: + node-version: 22 - name: Install Copilot CLI shell: bash diff --git a/.github/workflows/python-sample-validation.yml b/.github/workflows/python-sample-validation.yml index 4a14e6b41b..90fecec6a2 100644 --- a/.github/workflows/python-sample-validation.yml +++ b/.github/workflows/python-sample-validation.yml @@ -41,6 +41,13 @@ jobs: azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} os: ${{ runner.os }} + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + - name: Run sample validation run: | cd scripts && uv run python -m sample_validation --subdir 01-get-started --save-report --report-name 01-get-started @@ -50,7 +57,7 @@ jobs: if: always() with: name: validation-report-01-get-started - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ validate-02-agents: name: Validate 02-agents @@ -64,10 +71,13 @@ jobs: AZURE_OPENAI_ENDPOINT: ${{ vars.AZUREOPENAI__ENDPOINT }} AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }} AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }} + AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__EMBEDDINGDEPLOYMENTNAME }} # OpenAI configuration OPENAI_API_KEY: ${{ secrets.OPENAI__APIKEY }} OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI__CHATMODELID }} OPENAI_RESPONSES_MODEL_ID: ${{ vars.OPENAI__RESPONSESMODELID }} + # GitHub MCP + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} # Observability ENABLE_INSTRUMENTATION: "true" defaults: @@ -84,16 +94,420 @@ jobs: azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} os: ${{ runner.os }} + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=$AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME" >> .env + echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env + echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env + echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env + echo "GITHUB_PAT=$GITHUB_PAT" >> .env + - name: Run sample validation run: | - cd scripts && uv run python -m sample_validation --subdir 02-agents --save-report --report-name 02-agents + cd scripts && uv run python -m sample_validation --subdir 02-agents --exclude providers --save-report --report-name 02-agents - name: Upload validation report uses: actions/upload-artifact@v7 if: always() with: name: validation-report-02-agents - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ + + validate-02-agents-openai: + name: Validate 02-agents/providers/openai + runs-on: ubuntu-latest + environment: integration + env: + OPENAI_API_KEY: ${{ secrets.OPENAI__APIKEY }} + OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI__CHATMODELID }} + OPENAI_RESPONSES_MODEL_ID: ${{ vars.OPENAI__RESPONSESMODELID }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env + echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env + echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/openai --save-report --report-name 02-agents-openai + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-openai + path: python/samples/sample_validation/reports/ + + validate-02-agents-azure-openai: + name: Validate 02-agents/providers/azure_openai + runs-on: ubuntu-latest + environment: integration + env: + AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }} + AZURE_OPENAI_ENDPOINT: ${{ vars.AZUREOPENAI__ENDPOINT }} + AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }} + AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_openai --save-report --report-name 02-agents-azure-openai + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-azure-openai + path: python/samples/sample_validation/reports/ + + validate-02-agents-azure-ai: + name: Validate 02-agents/providers/azure_ai + runs-on: ubuntu-latest + environment: integration + env: + AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }} + AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }} + AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }} + AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__EMBEDDINGDEPLOYMENTNAME }} + BING_CONNECTION_ID: ${{ secrets.BING_CONNECTION_ID }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME=$AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME=$AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME" >> .env + echo "BING_CONNECTION_ID=$BING_CONNECTION_ID" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_ai --save-report --report-name 02-agents-azure-ai + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-azure-ai + path: python/samples/sample_validation/reports/ + + validate-02-agents-azure-ai-agent: + name: Validate 02-agents/providers/azure_ai_agent + runs-on: ubuntu-latest + environment: integration + env: + AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }} + AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_ai_agent --save-report --report-name 02-agents-azure-ai-agent + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-azure-ai-agent + path: python/samples/sample_validation/reports/ + + validate-02-agents-anthropic: + name: Validate 02-agents/providers/anthropic + runs-on: ubuntu-latest + environment: integration + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + ANTHROPIC_CHAT_MODEL_ID: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env + echo "ANTHROPIC_CHAT_MODEL_ID=$ANTHROPIC_CHAT_MODEL_ID" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/anthropic --save-report --report-name 02-agents-anthropic + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-anthropic + path: python/samples/sample_validation/reports/ + + validate-02-agents-github-copilot: + name: Validate 02-agents/providers/github_copilot + runs-on: ubuntu-latest + environment: integration + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/github_copilot --save-report --report-name 02-agents-github-copilot + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-github-copilot + path: python/samples/sample_validation/reports/ + + validate-02-agents-amazon: + name: Validate 02-agents/providers/amazon + if: false # Temporarily disabled - requires AWS credentials + runs-on: ubuntu-latest + environment: integration + env: + BEDROCK_CHAT_MODEL_ID: ${{ vars.BEDROCK__CHATMODELID }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/amazon --save-report --report-name 02-agents-amazon + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-amazon + path: python/samples/sample_validation/reports/ + + validate-02-agents-ollama: + name: Validate 02-agents/providers/ollama + if: false # Temporarily disabled - requires local Ollama server + runs-on: ubuntu-latest + environment: integration + env: + OLLAMA_MODEL: ${{ vars.OLLAMA__MODEL }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/ollama --save-report --report-name 02-agents-ollama + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-ollama + path: python/samples/sample_validation/reports/ + + validate-02-agents-foundry-local: + name: Validate 02-agents/providers/foundry_local + if: false # Temporarily disabled - requires local Foundry setup + runs-on: ubuntu-latest + environment: integration + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/foundry_local --save-report --report-name 02-agents-foundry-local + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-foundry-local + path: python/samples/sample_validation/reports/ + + validate-02-agents-copilotstudio: + name: Validate 02-agents/providers/copilotstudio + if: false # Temporarily disabled - requires Copilot Studio setup + runs-on: ubuntu-latest + environment: integration + env: + COPILOTSTUDIOAGENT__ENVIRONMENTID: ${{ secrets.COPILOTSTUDIOAGENT__ENVIRONMENTID }} + COPILOTSTUDIOAGENT__SCHEMANAME: ${{ secrets.COPILOTSTUDIOAGENT__SCHEMANAME }} + COPILOTSTUDIOAGENT__TENANTID: ${{ secrets.COPILOTSTUDIOAGENT__TENANTID }} + COPILOTSTUDIOAGENT__AGENTAPPID: ${{ secrets.COPILOTSTUDIOAGENT__AGENTAPPID }} + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Create .env for samples + run: | + echo "COPILOTSTUDIOAGENT__ENVIRONMENTID=$COPILOTSTUDIOAGENT__ENVIRONMENTID" >> .env + echo "COPILOTSTUDIOAGENT__SCHEMANAME=$COPILOTSTUDIOAGENT__SCHEMANAME" >> .env + echo "COPILOTSTUDIOAGENT__TENANTID=$COPILOTSTUDIOAGENT__TENANTID" >> .env + echo "COPILOTSTUDIOAGENT__AGENTAPPID=$COPILOTSTUDIOAGENT__AGENTAPPID" >> .env + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/copilotstudio --save-report --report-name 02-agents-copilotstudio + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-copilotstudio + path: python/samples/sample_validation/reports/ + + validate-02-agents-custom: + name: Validate 02-agents/providers/custom + runs-on: ubuntu-latest + environment: integration + defaults: + run: + working-directory: python + steps: + - uses: actions/checkout@v6 + + - name: Setup environment + uses: ./.github/actions/sample-validation-setup + with: + azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} + azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} + azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + os: ${{ runner.os }} + + - name: Run sample validation + run: | + cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/custom --save-report --report-name 02-agents-custom + + - name: Upload validation report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-report-02-agents-custom + path: python/samples/sample_validation/reports/ validate-03-workflows: name: Validate 03-workflows @@ -121,6 +535,14 @@ jobs: azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} os: ${{ runner.os }} + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env + - name: Run sample validation run: | cd scripts && uv run python -m sample_validation --subdir 03-workflows --save-report --report-name 03-workflows @@ -130,7 +552,7 @@ jobs: if: always() with: name: validation-report-03-workflows - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ validate-04-hosting: name: Validate 04-hosting @@ -169,7 +591,7 @@ jobs: if: always() with: name: validation-report-04-hosting - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ validate-05-end-to-end: name: Validate 05-end-to-end @@ -213,7 +635,7 @@ jobs: if: always() with: name: validation-report-05-end-to-end - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ validate-autogen-migration: name: Validate autogen-migration @@ -244,6 +666,16 @@ jobs: azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} os: ${{ runner.os }} + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env + echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env + echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env + - name: Run sample validation run: | cd scripts && uv run python -m sample_validation --subdir autogen-migration --save-report --report-name autogen-migration @@ -253,7 +685,7 @@ jobs: if: always() with: name: validation-report-autogen-migration - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ validate-semantic-kernel-migration: name: Validate semantic-kernel-migration @@ -290,6 +722,21 @@ jobs: azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} os: ${{ runner.os }} + - name: Create .env for samples + run: | + echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env + echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env + echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env + echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env + echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env + echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env + echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env + echo "COPILOTSTUDIOAGENT__ENVIRONMENTID=$COPILOTSTUDIOAGENT__ENVIRONMENTID" >> .env + echo "COPILOTSTUDIOAGENT__SCHEMANAME=$COPILOTSTUDIOAGENT__SCHEMANAME" >> .env + echo "COPILOTSTUDIOAGENT__TENANTID=$COPILOTSTUDIOAGENT__TENANTID" >> .env + echo "COPILOTSTUDIOAGENT__AGENTAPPID=$COPILOTSTUDIOAGENT__AGENTAPPID" >> .env + - name: Run sample validation run: | cd scripts && uv run python -m sample_validation --subdir semantic-kernel-migration --save-report --report-name semantic-kernel-migration @@ -299,4 +746,69 @@ jobs: if: always() with: name: validation-report-semantic-kernel-migration - path: python/scripts/sample_validation/reports/ + path: python/samples/sample_validation/reports/ + + aggregate-results: + name: Aggregate Results + runs-on: ubuntu-latest + if: always() + needs: + - validate-01-get-started + - validate-02-agents + - validate-02-agents-openai + - validate-02-agents-azure-openai + - validate-02-agents-azure-ai + - validate-02-agents-azure-ai-agent + - validate-02-agents-anthropic + - validate-02-agents-github-copilot + - validate-02-agents-amazon + - validate-02-agents-ollama + - validate-02-agents-foundry-local + - validate-02-agents-copilotstudio + - validate-02-agents-custom + - validate-03-workflows + - validate-04-hosting + - validate-05-end-to-end + - validate-autogen-migration + - validate-semantic-kernel-migration + steps: + - uses: actions/checkout@v6 + + - name: Download all validation reports + uses: actions/download-artifact@v7 + with: + pattern: validation-report-* + path: reports/ + merge-multiple: true + + - name: Restore validation history + id: cache-restore + uses: actions/cache/restore@v4 + with: + path: validation-history/ + key: validation-history-${{ github.run_id }} + restore-keys: | + validation-history- + + - name: Aggregate results and generate trend report + run: | + python3 python/scripts/sample_validation/aggregate.py \ + reports/ \ + validation-history/history.json \ + trend-report.md + + - name: Write trend report to job summary + run: cat trend-report.md >> "$GITHUB_STEP_SUMMARY" + + - name: Save validation history + uses: actions/cache/save@v4 + with: + path: validation-history/ + key: validation-history-${{ github.run_id }} + + - name: Upload trend report + uses: actions/upload-artifact@v7 + if: always() + with: + name: validation-trend-report + path: trend-report.md diff --git a/python/samples/02-agents/chat_client/built_in_chat_clients.py b/python/samples/02-agents/chat_client/built_in_chat_clients.py index 8560afcf4f..21d6a0f81a 100644 --- a/python/samples/02-agents/chat_client/built_in_chat_clients.py +++ b/python/samples/02-agents/chat_client/built_in_chat_clients.py @@ -5,7 +5,7 @@ import os from random import randint from typing import Annotated, Any, Literal -from agent_framework import SupportsChatGetResponse, tool +from agent_framework import Message, SupportsChatGetResponse, tool from agent_framework.azure import ( AzureAIAgentClient, AzureOpenAIAssistantsClient, @@ -117,35 +117,37 @@ async def main(client_name: ClientName = "openai_chat") -> None: client = get_client(client_name) # 1. Configure prompt and streaming mode. - message = "What's the weather in Amsterdam and in Paris?" + message = Message("user", text="What's the weather in Amsterdam and in Paris?") stream = os.getenv("STREAM", "false").lower() == "true" print(f"Client: {client_name}") - print(f"User: {message}") + print(f"User: {message.text}") # 2. Run with context-managed clients. if isinstance(client, OpenAIAssistantsClient | AzureOpenAIAssistantsClient | AzureAIAgentClient): async with client: if stream: - response_stream = client.get_response(message, stream=True, options={"tools": get_weather}) + response_stream = client.get_response([message], stream=True, options={"tools": get_weather}) print("Assistant: ", end="") async for chunk in response_stream: if chunk.text: print(chunk.text, end="") print("") else: - print(f"Assistant: {await client.get_response(message, stream=False, options={'tools': get_weather})}") + print( + f"Assistant: {await client.get_response([message], stream=False, options={'tools': get_weather})}" + ) return # 3. Run with non-context-managed clients. if stream: - response_stream = client.get_response(message, stream=True, options={"tools": get_weather}) + response_stream = client.get_response([message], stream=True, options={"tools": get_weather}) print("Assistant: ", end="") async for chunk in response_stream: if chunk.text: print(chunk.text, end="") print("") else: - print(f"Assistant: {await client.get_response(message, stream=False, options={'tools': get_weather})}") + print(f"Assistant: {await client.get_response([message], stream=False, options={'tools': get_weather})}") if __name__ == "__main__": diff --git a/python/samples/autogen-migration/orchestrations/01_round_robin_group_chat.py b/python/samples/autogen-migration/orchestrations/01_round_robin_group_chat.py index e5c6bd09f8..8b883a07b9 100644 --- a/python/samples/autogen-migration/orchestrations/01_round_robin_group_chat.py +++ b/python/samples/autogen-migration/orchestrations/01_round_robin_group_chat.py @@ -1,25 +1,17 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/orchestrations/01_round_robin_group_chat.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen RoundRobinGroupChat vs Agent Framework GroupChatBuilder/SequentialBuilder. -Demonstrates sequential agent orchestration where agents take turns processing -the task in a round-robin fashion. -""" import asyncio from agent_framework import Message from dotenv import load_dotenv +"""AutoGen RoundRobinGroupChat vs Agent Framework GroupChatBuilder/SequentialBuilder. + +Demonstrates sequential agent orchestration where agents take turns processing +the task in a round-robin fashion. +""" + # Load environment variables from .env file load_dotenv() @@ -98,7 +90,7 @@ async def run_agent_framework() -> None: print("[Agent Framework] Sequential conversation:") async for event in workflow.run("Create a brief summary about electric vehicles", stream=True): if event.type == "output" and isinstance(event.data, list): - for message in event.data: + for message in event.data: # type: ignore if isinstance(message, Message) and message.role == "assistant" and message.text: print(f"---------- {message.author_name} ----------") print(message.text) @@ -144,9 +136,7 @@ async def run_agent_framework_with_cycle() -> None: if last_message and "APPROVED" in last_message.text: await context.yield_output("Content approved.") else: - await context.send_message( - AgentExecutorRequest(messages=response.full_conversation, should_respond=True) - ) + await context.send_message(AgentExecutorRequest(messages=response.full_conversation, should_respond=True)) workflow = ( WorkflowBuilder(start_executor=researcher) diff --git a/python/samples/autogen-migration/orchestrations/02_selector_group_chat.py b/python/samples/autogen-migration/orchestrations/02_selector_group_chat.py index 6f16e1dea9..485f3793e5 100644 --- a/python/samples/autogen-migration/orchestrations/02_selector_group_chat.py +++ b/python/samples/autogen-migration/orchestrations/02_selector_group_chat.py @@ -1,25 +1,17 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/orchestrations/02_selector_group_chat.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen SelectorGroupChat vs Agent Framework GroupChatBuilder. -Demonstrates LLM-based speaker selection where an orchestrator decides -which agent should speak next based on the conversation context. -""" import asyncio from agent_framework import Message from dotenv import load_dotenv +"""AutoGen SelectorGroupChat vs Agent Framework GroupChatBuilder. + +Demonstrates LLM-based speaker selection where an orchestrator decides +which agent should speak next based on the conversation context. +""" + # Load environment variables from .env file load_dotenv() @@ -113,7 +105,7 @@ async def run_agent_framework() -> None: print("[Agent Framework] Group chat conversation:") async for event in workflow.run("How do I connect to a PostgreSQL database using Python?", stream=True): if event.type == "output" and isinstance(event.data, list): - for message in event.data: + for message in event.data: # type: ignore if isinstance(message, Message) and message.role == "assistant" and message.text: print(f"---------- {message.author_name} ----------") print(message.text) diff --git a/python/samples/autogen-migration/orchestrations/03_swarm.py b/python/samples/autogen-migration/orchestrations/03_swarm.py index a178ffcffe..e2a8688b10 100644 --- a/python/samples/autogen-migration/orchestrations/03_swarm.py +++ b/python/samples/autogen-migration/orchestrations/03_swarm.py @@ -1,19 +1,4 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/orchestrations/03_swarm.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen Swarm pattern vs Agent Framework HandoffBuilder. - -Demonstrates agent handoff coordination where agents can transfer control -to other specialized agents based on the task requirements. -""" import asyncio from typing import Any @@ -21,6 +6,12 @@ from typing import Any from agent_framework import AgentResponseUpdate, WorkflowEvent from dotenv import load_dotenv +"""AutoGen Swarm pattern vs Agent Framework HandoffBuilder. + +Demonstrates agent handoff coordination where agents can transfer control +to other specialized agents based on the task requirements. +""" + # Load environment variables from .env file load_dotenv() diff --git a/python/samples/autogen-migration/orchestrations/04_magentic_one.py b/python/samples/autogen-migration/orchestrations/04_magentic_one.py index b6728b0e46..58ec95e492 100644 --- a/python/samples/autogen-migration/orchestrations/04_magentic_one.py +++ b/python/samples/autogen-migration/orchestrations/04_magentic_one.py @@ -1,19 +1,4 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/orchestrations/04_magentic_one.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen MagenticOneGroupChat vs Agent Framework MagenticBuilder. - -Demonstrates orchestrated multi-agent workflows with a central coordinator -managing specialized agents for complex tasks. -""" import asyncio import json @@ -27,6 +12,12 @@ from agent_framework import ( from agent_framework.orchestrations import MagenticProgressLedger from dotenv import load_dotenv +"""AutoGen MagenticOneGroupChat vs Agent Framework MagenticBuilder. + +Demonstrates orchestrated multi-agent workflows with a central coordinator +managing specialized agents for complex tasks. +""" + # Load environment variables from .env file load_dotenv() diff --git a/python/samples/autogen-migration/single_agent/01_basic_assistant_agent.py b/python/samples/autogen-migration/single_agent/01_basic_assistant_agent.py index 73a3caba02..fad39f7719 100644 --- a/python/samples/autogen-migration/single_agent/01_basic_assistant_agent.py +++ b/python/samples/autogen-migration/single_agent/01_basic_assistant_agent.py @@ -1,14 +1,9 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/single_agent/01_basic_assistant_agent.py - # Copyright (c) Microsoft. All rights reserved. + +import asyncio + +from dotenv import load_dotenv + """Basic AutoGen AssistantAgent vs Agent Framework Agent. Both samples expect OpenAI-compatible environment variables (OPENAI_API_KEY or @@ -16,10 +11,6 @@ Azure OpenAI configuration). Update the prompts or client wiring to match your model of choice before running. """ -import asyncio - -from dotenv import load_dotenv - # Load environment variables from .env file load_dotenv() diff --git a/python/samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py b/python/samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py index aca868b9f2..af7ebaf03b 100644 --- a/python/samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py +++ b/python/samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py @@ -1,24 +1,14 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-core", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen AssistantAgent vs Agent Framework Agent with function tools. - -Demonstrates how to create and attach tools to agents in both frameworks. -""" import asyncio from dotenv import load_dotenv +"""AutoGen AssistantAgent vs Agent Framework Agent with function tools. + +Demonstrates how to create and attach tools to agents in both frameworks. +""" + # Load environment variables from .env file load_dotenv() diff --git a/python/samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py b/python/samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py index c544880cb1..9610f47ad2 100644 --- a/python/samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py +++ b/python/samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py @@ -1,23 +1,14 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py - # Copyright (c) Microsoft. All rights reserved. -"""AutoGen vs Agent Framework: Thread management and streaming responses. - -Demonstrates conversation state management and streaming in both frameworks. -""" import asyncio from dotenv import load_dotenv +"""AutoGen vs Agent Framework: Thread management and streaming responses. + +Demonstrates conversation state management and streaming in both frameworks. +""" + # Load environment variables from .env file load_dotenv() diff --git a/python/samples/autogen-migration/single_agent/04_agent_as_tool.py b/python/samples/autogen-migration/single_agent/04_agent_as_tool.py index 489ec74c01..74a9fb3463 100644 --- a/python/samples/autogen-migration/single_agent/04_agent_as_tool.py +++ b/python/samples/autogen-migration/single_agent/04_agent_as_tool.py @@ -1,24 +1,15 @@ -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "autogen-agentchat", -# "autogen-ext[openai]", -# ] -# /// -# Run with any PEP 723 compatible runner, e.g.: -# uv run samples/autogen-migration/single_agent/04_agent_as_tool.py - # Copyright (c) Microsoft. All rights reserved. + +import asyncio + +from dotenv import load_dotenv + """AutoGen vs Agent Framework: Agent-as-a-Tool pattern. Demonstrates hierarchical agent architectures where one agent delegates work to specialized sub-agents wrapped as tools. """ -import asyncio - -from dotenv import load_dotenv - # Load environment variables from .env file load_dotenv() @@ -107,6 +98,7 @@ async def run_agent_framework() -> None: if content.type == "function_call": # Accumulate function call content as it streams in call_id = content.call_id + assert call_id is not None, "Function call content must have a call_id" if call_id in accumulated_calls: # Add to existing call (arguments stream in gradually) accumulated_calls[call_id] = accumulated_calls[call_id] + content diff --git a/python/scripts/sample_validation/README.md b/python/scripts/sample_validation/README.md index 064d9752da..d7d9f0a08a 100644 --- a/python/scripts/sample_validation/README.md +++ b/python/scripts/sample_validation/README.md @@ -165,18 +165,17 @@ Produces: ## Report Status Codes -| Status | Label | Description | -| ------- | --------- | ----------------------------------------- | -| SUCCESS | [PASS] | Sample ran to completion with exit code 0 | -| FAILURE | [FAIL] | Sample exited with non-zero code | -| TIMEOUT | [TIMEOUT] | Sample exceeded timeout limit | -| ERROR | [ERROR] | Exception during execution | +| Status | Label | Description | +| ------------- | --------------- | ----------------------------------------- | +| SUCCESS | [PASS] | Sample ran to completion with exit code 0 | +| FAILURE | [FAIL] | Sample did not complete successfully (non-zero exit code) | +| MISSING_SETUP | [MISSING_SETUP] | Sample skipped due to missing setup | ## Troubleshooting ### Agent output parsing errors -If an agent returns non-JSON content, that sample is marked as `ERROR` with parser details in the report. +If an agent returns non-JSON content, that sample is marked as `FAILURE` with parser details in the report. ### GitHub Copilot authentication or CLI issues diff --git a/python/scripts/sample_validation/__main__.py b/python/scripts/sample_validation/__main__.py index 5d222b94b9..948fed3a30 100644 --- a/python/scripts/sample_validation/__main__.py +++ b/python/scripts/sample_validation/__main__.py @@ -75,6 +75,13 @@ Examples: help="Custom name for the report files (without extension). If not provided, uses timestamp.", ) + parser.add_argument( + "--exclude", + nargs="+", + type=str, + help="Subdirectory paths to exclude (relative to the search directory set by --subdir)", + ) + return parser.parse_args() @@ -104,6 +111,7 @@ async def main() -> int: samples_dir=samples_dir, python_root=python_root, subdir=args.subdir, + exclude=args.exclude, max_parallel_workers=max(1, args.max_parallel_workers), ) @@ -138,7 +146,7 @@ async def main() -> int: print(f" JSON: {json_path}") # Return appropriate exit code - failed = report.failure_count + report.timeout_count + report.error_count + failed = report.failure_count + report.missing_setup_count return 1 if failed > 0 else 0 diff --git a/python/scripts/sample_validation/aggregate.py b/python/scripts/sample_validation/aggregate.py new file mode 100644 index 0000000000..478bfeafdb --- /dev/null +++ b/python/scripts/sample_validation/aggregate.py @@ -0,0 +1,224 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Aggregate validation reports across runs and produce a trend report. + +Reads JSON reports from individual validation jobs, combines them with +cached history from previous runs, and produces a markdown trend report +showing per-sample status over the last 5 runs. + +Usage: + python aggregate.py +""" + +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +MAX_HISTORY = 5 + +STATUS_EMOJI = { + "success": "✅", + "failure": "❌", + "missing_setup": "⚠️", +} + + +def _format_run_label(timestamp: str) -> str: + """Format a run timestamp as a compact column label (e.g. '03-24 18:05').""" + try: + dt = datetime.fromisoformat(timestamp) + return dt.strftime("%m-%d %H:%M") + except (ValueError, TypeError): + return timestamp[:16] + + +def load_current_run(reports_dir: Path) -> dict[str, Any]: + """Load all JSON report files from the current run and merge them.""" + combined_results: dict[str, str] = {} + total = success = failure = missing = 0 + + json_files = sorted(reports_dir.glob("*.json")) + if not json_files: + print(f"Warning: No JSON report files found in {reports_dir}") + return { + "timestamp": datetime.now().isoformat(), + "summary": { + "total_samples": 0, + "success_count": 0, + "failure_count": 0, + "missing_setup_count": 0, + }, + "results": {}, + } + + for json_file in json_files: + print(f" Loading report: {json_file.name}") + with open(json_file, encoding="utf-8") as f: + report = json.load(f) + for result in report["results"]: + combined_results[result["path"]] = result["status"] + summary = report["summary"] + total += summary["total_samples"] + success += summary["success_count"] + failure += summary["failure_count"] + missing += summary["missing_setup_count"] + + return { + "timestamp": datetime.now().isoformat(), + "summary": { + "total_samples": total, + "success_count": success, + "failure_count": failure, + "missing_setup_count": missing, + }, + "results": combined_results, + } + + +def load_history(history_path: Path) -> list[dict[str, Any]]: + """Load previous run history from cache.""" + if history_path.exists(): + with open(history_path, encoding="utf-8") as f: + data = json.load(f) + runs = data.get("runs", []) + print(f" Loaded {len(runs)} previous run(s) from history") + return runs + print(" No previous history found") + return [] + + +def save_history(history_path: Path, runs: list[dict[str, Any]]) -> None: + """Save run history, keeping only the last MAX_HISTORY entries.""" + history_path.parent.mkdir(parents=True, exist_ok=True) + trimmed = runs[-MAX_HISTORY:] + with open(history_path, "w", encoding="utf-8") as f: + json.dump({"runs": trimmed}, f, indent=2) + print(f" Saved {len(trimmed)} run(s) to history") + + +def generate_trend_report(runs: list[dict[str, Any]]) -> str: + """Generate a markdown trend report from run history.""" + lines = [ + "# Sample Validation Trend Report", + "", + f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M UTC')}*", + "", + ] + + # --- Overall status table (most recent first) --- + lines.append("## Overall Status (Last 5 Runs)") + lines.append("") + lines.append("| Run | Success | Failure | Missing Setup | Total |") + lines.append("|-----|---------|---------|---------------|-------|") + + for run in reversed(runs): + s = run["summary"] + label = _format_run_label(run["timestamp"]) + lines.append( + f"| {label} | {s['success_count']}/{s['total_samples']} " + f"| {s['failure_count']}/{s['total_samples']} " + f"| {s['missing_setup_count']}/{s['total_samples']} " + f"| {s['total_samples']} |" + ) + + # Pad with N/A rows if fewer than 5 runs + for _ in range(MAX_HISTORY - len(runs)): + lines.append("| N/A | N/A | N/A | N/A | N/A |") + + lines.append("") + + # --- Per-sample results table --- + lines.append("## Per-Sample Results") + lines.append("") + + # Collect all sample paths across all runs + all_paths: set[str] = set() + for run in runs: + all_paths.update(run["results"].keys()) + + if not all_paths: + lines.append("*No sample results available.*") + return "\n".join(lines) + + # Build header (most recent run first) + header = "| Sample |" + separator = "|--------|" + for run in reversed(runs): + label = _format_run_label(run["timestamp"]) + header += f" {label} |" + separator += "------------|" + for _ in range(MAX_HISTORY - len(runs)): + header += " N/A |" + separator += "-----|" + + lines.append(header) + lines.append(separator) + + for path in sorted(all_paths): + row = f"| `{path}` |" + for run in reversed(runs): + status = run["results"].get(path, "N/A") + emoji = STATUS_EMOJI.get(status, "N/A") + row += f" {emoji} |" + for _ in range(MAX_HISTORY - len(runs)): + row += " N/A |" + lines.append(row) + + lines.append("") + lines.append("**Legend:** ✅ Success · ❌ Failure · ⚠️ Missing Setup · N/A Not available") + lines.append("") + + return "\n".join(lines) + + +def main() -> int: + if len(sys.argv) != 4: + print("Usage: python aggregate.py ") + return 1 + + reports_dir = Path(sys.argv[1]) + history_path = Path(sys.argv[2]) + output_path = Path(sys.argv[3]) + + print("Aggregating validation results...") + + # Load current run's reports + print(f"\nLoading reports from {reports_dir}:") + current_run = load_current_run(reports_dir) + s = current_run["summary"] + print( + f" Current run: {s['success_count']} success, " + f"{s['failure_count']} failure, " + f"{s['missing_setup_count']} missing setup " + f"(total: {s['total_samples']})" + ) + + # Load history and append current run + print(f"\nLoading history from {history_path}:") + runs = load_history(history_path) + runs.append(current_run) + runs = runs[-MAX_HISTORY:] + + # Save updated history + print(f"\nSaving history to {history_path}:") + save_history(history_path, runs) + + # Generate trend report + print("\nGenerating trend report...") + report = generate_trend_report(runs) + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(report, encoding="utf-8") + print(f"Trend report written to {output_path}") + + # Also print the report to stdout + print("\n" + "=" * 80) + print(report) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/scripts/sample_validation/create_dynamic_workflow_executor.py b/python/scripts/sample_validation/create_dynamic_workflow_executor.py index 69c5cc9a5e..4cffd5c71b 100644 --- a/python/scripts/sample_validation/create_dynamic_workflow_executor.py +++ b/python/scripts/sample_validation/create_dynamic_workflow_executor.py @@ -14,7 +14,8 @@ from agent_framework import ( handler, ) from agent_framework.github import GitHubCopilotAgent -from copilot.types import PermissionRequest, PermissionRequestResult +from copilot.generated.session_events import PermissionRequest +from copilot.types import PermissionRequestResult from pydantic import BaseModel from typing_extensions import Never @@ -36,6 +37,7 @@ class AgentResponseFormat(BaseModel): status: str output: str error: str + fix: str @dataclass @@ -54,15 +56,20 @@ class BatchCompletion: AgentInstruction = ( "You are validating exactly one Python sample.\n" - "Analyze the sample code and execute it. Based on the execution result, determine if it " - "runs successfully, fails, or times out. Feel free to install any required dependencies.\n" + "Analyze the sample code and execute it as it is. Based on the execution result, determine " + "if it runs successfully, fails, or is missing_setup. Use `missing_setup` if the sample reports " + "missing required environment variables. The environment you're given should contain the necessary " + "variables. Don't create new environment variables nor modify the sample code.\n" + "Feel free to install any required dependencies if needed.\n" "The sample can be interactive. If it is interactive, respond to the sample when prompted " "based on your analysis of the code. You do not need to consult human on what to respond.\n" + "If the sample fails, investigate the error and suggest a fix.\n" "Return ONLY valid JSON with this schema:\n" "{\n" - ' "status": "success|failure|timeout|error",\n' + ' "status": "success|failure|missing_setup",\n' ' "output": "short summary of the result and what you did if the sample was interactive",\n' - ' "error": "error details or empty string"\n' + ' "error": "error details or empty string",\n' + ' "fix": "suggested code fix if the sample failed, otherwise empty string"\n' "}\n\n" ) @@ -87,16 +94,15 @@ def status_from_text(value: str) -> RunStatus: for status in RunStatus: if status.value == normalized: return status - return RunStatus.ERROR + return RunStatus.FAILURE def prompt_permission( request: PermissionRequest, context: dict[str, str] ) -> PermissionRequestResult: """Permission handler that always approves.""" - kind = request.get("kind", "unknown") logger.debug( - f"[Permission Request: {kind}] ({context})Automatically approved for sample validation." + f"[Permission Request: {request.kind}] ({context})Automatically approved for sample validation." ) return PermissionRequestResult(kind="approved") @@ -108,39 +114,73 @@ class CustomAgentExecutor(Executor): returned as error responses, otherwise an exception in one agent could crash the entire workflow. """ + # Retry in case GitHub Copilot agent encounters transient errors unrelated to the sample execution. + RETRY_COUNT = 1 + def __init__(self, agent: GitHubCopilotAgent): super().__init__(id=agent.id) self.agent = agent + self._session = agent.create_session() @handler async def handle_task( self, sample: SampleInfo, ctx: WorkflowContext[WorkerFreed | RunResult] ) -> None: """Execute one sample task and notify collector + coordinator.""" - try: - response = await self.agent.run( - [ - Message( - role="user", - text=f"Validate the following sample:\n\n{sample.relative_path}", + current_retry = 0 + while True: + try: + response = await self.agent.run( + [ + Message( + role="user", + text=f"Validate the following sample:\n\n{sample.relative_path}", + ) + ], + session=self._session, + ) + result_payload = parse_agent_json(response.text) + result = RunResult( + sample=sample, + status=status_from_text(result_payload.status), + output=result_payload.output, + error=result_payload.error, + fix=result_payload.fix, + ) + break + except Exception as ex: + if current_retry < self.RETRY_COUNT: + logger.warning( + f"Error executing agent {self.agent.id} (attempt {current_retry + 1}/{self.RETRY_COUNT}): {ex}. Retrying..." ) - ] - ) - result_payload = parse_agent_json(response.text) - result = RunResult( - sample=sample, - status=status_from_text(result_payload.status), - output=result_payload.output, - error=result_payload.error, - ) - except Exception as ex: - logger.error(f"Error executing agent {self.agent.id}: {ex}") - result = RunResult( - sample=sample, - status=RunStatus.ERROR, - output="", - error=str(ex), - ) + try: + current_retry += 1 + await self.agent.stop() + await self.agent.start() + self._session = self.agent.create_session() # Reset session for retry + continue + except Exception as restart_ex: + logger.error( + f"Error restarting agent {self.agent.id}: {restart_ex}. No more retries." + ) + result = RunResult( + sample=sample, + status=RunStatus.FAILURE, + output="", + error=f"Original error: {ex}. Restart error: {restart_ex}", + fix="", + ) + break + + logger.error(f"Error executing agent {self.agent.id}: {ex}") + result = RunResult( + sample=sample, + status=RunStatus.FAILURE, + output="", + error=str(ex), + fix="", + ) + break await ctx.send_message(result, target_id="collector") await ctx.send_message(WorkerFreed(worker_id=self.id), target_id="coordinator") @@ -252,7 +292,7 @@ class CreateConcurrentValidationWorkflowExecutor(Executor): instructions=AgentInstruction, default_options={ "on_permission_request": prompt_permission, - "timeout": 180, + "timeout": 60, }, # type: ignore ) agents.append(agent) diff --git a/python/scripts/sample_validation/discovery.py b/python/scripts/sample_validation/discovery.py index 78eb1c9bfa..c5424dd6ee 100644 --- a/python/scripts/sample_validation/discovery.py +++ b/python/scripts/sample_validation/discovery.py @@ -52,13 +52,18 @@ def _has_main_entrypoint_guard(path: Path) -> bool: ) -def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[SampleInfo]: +def discover_samples( + samples_dir: Path, + subdir: str | None = None, + exclude: list[str] | None = None, +) -> list[SampleInfo]: """ Find all Python sample files in the samples directory. Args: samples_dir: Root samples directory subdir: Optional subdirectory to filter to + exclude: Optional list of subdirectory paths (relative to the search directory) to exclude Returns: List of SampleInfo objects for each discovered sample @@ -72,12 +77,21 @@ def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[Sampl else: search_dir = samples_dir + # Resolve excluded paths to absolute for reliable comparison + exclude_paths = {(search_dir / exc).resolve() for exc in (exclude or [])} + python_files: list[Path] = [] # Walk through all subdirectories and find .py files for root, dirs, files in os.walk(search_dir): - # Skip directories that start with _ (like _sample_validation) - dirs[:] = [d for d in dirs if not d.startswith("_") and d != "__pycache__"] + # Skip directories that start with _, __pycache__, or excluded paths + dirs[:] = [ + d + for d in dirs + if not d.startswith("_") + and d != "__pycache__" + and (Path(root) / d).resolve() not in exclude_paths + ] for file in files: # Skip files that start with _ and include only scripts with a main entrypoint guard @@ -113,8 +127,10 @@ class DiscoverSamplesExecutor(Executor): print(f"🔍 Discovering samples in {self.config.samples_dir}") if self.config.subdir: print(f" Filtering to subdirectory: {self.config.subdir}") + if self.config.exclude: + print(f" Excluding: {', '.join(self.config.exclude)}") - samples = discover_samples(self.config.samples_dir, self.config.subdir) + samples = discover_samples(self.config.samples_dir, self.config.subdir, self.config.exclude) print(f" Found {len(samples)} samples") await ctx.send_message(DiscoveryResult(samples=samples)) diff --git a/python/scripts/sample_validation/models.py b/python/scripts/sample_validation/models.py index ca9f26adab..ff45b5909b 100644 --- a/python/scripts/sample_validation/models.py +++ b/python/scripts/sample_validation/models.py @@ -18,6 +18,7 @@ class ValidationConfig: samples_dir: Path python_root: Path subdir: str | None = None + exclude: list[str] | None = None max_parallel_workers: int = 10 @@ -60,8 +61,7 @@ class RunStatus(Enum): SUCCESS = "success" FAILURE = "failure" - TIMEOUT = "timeout" - ERROR = "error" + MISSING_SETUP = "missing_setup" @dataclass @@ -72,6 +72,7 @@ class RunResult: status: RunStatus output: str error: str + fix: str @dataclass @@ -89,8 +90,7 @@ class Report: total_samples: int success_count: int failure_count: int - timeout_count: int - error_count: int + missing_setup_count: int results: list[RunResult] = field(default_factory=list) # type: ignore def to_markdown(self) -> str: @@ -107,15 +107,14 @@ class Report: f"| Total Samples | {self.total_samples} |", f"| [PASS] Success | {self.success_count} |", f"| [FAIL] Failure | {self.failure_count} |", - f"| [TIMEOUT] Timeout | {self.timeout_count} |", - f"| [ERROR] Error | {self.error_count} |", + f"| [MISSING_SETUP] Missing Setup | {self.missing_setup_count} |", "", "## Detailed Results", "", ] # Group by status - for status in [RunStatus.FAILURE, RunStatus.TIMEOUT, RunStatus.ERROR, RunStatus.SUCCESS]: + for status in [RunStatus.FAILURE, RunStatus.MISSING_SETUP, RunStatus.SUCCESS]: status_results = [r for r in self.results if r.status == status] if not status_results: continue @@ -123,8 +122,7 @@ class Report: status_label = { RunStatus.SUCCESS: "[PASS]", RunStatus.FAILURE: "[FAIL]", - RunStatus.TIMEOUT: "[TIMEOUT]", - RunStatus.ERROR: "[ERROR]", + RunStatus.MISSING_SETUP: "[MISSING_SETUP]", } lines.append(f"### {status_label[status]} {status.value.title()} ({len(status_results)})") @@ -148,8 +146,7 @@ class Report: "total_samples": self.total_samples, "success_count": self.success_count, "failure_count": self.failure_count, - "timeout_count": self.timeout_count, - "error_count": self.error_count, + "missing_setup_count": self.missing_setup_count, }, "results": [ { @@ -157,6 +154,7 @@ class Report: "status": r.status.value, "output": r.output, "error": r.error, + "fix": r.fix, } for r in self.results ], diff --git a/python/scripts/sample_validation/report.py b/python/scripts/sample_validation/report.py index db8eddeed1..10c4ff0406 100644 --- a/python/scripts/sample_validation/report.py +++ b/python/scripts/sample_validation/report.py @@ -22,12 +22,11 @@ def generate_report(results: list[RunResult]) -> Report: Returns: Report object with aggregated statistics """ - # Sort results: failures, timeouts, errors first, then successes + # Sort results: failures, missing setup first, then successes status_priority = { RunStatus.FAILURE: 0, - RunStatus.TIMEOUT: 1, - RunStatus.ERROR: 2, - RunStatus.SUCCESS: 3, + RunStatus.MISSING_SETUP: 1, + RunStatus.SUCCESS: 2, } sorted_results = sorted(results, key=lambda r: status_priority[r.status]) @@ -36,8 +35,7 @@ def generate_report(results: list[RunResult]) -> Report: total_samples=len(results), success_count=sum(1 for r in results if r.status == RunStatus.SUCCESS), failure_count=sum(1 for r in results if r.status == RunStatus.FAILURE), - timeout_count=sum(1 for r in results if r.status == RunStatus.TIMEOUT), - error_count=sum(1 for r in results if r.status == RunStatus.ERROR), + missing_setup_count=sum(1 for r in results if r.status == RunStatus.MISSING_SETUP), results=sorted_results, ) @@ -86,8 +84,7 @@ def print_summary(report: Report) -> None: if ( report.failure_count == 0 - and report.timeout_count == 0 - and report.error_count == 0 + and report.missing_setup_count == 0 ): print("[PASS] ALL SAMPLES PASSED!") else: @@ -98,8 +95,7 @@ def print_summary(report: Report) -> None: print("Results:") print(f" [PASS] Success: {report.success_count}") print(f" [FAIL] Failure: {report.failure_count}") - print(f" [TIMEOUT] Timeout: {report.timeout_count}") - print(f" [ERR] Errors: {report.error_count}") + print(f" [MISSING_SETUP] Missing Setup: {report.missing_setup_count}") print("=" * 80) # Print JSON output for GitHub Actions visibility diff --git a/python/scripts/sample_validation/run_dynamic_validation_workflow_executor.py b/python/scripts/sample_validation/run_dynamic_validation_workflow_executor.py index 6f28dc9244..c7244cff2a 100644 --- a/python/scripts/sample_validation/run_dynamic_validation_workflow_executor.py +++ b/python/scripts/sample_validation/run_dynamic_validation_workflow_executor.py @@ -66,9 +66,10 @@ class RunDynamicValidationWorkflowExecutor(Executor): fallback_results = [ RunResult( sample=sample, - status=RunStatus.ERROR, + status=RunStatus.FAILURE, output="", error="Nested workflow did not return an ExecutionResult.", + fix="", ) for sample in creation.samples ]