Python: Update sample validation scripts (#4870)

* Update sample validation scripts

* Adjust prompt

* Update autogen-migration samples

* Add fix suggestion

* Split jobs

* Add .env

* Create trend report

* Add timestamp

* Add more env vars

* Comments

* force node24

* force node24

* force node22
This commit is contained in:
Tao Chen
2026-03-24 18:21:32 -07:00
committed by GitHub
Unverified
parent 2c000b032d
commit 4b533608b6
19 changed files with 928 additions and 202 deletions
@@ -24,7 +24,9 @@ runs:
using: "composite"
steps:
- name: Set up Node.js environment
uses: actions/setup-node@v4
uses: actions/setup-node@v6
with:
node-version: 22
- name: Install Copilot CLI
shell: bash
+520 -8
View File
@@ -41,6 +41,13 @@ jobs:
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 01-get-started --save-report --report-name 01-get-started
@@ -50,7 +57,7 @@ jobs:
if: always()
with:
name: validation-report-01-get-started
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-02-agents:
name: Validate 02-agents
@@ -64,10 +71,13 @@ jobs:
AZURE_OPENAI_ENDPOINT: ${{ vars.AZUREOPENAI__ENDPOINT }}
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }}
AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }}
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__EMBEDDINGDEPLOYMENTNAME }}
# OpenAI configuration
OPENAI_API_KEY: ${{ secrets.OPENAI__APIKEY }}
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI__CHATMODELID }}
OPENAI_RESPONSES_MODEL_ID: ${{ vars.OPENAI__RESPONSESMODELID }}
# GitHub MCP
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
# Observability
ENABLE_INSTRUMENTATION: "true"
defaults:
@@ -84,16 +94,420 @@ jobs:
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME=$AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME" >> .env
echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env
echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env
echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env
echo "GITHUB_PAT=$GITHUB_PAT" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents --save-report --report-name 02-agents
cd scripts && uv run python -m sample_validation --subdir 02-agents --exclude providers --save-report --report-name 02-agents
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-02-agents-openai:
name: Validate 02-agents/providers/openai
runs-on: ubuntu-latest
environment: integration
env:
OPENAI_API_KEY: ${{ secrets.OPENAI__APIKEY }}
OPENAI_CHAT_MODEL_ID: ${{ vars.OPENAI__CHATMODELID }}
OPENAI_RESPONSES_MODEL_ID: ${{ vars.OPENAI__RESPONSESMODELID }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env
echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env
echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/openai --save-report --report-name 02-agents-openai
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-openai
path: python/samples/sample_validation/reports/
validate-02-agents-azure-openai:
name: Validate 02-agents/providers/azure_openai
runs-on: ubuntu-latest
environment: integration
env:
AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }}
AZURE_OPENAI_ENDPOINT: ${{ vars.AZUREOPENAI__ENDPOINT }}
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }}
AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_openai --save-report --report-name 02-agents-azure-openai
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-azure-openai
path: python/samples/sample_validation/reports/
validate-02-agents-azure-ai:
name: Validate 02-agents/providers/azure_ai
runs-on: ubuntu-latest
environment: integration
env:
AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }}
AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }}
AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__CHATDEPLOYMENTNAME }}
AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__EMBEDDINGDEPLOYMENTNAME }}
BING_CONNECTION_ID: ${{ secrets.BING_CONNECTION_ID }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME=$AZURE_AI_CHAT_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME=$AZURE_AI_EMBEDDING_MODEL_DEPLOYMENT_NAME" >> .env
echo "BING_CONNECTION_ID=$BING_CONNECTION_ID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_ai --save-report --report-name 02-agents-azure-ai
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-azure-ai
path: python/samples/sample_validation/reports/
validate-02-agents-azure-ai-agent:
name: Validate 02-agents/providers/azure_ai_agent
runs-on: ubuntu-latest
environment: integration
env:
AZURE_AI_PROJECT_ENDPOINT: ${{ vars.AZURE_AI_PROJECT_ENDPOINT }}
AZURE_AI_MODEL_DEPLOYMENT_NAME: ${{ vars.AZUREOPENAI__RESPONSESDEPLOYMENTNAME }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/azure_ai_agent --save-report --report-name 02-agents-azure-ai-agent
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-azure-ai-agent
path: python/samples/sample_validation/reports/
validate-02-agents-anthropic:
name: Validate 02-agents/providers/anthropic
runs-on: ubuntu-latest
environment: integration
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_CHAT_MODEL_ID: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY" >> .env
echo "ANTHROPIC_CHAT_MODEL_ID=$ANTHROPIC_CHAT_MODEL_ID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/anthropic --save-report --report-name 02-agents-anthropic
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-anthropic
path: python/samples/sample_validation/reports/
validate-02-agents-github-copilot:
name: Validate 02-agents/providers/github_copilot
runs-on: ubuntu-latest
environment: integration
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/github_copilot --save-report --report-name 02-agents-github-copilot
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-github-copilot
path: python/samples/sample_validation/reports/
validate-02-agents-amazon:
name: Validate 02-agents/providers/amazon
if: false # Temporarily disabled - requires AWS credentials
runs-on: ubuntu-latest
environment: integration
env:
BEDROCK_CHAT_MODEL_ID: ${{ vars.BEDROCK__CHATMODELID }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/amazon --save-report --report-name 02-agents-amazon
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-amazon
path: python/samples/sample_validation/reports/
validate-02-agents-ollama:
name: Validate 02-agents/providers/ollama
if: false # Temporarily disabled - requires local Ollama server
runs-on: ubuntu-latest
environment: integration
env:
OLLAMA_MODEL: ${{ vars.OLLAMA__MODEL }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/ollama --save-report --report-name 02-agents-ollama
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-ollama
path: python/samples/sample_validation/reports/
validate-02-agents-foundry-local:
name: Validate 02-agents/providers/foundry_local
if: false # Temporarily disabled - requires local Foundry setup
runs-on: ubuntu-latest
environment: integration
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/foundry_local --save-report --report-name 02-agents-foundry-local
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-foundry-local
path: python/samples/sample_validation/reports/
validate-02-agents-copilotstudio:
name: Validate 02-agents/providers/copilotstudio
if: false # Temporarily disabled - requires Copilot Studio setup
runs-on: ubuntu-latest
environment: integration
env:
COPILOTSTUDIOAGENT__ENVIRONMENTID: ${{ secrets.COPILOTSTUDIOAGENT__ENVIRONMENTID }}
COPILOTSTUDIOAGENT__SCHEMANAME: ${{ secrets.COPILOTSTUDIOAGENT__SCHEMANAME }}
COPILOTSTUDIOAGENT__TENANTID: ${{ secrets.COPILOTSTUDIOAGENT__TENANTID }}
COPILOTSTUDIOAGENT__AGENTAPPID: ${{ secrets.COPILOTSTUDIOAGENT__AGENTAPPID }}
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "COPILOTSTUDIOAGENT__ENVIRONMENTID=$COPILOTSTUDIOAGENT__ENVIRONMENTID" >> .env
echo "COPILOTSTUDIOAGENT__SCHEMANAME=$COPILOTSTUDIOAGENT__SCHEMANAME" >> .env
echo "COPILOTSTUDIOAGENT__TENANTID=$COPILOTSTUDIOAGENT__TENANTID" >> .env
echo "COPILOTSTUDIOAGENT__AGENTAPPID=$COPILOTSTUDIOAGENT__AGENTAPPID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/copilotstudio --save-report --report-name 02-agents-copilotstudio
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-copilotstudio
path: python/samples/sample_validation/reports/
validate-02-agents-custom:
name: Validate 02-agents/providers/custom
runs-on: ubuntu-latest
environment: integration
defaults:
run:
working-directory: python
steps:
- uses: actions/checkout@v6
- name: Setup environment
uses: ./.github/actions/sample-validation-setup
with:
azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 02-agents/providers/custom --save-report --report-name 02-agents-custom
- name: Upload validation report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-report-02-agents-custom
path: python/samples/sample_validation/reports/
validate-03-workflows:
name: Validate 03-workflows
@@ -121,6 +535,14 @@ jobs:
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir 03-workflows --save-report --report-name 03-workflows
@@ -130,7 +552,7 @@ jobs:
if: always()
with:
name: validation-report-03-workflows
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-04-hosting:
name: Validate 04-hosting
@@ -169,7 +591,7 @@ jobs:
if: always()
with:
name: validation-report-04-hosting
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-05-end-to-end:
name: Validate 05-end-to-end
@@ -213,7 +635,7 @@ jobs:
if: always()
with:
name: validation-report-05-end-to-end
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-autogen-migration:
name: Validate autogen-migration
@@ -244,6 +666,16 @@ jobs:
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env
echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env
echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir autogen-migration --save-report --report-name autogen-migration
@@ -253,7 +685,7 @@ jobs:
if: always()
with:
name: validation-report-autogen-migration
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
validate-semantic-kernel-migration:
name: Validate semantic-kernel-migration
@@ -290,6 +722,21 @@ jobs:
azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
os: ${{ runner.os }}
- name: Create .env for samples
run: |
echo "AZURE_AI_PROJECT_ENDPOINT=$AZURE_AI_PROJECT_ENDPOINT" >> .env
echo "AZURE_AI_MODEL_DEPLOYMENT_NAME=$AZURE_AI_MODEL_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_ENDPOINT=$AZURE_OPENAI_ENDPOINT" >> .env
echo "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME=$AZURE_OPENAI_CHAT_DEPLOYMENT_NAME" >> .env
echo "AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME=$AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME" >> .env
echo "OPENAI_API_KEY=$OPENAI_API_KEY" >> .env
echo "OPENAI_CHAT_MODEL_ID=$OPENAI_CHAT_MODEL_ID" >> .env
echo "OPENAI_RESPONSES_MODEL_ID=$OPENAI_RESPONSES_MODEL_ID" >> .env
echo "COPILOTSTUDIOAGENT__ENVIRONMENTID=$COPILOTSTUDIOAGENT__ENVIRONMENTID" >> .env
echo "COPILOTSTUDIOAGENT__SCHEMANAME=$COPILOTSTUDIOAGENT__SCHEMANAME" >> .env
echo "COPILOTSTUDIOAGENT__TENANTID=$COPILOTSTUDIOAGENT__TENANTID" >> .env
echo "COPILOTSTUDIOAGENT__AGENTAPPID=$COPILOTSTUDIOAGENT__AGENTAPPID" >> .env
- name: Run sample validation
run: |
cd scripts && uv run python -m sample_validation --subdir semantic-kernel-migration --save-report --report-name semantic-kernel-migration
@@ -299,4 +746,69 @@ jobs:
if: always()
with:
name: validation-report-semantic-kernel-migration
path: python/scripts/sample_validation/reports/
path: python/samples/sample_validation/reports/
aggregate-results:
name: Aggregate Results
runs-on: ubuntu-latest
if: always()
needs:
- validate-01-get-started
- validate-02-agents
- validate-02-agents-openai
- validate-02-agents-azure-openai
- validate-02-agents-azure-ai
- validate-02-agents-azure-ai-agent
- validate-02-agents-anthropic
- validate-02-agents-github-copilot
- validate-02-agents-amazon
- validate-02-agents-ollama
- validate-02-agents-foundry-local
- validate-02-agents-copilotstudio
- validate-02-agents-custom
- validate-03-workflows
- validate-04-hosting
- validate-05-end-to-end
- validate-autogen-migration
- validate-semantic-kernel-migration
steps:
- uses: actions/checkout@v6
- name: Download all validation reports
uses: actions/download-artifact@v7
with:
pattern: validation-report-*
path: reports/
merge-multiple: true
- name: Restore validation history
id: cache-restore
uses: actions/cache/restore@v4
with:
path: validation-history/
key: validation-history-${{ github.run_id }}
restore-keys: |
validation-history-
- name: Aggregate results and generate trend report
run: |
python3 python/scripts/sample_validation/aggregate.py \
reports/ \
validation-history/history.json \
trend-report.md
- name: Write trend report to job summary
run: cat trend-report.md >> "$GITHUB_STEP_SUMMARY"
- name: Save validation history
uses: actions/cache/save@v4
with:
path: validation-history/
key: validation-history-${{ github.run_id }}
- name: Upload trend report
uses: actions/upload-artifact@v7
if: always()
with:
name: validation-trend-report
path: trend-report.md
@@ -5,7 +5,7 @@ import os
from random import randint
from typing import Annotated, Any, Literal
from agent_framework import SupportsChatGetResponse, tool
from agent_framework import Message, SupportsChatGetResponse, tool
from agent_framework.azure import (
AzureAIAgentClient,
AzureOpenAIAssistantsClient,
@@ -117,35 +117,37 @@ async def main(client_name: ClientName = "openai_chat") -> None:
client = get_client(client_name)
# 1. Configure prompt and streaming mode.
message = "What's the weather in Amsterdam and in Paris?"
message = Message("user", text="What's the weather in Amsterdam and in Paris?")
stream = os.getenv("STREAM", "false").lower() == "true"
print(f"Client: {client_name}")
print(f"User: {message}")
print(f"User: {message.text}")
# 2. Run with context-managed clients.
if isinstance(client, OpenAIAssistantsClient | AzureOpenAIAssistantsClient | AzureAIAgentClient):
async with client:
if stream:
response_stream = client.get_response(message, stream=True, options={"tools": get_weather})
response_stream = client.get_response([message], stream=True, options={"tools": get_weather})
print("Assistant: ", end="")
async for chunk in response_stream:
if chunk.text:
print(chunk.text, end="")
print("")
else:
print(f"Assistant: {await client.get_response(message, stream=False, options={'tools': get_weather})}")
print(
f"Assistant: {await client.get_response([message], stream=False, options={'tools': get_weather})}"
)
return
# 3. Run with non-context-managed clients.
if stream:
response_stream = client.get_response(message, stream=True, options={"tools": get_weather})
response_stream = client.get_response([message], stream=True, options={"tools": get_weather})
print("Assistant: ", end="")
async for chunk in response_stream:
if chunk.text:
print(chunk.text, end="")
print("")
else:
print(f"Assistant: {await client.get_response(message, stream=False, options={'tools': get_weather})}")
print(f"Assistant: {await client.get_response([message], stream=False, options={'tools': get_weather})}")
if __name__ == "__main__":
@@ -1,25 +1,17 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/orchestrations/01_round_robin_group_chat.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen RoundRobinGroupChat vs Agent Framework GroupChatBuilder/SequentialBuilder.
Demonstrates sequential agent orchestration where agents take turns processing
the task in a round-robin fashion.
"""
import asyncio
from agent_framework import Message
from dotenv import load_dotenv
"""AutoGen RoundRobinGroupChat vs Agent Framework GroupChatBuilder/SequentialBuilder.
Demonstrates sequential agent orchestration where agents take turns processing
the task in a round-robin fashion.
"""
# Load environment variables from .env file
load_dotenv()
@@ -98,7 +90,7 @@ async def run_agent_framework() -> None:
print("[Agent Framework] Sequential conversation:")
async for event in workflow.run("Create a brief summary about electric vehicles", stream=True):
if event.type == "output" and isinstance(event.data, list):
for message in event.data:
for message in event.data: # type: ignore
if isinstance(message, Message) and message.role == "assistant" and message.text:
print(f"---------- {message.author_name} ----------")
print(message.text)
@@ -144,9 +136,7 @@ async def run_agent_framework_with_cycle() -> None:
if last_message and "APPROVED" in last_message.text:
await context.yield_output("Content approved.")
else:
await context.send_message(
AgentExecutorRequest(messages=response.full_conversation, should_respond=True)
)
await context.send_message(AgentExecutorRequest(messages=response.full_conversation, should_respond=True))
workflow = (
WorkflowBuilder(start_executor=researcher)
@@ -1,25 +1,17 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/orchestrations/02_selector_group_chat.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen SelectorGroupChat vs Agent Framework GroupChatBuilder.
Demonstrates LLM-based speaker selection where an orchestrator decides
which agent should speak next based on the conversation context.
"""
import asyncio
from agent_framework import Message
from dotenv import load_dotenv
"""AutoGen SelectorGroupChat vs Agent Framework GroupChatBuilder.
Demonstrates LLM-based speaker selection where an orchestrator decides
which agent should speak next based on the conversation context.
"""
# Load environment variables from .env file
load_dotenv()
@@ -113,7 +105,7 @@ async def run_agent_framework() -> None:
print("[Agent Framework] Group chat conversation:")
async for event in workflow.run("How do I connect to a PostgreSQL database using Python?", stream=True):
if event.type == "output" and isinstance(event.data, list):
for message in event.data:
for message in event.data: # type: ignore
if isinstance(message, Message) and message.role == "assistant" and message.text:
print(f"---------- {message.author_name} ----------")
print(message.text)
@@ -1,19 +1,4 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/orchestrations/03_swarm.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen Swarm pattern vs Agent Framework HandoffBuilder.
Demonstrates agent handoff coordination where agents can transfer control
to other specialized agents based on the task requirements.
"""
import asyncio
from typing import Any
@@ -21,6 +6,12 @@ from typing import Any
from agent_framework import AgentResponseUpdate, WorkflowEvent
from dotenv import load_dotenv
"""AutoGen Swarm pattern vs Agent Framework HandoffBuilder.
Demonstrates agent handoff coordination where agents can transfer control
to other specialized agents based on the task requirements.
"""
# Load environment variables from .env file
load_dotenv()
@@ -1,19 +1,4 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/orchestrations/04_magentic_one.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen MagenticOneGroupChat vs Agent Framework MagenticBuilder.
Demonstrates orchestrated multi-agent workflows with a central coordinator
managing specialized agents for complex tasks.
"""
import asyncio
import json
@@ -27,6 +12,12 @@ from agent_framework import (
from agent_framework.orchestrations import MagenticProgressLedger
from dotenv import load_dotenv
"""AutoGen MagenticOneGroupChat vs Agent Framework MagenticBuilder.
Demonstrates orchestrated multi-agent workflows with a central coordinator
managing specialized agents for complex tasks.
"""
# Load environment variables from .env file
load_dotenv()
@@ -1,14 +1,9 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/single_agent/01_basic_assistant_agent.py
# Copyright (c) Microsoft. All rights reserved.
import asyncio
from dotenv import load_dotenv
"""Basic AutoGen AssistantAgent vs Agent Framework Agent.
Both samples expect OpenAI-compatible environment variables (OPENAI_API_KEY or
@@ -16,10 +11,6 @@ Azure OpenAI configuration). Update the prompts or client wiring to match your
model of choice before running.
"""
import asyncio
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
@@ -1,24 +1,14 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-core",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/single_agent/02_assistant_agent_with_tool.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen AssistantAgent vs Agent Framework Agent with function tools.
Demonstrates how to create and attach tools to agents in both frameworks.
"""
import asyncio
from dotenv import load_dotenv
"""AutoGen AssistantAgent vs Agent Framework Agent with function tools.
Demonstrates how to create and attach tools to agents in both frameworks.
"""
# Load environment variables from .env file
load_dotenv()
@@ -1,23 +1,14 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/single_agent/03_assistant_agent_thread_and_stream.py
# Copyright (c) Microsoft. All rights reserved.
"""AutoGen vs Agent Framework: Thread management and streaming responses.
Demonstrates conversation state management and streaming in both frameworks.
"""
import asyncio
from dotenv import load_dotenv
"""AutoGen vs Agent Framework: Thread management and streaming responses.
Demonstrates conversation state management and streaming in both frameworks.
"""
# Load environment variables from .env file
load_dotenv()
@@ -1,24 +1,15 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "autogen-agentchat",
# "autogen-ext[openai]",
# ]
# ///
# Run with any PEP 723 compatible runner, e.g.:
# uv run samples/autogen-migration/single_agent/04_agent_as_tool.py
# Copyright (c) Microsoft. All rights reserved.
import asyncio
from dotenv import load_dotenv
"""AutoGen vs Agent Framework: Agent-as-a-Tool pattern.
Demonstrates hierarchical agent architectures where one agent delegates
work to specialized sub-agents wrapped as tools.
"""
import asyncio
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
@@ -107,6 +98,7 @@ async def run_agent_framework() -> None:
if content.type == "function_call":
# Accumulate function call content as it streams in
call_id = content.call_id
assert call_id is not None, "Function call content must have a call_id"
if call_id in accumulated_calls:
# Add to existing call (arguments stream in gradually)
accumulated_calls[call_id] = accumulated_calls[call_id] + content
+6 -7
View File
@@ -165,18 +165,17 @@ Produces:
## Report Status Codes
| Status | Label | Description |
| ------- | --------- | ----------------------------------------- |
| SUCCESS | [PASS] | Sample ran to completion with exit code 0 |
| FAILURE | [FAIL] | Sample exited with non-zero code |
| TIMEOUT | [TIMEOUT] | Sample exceeded timeout limit |
| ERROR | [ERROR] | Exception during execution |
| Status | Label | Description |
| ------------- | --------------- | ----------------------------------------- |
| SUCCESS | [PASS] | Sample ran to completion with exit code 0 |
| FAILURE | [FAIL] | Sample did not complete successfully (non-zero exit code) |
| MISSING_SETUP | [MISSING_SETUP] | Sample skipped due to missing setup |
## Troubleshooting
### Agent output parsing errors
If an agent returns non-JSON content, that sample is marked as `ERROR` with parser details in the report.
If an agent returns non-JSON content, that sample is marked as `FAILURE` with parser details in the report.
### GitHub Copilot authentication or CLI issues
+9 -1
View File
@@ -75,6 +75,13 @@ Examples:
help="Custom name for the report files (without extension). If not provided, uses timestamp.",
)
parser.add_argument(
"--exclude",
nargs="+",
type=str,
help="Subdirectory paths to exclude (relative to the search directory set by --subdir)",
)
return parser.parse_args()
@@ -104,6 +111,7 @@ async def main() -> int:
samples_dir=samples_dir,
python_root=python_root,
subdir=args.subdir,
exclude=args.exclude,
max_parallel_workers=max(1, args.max_parallel_workers),
)
@@ -138,7 +146,7 @@ async def main() -> int:
print(f" JSON: {json_path}")
# Return appropriate exit code
failed = report.failure_count + report.timeout_count + report.error_count
failed = report.failure_count + report.missing_setup_count
return 1 if failed > 0 else 0
@@ -0,0 +1,224 @@
# Copyright (c) Microsoft. All rights reserved.
"""Aggregate validation reports across runs and produce a trend report.
Reads JSON reports from individual validation jobs, combines them with
cached history from previous runs, and produces a markdown trend report
showing per-sample status over the last 5 runs.
Usage:
python aggregate.py <reports-dir> <history-file> <output-file>
"""
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
MAX_HISTORY = 5
STATUS_EMOJI = {
"success": "",
"failure": "",
"missing_setup": "⚠️",
}
def _format_run_label(timestamp: str) -> str:
"""Format a run timestamp as a compact column label (e.g. '03-24 18:05')."""
try:
dt = datetime.fromisoformat(timestamp)
return dt.strftime("%m-%d %H:%M")
except (ValueError, TypeError):
return timestamp[:16]
def load_current_run(reports_dir: Path) -> dict[str, Any]:
"""Load all JSON report files from the current run and merge them."""
combined_results: dict[str, str] = {}
total = success = failure = missing = 0
json_files = sorted(reports_dir.glob("*.json"))
if not json_files:
print(f"Warning: No JSON report files found in {reports_dir}")
return {
"timestamp": datetime.now().isoformat(),
"summary": {
"total_samples": 0,
"success_count": 0,
"failure_count": 0,
"missing_setup_count": 0,
},
"results": {},
}
for json_file in json_files:
print(f" Loading report: {json_file.name}")
with open(json_file, encoding="utf-8") as f:
report = json.load(f)
for result in report["results"]:
combined_results[result["path"]] = result["status"]
summary = report["summary"]
total += summary["total_samples"]
success += summary["success_count"]
failure += summary["failure_count"]
missing += summary["missing_setup_count"]
return {
"timestamp": datetime.now().isoformat(),
"summary": {
"total_samples": total,
"success_count": success,
"failure_count": failure,
"missing_setup_count": missing,
},
"results": combined_results,
}
def load_history(history_path: Path) -> list[dict[str, Any]]:
"""Load previous run history from cache."""
if history_path.exists():
with open(history_path, encoding="utf-8") as f:
data = json.load(f)
runs = data.get("runs", [])
print(f" Loaded {len(runs)} previous run(s) from history")
return runs
print(" No previous history found")
return []
def save_history(history_path: Path, runs: list[dict[str, Any]]) -> None:
"""Save run history, keeping only the last MAX_HISTORY entries."""
history_path.parent.mkdir(parents=True, exist_ok=True)
trimmed = runs[-MAX_HISTORY:]
with open(history_path, "w", encoding="utf-8") as f:
json.dump({"runs": trimmed}, f, indent=2)
print(f" Saved {len(trimmed)} run(s) to history")
def generate_trend_report(runs: list[dict[str, Any]]) -> str:
"""Generate a markdown trend report from run history."""
lines = [
"# Sample Validation Trend Report",
"",
f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M UTC')}*",
"",
]
# --- Overall status table (most recent first) ---
lines.append("## Overall Status (Last 5 Runs)")
lines.append("")
lines.append("| Run | Success | Failure | Missing Setup | Total |")
lines.append("|-----|---------|---------|---------------|-------|")
for run in reversed(runs):
s = run["summary"]
label = _format_run_label(run["timestamp"])
lines.append(
f"| {label} | {s['success_count']}/{s['total_samples']} "
f"| {s['failure_count']}/{s['total_samples']} "
f"| {s['missing_setup_count']}/{s['total_samples']} "
f"| {s['total_samples']} |"
)
# Pad with N/A rows if fewer than 5 runs
for _ in range(MAX_HISTORY - len(runs)):
lines.append("| N/A | N/A | N/A | N/A | N/A |")
lines.append("")
# --- Per-sample results table ---
lines.append("## Per-Sample Results")
lines.append("")
# Collect all sample paths across all runs
all_paths: set[str] = set()
for run in runs:
all_paths.update(run["results"].keys())
if not all_paths:
lines.append("*No sample results available.*")
return "\n".join(lines)
# Build header (most recent run first)
header = "| Sample |"
separator = "|--------|"
for run in reversed(runs):
label = _format_run_label(run["timestamp"])
header += f" {label} |"
separator += "------------|"
for _ in range(MAX_HISTORY - len(runs)):
header += " N/A |"
separator += "-----|"
lines.append(header)
lines.append(separator)
for path in sorted(all_paths):
row = f"| `{path}` |"
for run in reversed(runs):
status = run["results"].get(path, "N/A")
emoji = STATUS_EMOJI.get(status, "N/A")
row += f" {emoji} |"
for _ in range(MAX_HISTORY - len(runs)):
row += " N/A |"
lines.append(row)
lines.append("")
lines.append("**Legend:** ✅ Success · ❌ Failure · ⚠️ Missing Setup · N/A Not available")
lines.append("")
return "\n".join(lines)
def main() -> int:
if len(sys.argv) != 4:
print("Usage: python aggregate.py <reports-dir> <history-file> <output-file>")
return 1
reports_dir = Path(sys.argv[1])
history_path = Path(sys.argv[2])
output_path = Path(sys.argv[3])
print("Aggregating validation results...")
# Load current run's reports
print(f"\nLoading reports from {reports_dir}:")
current_run = load_current_run(reports_dir)
s = current_run["summary"]
print(
f" Current run: {s['success_count']} success, "
f"{s['failure_count']} failure, "
f"{s['missing_setup_count']} missing setup "
f"(total: {s['total_samples']})"
)
# Load history and append current run
print(f"\nLoading history from {history_path}:")
runs = load_history(history_path)
runs.append(current_run)
runs = runs[-MAX_HISTORY:]
# Save updated history
print(f"\nSaving history to {history_path}:")
save_history(history_path, runs)
# Generate trend report
print("\nGenerating trend report...")
report = generate_trend_report(runs)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report, encoding="utf-8")
print(f"Trend report written to {output_path}")
# Also print the report to stdout
print("\n" + "=" * 80)
print(report)
return 0
if __name__ == "__main__":
sys.exit(main())
@@ -14,7 +14,8 @@ from agent_framework import (
handler,
)
from agent_framework.github import GitHubCopilotAgent
from copilot.types import PermissionRequest, PermissionRequestResult
from copilot.generated.session_events import PermissionRequest
from copilot.types import PermissionRequestResult
from pydantic import BaseModel
from typing_extensions import Never
@@ -36,6 +37,7 @@ class AgentResponseFormat(BaseModel):
status: str
output: str
error: str
fix: str
@dataclass
@@ -54,15 +56,20 @@ class BatchCompletion:
AgentInstruction = (
"You are validating exactly one Python sample.\n"
"Analyze the sample code and execute it. Based on the execution result, determine if it "
"runs successfully, fails, or times out. Feel free to install any required dependencies.\n"
"Analyze the sample code and execute it as it is. Based on the execution result, determine "
"if it runs successfully, fails, or is missing_setup. Use `missing_setup` if the sample reports "
"missing required environment variables. The environment you're given should contain the necessary "
"variables. Don't create new environment variables nor modify the sample code.\n"
"Feel free to install any required dependencies if needed.\n"
"The sample can be interactive. If it is interactive, respond to the sample when prompted "
"based on your analysis of the code. You do not need to consult human on what to respond.\n"
"If the sample fails, investigate the error and suggest a fix.\n"
"Return ONLY valid JSON with this schema:\n"
"{\n"
' "status": "success|failure|timeout|error",\n'
' "status": "success|failure|missing_setup",\n'
' "output": "short summary of the result and what you did if the sample was interactive",\n'
' "error": "error details or empty string"\n'
' "error": "error details or empty string",\n'
' "fix": "suggested code fix if the sample failed, otherwise empty string"\n'
"}\n\n"
)
@@ -87,16 +94,15 @@ def status_from_text(value: str) -> RunStatus:
for status in RunStatus:
if status.value == normalized:
return status
return RunStatus.ERROR
return RunStatus.FAILURE
def prompt_permission(
request: PermissionRequest, context: dict[str, str]
) -> PermissionRequestResult:
"""Permission handler that always approves."""
kind = request.get("kind", "unknown")
logger.debug(
f"[Permission Request: {kind}] ({context})Automatically approved for sample validation."
f"[Permission Request: {request.kind}] ({context})Automatically approved for sample validation."
)
return PermissionRequestResult(kind="approved")
@@ -108,39 +114,73 @@ class CustomAgentExecutor(Executor):
returned as error responses, otherwise an exception in one agent could crash the entire workflow.
"""
# Retry in case GitHub Copilot agent encounters transient errors unrelated to the sample execution.
RETRY_COUNT = 1
def __init__(self, agent: GitHubCopilotAgent):
super().__init__(id=agent.id)
self.agent = agent
self._session = agent.create_session()
@handler
async def handle_task(
self, sample: SampleInfo, ctx: WorkflowContext[WorkerFreed | RunResult]
) -> None:
"""Execute one sample task and notify collector + coordinator."""
try:
response = await self.agent.run(
[
Message(
role="user",
text=f"Validate the following sample:\n\n{sample.relative_path}",
current_retry = 0
while True:
try:
response = await self.agent.run(
[
Message(
role="user",
text=f"Validate the following sample:\n\n{sample.relative_path}",
)
],
session=self._session,
)
result_payload = parse_agent_json(response.text)
result = RunResult(
sample=sample,
status=status_from_text(result_payload.status),
output=result_payload.output,
error=result_payload.error,
fix=result_payload.fix,
)
break
except Exception as ex:
if current_retry < self.RETRY_COUNT:
logger.warning(
f"Error executing agent {self.agent.id} (attempt {current_retry + 1}/{self.RETRY_COUNT}): {ex}. Retrying..."
)
]
)
result_payload = parse_agent_json(response.text)
result = RunResult(
sample=sample,
status=status_from_text(result_payload.status),
output=result_payload.output,
error=result_payload.error,
)
except Exception as ex:
logger.error(f"Error executing agent {self.agent.id}: {ex}")
result = RunResult(
sample=sample,
status=RunStatus.ERROR,
output="",
error=str(ex),
)
try:
current_retry += 1
await self.agent.stop()
await self.agent.start()
self._session = self.agent.create_session() # Reset session for retry
continue
except Exception as restart_ex:
logger.error(
f"Error restarting agent {self.agent.id}: {restart_ex}. No more retries."
)
result = RunResult(
sample=sample,
status=RunStatus.FAILURE,
output="",
error=f"Original error: {ex}. Restart error: {restart_ex}",
fix="",
)
break
logger.error(f"Error executing agent {self.agent.id}: {ex}")
result = RunResult(
sample=sample,
status=RunStatus.FAILURE,
output="",
error=str(ex),
fix="",
)
break
await ctx.send_message(result, target_id="collector")
await ctx.send_message(WorkerFreed(worker_id=self.id), target_id="coordinator")
@@ -252,7 +292,7 @@ class CreateConcurrentValidationWorkflowExecutor(Executor):
instructions=AgentInstruction,
default_options={
"on_permission_request": prompt_permission,
"timeout": 180,
"timeout": 60,
}, # type: ignore
)
agents.append(agent)
+20 -4
View File
@@ -52,13 +52,18 @@ def _has_main_entrypoint_guard(path: Path) -> bool:
)
def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[SampleInfo]:
def discover_samples(
samples_dir: Path,
subdir: str | None = None,
exclude: list[str] | None = None,
) -> list[SampleInfo]:
"""
Find all Python sample files in the samples directory.
Args:
samples_dir: Root samples directory
subdir: Optional subdirectory to filter to
exclude: Optional list of subdirectory paths (relative to the search directory) to exclude
Returns:
List of SampleInfo objects for each discovered sample
@@ -72,12 +77,21 @@ def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[Sampl
else:
search_dir = samples_dir
# Resolve excluded paths to absolute for reliable comparison
exclude_paths = {(search_dir / exc).resolve() for exc in (exclude or [])}
python_files: list[Path] = []
# Walk through all subdirectories and find .py files
for root, dirs, files in os.walk(search_dir):
# Skip directories that start with _ (like _sample_validation)
dirs[:] = [d for d in dirs if not d.startswith("_") and d != "__pycache__"]
# Skip directories that start with _, __pycache__, or excluded paths
dirs[:] = [
d
for d in dirs
if not d.startswith("_")
and d != "__pycache__"
and (Path(root) / d).resolve() not in exclude_paths
]
for file in files:
# Skip files that start with _ and include only scripts with a main entrypoint guard
@@ -113,8 +127,10 @@ class DiscoverSamplesExecutor(Executor):
print(f"🔍 Discovering samples in {self.config.samples_dir}")
if self.config.subdir:
print(f" Filtering to subdirectory: {self.config.subdir}")
if self.config.exclude:
print(f" Excluding: {', '.join(self.config.exclude)}")
samples = discover_samples(self.config.samples_dir, self.config.subdir)
samples = discover_samples(self.config.samples_dir, self.config.subdir, self.config.exclude)
print(f" Found {len(samples)} samples")
await ctx.send_message(DiscoveryResult(samples=samples))
+9 -11
View File
@@ -18,6 +18,7 @@ class ValidationConfig:
samples_dir: Path
python_root: Path
subdir: str | None = None
exclude: list[str] | None = None
max_parallel_workers: int = 10
@@ -60,8 +61,7 @@ class RunStatus(Enum):
SUCCESS = "success"
FAILURE = "failure"
TIMEOUT = "timeout"
ERROR = "error"
MISSING_SETUP = "missing_setup"
@dataclass
@@ -72,6 +72,7 @@ class RunResult:
status: RunStatus
output: str
error: str
fix: str
@dataclass
@@ -89,8 +90,7 @@ class Report:
total_samples: int
success_count: int
failure_count: int
timeout_count: int
error_count: int
missing_setup_count: int
results: list[RunResult] = field(default_factory=list) # type: ignore
def to_markdown(self) -> str:
@@ -107,15 +107,14 @@ class Report:
f"| Total Samples | {self.total_samples} |",
f"| [PASS] Success | {self.success_count} |",
f"| [FAIL] Failure | {self.failure_count} |",
f"| [TIMEOUT] Timeout | {self.timeout_count} |",
f"| [ERROR] Error | {self.error_count} |",
f"| [MISSING_SETUP] Missing Setup | {self.missing_setup_count} |",
"",
"## Detailed Results",
"",
]
# Group by status
for status in [RunStatus.FAILURE, RunStatus.TIMEOUT, RunStatus.ERROR, RunStatus.SUCCESS]:
for status in [RunStatus.FAILURE, RunStatus.MISSING_SETUP, RunStatus.SUCCESS]:
status_results = [r for r in self.results if r.status == status]
if not status_results:
continue
@@ -123,8 +122,7 @@ class Report:
status_label = {
RunStatus.SUCCESS: "[PASS]",
RunStatus.FAILURE: "[FAIL]",
RunStatus.TIMEOUT: "[TIMEOUT]",
RunStatus.ERROR: "[ERROR]",
RunStatus.MISSING_SETUP: "[MISSING_SETUP]",
}
lines.append(f"### {status_label[status]} {status.value.title()} ({len(status_results)})")
@@ -148,8 +146,7 @@ class Report:
"total_samples": self.total_samples,
"success_count": self.success_count,
"failure_count": self.failure_count,
"timeout_count": self.timeout_count,
"error_count": self.error_count,
"missing_setup_count": self.missing_setup_count,
},
"results": [
{
@@ -157,6 +154,7 @@ class Report:
"status": r.status.value,
"output": r.output,
"error": r.error,
"fix": r.fix,
}
for r in self.results
],
+6 -10
View File
@@ -22,12 +22,11 @@ def generate_report(results: list[RunResult]) -> Report:
Returns:
Report object with aggregated statistics
"""
# Sort results: failures, timeouts, errors first, then successes
# Sort results: failures, missing setup first, then successes
status_priority = {
RunStatus.FAILURE: 0,
RunStatus.TIMEOUT: 1,
RunStatus.ERROR: 2,
RunStatus.SUCCESS: 3,
RunStatus.MISSING_SETUP: 1,
RunStatus.SUCCESS: 2,
}
sorted_results = sorted(results, key=lambda r: status_priority[r.status])
@@ -36,8 +35,7 @@ def generate_report(results: list[RunResult]) -> Report:
total_samples=len(results),
success_count=sum(1 for r in results if r.status == RunStatus.SUCCESS),
failure_count=sum(1 for r in results if r.status == RunStatus.FAILURE),
timeout_count=sum(1 for r in results if r.status == RunStatus.TIMEOUT),
error_count=sum(1 for r in results if r.status == RunStatus.ERROR),
missing_setup_count=sum(1 for r in results if r.status == RunStatus.MISSING_SETUP),
results=sorted_results,
)
@@ -86,8 +84,7 @@ def print_summary(report: Report) -> None:
if (
report.failure_count == 0
and report.timeout_count == 0
and report.error_count == 0
and report.missing_setup_count == 0
):
print("[PASS] ALL SAMPLES PASSED!")
else:
@@ -98,8 +95,7 @@ def print_summary(report: Report) -> None:
print("Results:")
print(f" [PASS] Success: {report.success_count}")
print(f" [FAIL] Failure: {report.failure_count}")
print(f" [TIMEOUT] Timeout: {report.timeout_count}")
print(f" [ERR] Errors: {report.error_count}")
print(f" [MISSING_SETUP] Missing Setup: {report.missing_setup_count}")
print("=" * 80)
# Print JSON output for GitHub Actions visibility
@@ -66,9 +66,10 @@ class RunDynamicValidationWorkflowExecutor(Executor):
fallback_results = [
RunResult(
sample=sample,
status=RunStatus.ERROR,
status=RunStatus.FAILURE,
output="",
error="Nested workflow did not return an ExecutionResult.",
fix="",
)
for sample in creation.samples
]