Python: Automate sample validation (#4193)

* Automate sample validation: part 1

* Automate sample validation: part 2

* Create GH workflow

* comments

* Fix mypy
This commit is contained in:
Tao Chen
2026-02-23 17:08:16 -08:00
committed by GitHub
Unverified
parent 55398e21df
commit b7efaae709
14 changed files with 1408 additions and 308 deletions
@@ -31,6 +31,9 @@ class UserMemoryProvider(BaseContextProvider):
DEFAULT_SOURCE_ID = "user_memory"
def __init__(self):
super().__init__(self.DEFAULT_SOURCE_ID)
async def before_run(
self,
*,
-304
View File
@@ -1,304 +0,0 @@
# Copyright (c) Microsoft. All rights reserved.
"""
Script to run all Python samples in the samples directory concurrently.
This script will run all samples and report results at the end.
Note: This script is AI generated. This is for internal validation purposes only.
Samples that require human interaction are known to fail.
Usage:
python run_all_samples.py # Run all samples using uv run (concurrent)
python run_all_samples.py --direct # Run all samples directly (concurrent,
# assumes environment is set up)
python run_all_samples.py --subdir <directory> # Run samples only in specific subdirectory
python run_all_samples.py --subdir getting_started/workflows # Example: run only workflow samples
"""
import argparse
import os
import subprocess
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
def find_python_samples(samples_dir: Path, subdir: str | None = None) -> list[Path]:
"""Find all Python sample files in the samples directory or a subdirectory."""
python_files: list[Path] = []
# Determine the search directory
if subdir:
search_dir = samples_dir / subdir
if not search_dir.exists():
print(f"Warning: Subdirectory '{subdir}' does not exist in {samples_dir}")
return []
print(f"Searching in subdirectory: {search_dir}")
else:
search_dir = samples_dir
print(f"Searching in all samples: {search_dir}")
# Walk through all subdirectories and find .py files
for root, dirs, files in os.walk(search_dir):
# Skip __pycache__ directories
dirs[:] = [d for d in dirs if d != "__pycache__"]
for file in files:
if file.endswith(".py") and not file.startswith("_") and file != "_run_all_samples.py":
python_files.append(Path(root) / file)
# Sort files for consistent execution order
return sorted(python_files)
def run_sample(
sample_path: Path,
use_uv: bool = True,
python_root: Path | None = None,
) -> tuple[bool, str, str, str]:
"""
Run a single sample file using subprocess and return (success, output, error_info, error_type).
Args:
sample_path: Path to the sample file
use_uv: Whether to use uv run
python_root: Root directory for uv run
Returns:
Tuple of (success, output, error_info, error_type)
error_type can be: "timeout", "input_hang", "execution_error", "exception"
"""
if use_uv and python_root:
cmd = ["uv", "run", "python", str(sample_path)]
cwd = python_root
else:
cmd = [sys.executable, sample_path.name]
cwd = sample_path.parent
# Set environment variables to handle Unicode properly
env = os.environ.copy()
env["PYTHONIOENCODING"] = "utf-8" # Force Python to use UTF-8 for I/O
env["PYTHONUTF8"] = "1" # Enable UTF-8 mode in Python 3.7+
try:
# Use Popen for better timeout handling with stdin for samples that may wait for input
# Popen gives us more control over process lifecycle compared to subprocess.run()
process = subprocess.Popen(
cmd, # Command to execute as a list [program, arg1, arg2, ...]
cwd=cwd, # Working directory for the subprocess
stdout=subprocess.PIPE, # Capture stdout so we can read the output
stderr=subprocess.PIPE, # Capture stderr so we can read error messages
stdin=subprocess.PIPE, # Create a pipe for stdin so we can send input
text=True, # Handle input/output as text strings (not bytes)
encoding="utf-8", # Use UTF-8 encoding to handle Unicode characters like emojis
errors="replace", # Replace problematic characters instead of failing
env=env, # Pass environment variables for proper Unicode handling
)
try:
# communicate() sends input to stdin and waits for process to complete
# input="" sends an empty string to stdin, which causes input() calls to
# immediately receive EOFError (End Of File) since there's no data to read.
# This prevents the process from hanging indefinitely waiting for user input.
stdout, stderr = process.communicate(input="", timeout=60)
except subprocess.TimeoutExpired:
# If the process doesn't complete within the timeout period, we need to
# forcibly terminate it. This is especially important for processes that
# ignore EOFError and continue to hang on input() calls.
# First attempt: Send SIGKILL (immediate termination) on Unix or TerminateProcess on Windows
process.kill()
try:
# Give the process a few seconds to clean up after being killed
stdout, stderr = process.communicate(timeout=5)
except subprocess.TimeoutExpired:
# If the process is still alive after kill(), use terminate() as a last resort
# terminate() sends SIGTERM (graceful termination request) which may work
# when kill() doesn't on some systems
process.terminate()
stdout, stderr = "", "Process forcibly terminated"
return False, "", f"TIMEOUT: {sample_path.name} (exceeded 60 seconds)", "timeout"
if process.returncode == 0:
output = stdout.strip() if stdout.strip() else "No output"
return True, output, "", "success"
error_info = f"Exit code: {process.returncode}"
if stderr.strip():
error_info += f"\nSTDERR: {stderr}"
# Check if this looks like an input/interaction related error
error_type = "execution_error"
stderr_safe = stderr.encode("utf-8", errors="replace").decode("utf-8") if stderr else ""
if "EOFError" in stderr_safe or "input" in stderr_safe.lower() or "stdin" in stderr_safe.lower():
error_type = "input_hang"
elif "UnicodeEncodeError" in stderr_safe and ("charmap" in stderr_safe or "codec can't encode" in stderr_safe):
error_type = "input_hang" # Unicode errors often indicate interactive samples with emojis
return False, stdout.strip() if stdout.strip() else "", error_info, error_type
except Exception as e:
return False, "", f"ERROR: {sample_path.name} - Exception: {str(e)}", "exception"
def parse_arguments() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Run Python samples concurrently",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python run_all_samples.py # Run all samples
python run_all_samples.py --direct # Run all samples directly
python run_all_samples.py --subdir getting_started # Run only getting_started samples
python run_all_samples.py --subdir getting_started/workflows # Run only workflow samples
python run_all_samples.py --subdir semantic-kernel-migration # Run only SK migration samples
""",
)
parser.add_argument(
"--direct", action="store_true", help="Run samples directly with python instead of using uv run"
)
parser.add_argument(
"--subdir", type=str, help="Run samples only in the specified subdirectory (relative to samples/)"
)
parser.add_argument(
"--max-workers", type=int, default=16, help="Maximum number of concurrent workers (default: 16)"
)
return parser.parse_args()
def main() -> None:
"""Main function to run all samples concurrently."""
args = parse_arguments()
# Get the samples directory (assuming this script is in the samples directory)
samples_dir = Path(__file__).parent
python_root = samples_dir.parent # Go up to the python/ directory
print("Python samples runner")
print(f"Samples directory: {samples_dir}")
if args.direct:
print("Running samples directly (assuming environment is set up)")
else:
print(f"Using uv run from: {python_root}")
if args.subdir:
print(f"Filtering to subdirectory: {args.subdir}")
print("🚀 Running samples concurrently...")
# Find all Python sample files
sample_files = find_python_samples(samples_dir, args.subdir)
if not sample_files:
print("No Python sample files found!")
return
print(f"Found {len(sample_files)} Python sample files")
# Run samples concurrently
results: list[tuple[Path, bool, str, str, str]] = []
with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
# Submit all tasks
future_to_sample = {
executor.submit(run_sample, sample_path, not args.direct, python_root): sample_path
for sample_path in sample_files
}
# Collect results as they complete
for future in as_completed(future_to_sample):
sample_path = future_to_sample[future]
try:
success, output, error_info, error_type = future.result()
results.append((sample_path, success, output, error_info, error_type))
# Print progress - show relative path from samples directory
relative_path = sample_path.relative_to(samples_dir)
if success:
print(f"{relative_path}")
else:
# Show error type in progress display
error_display = f"{error_type.upper()}" if error_type != "execution_error" else "ERROR"
print(f"{relative_path} - {error_display}")
except Exception as e:
error_info = f"Future exception: {str(e)}"
results.append((sample_path, False, "", error_info, "exception"))
relative_path = sample_path.relative_to(samples_dir)
print(f"{relative_path} - EXCEPTION")
# Sort results by original file order for consistent reporting
sample_to_index = {path: i for i, path in enumerate(sample_files)}
results.sort(key=lambda x: sample_to_index[x[0]])
successful_runs = sum(1 for _, success, _, _, _ in results if success)
failed_runs = len(results) - successful_runs
# Categorize failures by type
timeout_failures = [r for r in results if not r[1] and r[4] == "timeout"]
input_hang_failures = [r for r in results if not r[1] and r[4] == "input_hang"]
execution_errors = [r for r in results if not r[1] and r[4] == "execution_error"]
exceptions = [r for r in results if not r[1] and r[4] == "exception"]
# Print detailed results
print(f"\n{'=' * 80}")
print("DETAILED RESULTS:")
print(f"{'=' * 80}")
for sample_path, success, output, error_info, error_type in results:
relative_path = sample_path.relative_to(samples_dir)
if success:
print(f"{relative_path}")
if output and output != "No output":
print(f" Output preview: {output[:100]}{'...' if len(output) > 100 else ''}")
else:
# Display error with type indicator
if error_type == "timeout":
print(f"⏱️ {relative_path} - TIMEOUT (likely waiting for input)")
elif error_type == "input_hang":
print(f"⌨️ {relative_path} - INPUT ERROR (interactive sample)")
elif error_type == "exception":
print(f"💥 {relative_path} - EXCEPTION")
else:
print(f"{relative_path} - EXECUTION ERROR")
print(f" Error: {error_info}")
# Print categorized summary
print(f"\n{'=' * 80}")
if failed_runs == 0:
print("🎉 ALL SAMPLES COMPLETED SUCCESSFULLY!")
else:
print(f"{failed_runs} SAMPLE(S) FAILED!")
print(f"Successful runs: {successful_runs}")
print(f"Failed runs: {failed_runs}")
if failed_runs > 0:
print("\nFailure breakdown:")
if len(timeout_failures) > 0:
print(f" ⏱️ Timeouts (likely interactive): {len(timeout_failures)}")
if len(input_hang_failures) > 0:
print(f" ⌨️ Input errors (interactive): {len(input_hang_failures)}")
if len(execution_errors) > 0:
print(f" ❌ Execution errors: {len(execution_errors)}")
if len(exceptions) > 0:
print(f" 💥 Exceptions: {len(exceptions)}")
if args.subdir:
print(f"Subdirectory filter: {args.subdir}")
print(f"{'=' * 80}")
# Exit with error code if any samples failed
if failed_runs > 0:
sys.exit(1)
if __name__ == "__main__":
main()
+183
View File
@@ -0,0 +1,183 @@
# Sample Validation System
An AI-powered workflow system for validating Python samples by discovering them, creating a nested batched workflow, and producing a report.
## Architecture
```
┌─────────────────────────────────────────────────────────────────────┐
│ Sample Validation Workflow │
│ (Sequential - 4 Executors) │
└─────────────────────────────────────────────────────────────────────┘
┌──────────────────────────┼──────────────────────────┐
▼ ▼ ▼
┌───────────────┐ ┌─────────────────┐ ┌─────────────────┐
│ Discover │ ──► │ Create Dynamic │ ──► │ Run Nested │
│ Samples │ │ Batched Flow │ │ Workflow │
└───────────────┘ └─────────────────┘ └─────────────────┘
│ │ │
▼ ▼ ▼
List[SampleInfo] WorkflowCreationResult ExecutionResult
(workers + coordinator) │
┌─────────────────┐
│ Generate Report │
└─────────────────┘
Report
```
### Nested Workflow Strategy
```
┌─────────────────────────────────────────────────────────────────────┐
│ Nested Batched Workflow (coordinator + workers) │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ WorkflowBuilder + fan-out/fan-in edges │ │
│ │ - Coordinator dispatches tasks in bounded batches │ │
│ │ - Worker executors run GitHub Copilot agents │ │
│ │ - Collector aggregates per-sample RunResult messages │ │
│ │ - Max in-flight workers set by --max-parallel-workers │ │
│ └─────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────┘
```
## File Structure
```
samples/
├── _sample_validation/
│ ├── __init__.py # Package exports
│ ├── README.md # This file
│ ├── models.py # Data classes
│ │ ├── SampleInfo # Discovered sample metadata
│ │ ├── RunResult # Execution result
│ │ └── Report # Final validation report
│ ├── discovery.py # Sample discovery
│ │ ├── discover_samples() # Finds all .py files
│ │ └── DiscoverSamplesExecutor
│ ├── report.py # Report generation
│ │ ├── generate_report() # Create Report from results
│ │ ├── save_report() # Write to markdown/JSON
│ │ ├── print_summary() # Console output
│ │ └── GenerateReportExecutor
│ ├── create_dynamic_workflow_executor.py # Coordinator, workers, collector, CreateConcurrentValidationWorkflowExecutor
│ ├── run_dynamic_validation_workflow_executor.py # RunDynamicValidationWorkflowExecutor
│ └── workflow.py # Workflow assembly entrypoint
├── __main__.py # CLI entry point
```
## Dependencies
### Required
- **agent-framework** - Core workflow and agent functionality
- **agent-framework-github-copilot** - GitHub Copilot agent integration
### Optional
- `GITHUB_COPILOT_MODEL` to override default Copilot model selection.
## Environment Variables
No required environment variables. Optional:
| Variable | Description | Required |
| ------------------------ | --------------------------------- | -------- |
| `GITHUB_COPILOT_MODEL` | Copilot model override | No |
| `GITHUB_COPILOT_TIMEOUT` | Copilot request timeout (seconds) | No |
## Usage
### Basic Usage
```bash
# Validate all samples
uv run python -m _sample_validation
# Validate specific subdirectory
uv run python -m _sample_validation --subdir 03-workflows
# Save reports to files
uv run python -m _sample_validation --save-report --output-dir ./reports
```
### Configuration Options
```bash
uv run python -m _sample_validation [OPTIONS]
Options:
--subdir TEXT Subdirectory to validate (relative to samples/)
--output-dir TEXT Report output directory (default: ./_sample_validation/reports)
--max-parallel-workers INT Max in-flight workers per batch (default: 10)
--save-report Save reports to files
```
### Examples
```bash
# Quick validation of a small directory
uv run python -m _sample_validation --subdir 03-workflows/_start-here
# Limit parallel workers for large sample sets
uv run python -m _sample_validation --subdir 02-agents --max-parallel-workers 8
# Save report artifacts
uv run python -m _sample_validation --save-report
```
## How It Works
### 1. Discovery
Walks the samples directory and finds all `.py` files that:
- Don't start with `_` (excludes private files)
- Aren't in `__pycache__` directories
- Aren't in directories starting with `_` (excludes `_sample_validation`)
### 2. Dynamic Workflow Creation
Creates a nested workflow with:
- A coordinator executor
- One worker executor per discovered sample
- A collector executor
### 3. Nested Workflow Execution
The coordinator sends initial work to the first `max_parallel_workers` workers. As each worker finishes, it notifies
the coordinator, which dispatches the next queued sample. Workers also send result items to the collector, which emits
the final `ExecutionResult` once all samples are processed.
### 4. Report Generation
Produces:
- **Console summary** - Pass/fail counts with emoji indicators
- **Markdown report** - Detailed results grouped by status
- **JSON report** - Machine-readable for CI integration
## Report Status Codes
| Status | Label | Description |
| ------- | --------- | ----------------------------------------- |
| SUCCESS | [PASS] | Sample ran to completion with exit code 0 |
| FAILURE | [FAIL] | Sample exited with non-zero code |
| TIMEOUT | [TIMEOUT] | Sample exceeded timeout limit |
| ERROR | [ERROR] | Exception during execution |
## Troubleshooting
### Agent output parsing errors
If an agent returns non-JSON content, that sample is marked as `ERROR` with parser details in the report.
### GitHub Copilot authentication or CLI issues
Ensure GitHub Copilot is authenticated in your environment and the Copilot CLI is available.
@@ -0,0 +1,25 @@
# Copyright (c) Microsoft. All rights reserved.
"""
Sample Validation System
A workflow-based system for validating Python samples by:
1. Discovering all sample files
2. Creating a dynamic nested concurrent workflow (one GitHub agent per sample)
3. Running the nested workflow
4. Generating a validation report
Usage:
uv run python -m _sample_validation
uv run python -m _sample_validation --subdir 01-get-started
"""
from _sample_validation.models import Report, RunResult, SampleInfo
from _sample_validation.workflow import create_validation_workflow
__all__ = [
"SampleInfo",
"RunResult",
"Report",
"create_validation_workflow",
]
@@ -0,0 +1,143 @@
# Copyright (c) Microsoft. All rights reserved.
"""
Sample Validation Script
Validates all Python samples in the samples directory using a workflow that:
1. Discovers all sample files
2. Builds a nested concurrent workflow with one GitHub agent per sample
3. Runs the nested workflow
4. Generates a validation report
Usage:
uv run python -m _sample_validation
uv run python -m _sample_validation --subdir 03-workflows
uv run python -m _sample_validation --output-dir ./reports
"""
import argparse
import asyncio
import os
import sys
import time
from pathlib import Path
# Add the samples directory to the path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from _sample_validation.models import Report
from _sample_validation.report import save_report
from _sample_validation.workflow import ValidationConfig, create_validation_workflow
def parse_arguments() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Validate Python samples using a dynamic nested concurrent workflow",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
uv run python -m _sample_validation # Validate all samples
uv run python -m _sample_validation --subdir 03-workflows # Validate only workflows
uv run python -m _sample_validation --output-dir ./reports # Save reports to custom dir
""",
)
parser.add_argument(
"--subdir",
type=str,
help="Validate samples only in the specified subdirectory (relative to samples/)",
)
parser.add_argument(
"--output-dir",
type=str,
default="./_sample_validation/reports",
help="Directory to save validation reports (default: ./_sample_validation/reports)",
)
parser.add_argument(
"--save-report",
action="store_true",
help="Save the validation report to files",
)
parser.add_argument(
"--max-parallel-workers",
type=int,
default=10,
help="Maximum number of samples to run in parallel per batch (default: 10)",
)
parser.add_argument(
"--report-name",
type=str,
help="Custom name for the report files (without extension). If not provided, uses timestamp.",
)
return parser.parse_args()
async def main() -> int:
"""Main entry point."""
args = parse_arguments()
# Determine paths
samples_dir = Path(__file__).parent.parent
python_root = samples_dir.parent
print("=" * 80)
print("SAMPLE VALIDATION WORKFLOW")
print("=" * 80)
print(f"Samples directory: {samples_dir}")
print(f"Python root: {python_root}")
if os.environ.get("GITHUB_COPILOT_MODEL"):
print(f"Using GitHub Copilot model override: {os.environ['GITHUB_COPILOT_MODEL']}")
# Create validation config
config = ValidationConfig(
samples_dir=samples_dir,
python_root=python_root,
subdir=args.subdir,
max_parallel_workers=max(1, args.max_parallel_workers),
)
# Create and run the workflow
workflow = create_validation_workflow(config)
print("\nStarting validation workflow...")
print("-" * 80)
# Run the workflow
run_start = time.perf_counter()
try:
events = await workflow.run("start")
finally:
run_duration = time.perf_counter() - run_start
print(f"\nWorkflow run completed in {run_duration:.2f}s")
outputs = events.get_outputs()
if not outputs:
print("\n[ERROR] Workflow did not produce any output")
return 1
report: Report = outputs[0]
# Save report if requested
if args.save_report:
output_dir = samples_dir / args.output_dir
md_path, json_path = save_report(report, output_dir, name=args.report_name)
print("\nReports saved:")
print(f" Markdown: {md_path}")
print(f" JSON: {json_path}")
# Return appropriate exit code
failed = report.failure_count + report.timeout_count + report.error_count
return 1 if failed > 0 else 0
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)
@@ -0,0 +1,3 @@
# Copyright (c) Microsoft. All rights reserved.
WORKER_COMPLETED = "worker_completed"
@@ -0,0 +1,252 @@
# Copyright (c) Microsoft. All rights reserved.
import logging
from collections import deque
from dataclasses import dataclass
from _sample_validation.const import WORKER_COMPLETED
from _sample_validation.discovery import DiscoveryResult
from _sample_validation.models import (
ExecutionResult,
RunResult,
RunStatus,
SampleInfo,
ValidationConfig,
WorkflowCreationResult,
)
from agent_framework import (
Executor,
Message,
Workflow,
WorkflowBuilder,
WorkflowContext,
WorkflowEvent,
handler,
)
from agent_framework.github import GitHubCopilotAgent
from copilot.types import PermissionRequest, PermissionRequestResult
from pydantic import BaseModel
from typing_extensions import Never
logger = logging.getLogger(__name__)
class AgentResponseFormat(BaseModel):
status: str
output: str
error: str
@dataclass
class CoordinatorStart:
samples: list[SampleInfo]
@dataclass
class WorkerFreed:
worker_id: str
class BatchCompletion:
pass
AgentInstruction = (
"You are validating exactly one Python sample.\n"
"Analyze the sample code and execute it. Determine if it runs successfully, fails, or times out.\n"
"The sample can be interactive. If it is interactive, respond to the sample when prompted "
"based on your analysis of the code. You do not need to consult human on what to respond\n"
"Return ONLY valid JSON with this schema:\n"
"{\n"
' "status": "success|failure|timeout|error",\n'
' "output": "short summary of the result and what you did if the sample was interactive",\n'
' "error": "error details or empty string"\n'
"}\n\n"
)
def parse_agent_json(text: str) -> AgentResponseFormat:
"""Parse JSON object from an agent response."""
stripped = text.strip()
if stripped.startswith("{") and stripped.endswith("}"):
return AgentResponseFormat.model_validate_json(stripped)
start = stripped.find("{")
end = stripped.rfind("}")
if start == -1 or end == -1 or end <= start:
raise ValueError("No JSON object found in response")
return AgentResponseFormat.model_validate_json(stripped[start : end + 1])
def status_from_text(value: str) -> RunStatus:
"""Convert a string value to RunStatus with safe fallback."""
normalized = value.strip().lower()
for status in RunStatus:
if status.value == normalized:
return status
return RunStatus.ERROR
def prompt_permission(request: PermissionRequest, context: dict[str, str]) -> PermissionRequestResult:
"""Permission handler that always approves."""
kind = request.get("kind", "unknown")
logger.debug(f"[Permission Request: {kind}] ({context})Automatically approved for sample validation.")
return PermissionRequestResult(kind="approved")
class CustomAgentExecutor(Executor):
"""Executor that runs a GitHub Copilot agent and returns its response.
We need the custom executor to wrap the agent call in a try/except to ensure that any exceptions are caught and
returned as error responses, otherwise an exception in one agent could crash the entire workflow.
"""
def __init__(self, agent: GitHubCopilotAgent):
super().__init__(id=agent.id)
self.agent = agent
@handler
async def handle_task(self, sample: SampleInfo, ctx: WorkflowContext[WorkerFreed | RunResult]) -> None:
"""Execute one sample task and notify collector + coordinator."""
try:
response = await self.agent.run([
Message(role="user", text=f"Validate the following sample:\n\n{sample.relative_path}")
])
result_payload = parse_agent_json(response.text)
result = RunResult(
sample=sample,
status=status_from_text(result_payload.status),
output=result_payload.output,
error=result_payload.error,
)
except Exception as ex:
logger.error(f"Error executing agent {self.agent.id}: {ex}")
result = RunResult(
sample=sample,
status=RunStatus.ERROR,
output="",
error=str(ex),
)
await ctx.send_message(result, target_id="collector")
await ctx.send_message(WorkerFreed(worker_id=self.id), target_id="coordinator")
await ctx.add_event(WorkflowEvent(WORKER_COMPLETED, sample)) # type: ignore
class BatchCoordinatorExecutor(Executor):
"""Dispatch sample tasks to worker executors in bounded batches."""
def __init__(self, worker_ids: list[str], max_parallel_workers: int) -> None:
super().__init__(id="coordinator")
self._worker_ids = worker_ids
self._max_parallel_workers = max(1, max_parallel_workers)
self._pending: deque[SampleInfo] = deque()
self._inflight: set[str] = set()
async def _assign_next(self, worker_id: str, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
if not self._pending:
# No more samples to assign
if not self._inflight:
# All tasks are completed, notify collector and exit
await ctx.send_message(BatchCompletion(), target_id="collector")
return
sample = self._pending.popleft()
self._inflight.add(worker_id)
# Messages will get queued in the runner until the next superstep when all workers are freed,
# thus achieving automatic batching without needing complex synchronization logic
await ctx.send_message(sample, target_id=worker_id)
@handler
async def on_start(self, start: CoordinatorStart, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
"""Initialize queue and dispatch first wave of tasks."""
self._pending = deque(start.samples)
self._inflight.clear()
for worker_id in self._worker_ids[: self._max_parallel_workers]:
await self._assign_next(worker_id, ctx)
@handler
async def on_worker_freed(self, freed: WorkerFreed, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
"""Dispatch next queued sample when a worker finishes."""
self._inflight.discard(freed.worker_id)
await self._assign_next(freed.worker_id, ctx)
class CollectorExecutor(Executor):
"""Collect per-sample results and emit the final execution result."""
def __init__(self) -> None:
super().__init__(id="collector")
self._results: list[RunResult] = []
@handler
async def on_all(self, batch_completion: BatchCompletion, ctx: WorkflowContext[Never, ExecutionResult]) -> None:
"""Receive all results at once and emit final output."""
await ctx.yield_output(ExecutionResult(results=self._results))
@handler
async def on_item(self, item: RunResult, ctx: WorkflowContext) -> None:
"""Record a result and emit output when all expected results arrive."""
self._results.append(item)
class CreateConcurrentValidationWorkflowExecutor(Executor):
"""Executor that builds a nested concurrent workflow with one agent per sample."""
def __init__(self, config: ValidationConfig):
super().__init__(id="create_dynamic_workflow")
self.config = config
@handler
async def create(
self,
discovery: DiscoveryResult,
ctx: WorkflowContext[WorkflowCreationResult],
) -> None:
"""Create a nested workflow with a coordinator + worker fan-out/fan-in."""
sample_count = len(discovery.samples)
print(f"\nCreating nested batched workflow for {sample_count} samples...")
if sample_count == 0:
await ctx.send_message(WorkflowCreationResult(samples=[], workflow=None, agents=[]))
return
agents: list[GitHubCopilotAgent] = []
workers: list[CustomAgentExecutor] = []
for index, sample in enumerate(discovery.samples, start=1):
agent_id = f"sample_validator_{index}({sample.relative_path})"
agent = GitHubCopilotAgent(
id=agent_id,
name=agent_id,
instructions=AgentInstruction,
default_options={"on_permission_request": prompt_permission, "timeout": 180}, # type: ignore
)
agents.append(agent)
workers.append(CustomAgentExecutor(agent))
coordinator = BatchCoordinatorExecutor(
worker_ids=[worker.id for worker in workers],
max_parallel_workers=self.config.max_parallel_workers,
)
collector = CollectorExecutor()
nested_builder = WorkflowBuilder(start_executor=coordinator, output_executors=[collector])
nested_builder.add_edge(coordinator, collector)
for worker in workers:
nested_builder.add_edge(coordinator, worker)
nested_builder.add_edge(worker, coordinator)
nested_builder.add_edge(worker, collector)
nested_workflow: Workflow = nested_builder.build()
await ctx.send_message(
WorkflowCreationResult(
samples=discovery.samples,
workflow=nested_workflow,
agents=agents,
)
)
@@ -0,0 +1,116 @@
# Copyright (c) Microsoft. All rights reserved.
"""Sample discovery module."""
import ast
import os
from pathlib import Path
from _sample_validation.models import DiscoveryResult, SampleInfo, ValidationConfig
from agent_framework import Executor, WorkflowContext, handler
def _is_main_entrypoint_guard(test: ast.expr) -> bool:
"""Check whether an expression is ``__name__ == '__main__'``."""
if not isinstance(test, ast.Compare):
return False
if len(test.ops) != 1 or not isinstance(test.ops[0], ast.Eq):
return False
if len(test.comparators) != 1:
return False
left = test.left
right = test.comparators[0]
return (
isinstance(left, ast.Name)
and left.id == "__name__"
and isinstance(right, ast.Constant)
and right.value == "__main__"
) or (
isinstance(right, ast.Name)
and right.id == "__name__"
and isinstance(left, ast.Constant)
and left.value == "__main__"
)
def _has_main_entrypoint_guard(path: Path) -> bool:
"""Check whether a Python file defines a top-level main entrypoint guard."""
try:
source = path.read_text(encoding="utf-8")
tree = ast.parse(source)
except Exception:
return False
return any(isinstance(node, ast.If) and _is_main_entrypoint_guard(node.test) for node in tree.body)
def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[SampleInfo]:
"""
Find all Python sample files in the samples directory.
Args:
samples_dir: Root samples directory
subdir: Optional subdirectory to filter to
Returns:
List of SampleInfo objects for each discovered sample
"""
# Determine the search directory
if subdir:
search_dir = samples_dir / subdir
if not search_dir.exists():
print(f"Warning: Subdirectory '{subdir}' does not exist in {samples_dir}")
return []
else:
search_dir = samples_dir
python_files: list[Path] = []
# Walk through all subdirectories and find .py files
for root, dirs, files in os.walk(search_dir):
# Skip directories that start with _ (like _sample_validation)
dirs[:] = [d for d in dirs if not d.startswith("_") and d != "__pycache__"]
for file in files:
# Skip files that start with _ and include only scripts with a main entrypoint guard
if file.endswith(".py") and not file.startswith("_"):
file_path = Path(root) / file
if _has_main_entrypoint_guard(file_path):
python_files.append(file_path)
# Sort files for consistent execution order
python_files = sorted(python_files)
# Convert to SampleInfo objects
samples: list[SampleInfo] = []
for path in python_files:
try:
samples.append(SampleInfo.from_path(path, samples_dir))
except Exception as e:
print(f"Warning: Could not read {path}: {e}")
return samples
class DiscoverSamplesExecutor(Executor):
"""Executor that discovers all samples in the samples directory."""
def __init__(self, config: ValidationConfig):
super().__init__(id="discover_samples")
self.config = config
@handler
async def discover(self, _: str, ctx: WorkflowContext[DiscoveryResult]) -> None:
"""Discover all Python samples."""
print(f"🔍 Discovering samples in {self.config.samples_dir}")
if self.config.subdir:
print(f" Filtering to subdirectory: {self.config.subdir}")
samples = discover_samples(self.config.samples_dir, self.config.subdir)
print(f" Found {len(samples)} samples")
await ctx.send_message(DiscoveryResult(samples=samples))
+163
View File
@@ -0,0 +1,163 @@
# Copyright (c) Microsoft. All rights reserved.
"""Data models for the sample validation system."""
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from pathlib import Path
from agent_framework import Workflow
from agent_framework.github import GitHubCopilotAgent
@dataclass
class ValidationConfig:
"""Configuration for the validation workflow."""
samples_dir: Path
python_root: Path
subdir: str | None = None
max_parallel_workers: int = 10
@dataclass
class SampleInfo:
"""Information about a discovered sample file."""
path: Path
relative_path: str
code: str
@classmethod
def from_path(cls, path: Path, samples_dir: Path) -> "SampleInfo":
"""Create SampleInfo from a file path."""
return cls(
path=path,
relative_path=str(path.relative_to(samples_dir)),
code=path.read_text(encoding="utf-8"),
)
@dataclass
class DiscoveryResult:
"""Result of sample discovery."""
samples: list[SampleInfo]
@dataclass
class WorkflowCreationResult:
"""Result of creating a nested per-sample concurrent workflow."""
samples: list[SampleInfo]
workflow: Workflow | None
agents: list[GitHubCopilotAgent]
class RunStatus(Enum):
"""Status of a sample run."""
SUCCESS = "success"
FAILURE = "failure"
TIMEOUT = "timeout"
ERROR = "error"
@dataclass
class RunResult:
"""Result of running a single sample."""
sample: SampleInfo
status: RunStatus
output: str
error: str
@dataclass
class ExecutionResult:
"""Result of sample execution."""
results: list[RunResult]
@dataclass
class Report:
"""Final validation report."""
timestamp: datetime
total_samples: int
success_count: int
failure_count: int
timeout_count: int
error_count: int
results: list[RunResult] = field(default_factory=list) # type: ignore
def to_markdown(self) -> str:
"""Generate a markdown report."""
lines = [
"# Sample Validation Report",
"",
f"**Generated:** {self.timestamp.isoformat()}",
"",
"## Summary",
"",
"| Metric | Count |",
"|--------|-------|",
f"| Total Samples | {self.total_samples} |",
f"| [PASS] Success | {self.success_count} |",
f"| [FAIL] Failure | {self.failure_count} |",
f"| [TIMEOUT] Timeout | {self.timeout_count} |",
f"| [ERROR] Error | {self.error_count} |",
"",
"## Detailed Results",
"",
]
# Group by status
for status in [RunStatus.FAILURE, RunStatus.TIMEOUT, RunStatus.ERROR, RunStatus.SUCCESS]:
status_results = [r for r in self.results if r.status == status]
if not status_results:
continue
status_label = {
RunStatus.SUCCESS: "[PASS]",
RunStatus.FAILURE: "[FAIL]",
RunStatus.TIMEOUT: "[TIMEOUT]",
RunStatus.ERROR: "[ERROR]",
}
lines.append(f"### {status_label[status]} {status.value.title()} ({len(status_results)})")
lines.append("")
for result in status_results:
lines.append(f"- **{result.sample.relative_path}**")
if result.error:
# Truncate long errors
error_preview = result.error[:200] + "..." if len(result.error) > 200 else result.error
lines.append(f" - Error: `{error_preview}`")
lines.append("")
return "\n".join(lines)
def to_dict(self) -> dict[str, object]:
"""Convert report to dictionary for JSON serialization."""
return {
"timestamp": self.timestamp.isoformat(),
"summary": {
"total_samples": self.total_samples,
"success_count": self.success_count,
"failure_count": self.failure_count,
"timeout_count": self.timeout_count,
"error_count": self.error_count,
},
"results": [
{
"path": r.sample.relative_path,
"status": r.status.value,
"output": r.output,
"error": r.error,
}
for r in self.results
],
}
+105
View File
@@ -0,0 +1,105 @@
# Copyright (c) Microsoft. All rights reserved.
"""Report generation for sample validation results."""
import json
from datetime import datetime
from pathlib import Path
from _sample_validation.models import ExecutionResult, Report, RunResult, RunStatus
from agent_framework import Executor, WorkflowContext, handler
from typing_extensions import Never
def generate_report(results: list[RunResult]) -> Report:
"""
Generate a validation report from run results.
Args:
results: List of RunResult objects from sample execution
Returns:
Report object with aggregated statistics
"""
return Report(
timestamp=datetime.now(),
total_samples=len(results),
success_count=sum(1 for r in results if r.status == RunStatus.SUCCESS),
failure_count=sum(1 for r in results if r.status == RunStatus.FAILURE),
timeout_count=sum(1 for r in results if r.status == RunStatus.TIMEOUT),
error_count=sum(1 for r in results if r.status == RunStatus.ERROR),
results=results,
)
def save_report(report: Report, output_dir: Path, name: str | None = None) -> tuple[Path, Path]:
"""
Save the report to markdown and JSON files.
Args:
report: The report to save
output_dir: Directory to save the report files
name: Optional custom name for the report files (without extension)
Returns:
Tuple of (markdown_path, json_path)
"""
output_dir.mkdir(parents=True, exist_ok=True)
if name:
base_name = name
else:
timestamp_str = report.timestamp.strftime("%Y%m%d_%H%M%S")
base_name = f"validation_report_{timestamp_str}"
# Save markdown
md_path = output_dir / f"{base_name}.md"
md_path.write_text(report.to_markdown(), encoding="utf-8")
# Save JSON
json_path = output_dir / f"{base_name}.json"
json_path.write_text(
json.dumps(report.to_dict(), indent=2),
encoding="utf-8",
)
return md_path, json_path
def print_summary(report: Report) -> None:
"""Print a summary of the validation report to console."""
print("\n" + "=" * 80)
print("SAMPLE VALIDATION SUMMARY")
print("=" * 80)
if report.failure_count == 0 and report.timeout_count == 0 and report.error_count == 0:
print("[PASS] ALL SAMPLES PASSED!")
else:
print("[FAIL] SOME SAMPLES FAILED")
print(f"\nTotal samples: {report.total_samples}")
print()
print("Results:")
print(f" [PASS] Success: {report.success_count}")
print(f" [FAIL] Failure: {report.failure_count}")
print(f" [TIMEOUT] Timeout: {report.timeout_count}")
print(f" [ERROR] Error: {report.error_count}")
print("=" * 80)
class GenerateReportExecutor(Executor):
"""Executor that generates the final validation report."""
def __init__(self) -> None:
super().__init__(id="generate_report")
@handler
async def generate(self, execution: ExecutionResult, ctx: WorkflowContext[Never, Report]) -> None:
"""Generate the validation report from fan-in results."""
print("\nGenerating report...")
report = generate_report(execution.results)
print_summary(report)
await ctx.yield_output(report)
@@ -0,0 +1,64 @@
# Copyright (c) Microsoft. All rights reserved.
from collections.abc import Sequence
from _sample_validation.const import WORKER_COMPLETED
from _sample_validation.create_dynamic_workflow_executor import CoordinatorStart
from _sample_validation.models import ExecutionResult, RunResult, RunStatus, SampleInfo, WorkflowCreationResult
from agent_framework import Executor, WorkflowContext, handler
from agent_framework.github import GitHubCopilotAgent
async def stop_agents(agents: Sequence[GitHubCopilotAgent]) -> None:
"""Stop all GitHub Copilot agents used by the nested workflow."""
for agent in agents:
try:
await agent.stop()
except Exception:
continue
class RunDynamicValidationWorkflowExecutor(Executor):
"""Executor that runs the nested workflow created in the previous step."""
def __init__(self) -> None:
super().__init__(id="run_dynamic_workflow")
@handler
async def run(self, creation: WorkflowCreationResult, ctx: WorkflowContext[ExecutionResult]) -> None:
"""Run the nested workflow and emit execution results."""
if creation.workflow is None:
await ctx.send_message(ExecutionResult(results=[]))
return
print("\nRunning nested batched workflow...")
print("-" * 80)
try:
remaining_sample_counts = len(creation.samples)
result: ExecutionResult | None = None
async for event in creation.workflow.run(CoordinatorStart(samples=creation.samples), stream=True):
if event.type == "output" and isinstance(event.data, ExecutionResult):
result = event.data # type: ignore
elif event.type == WORKER_COMPLETED and isinstance(event.data, SampleInfo): # type: ignore
remaining_sample_counts -= 1
print(
f"Completed validation for sample: {event.data.relative_path:<80} | "
f"Remaining: {remaining_sample_counts:>4}"
)
if result is not None:
await ctx.send_message(result)
else:
fallback_results = [
RunResult(
sample=sample,
status=RunStatus.ERROR,
output="",
error="Nested workflow did not return an ExecutionResult.",
)
for sample in creation.samples
]
await ctx.send_message(ExecutionResult(results=fallback_results))
finally:
await stop_agents(creation.agents)
@@ -0,0 +1,42 @@
# Copyright (c) Microsoft. All rights reserved.
"""
Sample Validation Workflow using Microsoft Agent Framework.
Workflow composition for sample validation.
"""
from _sample_validation.create_dynamic_workflow_executor import CreateConcurrentValidationWorkflowExecutor
from _sample_validation.discovery import DiscoverSamplesExecutor, ValidationConfig
from _sample_validation.report import GenerateReportExecutor
from _sample_validation.run_dynamic_validation_workflow_executor import RunDynamicValidationWorkflowExecutor
from agent_framework import Workflow, WorkflowBuilder
def create_validation_workflow(
config: ValidationConfig,
) -> Workflow:
"""
Create the sample validation workflow.
Args:
config: Validation configuration
Returns:
Configured Workflow instance
"""
discover = DiscoverSamplesExecutor(config)
create_dynamic_workflow = CreateConcurrentValidationWorkflowExecutor(config)
run_dynamic_workflow = RunDynamicValidationWorkflowExecutor()
generate = GenerateReportExecutor()
return (
WorkflowBuilder(start_executor=discover)
.add_edge(discover, create_dynamic_workflow)
.add_edge(create_dynamic_workflow, run_dynamic_workflow)
.add_edge(run_dynamic_workflow, generate)
.build()
)
__all__ = ["ValidationConfig", "create_validation_workflow"]