Python: Automate sample validation (#4193)

* Automate sample validation: part 1 * Automate sample validation: part 2 * Create GH workflow * comments * Fix mypy
2026-06-16 21:04:09 +08:00 · 2026-02-23 17:08:16 -08:00
parent 55398e21df
commit b7efaae709
14 changed files with 1408 additions and 308 deletions
@@ -31,6 +31,9 @@ class UserMemoryProvider(BaseContextProvider):

    DEFAULT_SOURCE_ID = "user_memory"

+    def __init__(self):
+        super().__init__(self.DEFAULT_SOURCE_ID)
+
    async def before_run(
        self,
        *,
@@ -1,304 +0,0 @@
-# Copyright (c) Microsoft. All rights reserved.
-
-"""
-Script to run all Python samples in the samples directory concurrently.
-This script will run all samples and report results at the end.
-
-Note: This script is AI generated. This is for internal validation purposes only.
-
-Samples that require human interaction are known to fail.
-
-Usage:
-    python run_all_samples.py                          # Run all samples using uv run (concurrent)
-    python run_all_samples.py --direct                 # Run all samples directly (concurrent,
-                                                       # assumes environment is set up)
-    python run_all_samples.py --subdir <directory>     # Run samples only in specific subdirectory
-    python run_all_samples.py --subdir getting_started/workflows  # Example: run only workflow samples
-"""
-
-import argparse
-import os
-import subprocess
-import sys
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-
-
-def find_python_samples(samples_dir: Path, subdir: str | None = None) -> list[Path]:
-    """Find all Python sample files in the samples directory or a subdirectory."""
-    python_files: list[Path] = []
-
-    # Determine the search directory
-    if subdir:
-        search_dir = samples_dir / subdir
-        if not search_dir.exists():
-            print(f"Warning: Subdirectory '{subdir}' does not exist in {samples_dir}")
-            return []
-        print(f"Searching in subdirectory: {search_dir}")
-    else:
-        search_dir = samples_dir
-        print(f"Searching in all samples: {search_dir}")
-
-    # Walk through all subdirectories and find .py files
-    for root, dirs, files in os.walk(search_dir):
-        # Skip __pycache__ directories
-        dirs[:] = [d for d in dirs if d != "__pycache__"]
-
-        for file in files:
-            if file.endswith(".py") and not file.startswith("_") and file != "_run_all_samples.py":
-                python_files.append(Path(root) / file)
-
-    # Sort files for consistent execution order
-    return sorted(python_files)
-
-
-def run_sample(
-    sample_path: Path,
-    use_uv: bool = True,
-    python_root: Path | None = None,
-) -> tuple[bool, str, str, str]:
-    """
-    Run a single sample file using subprocess and return (success, output, error_info, error_type).
-
-    Args:
-        sample_path: Path to the sample file
-        use_uv: Whether to use uv run
-        python_root: Root directory for uv run
-
-    Returns:
-        Tuple of (success, output, error_info, error_type)
-        error_type can be: "timeout", "input_hang", "execution_error", "exception"
-    """
-    if use_uv and python_root:
-        cmd = ["uv", "run", "python", str(sample_path)]
-        cwd = python_root
-    else:
-        cmd = [sys.executable, sample_path.name]
-        cwd = sample_path.parent
-
-    # Set environment variables to handle Unicode properly
-    env = os.environ.copy()
-    env["PYTHONIOENCODING"] = "utf-8"  # Force Python to use UTF-8 for I/O
-    env["PYTHONUTF8"] = "1"  # Enable UTF-8 mode in Python 3.7+
-
-    try:
-        # Use Popen for better timeout handling with stdin for samples that may wait for input
-        # Popen gives us more control over process lifecycle compared to subprocess.run()
-        process = subprocess.Popen(
-            cmd,  # Command to execute as a list [program, arg1, arg2, ...]
-            cwd=cwd,  # Working directory for the subprocess
-            stdout=subprocess.PIPE,  # Capture stdout so we can read the output
-            stderr=subprocess.PIPE,  # Capture stderr so we can read error messages
-            stdin=subprocess.PIPE,  # Create a pipe for stdin so we can send input
-            text=True,  # Handle input/output as text strings (not bytes)
-            encoding="utf-8",  # Use UTF-8 encoding to handle Unicode characters like emojis
-            errors="replace",  # Replace problematic characters instead of failing
-            env=env,  # Pass environment variables for proper Unicode handling
-        )
-
-        try:
-            # communicate() sends input to stdin and waits for process to complete
-            # input="" sends an empty string to stdin, which causes input() calls to
-            # immediately receive EOFError (End Of File) since there's no data to read.
-            # This prevents the process from hanging indefinitely waiting for user input.
-            stdout, stderr = process.communicate(input="", timeout=60)
-        except subprocess.TimeoutExpired:
-            # If the process doesn't complete within the timeout period, we need to
-            # forcibly terminate it. This is especially important for processes that
-            # ignore EOFError and continue to hang on input() calls.
-
-            # First attempt: Send SIGKILL (immediate termination) on Unix or TerminateProcess on Windows
-            process.kill()
-            try:
-                # Give the process a few seconds to clean up after being killed
-                stdout, stderr = process.communicate(timeout=5)
-            except subprocess.TimeoutExpired:
-                # If the process is still alive after kill(), use terminate() as a last resort
-                # terminate() sends SIGTERM (graceful termination request) which may work
-                # when kill() doesn't on some systems
-                process.terminate()
-                stdout, stderr = "", "Process forcibly terminated"
-            return False, "", f"TIMEOUT: {sample_path.name} (exceeded 60 seconds)", "timeout"
-
-        if process.returncode == 0:
-            output = stdout.strip() if stdout.strip() else "No output"
-            return True, output, "", "success"
-
-        error_info = f"Exit code: {process.returncode}"
-        if stderr.strip():
-            error_info += f"\nSTDERR: {stderr}"
-
-        # Check if this looks like an input/interaction related error
-        error_type = "execution_error"
-        stderr_safe = stderr.encode("utf-8", errors="replace").decode("utf-8") if stderr else ""
-        if "EOFError" in stderr_safe or "input" in stderr_safe.lower() or "stdin" in stderr_safe.lower():
-            error_type = "input_hang"
-        elif "UnicodeEncodeError" in stderr_safe and ("charmap" in stderr_safe or "codec can't encode" in stderr_safe):
-            error_type = "input_hang"  # Unicode errors often indicate interactive samples with emojis
-
-        return False, stdout.strip() if stdout.strip() else "", error_info, error_type
-    except Exception as e:
-        return False, "", f"ERROR: {sample_path.name} - Exception: {str(e)}", "exception"
-
-
-def parse_arguments() -> argparse.Namespace:
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Run Python samples concurrently",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  python run_all_samples.py                                    # Run all samples
-  python run_all_samples.py --direct                           # Run all samples directly
-  python run_all_samples.py --subdir getting_started           # Run only getting_started samples
-  python run_all_samples.py --subdir getting_started/workflows # Run only workflow samples
-  python run_all_samples.py --subdir semantic-kernel-migration # Run only SK migration samples
-        """,
-    )
-
-    parser.add_argument(
-        "--direct", action="store_true", help="Run samples directly with python instead of using uv run"
-    )
-
-    parser.add_argument(
-        "--subdir", type=str, help="Run samples only in the specified subdirectory (relative to samples/)"
-    )
-
-    parser.add_argument(
-        "--max-workers", type=int, default=16, help="Maximum number of concurrent workers (default: 16)"
-    )
-
-    return parser.parse_args()
-
-
-def main() -> None:
-    """Main function to run all samples concurrently."""
-    args = parse_arguments()
-
-    # Get the samples directory (assuming this script is in the samples directory)
-    samples_dir = Path(__file__).parent
-    python_root = samples_dir.parent  # Go up to the python/ directory
-
-    print("Python samples runner")
-    print(f"Samples directory: {samples_dir}")
-
-    if args.direct:
-        print("Running samples directly (assuming environment is set up)")
-    else:
-        print(f"Using uv run from: {python_root}")
-
-    if args.subdir:
-        print(f"Filtering to subdirectory: {args.subdir}")
-
-    print("🚀 Running samples concurrently...")
-
-    # Find all Python sample files
-    sample_files = find_python_samples(samples_dir, args.subdir)
-
-    if not sample_files:
-        print("No Python sample files found!")
-        return
-
-    print(f"Found {len(sample_files)} Python sample files")
-
-    # Run samples concurrently
-    results: list[tuple[Path, bool, str, str, str]] = []
-
-    with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
-        # Submit all tasks
-        future_to_sample = {
-            executor.submit(run_sample, sample_path, not args.direct, python_root): sample_path
-            for sample_path in sample_files
-        }
-
-        # Collect results as they complete
-        for future in as_completed(future_to_sample):
-            sample_path = future_to_sample[future]
-            try:
-                success, output, error_info, error_type = future.result()
-                results.append((sample_path, success, output, error_info, error_type))
-
-                # Print progress - show relative path from samples directory
-                relative_path = sample_path.relative_to(samples_dir)
-                if success:
-                    print(f"✅ {relative_path}")
-                else:
-                    # Show error type in progress display
-                    error_display = f"{error_type.upper()}" if error_type != "execution_error" else "ERROR"
-                    print(f"❌ {relative_path} - {error_display}")
-
-            except Exception as e:
-                error_info = f"Future exception: {str(e)}"
-                results.append((sample_path, False, "", error_info, "exception"))
-                relative_path = sample_path.relative_to(samples_dir)
-                print(f"❌ {relative_path} - EXCEPTION")
-
-    # Sort results by original file order for consistent reporting
-    sample_to_index = {path: i for i, path in enumerate(sample_files)}
-    results.sort(key=lambda x: sample_to_index[x[0]])
-
-    successful_runs = sum(1 for _, success, _, _, _ in results if success)
-    failed_runs = len(results) - successful_runs
-
-    # Categorize failures by type
-    timeout_failures = [r for r in results if not r[1] and r[4] == "timeout"]
-    input_hang_failures = [r for r in results if not r[1] and r[4] == "input_hang"]
-    execution_errors = [r for r in results if not r[1] and r[4] == "execution_error"]
-    exceptions = [r for r in results if not r[1] and r[4] == "exception"]
-
-    # Print detailed results
-    print(f"\n{'=' * 80}")
-    print("DETAILED RESULTS:")
-    print(f"{'=' * 80}")
-
-    for sample_path, success, output, error_info, error_type in results:
-        relative_path = sample_path.relative_to(samples_dir)
-        if success:
-            print(f"✅ {relative_path}")
-            if output and output != "No output":
-                print(f"   Output preview: {output[:100]}{'...' if len(output) > 100 else ''}")
-        else:
-            # Display error with type indicator
-            if error_type == "timeout":
-                print(f"⏱️  {relative_path} - TIMEOUT (likely waiting for input)")
-            elif error_type == "input_hang":
-                print(f"⌨️  {relative_path} - INPUT ERROR (interactive sample)")
-            elif error_type == "exception":
-                print(f"💥 {relative_path} - EXCEPTION")
-            else:
-                print(f"❌ {relative_path} - EXECUTION ERROR")
-            print(f"   Error: {error_info}")
-
-    # Print categorized summary
-    print(f"\n{'=' * 80}")
-    if failed_runs == 0:
-        print("🎉 ALL SAMPLES COMPLETED SUCCESSFULLY!")
-    else:
-        print(f"❌ {failed_runs} SAMPLE(S) FAILED!")
-
-    print(f"Successful runs: {successful_runs}")
-    print(f"Failed runs: {failed_runs}")
-
-    if failed_runs > 0:
-        print("\nFailure breakdown:")
-        if len(timeout_failures) > 0:
-            print(f"  ⏱️  Timeouts (likely interactive): {len(timeout_failures)}")
-        if len(input_hang_failures) > 0:
-            print(f"  ⌨️  Input errors (interactive): {len(input_hang_failures)}")
-        if len(execution_errors) > 0:
-            print(f"  ❌ Execution errors: {len(execution_errors)}")
-        if len(exceptions) > 0:
-            print(f"  💥 Exceptions: {len(exceptions)}")
-
-    if args.subdir:
-        print(f"Subdirectory filter: {args.subdir}")
-
-    print(f"{'=' * 80}")
-
-    # Exit with error code if any samples failed
-    if failed_runs > 0:
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,183 @@
+# Sample Validation System
+
+An AI-powered workflow system for validating Python samples by discovering them, creating a nested batched workflow, and producing a report.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│                    Sample Validation Workflow                        │
+│                    (Sequential - 4 Executors)                        │
+└─────────────────────────────────────────────────────────────────────┘
+                                   │
+        ┌──────────────────────────┼──────────────────────────┐
+        ▼                          ▼                          ▼
+┌───────────────┐        ┌─────────────────┐        ┌─────────────────┐
+│   Discover    │   ──►  │ Create Dynamic  │   ──►  │ Run Nested      │
+│   Samples     │        │ Batched Flow    │        │ Workflow        │
+└───────────────┘        └─────────────────┘        └─────────────────┘
+        │                          │                          │
+        ▼                          ▼                          ▼
+  List[SampleInfo]          WorkflowCreationResult      ExecutionResult
+                        (workers + coordinator)              │
+                                                             ▼
+                                                    ┌─────────────────┐
+                                                    │ Generate Report │
+                                                    └─────────────────┘
+                                                             │
+                                                             ▼
+                                                          Report
+```
+
+### Nested Workflow Strategy
+
+```
+┌─────────────────────────────────────────────────────────────────────┐
+│             Nested Batched Workflow (coordinator + workers)          │
+├─────────────────────────────────────────────────────────────────────┤
+│                                                                     │
+│  ┌─────────────────────────────────────────────────────────────┐   │
+│  │ WorkflowBuilder + fan-out/fan-in edges                      │   │
+│  │ - Coordinator dispatches tasks in bounded batches           │   │
+│  │ - Worker executors run GitHub Copilot agents               │   │
+│  │ - Collector aggregates per-sample RunResult messages       │   │
+│  │ - Max in-flight workers set by --max-parallel-workers      │   │
+│  └─────────────────────────────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────────┘
+```
+
+## File Structure
+
+```
+samples/
+├── _sample_validation/
+│   ├── __init__.py              # Package exports
+│   ├── README.md                # This file
+│   ├── models.py                # Data classes
+│   │   ├── SampleInfo           # Discovered sample metadata
+│   │   ├── RunResult            # Execution result
+│   │   └── Report               # Final validation report
+│   ├── discovery.py             # Sample discovery
+│   │   ├── discover_samples()   # Finds all .py files
+│   │   └── DiscoverSamplesExecutor
+│   ├── report.py                # Report generation
+│   │   ├── generate_report()    # Create Report from results
+│   │   ├── save_report()        # Write to markdown/JSON
+│   │   ├── print_summary()      # Console output
+│   │   └── GenerateReportExecutor
+│   ├── create_dynamic_workflow_executor.py # Coordinator, workers, collector, CreateConcurrentValidationWorkflowExecutor
+│   ├── run_dynamic_validation_workflow_executor.py # RunDynamicValidationWorkflowExecutor
+│   └── workflow.py              # Workflow assembly entrypoint
+├── __main__.py                  # CLI entry point
+```
+
+## Dependencies
+
+### Required
+
+- **agent-framework** - Core workflow and agent functionality
+- **agent-framework-github-copilot** - GitHub Copilot agent integration
+
+### Optional
+
+- `GITHUB_COPILOT_MODEL` to override default Copilot model selection.
+
+## Environment Variables
+
+No required environment variables. Optional:
+
+| Variable                 | Description                       | Required |
+| ------------------------ | --------------------------------- | -------- |
+| `GITHUB_COPILOT_MODEL`   | Copilot model override            | No       |
+| `GITHUB_COPILOT_TIMEOUT` | Copilot request timeout (seconds) | No       |
+
+## Usage
+
+### Basic Usage
+
+```bash
+# Validate all samples
+uv run python -m _sample_validation
+
+# Validate specific subdirectory
+uv run python -m _sample_validation --subdir 03-workflows
+
+# Save reports to files
+uv run python -m _sample_validation --save-report --output-dir ./reports
+```
+
+### Configuration Options
+
+```bash
+uv run python -m _sample_validation [OPTIONS]
+
+Options:
+  --subdir TEXT                Subdirectory to validate (relative to samples/)
+  --output-dir TEXT            Report output directory (default: ./_sample_validation/reports)
+  --max-parallel-workers INT   Max in-flight workers per batch (default: 10)
+  --save-report                      Save reports to files
+```
+
+### Examples
+
+```bash
+# Quick validation of a small directory
+uv run python -m _sample_validation --subdir 03-workflows/_start-here
+
+# Limit parallel workers for large sample sets
+uv run python -m _sample_validation --subdir 02-agents --max-parallel-workers 8
+
+# Save report artifacts
+uv run python -m _sample_validation --save-report
+```
+
+## How It Works
+
+### 1. Discovery
+
+Walks the samples directory and finds all `.py` files that:
+
+- Don't start with `_` (excludes private files)
+- Aren't in `__pycache__` directories
+- Aren't in directories starting with `_` (excludes `_sample_validation`)
+
+### 2. Dynamic Workflow Creation
+
+Creates a nested workflow with:
+
+- A coordinator executor
+- One worker executor per discovered sample
+- A collector executor
+
+### 3. Nested Workflow Execution
+
+The coordinator sends initial work to the first `max_parallel_workers` workers. As each worker finishes, it notifies
+the coordinator, which dispatches the next queued sample. Workers also send result items to the collector, which emits
+the final `ExecutionResult` once all samples are processed.
+
+### 4. Report Generation
+
+Produces:
+
+- **Console summary** - Pass/fail counts with emoji indicators
+- **Markdown report** - Detailed results grouped by status
+- **JSON report** - Machine-readable for CI integration
+
+## Report Status Codes
+
+| Status  | Label     | Description                               |
+| ------- | --------- | ----------------------------------------- |
+| SUCCESS | [PASS]    | Sample ran to completion with exit code 0 |
+| FAILURE | [FAIL]    | Sample exited with non-zero code          |
+| TIMEOUT | [TIMEOUT] | Sample exceeded timeout limit             |
+| ERROR   | [ERROR]   | Exception during execution                |
+
+## Troubleshooting
+
+### Agent output parsing errors
+
+If an agent returns non-JSON content, that sample is marked as `ERROR` with parser details in the report.
+
+### GitHub Copilot authentication or CLI issues
+
+Ensure GitHub Copilot is authenticated in your environment and the Copilot CLI is available.
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""
+Sample Validation System
+
+A workflow-based system for validating Python samples by:
+1. Discovering all sample files
+2. Creating a dynamic nested concurrent workflow (one GitHub agent per sample)
+3. Running the nested workflow
+4. Generating a validation report
+
+Usage:
+    uv run python -m _sample_validation
+    uv run python -m _sample_validation --subdir 01-get-started
+"""
+
+from _sample_validation.models import Report, RunResult, SampleInfo
+from _sample_validation.workflow import create_validation_workflow
+
+__all__ = [
+    "SampleInfo",
+    "RunResult",
+    "Report",
+    "create_validation_workflow",
+]
@@ -0,0 +1,143 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""
+Sample Validation Script
+
+Validates all Python samples in the samples directory using a workflow that:
+1. Discovers all sample files
+2. Builds a nested concurrent workflow with one GitHub agent per sample
+3. Runs the nested workflow
+4. Generates a validation report
+
+Usage:
+    uv run python -m _sample_validation
+    uv run python -m _sample_validation --subdir 03-workflows
+    uv run python -m _sample_validation --output-dir ./reports
+"""
+
+import argparse
+import asyncio
+import os
+import sys
+import time
+from pathlib import Path
+
+# Add the samples directory to the path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from _sample_validation.models import Report
+from _sample_validation.report import save_report
+from _sample_validation.workflow import ValidationConfig, create_validation_workflow
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate Python samples using a dynamic nested concurrent workflow",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  uv run python -m _sample_validation                        # Validate all samples
+  uv run python -m _sample_validation --subdir 03-workflows  # Validate only workflows
+  uv run python -m _sample_validation --output-dir ./reports # Save reports to custom dir
+        """,
+    )
+
+    parser.add_argument(
+        "--subdir",
+        type=str,
+        help="Validate samples only in the specified subdirectory (relative to samples/)",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="./_sample_validation/reports",
+        help="Directory to save validation reports (default: ./_sample_validation/reports)",
+    )
+
+    parser.add_argument(
+        "--save-report",
+        action="store_true",
+        help="Save the validation report to files",
+    )
+
+    parser.add_argument(
+        "--max-parallel-workers",
+        type=int,
+        default=10,
+        help="Maximum number of samples to run in parallel per batch (default: 10)",
+    )
+
+    parser.add_argument(
+        "--report-name",
+        type=str,
+        help="Custom name for the report files (without extension). If not provided, uses timestamp.",
+    )
+
+    return parser.parse_args()
+
+
+async def main() -> int:
+    """Main entry point."""
+    args = parse_arguments()
+
+    # Determine paths
+    samples_dir = Path(__file__).parent.parent
+    python_root = samples_dir.parent
+
+    print("=" * 80)
+    print("SAMPLE VALIDATION WORKFLOW")
+    print("=" * 80)
+    print(f"Samples directory: {samples_dir}")
+    print(f"Python root: {python_root}")
+
+    if os.environ.get("GITHUB_COPILOT_MODEL"):
+        print(f"Using GitHub Copilot model override: {os.environ['GITHUB_COPILOT_MODEL']}")
+
+    # Create validation config
+    config = ValidationConfig(
+        samples_dir=samples_dir,
+        python_root=python_root,
+        subdir=args.subdir,
+        max_parallel_workers=max(1, args.max_parallel_workers),
+    )
+
+    # Create and run the workflow
+    workflow = create_validation_workflow(config)
+
+    print("\nStarting validation workflow...")
+    print("-" * 80)
+
+    # Run the workflow
+    run_start = time.perf_counter()
+    try:
+        events = await workflow.run("start")
+    finally:
+        run_duration = time.perf_counter() - run_start
+        print(f"\nWorkflow run completed in {run_duration:.2f}s")
+
+    outputs = events.get_outputs()
+
+    if not outputs:
+        print("\n[ERROR] Workflow did not produce any output")
+        return 1
+
+    report: Report = outputs[0]
+
+    # Save report if requested
+    if args.save_report:
+        output_dir = samples_dir / args.output_dir
+        md_path, json_path = save_report(report, output_dir, name=args.report_name)
+        print("\nReports saved:")
+        print(f"   Markdown: {md_path}")
+        print(f"   JSON: {json_path}")
+
+    # Return appropriate exit code
+    failed = report.failure_count + report.timeout_count + report.error_count
+    return 1 if failed > 0 else 0
+
+
+if __name__ == "__main__":
+    exit_code = asyncio.run(main())
+    sys.exit(exit_code)
@@ -0,0 +1,3 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+WORKER_COMPLETED = "worker_completed"
@@ -0,0 +1,252 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+from collections import deque
+from dataclasses import dataclass
+
+from _sample_validation.const import WORKER_COMPLETED
+from _sample_validation.discovery import DiscoveryResult
+from _sample_validation.models import (
+    ExecutionResult,
+    RunResult,
+    RunStatus,
+    SampleInfo,
+    ValidationConfig,
+    WorkflowCreationResult,
+)
+from agent_framework import (
+    Executor,
+    Message,
+    Workflow,
+    WorkflowBuilder,
+    WorkflowContext,
+    WorkflowEvent,
+    handler,
+)
+from agent_framework.github import GitHubCopilotAgent
+from copilot.types import PermissionRequest, PermissionRequestResult
+from pydantic import BaseModel
+from typing_extensions import Never
+
+logger = logging.getLogger(__name__)
+
+
+class AgentResponseFormat(BaseModel):
+    status: str
+    output: str
+    error: str
+
+
+@dataclass
+class CoordinatorStart:
+    samples: list[SampleInfo]
+
+
+@dataclass
+class WorkerFreed:
+    worker_id: str
+
+
+class BatchCompletion:
+    pass
+
+
+AgentInstruction = (
+    "You are validating exactly one Python sample.\n"
+    "Analyze the sample code and execute it. Determine if it runs successfully, fails, or times out.\n"
+    "The sample can be interactive. If it is interactive, respond to the sample when prompted "
+    "based on your analysis of the code. You do not need to consult human on what to respond\n"
+    "Return ONLY valid JSON with this schema:\n"
+    "{\n"
+    '  "status": "success|failure|timeout|error",\n'
+    '  "output": "short summary of the result and what you did if the sample was interactive",\n'
+    '  "error": "error details or empty string"\n'
+    "}\n\n"
+)
+
+
+def parse_agent_json(text: str) -> AgentResponseFormat:
+    """Parse JSON object from an agent response."""
+    stripped = text.strip()
+    if stripped.startswith("{") and stripped.endswith("}"):
+        return AgentResponseFormat.model_validate_json(stripped)
+
+    start = stripped.find("{")
+    end = stripped.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise ValueError("No JSON object found in response")
+
+    return AgentResponseFormat.model_validate_json(stripped[start : end + 1])
+
+
+def status_from_text(value: str) -> RunStatus:
+    """Convert a string value to RunStatus with safe fallback."""
+    normalized = value.strip().lower()
+    for status in RunStatus:
+        if status.value == normalized:
+            return status
+    return RunStatus.ERROR
+
+
+def prompt_permission(request: PermissionRequest, context: dict[str, str]) -> PermissionRequestResult:
+    """Permission handler that always approves."""
+    kind = request.get("kind", "unknown")
+    logger.debug(f"[Permission Request: {kind}] ({context})Automatically approved for sample validation.")
+    return PermissionRequestResult(kind="approved")
+
+
+class CustomAgentExecutor(Executor):
+    """Executor that runs a GitHub Copilot agent and returns its response.
+
+    We need the custom executor to wrap the agent call in a try/except to ensure that any exceptions are caught and
+    returned as error responses, otherwise an exception in one agent could crash the entire workflow.
+    """
+
+    def __init__(self, agent: GitHubCopilotAgent):
+        super().__init__(id=agent.id)
+        self.agent = agent
+
+    @handler
+    async def handle_task(self, sample: SampleInfo, ctx: WorkflowContext[WorkerFreed | RunResult]) -> None:
+        """Execute one sample task and notify collector + coordinator."""
+        try:
+            response = await self.agent.run([
+                Message(role="user", text=f"Validate the following sample:\n\n{sample.relative_path}")
+            ])
+            result_payload = parse_agent_json(response.text)
+            result = RunResult(
+                sample=sample,
+                status=status_from_text(result_payload.status),
+                output=result_payload.output,
+                error=result_payload.error,
+            )
+        except Exception as ex:
+            logger.error(f"Error executing agent {self.agent.id}: {ex}")
+            result = RunResult(
+                sample=sample,
+                status=RunStatus.ERROR,
+                output="",
+                error=str(ex),
+            )
+
+        await ctx.send_message(result, target_id="collector")
+        await ctx.send_message(WorkerFreed(worker_id=self.id), target_id="coordinator")
+
+        await ctx.add_event(WorkflowEvent(WORKER_COMPLETED, sample))  # type: ignore
+
+
+class BatchCoordinatorExecutor(Executor):
+    """Dispatch sample tasks to worker executors in bounded batches."""
+
+    def __init__(self, worker_ids: list[str], max_parallel_workers: int) -> None:
+        super().__init__(id="coordinator")
+        self._worker_ids = worker_ids
+        self._max_parallel_workers = max(1, max_parallel_workers)
+        self._pending: deque[SampleInfo] = deque()
+        self._inflight: set[str] = set()
+
+    async def _assign_next(self, worker_id: str, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
+        if not self._pending:
+            # No more samples to assign
+            if not self._inflight:
+                # All tasks are completed, notify collector and exit
+                await ctx.send_message(BatchCompletion(), target_id="collector")
+            return
+
+        sample = self._pending.popleft()
+        self._inflight.add(worker_id)
+        # Messages will get queued in the runner until the next superstep when all workers are freed,
+        # thus achieving automatic batching without needing complex synchronization logic
+        await ctx.send_message(sample, target_id=worker_id)
+
+    @handler
+    async def on_start(self, start: CoordinatorStart, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
+        """Initialize queue and dispatch first wave of tasks."""
+        self._pending = deque(start.samples)
+        self._inflight.clear()
+
+        for worker_id in self._worker_ids[: self._max_parallel_workers]:
+            await self._assign_next(worker_id, ctx)
+
+    @handler
+    async def on_worker_freed(self, freed: WorkerFreed, ctx: WorkflowContext[SampleInfo | BatchCompletion]) -> None:
+        """Dispatch next queued sample when a worker finishes."""
+        self._inflight.discard(freed.worker_id)
+        await self._assign_next(freed.worker_id, ctx)
+
+
+class CollectorExecutor(Executor):
+    """Collect per-sample results and emit the final execution result."""
+
+    def __init__(self) -> None:
+        super().__init__(id="collector")
+        self._results: list[RunResult] = []
+
+    @handler
+    async def on_all(self, batch_completion: BatchCompletion, ctx: WorkflowContext[Never, ExecutionResult]) -> None:
+        """Receive all results at once and emit final output."""
+        await ctx.yield_output(ExecutionResult(results=self._results))
+
+    @handler
+    async def on_item(self, item: RunResult, ctx: WorkflowContext) -> None:
+        """Record a result and emit output when all expected results arrive."""
+        self._results.append(item)
+
+
+class CreateConcurrentValidationWorkflowExecutor(Executor):
+    """Executor that builds a nested concurrent workflow with one agent per sample."""
+
+    def __init__(self, config: ValidationConfig):
+        super().__init__(id="create_dynamic_workflow")
+        self.config = config
+
+    @handler
+    async def create(
+        self,
+        discovery: DiscoveryResult,
+        ctx: WorkflowContext[WorkflowCreationResult],
+    ) -> None:
+        """Create a nested workflow with a coordinator + worker fan-out/fan-in."""
+        sample_count = len(discovery.samples)
+        print(f"\nCreating nested batched workflow for {sample_count} samples...")
+
+        if sample_count == 0:
+            await ctx.send_message(WorkflowCreationResult(samples=[], workflow=None, agents=[]))
+            return
+
+        agents: list[GitHubCopilotAgent] = []
+        workers: list[CustomAgentExecutor] = []
+
+        for index, sample in enumerate(discovery.samples, start=1):
+            agent_id = f"sample_validator_{index}({sample.relative_path})"
+            agent = GitHubCopilotAgent(
+                id=agent_id,
+                name=agent_id,
+                instructions=AgentInstruction,
+                default_options={"on_permission_request": prompt_permission, "timeout": 180},  # type: ignore
+            )
+            agents.append(agent)
+
+            workers.append(CustomAgentExecutor(agent))
+
+        coordinator = BatchCoordinatorExecutor(
+            worker_ids=[worker.id for worker in workers],
+            max_parallel_workers=self.config.max_parallel_workers,
+        )
+        collector = CollectorExecutor()
+
+        nested_builder = WorkflowBuilder(start_executor=coordinator, output_executors=[collector])
+        nested_builder.add_edge(coordinator, collector)
+        for worker in workers:
+            nested_builder.add_edge(coordinator, worker)
+            nested_builder.add_edge(worker, coordinator)
+            nested_builder.add_edge(worker, collector)
+        nested_workflow: Workflow = nested_builder.build()
+
+        await ctx.send_message(
+            WorkflowCreationResult(
+                samples=discovery.samples,
+                workflow=nested_workflow,
+                agents=agents,
+            )
+        )
@@ -0,0 +1,116 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Sample discovery module."""
+
+import ast
+import os
+from pathlib import Path
+
+from _sample_validation.models import DiscoveryResult, SampleInfo, ValidationConfig
+from agent_framework import Executor, WorkflowContext, handler
+
+
+def _is_main_entrypoint_guard(test: ast.expr) -> bool:
+    """Check whether an expression is ``__name__ == '__main__'``."""
+    if not isinstance(test, ast.Compare):
+        return False
+
+    if len(test.ops) != 1 or not isinstance(test.ops[0], ast.Eq):
+        return False
+
+    if len(test.comparators) != 1:
+        return False
+
+    left = test.left
+    right = test.comparators[0]
+
+    return (
+        isinstance(left, ast.Name)
+        and left.id == "__name__"
+        and isinstance(right, ast.Constant)
+        and right.value == "__main__"
+    ) or (
+        isinstance(right, ast.Name)
+        and right.id == "__name__"
+        and isinstance(left, ast.Constant)
+        and left.value == "__main__"
+    )
+
+
+def _has_main_entrypoint_guard(path: Path) -> bool:
+    """Check whether a Python file defines a top-level main entrypoint guard."""
+    try:
+        source = path.read_text(encoding="utf-8")
+        tree = ast.parse(source)
+    except Exception:
+        return False
+
+    return any(isinstance(node, ast.If) and _is_main_entrypoint_guard(node.test) for node in tree.body)
+
+
+def discover_samples(samples_dir: Path, subdir: str | None = None) -> list[SampleInfo]:
+    """
+    Find all Python sample files in the samples directory.
+
+    Args:
+        samples_dir: Root samples directory
+        subdir: Optional subdirectory to filter to
+
+    Returns:
+        List of SampleInfo objects for each discovered sample
+    """
+    # Determine the search directory
+    if subdir:
+        search_dir = samples_dir / subdir
+        if not search_dir.exists():
+            print(f"Warning: Subdirectory '{subdir}' does not exist in {samples_dir}")
+            return []
+    else:
+        search_dir = samples_dir
+
+    python_files: list[Path] = []
+
+    # Walk through all subdirectories and find .py files
+    for root, dirs, files in os.walk(search_dir):
+        # Skip directories that start with _ (like _sample_validation)
+        dirs[:] = [d for d in dirs if not d.startswith("_") and d != "__pycache__"]
+
+        for file in files:
+            # Skip files that start with _ and include only scripts with a main entrypoint guard
+            if file.endswith(".py") and not file.startswith("_"):
+                file_path = Path(root) / file
+                if _has_main_entrypoint_guard(file_path):
+                    python_files.append(file_path)
+
+    # Sort files for consistent execution order
+    python_files = sorted(python_files)
+
+    # Convert to SampleInfo objects
+    samples: list[SampleInfo] = []
+    for path in python_files:
+        try:
+            samples.append(SampleInfo.from_path(path, samples_dir))
+        except Exception as e:
+            print(f"Warning: Could not read {path}: {e}")
+
+    return samples
+
+
+class DiscoverSamplesExecutor(Executor):
+    """Executor that discovers all samples in the samples directory."""
+
+    def __init__(self, config: ValidationConfig):
+        super().__init__(id="discover_samples")
+        self.config = config
+
+    @handler
+    async def discover(self, _: str, ctx: WorkflowContext[DiscoveryResult]) -> None:
+        """Discover all Python samples."""
+        print(f"🔍 Discovering samples in {self.config.samples_dir}")
+        if self.config.subdir:
+            print(f"   Filtering to subdirectory: {self.config.subdir}")
+
+        samples = discover_samples(self.config.samples_dir, self.config.subdir)
+        print(f"   Found {len(samples)} samples")
+
+        await ctx.send_message(DiscoveryResult(samples=samples))
@@ -0,0 +1,163 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Data models for the sample validation system."""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+
+from agent_framework import Workflow
+from agent_framework.github import GitHubCopilotAgent
+
+
+@dataclass
+class ValidationConfig:
+    """Configuration for the validation workflow."""
+
+    samples_dir: Path
+    python_root: Path
+    subdir: str | None = None
+    max_parallel_workers: int = 10
+
+
+@dataclass
+class SampleInfo:
+    """Information about a discovered sample file."""
+
+    path: Path
+    relative_path: str
+    code: str
+
+    @classmethod
+    def from_path(cls, path: Path, samples_dir: Path) -> "SampleInfo":
+        """Create SampleInfo from a file path."""
+        return cls(
+            path=path,
+            relative_path=str(path.relative_to(samples_dir)),
+            code=path.read_text(encoding="utf-8"),
+        )
+
+
+@dataclass
+class DiscoveryResult:
+    """Result of sample discovery."""
+
+    samples: list[SampleInfo]
+
+
+@dataclass
+class WorkflowCreationResult:
+    """Result of creating a nested per-sample concurrent workflow."""
+
+    samples: list[SampleInfo]
+    workflow: Workflow | None
+    agents: list[GitHubCopilotAgent]
+
+
+class RunStatus(Enum):
+    """Status of a sample run."""
+
+    SUCCESS = "success"
+    FAILURE = "failure"
+    TIMEOUT = "timeout"
+    ERROR = "error"
+
+
+@dataclass
+class RunResult:
+    """Result of running a single sample."""
+
+    sample: SampleInfo
+    status: RunStatus
+    output: str
+    error: str
+
+
+@dataclass
+class ExecutionResult:
+    """Result of sample execution."""
+
+    results: list[RunResult]
+
+
+@dataclass
+class Report:
+    """Final validation report."""
+
+    timestamp: datetime
+    total_samples: int
+    success_count: int
+    failure_count: int
+    timeout_count: int
+    error_count: int
+    results: list[RunResult] = field(default_factory=list)  # type: ignore
+
+    def to_markdown(self) -> str:
+        """Generate a markdown report."""
+        lines = [
+            "# Sample Validation Report",
+            "",
+            f"**Generated:** {self.timestamp.isoformat()}",
+            "",
+            "## Summary",
+            "",
+            "| Metric | Count |",
+            "|--------|-------|",
+            f"| Total Samples | {self.total_samples} |",
+            f"| [PASS] Success | {self.success_count} |",
+            f"| [FAIL] Failure | {self.failure_count} |",
+            f"| [TIMEOUT] Timeout | {self.timeout_count} |",
+            f"| [ERROR] Error | {self.error_count} |",
+            "",
+            "## Detailed Results",
+            "",
+        ]
+
+        # Group by status
+        for status in [RunStatus.FAILURE, RunStatus.TIMEOUT, RunStatus.ERROR, RunStatus.SUCCESS]:
+            status_results = [r for r in self.results if r.status == status]
+            if not status_results:
+                continue
+
+            status_label = {
+                RunStatus.SUCCESS: "[PASS]",
+                RunStatus.FAILURE: "[FAIL]",
+                RunStatus.TIMEOUT: "[TIMEOUT]",
+                RunStatus.ERROR: "[ERROR]",
+            }
+
+            lines.append(f"### {status_label[status]} {status.value.title()} ({len(status_results)})")
+            lines.append("")
+
+            for result in status_results:
+                lines.append(f"- **{result.sample.relative_path}**")
+                if result.error:
+                    # Truncate long errors
+                    error_preview = result.error[:200] + "..." if len(result.error) > 200 else result.error
+                    lines.append(f"  - Error: `{error_preview}`")
+            lines.append("")
+
+        return "\n".join(lines)
+
+    def to_dict(self) -> dict[str, object]:
+        """Convert report to dictionary for JSON serialization."""
+        return {
+            "timestamp": self.timestamp.isoformat(),
+            "summary": {
+                "total_samples": self.total_samples,
+                "success_count": self.success_count,
+                "failure_count": self.failure_count,
+                "timeout_count": self.timeout_count,
+                "error_count": self.error_count,
+            },
+            "results": [
+                {
+                    "path": r.sample.relative_path,
+                    "status": r.status.value,
+                    "output": r.output,
+                    "error": r.error,
+                }
+                for r in self.results
+            ],
+        }
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Report generation for sample validation results."""
+
+import json
+from datetime import datetime
+from pathlib import Path
+
+from _sample_validation.models import ExecutionResult, Report, RunResult, RunStatus
+from agent_framework import Executor, WorkflowContext, handler
+from typing_extensions import Never
+
+
+def generate_report(results: list[RunResult]) -> Report:
+    """
+    Generate a validation report from run results.
+
+    Args:
+        results: List of RunResult objects from sample execution
+
+    Returns:
+        Report object with aggregated statistics
+    """
+
+    return Report(
+        timestamp=datetime.now(),
+        total_samples=len(results),
+        success_count=sum(1 for r in results if r.status == RunStatus.SUCCESS),
+        failure_count=sum(1 for r in results if r.status == RunStatus.FAILURE),
+        timeout_count=sum(1 for r in results if r.status == RunStatus.TIMEOUT),
+        error_count=sum(1 for r in results if r.status == RunStatus.ERROR),
+        results=results,
+    )
+
+
+def save_report(report: Report, output_dir: Path, name: str | None = None) -> tuple[Path, Path]:
+    """
+    Save the report to markdown and JSON files.
+
+    Args:
+        report: The report to save
+        output_dir: Directory to save the report files
+        name: Optional custom name for the report files (without extension)
+
+    Returns:
+        Tuple of (markdown_path, json_path)
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if name:
+        base_name = name
+    else:
+        timestamp_str = report.timestamp.strftime("%Y%m%d_%H%M%S")
+        base_name = f"validation_report_{timestamp_str}"
+
+    # Save markdown
+    md_path = output_dir / f"{base_name}.md"
+    md_path.write_text(report.to_markdown(), encoding="utf-8")
+
+    # Save JSON
+    json_path = output_dir / f"{base_name}.json"
+    json_path.write_text(
+        json.dumps(report.to_dict(), indent=2),
+        encoding="utf-8",
+    )
+
+    return md_path, json_path
+
+
+def print_summary(report: Report) -> None:
+    """Print a summary of the validation report to console."""
+    print("\n" + "=" * 80)
+    print("SAMPLE VALIDATION SUMMARY")
+    print("=" * 80)
+
+    if report.failure_count == 0 and report.timeout_count == 0 and report.error_count == 0:
+        print("[PASS] ALL SAMPLES PASSED!")
+    else:
+        print("[FAIL] SOME SAMPLES FAILED")
+
+    print(f"\nTotal samples: {report.total_samples}")
+    print()
+    print("Results:")
+    print(f"  [PASS] Success: {report.success_count}")
+    print(f"  [FAIL] Failure: {report.failure_count}")
+    print(f"  [TIMEOUT] Timeout: {report.timeout_count}")
+    print(f"  [ERROR] Error: {report.error_count}")
+    print("=" * 80)
+
+
+class GenerateReportExecutor(Executor):
+    """Executor that generates the final validation report."""
+
+    def __init__(self) -> None:
+        super().__init__(id="generate_report")
+
+    @handler
+    async def generate(self, execution: ExecutionResult, ctx: WorkflowContext[Never, Report]) -> None:
+        """Generate the validation report from fan-in results."""
+        print("\nGenerating report...")
+
+        report = generate_report(execution.results)
+        print_summary(report)
+
+        await ctx.yield_output(report)
@@ -0,0 +1,64 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from collections.abc import Sequence
+
+from _sample_validation.const import WORKER_COMPLETED
+from _sample_validation.create_dynamic_workflow_executor import CoordinatorStart
+from _sample_validation.models import ExecutionResult, RunResult, RunStatus, SampleInfo, WorkflowCreationResult
+from agent_framework import Executor, WorkflowContext, handler
+from agent_framework.github import GitHubCopilotAgent
+
+
+async def stop_agents(agents: Sequence[GitHubCopilotAgent]) -> None:
+    """Stop all GitHub Copilot agents used by the nested workflow."""
+    for agent in agents:
+        try:
+            await agent.stop()
+        except Exception:
+            continue
+
+
+class RunDynamicValidationWorkflowExecutor(Executor):
+    """Executor that runs the nested workflow created in the previous step."""
+
+    def __init__(self) -> None:
+        super().__init__(id="run_dynamic_workflow")
+
+    @handler
+    async def run(self, creation: WorkflowCreationResult, ctx: WorkflowContext[ExecutionResult]) -> None:
+        """Run the nested workflow and emit execution results."""
+        if creation.workflow is None:
+            await ctx.send_message(ExecutionResult(results=[]))
+            return
+
+        print("\nRunning nested batched workflow...")
+        print("-" * 80)
+
+        try:
+            remaining_sample_counts = len(creation.samples)
+            result: ExecutionResult | None = None
+            async for event in creation.workflow.run(CoordinatorStart(samples=creation.samples), stream=True):
+                if event.type == "output" and isinstance(event.data, ExecutionResult):
+                    result = event.data  # type: ignore
+                elif event.type == WORKER_COMPLETED and isinstance(event.data, SampleInfo):  # type: ignore
+                    remaining_sample_counts -= 1
+                    print(
+                        f"Completed validation for sample: {event.data.relative_path:<80} | "
+                        f"Remaining: {remaining_sample_counts:>4}"
+                    )
+
+            if result is not None:
+                await ctx.send_message(result)
+            else:
+                fallback_results = [
+                    RunResult(
+                        sample=sample,
+                        status=RunStatus.ERROR,
+                        output="",
+                        error="Nested workflow did not return an ExecutionResult.",
+                    )
+                    for sample in creation.samples
+                ]
+                await ctx.send_message(ExecutionResult(results=fallback_results))
+        finally:
+            await stop_agents(creation.agents)
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""
+Sample Validation Workflow using Microsoft Agent Framework.
+
+Workflow composition for sample validation.
+"""
+
+from _sample_validation.create_dynamic_workflow_executor import CreateConcurrentValidationWorkflowExecutor
+from _sample_validation.discovery import DiscoverSamplesExecutor, ValidationConfig
+from _sample_validation.report import GenerateReportExecutor
+from _sample_validation.run_dynamic_validation_workflow_executor import RunDynamicValidationWorkflowExecutor
+from agent_framework import Workflow, WorkflowBuilder
+
+
+def create_validation_workflow(
+    config: ValidationConfig,
+) -> Workflow:
+    """
+    Create the sample validation workflow.
+
+    Args:
+        config: Validation configuration
+
+    Returns:
+        Configured Workflow instance
+    """
+    discover = DiscoverSamplesExecutor(config)
+    create_dynamic_workflow = CreateConcurrentValidationWorkflowExecutor(config)
+    run_dynamic_workflow = RunDynamicValidationWorkflowExecutor()
+    generate = GenerateReportExecutor()
+
+    return (
+        WorkflowBuilder(start_executor=discover)
+        .add_edge(discover, create_dynamic_workflow)
+        .add_edge(create_dynamic_workflow, run_dynamic_workflow)
+        .add_edge(run_dynamic_workflow, generate)
+        .build()
+    )
+
+
+__all__ = ["ValidationConfig", "create_validation_workflow"]