Python: Tuning auto sample validation workflow (#4218)

* Tuning validate-01-get-started * Add gh token * Add model * enable debug log * bump up timeout for testing purposes * Test cli is working * Fix end quote * Run gh auth * Run gh auth trail 2 * Run gh auth trail 3 * Test token * Add zcure login * Add zcure login 2 * Add zcure login 3 * Add zcure login 4 * Extract common actions * Extract common actions 2 * Correct env vars * Print outputs to action console * Disable end-to-end samples * Fix ruff errors * Fix ruff errors 2 * Revert workflow changes to fix tests * Revert workflow changes to fix tests 2 * Revert workflow changes to fix tests 3 * Revert workflow changes to fix tests 4
2026-06-16 21:04:09 +08:00 · 2026-02-26 18:45:10 -08:00
parent 54c0bea3b6
commit c45d47d4b2
9 changed files with 171 additions and 112 deletions
@@ -53,9 +53,10 @@ class BatchCompletion:

 AgentInstruction = (
    "You are validating exactly one Python sample.\n"
-    "Analyze the sample code and execute it. Determine if it runs successfully, fails, or times out.\n"
+    "Analyze the sample code and execute it. Based on the execution result, determine if it "
+    "runs successfully, fails, or times out. Feel free to install any required dependencies.\n"
    "The sample can be interactive. If it is interactive, respond to the sample when prompted "
-    "based on your analysis of the code. You do not need to consult human on what to respond\n"
+    "based on your analysis of the code. You do not need to consult human on what to respond.\n"
    "Return ONLY valid JSON with this schema:\n"
    "{\n"
    '  "status": "success|failure|timeout|error",\n'
@@ -21,6 +21,14 @@ def generate_report(results: list[RunResult]) -> Report:
    Returns:
        Report object with aggregated statistics
    """
+    # Sort results: failures, timeouts, errors first, then successes
+    status_priority = {
+        RunStatus.FAILURE: 0,
+        RunStatus.TIMEOUT: 1,
+        RunStatus.ERROR: 2,
+        RunStatus.SUCCESS: 3,
+    }
+    sorted_results = sorted(results, key=lambda r: status_priority[r.status])

    return Report(
        timestamp=datetime.now(),
@@ -29,7 +37,7 @@ def generate_report(results: list[RunResult]) -> Report:
        failure_count=sum(1 for r in results if r.status == RunStatus.FAILURE),
        timeout_count=sum(1 for r in results if r.status == RunStatus.TIMEOUT),
        error_count=sum(1 for r in results if r.status == RunStatus.ERROR),
-        results=results,
+        results=sorted_results,
    )


@@ -84,9 +92,13 @@ def print_summary(report: Report) -> None:
    print(f"  [PASS] Success: {report.success_count}")
    print(f"  [FAIL] Failure: {report.failure_count}")
    print(f"  [TIMEOUT] Timeout: {report.timeout_count}")
-    print(f"  [ERROR] Error: {report.error_count}")
+    print(f"  [ERR] Errors: {report.error_count}")
    print("=" * 80)

+    # Print JSON output for GitHub Actions visibility
+    print("\nJSON Report:")
+    print(json.dumps(report.to_dict(), indent=2))
+

 class GenerateReportExecutor(Executor):
    """Executor that generates the final validation report."""