Python: Flaky test report (#5342)

* Add flaky test trend reporting to CI workflows Parse JUnit XML (pytest.xml) from each integration test job and aggregate results into a markdown trend report showing per-test pass/fail/skip status across the last 5 runs. Changes: - Add python/scripts/flaky_report/ package (JUnit XML parser + trend report generator following the sample_validation pattern) - Add upload-artifact steps to all 6 integration test jobs in both python-merge-tests.yml and python-integration-tests.yml - Add python-flaky-test-report aggregation job with history caching - Add --junitxml=pytest.xml to integration-tests.yml jobs (already present in merge-tests.yml) - Fix Cosmos job --junitxml path (use absolute path since uv run --directory changes cwd) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Fix flaky report: handle missing test results gracefully - Guard against missing reports directory in load_current_run() - Only run report job when at least one integration test job completed (skip when all jobs are skipped, e.g. on pull_request events) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Address PR review: fix provider names and if-expression precedence - Use explicit provider name mapping in _derive_provider() so OpenAI renders correctly instead of 'Openai' - Fix operator precedence in workflow if-expressions by wrapping success/failure checks in parentheses Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Add File column and xfail detection to flaky test report - Add File column showing module name (e.g., test_openai_chat_client) to disambiguate tests with the same function name across files - Detect pytest xfail tests in JUnit XML (type=pytest.xfail) and show them with a distinct warning emoji instead of skip emoji - Update legend to include xfail explanation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Add Foundry embedding env vars to merge-tests workflow Sync the Foundry integration job in python-merge-tests.yml with python-integration-tests.yml by adding FOUNDRY_MODELS_ENDPOINT, FOUNDRY_MODELS_API_KEY, FOUNDRY_EMBEDDING_MODEL, and FOUNDRY_IMAGE_EMBEDDING_MODEL. Once the repo variables/secrets are configured, the embedding integration test will run in CI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Fix File column showing class name instead of module name When a test is inside a class, pytest writes the classname as e.g. 'pkg.test_file.TestClass'. The previous rsplit logic extracted 'TestClass' instead of 'test_file'. Now detect uppercase-starting segments as class names and use the preceding segment instead. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Address PR review: UTC timestamps, XML error handling, summary fix, docstring - Use datetime.now(timezone.utc) for accurate UTC timestamps - Catch ET.ParseError per-file so corrupt XML doesn't crash the report - Remove separate 'error' key from summary (errors folded into 'failed') - Fix _short_name docstring to show actual dotted classname::name format Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-06-16 21:04:09 +08:00 · 2026-04-22 13:16:50 -07:00
parent d75f874d78
commit 3f23e1dfbf
5 changed files with 651 additions and 2 deletions
@@ -87,6 +87,14 @@ jobs:
          -n logical --dist worksteal
          --timeout=120 --session-timeout=900 --timeout_method thread
          --retries 2 --retry-delay 5
+          --junitxml=pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-openai
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Azure OpenAI integration tests
  python-tests-azure-openai:
@@ -130,6 +138,14 @@ jobs:
          -n logical --dist worksteal
          --timeout=120 --session-timeout=900 --timeout_method thread
          --retries 2 --retry-delay 5
+          --junitxml=pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-azure-openai
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Misc integration tests (Anthropic, Hyperlight, Ollama, MCP)
  python-tests-misc-integration:
@@ -173,6 +189,14 @@ jobs:
          -n logical --dist worksteal
          --timeout=120 --session-timeout=900 --timeout_method thread
          --retries 2 --retry-delay 30
+          --junitxml=pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-misc
+          path: ./python/pytest.xml
+          if-no-files-found: ignore
      - name: Stop local MCP server
        if: always()
        shell: bash
@@ -249,6 +273,14 @@ jobs:
          -x
          --timeout=360 --session-timeout=900 --timeout_method thread
          --retries 2 --retry-delay 5
+          --junitxml=pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-functions
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Foundry integration tests
  python-tests-foundry:
@@ -295,6 +327,14 @@ jobs:
          -n logical --dist worksteal
          --timeout=120 --session-timeout=900 --timeout_method thread
          --retries 2 --retry-delay 5
+          --junitxml=pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-foundry
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Azure Cosmos integration tests
  python-tests-cosmos:
@@ -339,7 +379,80 @@ jobs:
          echo "Cosmos DB emulator did not become ready in time." >&2
          exit 1
      - name: Test with pytest (Cosmos integration)
-        run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5
+        run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=${{ github.workspace }}/python/pytest.xml
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-cosmos
+          path: ./python/pytest.xml
+          if-no-files-found: ignore
+
+  # Flaky test trend report (aggregates per-job JUnit XML results)
+  python-flaky-test-report:
+    name: Flaky Test Report
+    if: >
+      always() &&
+      (contains(join(needs.*.result, ','), 'success') ||
+       contains(join(needs.*.result, ','), 'failure'))
+    needs:
+      [
+        python-tests-openai,
+        python-tests-azure-openai,
+        python-tests-misc-integration,
+        python-tests-functions,
+        python-tests-foundry,
+        python-tests-cosmos,
+      ]
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: python
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          ref: ${{ inputs.checkout-ref }}
+          persist-credentials: false
+      - name: Set up python and install the project
+        uses: ./.github/actions/python-setup
+        with:
+          python-version: ${{ env.UV_PYTHON }}
+          os: ${{ runner.os }}
+      - name: Download all test results from current run
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-results-*
+          path: test-results/
+      - name: Restore flaky report history cache
+        uses: actions/cache/restore@v4
+        with:
+          path: python/flaky-report-history.json
+          key: flaky-report-history-integration-${{ github.run_id }}
+          restore-keys: |
+            flaky-report-history-integration-
+      - name: Generate trend report
+        run: >
+          uv run python scripts/flaky_report/aggregate.py
+          ../test-results/
+          flaky-report-history.json
+          flaky-test-report.md
+      - name: Post to Job Summary
+        if: always()
+        run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY
+      - name: Save flaky report history cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: python/flaky-report-history.json
+          key: flaky-report-history-integration-${{ github.run_id }}
+      - name: Upload unified trend report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: flaky-test-report
+          path: |
+            python/flaky-test-report.md
+            python/flaky-report-history.json

  python-integration-tests-check:
    if: always()
@@ -181,6 +181,13 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: OpenAI integration test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-openai
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Azure OpenAI integration tests
  python-tests-azure-openai:
@@ -244,6 +251,13 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: Azure OpenAI integration test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-azure-openai
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Misc integration tests (Anthropic, Ollama, MCP)
  python-tests-misc-integration:
@@ -321,6 +335,13 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: Misc integration test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-misc
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # Azure Functions + Durable Task integration tests
  python-tests-functions:
@@ -392,6 +413,13 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: Functions integration test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-functions
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  python-tests-foundry:
    name: Python Integration Tests - Foundry
@@ -409,6 +437,10 @@ jobs:
      FOUNDRY_MODEL: ${{ vars.FOUNDRY_MODEL }}
      FOUNDRY_AGENT_NAME: ${{ vars.FOUNDRY_AGENT_NAME }}
      FOUNDRY_AGENT_VERSION: ${{ vars.FOUNDRY_AGENT_VERSION }}
+      FOUNDRY_MODELS_ENDPOINT: ${{ vars.FOUNDRY_MODELS_ENDPOINT || '' }}
+      FOUNDRY_MODELS_API_KEY: ${{ secrets.FOUNDRY_MODELS_API_KEY || '' }}
+      FOUNDRY_EMBEDDING_MODEL: ${{ vars.FOUNDRY_EMBEDDING_MODEL || '' }}
+      FOUNDRY_IMAGE_EMBEDDING_MODEL: ${{ vars.FOUNDRY_IMAGE_EMBEDDING_MODEL || '' }}
      LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }}
    defaults:
      run:
@@ -448,6 +480,13 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: Test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-foundry
+          path: ./python/pytest.xml
+          if-no-files-found: ignore

  # TODO: Add python-tests-lab

@@ -497,7 +536,7 @@ jobs:
          echo "Cosmos DB emulator did not become ready in time." >&2
          exit 1
      - name: Test with pytest (Cosmos integration)
-        run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=pytest.xml
+        run: uv run --directory packages/azure-cosmos poe integration-tests -n logical --dist worksteal --timeout=120 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=${{ github.workspace }}/python/pytest.xml
        working-directory: ./python
      - name: Surface failing tests
        if: always()
@@ -508,6 +547,76 @@ jobs:
          display-options: fEX
          fail-on-empty: false
          title: Cosmos integration test results
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-results-cosmos
+          path: ./python/pytest.xml
+          if-no-files-found: ignore
+
+  # Flaky test trend report (aggregates per-job JUnit XML results)
+  python-flaky-test-report:
+    name: Flaky Test Report
+    if: >
+      always() &&
+      (contains(join(needs.*.result, ','), 'success') ||
+       contains(join(needs.*.result, ','), 'failure'))
+    needs:
+      [
+        python-tests-openai,
+        python-tests-azure-openai,
+        python-tests-misc-integration,
+        python-tests-functions,
+        python-tests-foundry,
+        python-tests-cosmos,
+      ]
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: python
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up python and install the project
+        uses: ./.github/actions/python-setup
+        with:
+          python-version: ${{ env.UV_PYTHON }}
+          os: ${{ runner.os }}
+      - name: Download all test results from current run
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-results-*
+          path: test-results/
+      - name: Restore flaky report history cache
+        uses: actions/cache/restore@v4
+        with:
+          path: python/flaky-report-history.json
+          key: flaky-report-history-merge-${{ github.run_id }}
+          restore-keys: |
+            flaky-report-history-merge-
+      - name: Generate trend report
+        run: >
+          uv run python scripts/flaky_report/aggregate.py
+          ../test-results/
+          flaky-report-history.json
+          flaky-test-report.md
+      - name: Post to Job Summary
+        if: always()
+        run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY
+      - name: Save flaky report history cache
+        if: always()
+        uses: actions/cache/save@v4
+        with:
+          path: python/flaky-report-history.json
+          key: flaky-report-history-merge-${{ github.run_id }}
+      - name: Upload unified trend report
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: flaky-test-report
+          path: |
+            python/flaky-test-report.md
+            python/flaky-report-history.json

  python-integration-tests-check:
    if: always()