Python: support checkpoints for workflow orchestrations and sub-workflows (#863)

* Magentic checkpoint wip * Magentic checkpoint updates * Support checkpointing for magentic orchestration. * Checkpointing for sub-workflows * Use _execute_contexts instead of _pending_requests * Remove unnecessary type ignores * Support checkpoints for other orchestrations, refactor some code. * Regenerate uv.lock
2026-06-16 21:04:09 +08:00 · 2025-09-26 11:21:06 +09:00
parent 4b743ea62a
commit 2cd7ab342b
22 changed files with 2005 additions and 277 deletions
@@ -0,0 +1,299 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import json
+from pathlib import Path
+
+from agent_framework import (
+    ChatAgent,
+    FileCheckpointStorage,
+    MagenticBuilder,
+    MagenticPlanReviewDecision,
+    MagenticPlanReviewReply,
+    MagenticPlanReviewRequest,
+    RequestInfoEvent,
+    WorkflowCheckpoint,
+    WorkflowOutputEvent,
+    WorkflowRunState,
+    WorkflowStatusEvent,
+)
+from agent_framework.openai import OpenAIChatClient
+
+"""
+Sample: Magentic Orchestration + Checkpointing
+
+The goal of this sample is to show the exact mechanics needed to pause a Magentic
+workflow that requires human plan review, persist the outstanding request via a
+checkpoint, and later resume the workflow by feeding in the saved response.
+
+Concepts highlighted here:
+1. **Deterministic executor IDs** - the orchestrator and plan-review request executor
+   must keep stable IDs so the checkpoint state aligns when we rebuild the graph.
+2. **Executor snapshotting** - checkpoints capture the `RequestInfoExecutor` state,
+   specifically the pending plan-review request map, at superstep boundaries.
+3. **Resume with responses** - `Workflow.run_stream_from_checkpoint` accepts a
+   `responses` mapping so we can inject the stored human reply during restoration.
+
+Prerequisites:
+- OpenAI environment variables configured for `OpenAIChatClient`.
+"""
+
+TASK = (
+    "Draft a concise internal brief describing how our research and implementation teams should collaborate "
+    "to launch a beta feature for data-driven email summarization. Highlight the key milestones, "
+    "risks, and communication cadence."
+)
+
+# Dedicated folder for captured checkpoints. Keeping it under the sample directory
+# makes it easy to inspect the JSON blobs produced by each run.
+CHECKPOINT_DIR = Path(__file__).parent / "tmp" / "magentic_checkpoints"
+
+
+def build_workflow(checkpoint_storage: FileCheckpointStorage):
+    """Construct the Magentic workflow graph with checkpointing enabled."""
+
+    # Two vanilla ChatAgents act as participants in the orchestration. They do not need
+    # extra state handling because their inputs/outputs are fully described by chat messages.
+    researcher = ChatAgent(
+        name="ResearcherAgent",
+        description="Collects background facts and references for the project.",
+        instructions=("You are the research lead. Gather crisp bullet points the team should know."),
+        chat_client=OpenAIChatClient(),
+    )
+
+    writer = ChatAgent(
+        name="WriterAgent",
+        description="Synthesizes the final brief for stakeholders.",
+        instructions=("You convert the research notes into a structured brief with milestones and risks."),
+        chat_client=OpenAIChatClient(),
+    )
+
+    # The builder wires in the Magentic orchestrator, sets the plan review path, and
+    # stores the checkpoint backend so the runtime knows where to persist snapshots.
+    return (
+        MagenticBuilder()
+        .participants(researcher=researcher, writer=writer)
+        .with_plan_review()
+        .with_standard_manager(
+            chat_client=OpenAIChatClient(),
+            max_round_count=10,
+            max_stall_count=3,
+        )
+        .with_checkpointing(checkpoint_storage)
+        .build()
+    )
+
+
+async def main() -> None:
+    # Stage 0: make sure the checkpoint folder is empty so we inspect only checkpoints
+    # written by this invocation. This prevents stale files from previous runs from
+    # confusing the analysis.
+    CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
+    for file in CHECKPOINT_DIR.glob("*.json"):
+        file.unlink()
+
+    checkpoint_storage = FileCheckpointStorage(CHECKPOINT_DIR)
+
+    print("\n=== Stage 1: run until plan review request (checkpointing active) ===")
+    workflow = build_workflow(checkpoint_storage)
+
+    # Run the workflow until the first RequestInfoEvent is surfaced. The event carries the
+    # request_id we must reuse on resume. In a real system this is where the UI would present
+    # the plan for human review.
+    plan_review_request_id: str | None = None
+    async for event in workflow.run_stream(TASK):
+        if isinstance(event, RequestInfoEvent) and event.request_type is MagenticPlanReviewRequest:
+            plan_review_request_id = event.request_id
+            print(f"Captured plan review request: {plan_review_request_id}")
+
+        if isinstance(event, WorkflowStatusEvent) and event.state is WorkflowRunState.IDLE_WITH_PENDING_REQUESTS:
+            break
+
+    if plan_review_request_id is None:
+        print("No plan review request emitted; nothing to resume.")
+        return
+
+    checkpoints = await checkpoint_storage.list_checkpoints(workflow.workflow.id)
+    if not checkpoints:
+        print("No checkpoints persisted.")
+        return
+
+    resume_checkpoint = max(
+        checkpoints,
+        key=lambda cp: (cp.iteration_count, cp.timestamp),
+    )
+    print(f"Using checkpoint {resume_checkpoint.checkpoint_id} at iteration {resume_checkpoint.iteration_count}")
+
+    # Show that the checkpoint JSON indeed contains the pending plan-review request record.
+    checkpoint_path = checkpoint_storage.storage_path / f"{resume_checkpoint.checkpoint_id}.json"
+    if checkpoint_path.exists():
+        with checkpoint_path.open() as f:
+            snapshot = json.load(f)
+        request_map = snapshot.get("executor_states", {}).get("magentic_plan_review", {}).get("request_events", {})
+        print(f"Pending plan-review requests persisted in checkpoint: {list(request_map.keys())}")
+
+    print("\n=== Stage 2: resume from checkpoint and approve plan ===")
+    resumed_workflow = build_workflow(checkpoint_storage)
+
+    approval = MagenticPlanReviewReply(decision=MagenticPlanReviewDecision.APPROVE)
+    # Resume execution and supply the recorded approval in a single call.
+    # `run_stream_from_checkpoint` rebuilds executor state, applies the provided responses,
+    # and then continues the workflow. Because we only captured the initial plan review
+    # checkpoint, the resumed run should complete almost immediately.
+    final_event: WorkflowOutputEvent | None = None
+    async for event in resumed_workflow.workflow.run_stream_from_checkpoint(
+        resume_checkpoint.checkpoint_id,
+        responses={plan_review_request_id: approval},
+    ):
+        if isinstance(event, WorkflowOutputEvent):
+            final_event = event
+
+    if final_event is None:
+        print("Workflow did not complete after resume.")
+        return
+
+    # Final sanity check: display the assistant's answer as proof the orchestration reached
+    # a natural completion after resuming from the checkpoint.
+    result = final_event.data
+    if not result:
+        print("No result data from workflow.")
+        return
+    text = getattr(result, "text", None) or str(result)
+    print("\n=== Final Answer ===")
+    print(text)
+
+    # ------------------------------------------------------------------
+    # Stage 3: demonstrate resuming from a later checkpoint (post-plan)
+    # ------------------------------------------------------------------
+
+    def _pending_message_count(cp: WorkflowCheckpoint) -> int:
+        return sum(len(msg_list) for msg_list in cp.messages.values() if isinstance(msg_list, list))
+
+    all_checkpoints = await checkpoint_storage.list_checkpoints(resume_checkpoint.workflow_id)
+    later_checkpoints_with_messages = [
+        cp
+        for cp in all_checkpoints
+        if cp.iteration_count > resume_checkpoint.iteration_count and _pending_message_count(cp) > 0
+    ]
+
+    if later_checkpoints_with_messages:
+        post_plan_checkpoint = max(
+            later_checkpoints_with_messages,
+            key=lambda cp: (cp.iteration_count, cp.timestamp),
+        )
+    else:
+        later_checkpoints = [cp for cp in all_checkpoints if cp.iteration_count > resume_checkpoint.iteration_count]
+
+        if not later_checkpoints:
+            print("\nNo additional checkpoints recorded beyond plan approval; sample complete.")
+            return
+
+        post_plan_checkpoint = max(
+            later_checkpoints,
+            key=lambda cp: (cp.iteration_count, cp.timestamp),
+        )
+    print("\n=== Stage 3: resume from post-plan checkpoint ===")
+    pending_messages = _pending_message_count(post_plan_checkpoint)
+    print(
+        f"Resuming from checkpoint {post_plan_checkpoint.checkpoint_id} at iteration "
+        f"{post_plan_checkpoint.iteration_count} (pending messages: {pending_messages})"
+    )
+    if pending_messages == 0:
+        print("Checkpoint has no pending messages; no additional work expected on resume.")
+
+    final_event_post: WorkflowOutputEvent | None = None
+    post_emitted_events = False
+    post_plan_workflow = build_workflow(checkpoint_storage)
+    async for event in post_plan_workflow.workflow.run_stream_from_checkpoint(
+        post_plan_checkpoint.checkpoint_id,
+        responses={},
+    ):
+        post_emitted_events = True
+        if isinstance(event, WorkflowOutputEvent):
+            final_event_post = event
+
+    if final_event_post is None:
+        if not post_emitted_events:
+            print("No new events were emitted; checkpoint already captured a completed run.")
+            print("\n=== Final Answer (post-plan resume) ===")
+            print(text)
+            return
+        print("Workflow did not complete after post-plan resume.")
+        return
+
+    post_result = final_event_post.data
+    if not post_result:
+        print("No result data from post-plan resume.")
+        return
+
+    post_text = getattr(post_result, "text", None) or str(post_result)
+    print("\n=== Final Answer (post-plan resume) ===")
+    print(post_text)
+
+    """
+    Sample Output:
+
+    === Stage 1: run until plan review request (checkpointing active) ===
+    Captured plan review request: 3a1a4a09-4ed1-4c90-9cf6-9ac488d452c0
+    Using checkpoint 4c76d77a-6ff8-4d2b-84f6-824771ffac7e at iteration 1
+    Pending plan-review requests persisted in checkpoint: ['3a1a4a09-4ed1-4c90-9cf6-9ac488d452c0']
+
+    === Stage 2: resume from checkpoint and approve plan ===
+
+    === Final Answer ===
+    Certainly! Here's your concise internal brief on how the research and implementation teams should collaborate for
+    the beta launch of the data-driven email summarization feature:
+
+    ---
+
+    **Internal Brief: Collaboration Plan for Data-driven Email Summarization Beta Launch**
+
+    **Collaboration Approach**
+    - **Joint Kickoff:** Research and Implementation teams hold a project kickoff to align on objectives, requirements,
+        and success metrics.
+    - **Ongoing Coordination:** Teams collaborate closely; researchers share model developments and insights, while
+        implementation ensures smooth integration and user experience.
+    - **Real-time Feedback Loop:** Implementation provides early feedback on technical integration and UX, while
+        Research evaluates initial performance and user engagement signals post-integration.
+
+    **Key Milestones**
+    1. **Requirement Finalization & Scoping** - Define MVP feature set and success criteria.
+    2. **Model Prototyping & Evaluation** - Researchers develop and validate summarization models with agreed metrics.
+    3. **Integration & Internal Testing** - Implementation team integrates the model; internal alpha testing and
+        compliance checks.
+    4. **Beta User Onboarding** - Recruit a select cohort of beta users and guide them through onboarding.
+    5. **Beta Launch & Monitoring** - Soft-launch for beta group, with active monitoring of usage, feedback,
+      and performance.
+    6. **Iterative Improvements** - Address issues, refine features, and prepare for possible broader rollout.
+
+    **Top Risks**
+    - **Data Privacy & Compliance:** Strict protocols and compliance reviews to prevent data leakage.
+    - **Model Quality (Bias, Hallucination):** Careful monitoring of summary accuracy; rapid iterations if critical
+        errors occur.
+    - **User Adoption:** Ensuring the beta solves genuine user needs, collecting actionable feedback early.
+    - **Feedback Quality & Quantity:** Proactively schedule user outreach to ensure substantive beta feedback.
+
+    **Communication Cadence**
+    - **Weekly Team Syncs:** Short all-hands progress and blockers meeting.
+    - **Bi-Weekly Stakeholder Check-ins:** Leadership and project leads address escalations and strategic decisions.
+    - **Dedicated Slack Channel:** For real-time queries and updates.
+    - **Documentation Hub:** Up-to-date project docs and FAQs on a shared internal wiki.
+    - **Post-Milestone Retrospectives:** After critical phases (e.g., alpha, beta), reviewing what worked and what needs
+        improvement.
+
+    **Summary**
+    Clear alignment, consistent communication, and iterative feedback are key to a successful beta. All team members are
+        expected to surface issues quickly and keep documentation current as we drive toward launch.
+    ---
+
+    === Stage 3: resume from post-plan checkpoint ===
+    Resuming from checkpoint 9a3b... at iteration 3 (pending messages: 0)
+    No new events were emitted; checkpoint already captured a completed run.
+
+    === Final Answer (post-plan resume) ===
+    (same brief as above)
+    """
+
+
+if __name__ == "__main__":
+    asyncio.run(main())