mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
Python: support checkpoints for workflow orchestrations and sub-workflows (#863)
* Magentic checkpoint wip * Magentic checkpoint updates * Support checkpointing for magentic orchestration. * Checkpointing for sub-workflows * Use _execute_contexts instead of _pending_requests * Remove unnecessary type ignores * Support checkpoints for other orchestrations, refactor some code. * Regenerate uv.lock
This commit is contained in:
committed by
GitHub
Unverified
parent
4b743ea62a
commit
2cd7ab342b
@@ -46,6 +46,7 @@ Once comfortable with these, explore the rest of the samples below.
|
||||
|---|---|---|
|
||||
| Checkpoint & Resume | [checkpoint/checkpoint_with_resume.py](./checkpoint/checkpoint_with_resume.py) | Create checkpoints, inspect them, and resume execution |
|
||||
| Checkpoint & HITL Resume | [checkpoint/checkpoint_with_human_in_the_loop.py](./checkpoint/checkpoint_with_human_in_the_loop.py) | Combine checkpointing with human approvals and resume pending HITL requests |
|
||||
| Checkpointed Sub-Workflow | [checkpoint/sub_workflow_checkpoint.py](./checkpoint/sub_workflow_checkpoint.py) | Save and resume a sub-workflow that pauses for human approval |
|
||||
|
||||
### composition
|
||||
|
||||
@@ -87,9 +88,12 @@ Once comfortable with these, explore the rest of the samples below.
|
||||
| Concurrent Orchestration (Custom Agent Executors) | [orchestration/concurrent_custom_agent_executors.py](./orchestration/concurrent_custom_agent_executors.py) | Child executors own ChatAgents; concurrent fan-out/fan-in via ConcurrentBuilder |
|
||||
| Magentic Workflow (Multi-Agent) | [orchestration/magentic.py](./orchestration/magentic.py) | Orchestrate multiple agents with Magentic manager and streaming |
|
||||
| Magentic + Human Plan Review | [orchestration/magentic_human_plan_update.py](./orchestration/magentic_human_plan_update.py) | Human reviews/updates the plan before execution |
|
||||
| Magentic + Checkpoint Resume | [orchestration/magentic_checkpoint.py](./orchestration/magentic_checkpoint.py) | Resume Magentic orchestration from saved checkpoints |
|
||||
| Sequential Orchestration (Agents) | [orchestration/sequential_agents.py](./orchestration/sequential_agents.py) | Chain agents sequentially with shared conversation context |
|
||||
| Sequential Orchestration (Custom Executor) | [orchestration/sequential_custom_executors.py](./orchestration/sequential_custom_executors.py) | Mix agents with a summarizer that appends a compact summary |
|
||||
|
||||
**Magentic checkpointing tip**: Treat `MagenticBuilder.participants` keys as stable identifiers. When resuming from a checkpoint, the rebuilt workflow must reuse the same participant names; otherwise the checkpoint cannot be applied and the run will fail fast.
|
||||
|
||||
### parallelism
|
||||
|
||||
| Sample | File | Concepts |
|
||||
|
||||
@@ -249,7 +249,7 @@ async def main():
|
||||
line += f" messages={msg_count}"
|
||||
print(line)
|
||||
|
||||
user_input = input(
|
||||
user_input = input( # noqa: ASYNC250
|
||||
"\nEnter checkpoint index (or paste checkpoint id) to resume from, or press Enter to skip resume: "
|
||||
).strip()
|
||||
|
||||
|
||||
@@ -0,0 +1,370 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
import json
|
||||
from dataclasses import dataclass, field, replace
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
from agent_framework import (
|
||||
Executor,
|
||||
FileCheckpointStorage,
|
||||
RequestInfoEvent,
|
||||
RequestInfoExecutor,
|
||||
RequestInfoMessage,
|
||||
RequestResponse,
|
||||
Workflow,
|
||||
WorkflowBuilder,
|
||||
WorkflowContext,
|
||||
WorkflowExecutor,
|
||||
WorkflowOutputEvent,
|
||||
WorkflowRunState,
|
||||
WorkflowStatusEvent,
|
||||
handler,
|
||||
)
|
||||
|
||||
CHECKPOINT_DIR = Path(__file__).with_suffix("").parent / "tmp" / "sub_workflow_checkpoints"
|
||||
|
||||
"""
|
||||
Sample: Checkpointing for workflows that embed sub-workflows.
|
||||
|
||||
This sample shows how a parent workflow that wraps a sub-workflow can:
|
||||
- run until the sub-workflow emits a human approval request via RequestInfoExecutor
|
||||
- persist a checkpoint that captures the pending request (including complex payloads)
|
||||
- resume later, supplying the human decision directly at restore time
|
||||
|
||||
It is intentionally similar in spirit to the orchestration checkpoint sample but
|
||||
uses ``WorkflowExecutor`` so we exercise the full parent/sub-workflow round-trip.
|
||||
"""
|
||||
|
||||
|
||||
def _utc_now() -> datetime:
|
||||
return datetime.now()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Messages exchanged inside the sub-workflow
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@dataclass
|
||||
class DraftTask:
|
||||
"""Task handed from the parent to the sub-workflow writer."""
|
||||
|
||||
topic: str
|
||||
due: datetime
|
||||
iteration: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class DraftPackage:
|
||||
"""Intermediate draft produced by the sub-workflow writer."""
|
||||
|
||||
topic: str
|
||||
content: str
|
||||
iteration: int
|
||||
created_at: datetime = field(default_factory=_utc_now)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FinalDraft:
|
||||
"""Final deliverable returned to the parent workflow."""
|
||||
|
||||
topic: str
|
||||
content: str
|
||||
iterations: int
|
||||
approved_at: datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReviewRequest(RequestInfoMessage):
|
||||
"""Human approval request surfaced via RequestInfoExecutor."""
|
||||
|
||||
topic: str = ""
|
||||
iteration: int = 1
|
||||
draft_excerpt: str = ""
|
||||
due_iso: str = ""
|
||||
reviewer_guidance: list[str] = field(default_factory=list) # type: ignore
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sub-workflow executors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DraftWriter(Executor):
|
||||
"""Produces an initial draft for the supplied topic."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(id="draft_writer")
|
||||
|
||||
@handler
|
||||
async def create_draft(self, task: DraftTask, ctx: WorkflowContext[DraftPackage]) -> None:
|
||||
draft = DraftPackage(
|
||||
topic=task.topic,
|
||||
content=(
|
||||
f"Launch plan for {task.topic}.\n\n"
|
||||
"- Outline the customer message.\n"
|
||||
"- Highlight three differentiators.\n"
|
||||
"- Close with a next-step CTA.\n"
|
||||
f"(iteration {task.iteration})"
|
||||
),
|
||||
iteration=task.iteration,
|
||||
)
|
||||
await ctx.send_message(draft, target_id="draft_review")
|
||||
|
||||
|
||||
class DraftReviewRouter(Executor):
|
||||
"""Turns draft packages into human approval requests."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(id="draft_review")
|
||||
|
||||
@handler
|
||||
async def request_review(self, draft: DraftPackage, ctx: WorkflowContext[ReviewRequest]) -> None:
|
||||
excerpt = draft.content.splitlines()[0]
|
||||
request = ReviewRequest(
|
||||
topic=draft.topic,
|
||||
iteration=draft.iteration,
|
||||
draft_excerpt=excerpt,
|
||||
due_iso=draft.created_at.isoformat(),
|
||||
reviewer_guidance=[
|
||||
"Ensure tone matches launch messaging",
|
||||
"Confirm CTA is action-oriented",
|
||||
],
|
||||
)
|
||||
await ctx.send_message(request, target_id="sub_review_requests")
|
||||
|
||||
@handler
|
||||
async def forward_decision(
|
||||
self,
|
||||
decision: RequestResponse[ReviewRequest, str],
|
||||
ctx: WorkflowContext[RequestResponse[ReviewRequest, str]],
|
||||
) -> None:
|
||||
await ctx.send_message(decision, target_id="draft_finaliser")
|
||||
|
||||
|
||||
class DraftFinaliser(Executor):
|
||||
"""Applies the human decision and emits the final draft."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(id="draft_finaliser")
|
||||
|
||||
@handler
|
||||
async def on_review_decision(
|
||||
self,
|
||||
decision: RequestResponse[ReviewRequest, str],
|
||||
ctx: WorkflowContext[DraftTask, FinalDraft],
|
||||
) -> None:
|
||||
reply = (decision.data or "").strip().lower()
|
||||
original = decision.original_request
|
||||
topic = original.topic if original else "unknown topic"
|
||||
iteration = original.iteration if original else 1
|
||||
|
||||
if reply != "approve":
|
||||
# Loop back with a follow-up task. In a real workflow you would
|
||||
# incorporate the human guidance; here we just increment the counter.
|
||||
next_task = DraftTask(
|
||||
topic=topic,
|
||||
due=_utc_now() + timedelta(hours=1),
|
||||
iteration=iteration + 1,
|
||||
)
|
||||
await ctx.send_message(next_task, target_id="draft_writer")
|
||||
return
|
||||
|
||||
final = FinalDraft(
|
||||
topic=topic,
|
||||
content=f"Approved launch narrative for {topic} (iteration {iteration}).",
|
||||
iterations=iteration,
|
||||
approved_at=_utc_now(),
|
||||
)
|
||||
await ctx.yield_output(final)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parent workflow executors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class LaunchCoordinator(Executor):
|
||||
"""Owns the top-level workflow and collects the final draft."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(id="launch_coordinator")
|
||||
self._final: FinalDraft | None = None
|
||||
|
||||
@handler
|
||||
async def kick_off(self, topic: str, ctx: WorkflowContext[DraftTask]) -> None:
|
||||
task = DraftTask(topic=topic, due=_utc_now() + timedelta(hours=2))
|
||||
await ctx.send_message(task, target_id="launch_subworkflow")
|
||||
|
||||
@handler
|
||||
async def collect_final(self, draft: FinalDraft, ctx: WorkflowContext[None, FinalDraft]) -> None:
|
||||
approved_at = draft.approved_at
|
||||
normalised = draft
|
||||
if isinstance(approved_at, str):
|
||||
with contextlib.suppress(ValueError):
|
||||
parsed = datetime.fromisoformat(approved_at)
|
||||
normalised = replace(draft, approved_at=parsed)
|
||||
approved_at = parsed
|
||||
|
||||
self._final = normalised
|
||||
|
||||
approved_display = approved_at.isoformat() if hasattr(approved_at, "isoformat") else str(approved_at)
|
||||
|
||||
print("\n>>> Parent workflow received approved draft:")
|
||||
print(f"- Topic: {normalised.topic}")
|
||||
print(f"- Iterations: {normalised.iterations}")
|
||||
print(f"- Approved at: {approved_display}")
|
||||
print(f"- Content: {normalised.content}\n")
|
||||
|
||||
await ctx.yield_output(normalised)
|
||||
|
||||
@property
|
||||
def final_result(self) -> FinalDraft | None:
|
||||
return self._final
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Workflow construction helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def build_sub_workflow() -> WorkflowExecutor:
|
||||
writer = DraftWriter()
|
||||
router = DraftReviewRouter()
|
||||
request_info = RequestInfoExecutor(id="sub_review_requests")
|
||||
finaliser = DraftFinaliser()
|
||||
|
||||
sub_workflow = (
|
||||
WorkflowBuilder()
|
||||
.set_start_executor(writer)
|
||||
.add_edge(writer, router)
|
||||
.add_edge(router, request_info)
|
||||
.add_edge(request_info, router, condition=lambda msg: isinstance(msg, RequestResponse))
|
||||
.add_edge(router, finaliser, condition=lambda msg: isinstance(msg, RequestResponse))
|
||||
.add_edge(request_info, finaliser)
|
||||
.add_edge(finaliser, writer) # permits revision loops
|
||||
.build()
|
||||
)
|
||||
|
||||
return WorkflowExecutor(sub_workflow, id="launch_subworkflow")
|
||||
|
||||
|
||||
def build_parent_workflow(storage: FileCheckpointStorage) -> tuple[LaunchCoordinator, Workflow]:
|
||||
coordinator = LaunchCoordinator()
|
||||
sub_executor = build_sub_workflow()
|
||||
parent_request_info = RequestInfoExecutor(id="parent_review_gateway")
|
||||
|
||||
workflow = (
|
||||
WorkflowBuilder()
|
||||
.set_start_executor(coordinator)
|
||||
.add_edge(coordinator, sub_executor)
|
||||
.add_edge(sub_executor, coordinator, condition=lambda msg: isinstance(msg, FinalDraft))
|
||||
.add_edge(
|
||||
sub_executor,
|
||||
parent_request_info,
|
||||
condition=lambda msg: isinstance(msg, RequestInfoMessage),
|
||||
)
|
||||
.add_edge(parent_request_info, sub_executor)
|
||||
.with_checkpointing(storage)
|
||||
.build()
|
||||
)
|
||||
|
||||
return coordinator, workflow
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for file in CHECKPOINT_DIR.glob("*.json"):
|
||||
file.unlink()
|
||||
|
||||
storage = FileCheckpointStorage(CHECKPOINT_DIR)
|
||||
|
||||
_, workflow = build_parent_workflow(storage)
|
||||
|
||||
print("\n=== Stage 1: run until sub-workflow requests human review ===")
|
||||
request_id: str | None = None
|
||||
async for event in workflow.run_stream("Contoso Gadget Launch"):
|
||||
if isinstance(event, RequestInfoEvent) and request_id is None:
|
||||
request_id = event.request_id
|
||||
print(f"Captured review request id: {request_id}")
|
||||
if isinstance(event, WorkflowStatusEvent) and event.state is WorkflowRunState.IDLE_WITH_PENDING_REQUESTS:
|
||||
break
|
||||
|
||||
if request_id is None:
|
||||
print("Sub-workflow completed without requesting review.")
|
||||
return
|
||||
|
||||
checkpoints = await storage.list_checkpoints(workflow.id)
|
||||
if not checkpoints:
|
||||
print("No checkpoints written.")
|
||||
return
|
||||
|
||||
checkpoints.sort(key=lambda cp: cp.timestamp)
|
||||
resume_checkpoint = checkpoints[-1]
|
||||
print(f"Using checkpoint {resume_checkpoint.checkpoint_id} at iteration {resume_checkpoint.iteration_count}")
|
||||
|
||||
checkpoint_path = storage.storage_path / f"{resume_checkpoint.checkpoint_id}.json"
|
||||
if checkpoint_path.exists():
|
||||
snapshot = json.loads(checkpoint_path.read_text())
|
||||
exec_states = snapshot.get("executor_states", {})
|
||||
sub_pending = exec_states.get("sub_review_requests", {}).get("request_events", {})
|
||||
parent_pending = exec_states.get("parent_review_gateway", {}).get("request_events", {})
|
||||
print(f"Pending review requests (sub executor snapshot): {list(sub_pending.keys())}")
|
||||
print(f"Pending review requests (parent executor snapshot): {list(parent_pending.keys())}")
|
||||
|
||||
print("\n=== Stage 2: resume from checkpoint and approve draft ===")
|
||||
# Rebuild fresh instances to mimic a separate process resuming
|
||||
coordinator2, workflow2 = build_parent_workflow(storage)
|
||||
|
||||
approval_response = "approve"
|
||||
final_event: WorkflowOutputEvent | None = None
|
||||
async for event in workflow2.run_stream_from_checkpoint(
|
||||
resume_checkpoint.checkpoint_id,
|
||||
responses={request_id: approval_response},
|
||||
):
|
||||
if isinstance(event, WorkflowOutputEvent):
|
||||
final_event = event
|
||||
|
||||
if final_event is None:
|
||||
print("Workflow did not complete after resume.")
|
||||
return
|
||||
|
||||
final = final_event.data
|
||||
print("\n=== Final Draft (from resumed run) ===")
|
||||
print(final)
|
||||
|
||||
if coordinator2.final_result is None:
|
||||
print("Coordinator did not capture final result via handler.")
|
||||
else:
|
||||
print("Coordinator stored final draft successfully.")
|
||||
|
||||
""""
|
||||
Sample Output:
|
||||
|
||||
=== Stage 1: run until sub-workflow requests human review ===
|
||||
Captured review request id: 032c9f3a-ad1b-4a52-89be-a168d6663011
|
||||
Using checkpoint 54f376c2-f849-44e4-9d8d-e627fd27ab96 at iteration 2
|
||||
Pending review requests (sub executor snapshot): []
|
||||
Pending review requests (parent executor snapshot): ['032c9f3a-ad1b-4a52-89be-a168d6663011']
|
||||
|
||||
=== Stage 2: resume from checkpoint and approve draft ===
|
||||
|
||||
>>> Parent workflow received approved draft:
|
||||
- Topic: Contoso Gadget Launch
|
||||
- Iterations: 1
|
||||
- Approved at: 2025-09-25T14:29:34.479164
|
||||
- Content: Approved launch narrative for Contoso Gadget Launch (iteration 1).
|
||||
|
||||
|
||||
=== Final Draft (from resumed run) ===
|
||||
FinalDraft(topic='Contoso Gadget Launch', content='Approved launch narrative for Contoso
|
||||
Gadget Launch (iteration 1).', iterations=1, approved_at=datetime.datetime(2025, 9, 25, 14, 29, 34, 479164))
|
||||
Coordinator stored final draft successfully.
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -4,8 +4,6 @@ import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from typing_extensions import Never
|
||||
|
||||
from agent_framework import (
|
||||
Executor,
|
||||
WorkflowBuilder,
|
||||
@@ -14,6 +12,7 @@ from agent_framework import (
|
||||
WorkflowExecutor,
|
||||
handler,
|
||||
)
|
||||
from typing_extensions import Never
|
||||
|
||||
"""
|
||||
Sample: Sub-Workflows (Basics)
|
||||
|
||||
+4
-3
@@ -4,8 +4,6 @@ import asyncio
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from typing_extensions import Never
|
||||
|
||||
from agent_framework import (
|
||||
Executor,
|
||||
RequestInfoExecutor,
|
||||
@@ -16,6 +14,7 @@ from agent_framework import (
|
||||
WorkflowExecutor,
|
||||
handler,
|
||||
)
|
||||
from typing_extensions import Never
|
||||
|
||||
"""
|
||||
Sample: Sub-workflow with parallel request handling by specialized interceptors
|
||||
@@ -170,7 +169,9 @@ class ResourceRequester(Executor):
|
||||
|
||||
@handler
|
||||
async def handle_policy_response(
|
||||
self, response: RequestResponse[PolicyCheckRequest, PolicyResponse], ctx: WorkflowContext[Never, RequestFinished]
|
||||
self,
|
||||
response: RequestResponse[PolicyCheckRequest, PolicyResponse],
|
||||
ctx: WorkflowContext[Never, RequestFinished],
|
||||
) -> None:
|
||||
"""Handle policy check response."""
|
||||
if response.data:
|
||||
|
||||
@@ -0,0 +1,299 @@
|
||||
# Copyright (c) Microsoft. All rights reserved.
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from agent_framework import (
|
||||
ChatAgent,
|
||||
FileCheckpointStorage,
|
||||
MagenticBuilder,
|
||||
MagenticPlanReviewDecision,
|
||||
MagenticPlanReviewReply,
|
||||
MagenticPlanReviewRequest,
|
||||
RequestInfoEvent,
|
||||
WorkflowCheckpoint,
|
||||
WorkflowOutputEvent,
|
||||
WorkflowRunState,
|
||||
WorkflowStatusEvent,
|
||||
)
|
||||
from agent_framework.openai import OpenAIChatClient
|
||||
|
||||
"""
|
||||
Sample: Magentic Orchestration + Checkpointing
|
||||
|
||||
The goal of this sample is to show the exact mechanics needed to pause a Magentic
|
||||
workflow that requires human plan review, persist the outstanding request via a
|
||||
checkpoint, and later resume the workflow by feeding in the saved response.
|
||||
|
||||
Concepts highlighted here:
|
||||
1. **Deterministic executor IDs** - the orchestrator and plan-review request executor
|
||||
must keep stable IDs so the checkpoint state aligns when we rebuild the graph.
|
||||
2. **Executor snapshotting** - checkpoints capture the `RequestInfoExecutor` state,
|
||||
specifically the pending plan-review request map, at superstep boundaries.
|
||||
3. **Resume with responses** - `Workflow.run_stream_from_checkpoint` accepts a
|
||||
`responses` mapping so we can inject the stored human reply during restoration.
|
||||
|
||||
Prerequisites:
|
||||
- OpenAI environment variables configured for `OpenAIChatClient`.
|
||||
"""
|
||||
|
||||
TASK = (
|
||||
"Draft a concise internal brief describing how our research and implementation teams should collaborate "
|
||||
"to launch a beta feature for data-driven email summarization. Highlight the key milestones, "
|
||||
"risks, and communication cadence."
|
||||
)
|
||||
|
||||
# Dedicated folder for captured checkpoints. Keeping it under the sample directory
|
||||
# makes it easy to inspect the JSON blobs produced by each run.
|
||||
CHECKPOINT_DIR = Path(__file__).parent / "tmp" / "magentic_checkpoints"
|
||||
|
||||
|
||||
def build_workflow(checkpoint_storage: FileCheckpointStorage):
|
||||
"""Construct the Magentic workflow graph with checkpointing enabled."""
|
||||
|
||||
# Two vanilla ChatAgents act as participants in the orchestration. They do not need
|
||||
# extra state handling because their inputs/outputs are fully described by chat messages.
|
||||
researcher = ChatAgent(
|
||||
name="ResearcherAgent",
|
||||
description="Collects background facts and references for the project.",
|
||||
instructions=("You are the research lead. Gather crisp bullet points the team should know."),
|
||||
chat_client=OpenAIChatClient(),
|
||||
)
|
||||
|
||||
writer = ChatAgent(
|
||||
name="WriterAgent",
|
||||
description="Synthesizes the final brief for stakeholders.",
|
||||
instructions=("You convert the research notes into a structured brief with milestones and risks."),
|
||||
chat_client=OpenAIChatClient(),
|
||||
)
|
||||
|
||||
# The builder wires in the Magentic orchestrator, sets the plan review path, and
|
||||
# stores the checkpoint backend so the runtime knows where to persist snapshots.
|
||||
return (
|
||||
MagenticBuilder()
|
||||
.participants(researcher=researcher, writer=writer)
|
||||
.with_plan_review()
|
||||
.with_standard_manager(
|
||||
chat_client=OpenAIChatClient(),
|
||||
max_round_count=10,
|
||||
max_stall_count=3,
|
||||
)
|
||||
.with_checkpointing(checkpoint_storage)
|
||||
.build()
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
# Stage 0: make sure the checkpoint folder is empty so we inspect only checkpoints
|
||||
# written by this invocation. This prevents stale files from previous runs from
|
||||
# confusing the analysis.
|
||||
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
for file in CHECKPOINT_DIR.glob("*.json"):
|
||||
file.unlink()
|
||||
|
||||
checkpoint_storage = FileCheckpointStorage(CHECKPOINT_DIR)
|
||||
|
||||
print("\n=== Stage 1: run until plan review request (checkpointing active) ===")
|
||||
workflow = build_workflow(checkpoint_storage)
|
||||
|
||||
# Run the workflow until the first RequestInfoEvent is surfaced. The event carries the
|
||||
# request_id we must reuse on resume. In a real system this is where the UI would present
|
||||
# the plan for human review.
|
||||
plan_review_request_id: str | None = None
|
||||
async for event in workflow.run_stream(TASK):
|
||||
if isinstance(event, RequestInfoEvent) and event.request_type is MagenticPlanReviewRequest:
|
||||
plan_review_request_id = event.request_id
|
||||
print(f"Captured plan review request: {plan_review_request_id}")
|
||||
|
||||
if isinstance(event, WorkflowStatusEvent) and event.state is WorkflowRunState.IDLE_WITH_PENDING_REQUESTS:
|
||||
break
|
||||
|
||||
if plan_review_request_id is None:
|
||||
print("No plan review request emitted; nothing to resume.")
|
||||
return
|
||||
|
||||
checkpoints = await checkpoint_storage.list_checkpoints(workflow.workflow.id)
|
||||
if not checkpoints:
|
||||
print("No checkpoints persisted.")
|
||||
return
|
||||
|
||||
resume_checkpoint = max(
|
||||
checkpoints,
|
||||
key=lambda cp: (cp.iteration_count, cp.timestamp),
|
||||
)
|
||||
print(f"Using checkpoint {resume_checkpoint.checkpoint_id} at iteration {resume_checkpoint.iteration_count}")
|
||||
|
||||
# Show that the checkpoint JSON indeed contains the pending plan-review request record.
|
||||
checkpoint_path = checkpoint_storage.storage_path / f"{resume_checkpoint.checkpoint_id}.json"
|
||||
if checkpoint_path.exists():
|
||||
with checkpoint_path.open() as f:
|
||||
snapshot = json.load(f)
|
||||
request_map = snapshot.get("executor_states", {}).get("magentic_plan_review", {}).get("request_events", {})
|
||||
print(f"Pending plan-review requests persisted in checkpoint: {list(request_map.keys())}")
|
||||
|
||||
print("\n=== Stage 2: resume from checkpoint and approve plan ===")
|
||||
resumed_workflow = build_workflow(checkpoint_storage)
|
||||
|
||||
approval = MagenticPlanReviewReply(decision=MagenticPlanReviewDecision.APPROVE)
|
||||
# Resume execution and supply the recorded approval in a single call.
|
||||
# `run_stream_from_checkpoint` rebuilds executor state, applies the provided responses,
|
||||
# and then continues the workflow. Because we only captured the initial plan review
|
||||
# checkpoint, the resumed run should complete almost immediately.
|
||||
final_event: WorkflowOutputEvent | None = None
|
||||
async for event in resumed_workflow.workflow.run_stream_from_checkpoint(
|
||||
resume_checkpoint.checkpoint_id,
|
||||
responses={plan_review_request_id: approval},
|
||||
):
|
||||
if isinstance(event, WorkflowOutputEvent):
|
||||
final_event = event
|
||||
|
||||
if final_event is None:
|
||||
print("Workflow did not complete after resume.")
|
||||
return
|
||||
|
||||
# Final sanity check: display the assistant's answer as proof the orchestration reached
|
||||
# a natural completion after resuming from the checkpoint.
|
||||
result = final_event.data
|
||||
if not result:
|
||||
print("No result data from workflow.")
|
||||
return
|
||||
text = getattr(result, "text", None) or str(result)
|
||||
print("\n=== Final Answer ===")
|
||||
print(text)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Stage 3: demonstrate resuming from a later checkpoint (post-plan)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _pending_message_count(cp: WorkflowCheckpoint) -> int:
|
||||
return sum(len(msg_list) for msg_list in cp.messages.values() if isinstance(msg_list, list))
|
||||
|
||||
all_checkpoints = await checkpoint_storage.list_checkpoints(resume_checkpoint.workflow_id)
|
||||
later_checkpoints_with_messages = [
|
||||
cp
|
||||
for cp in all_checkpoints
|
||||
if cp.iteration_count > resume_checkpoint.iteration_count and _pending_message_count(cp) > 0
|
||||
]
|
||||
|
||||
if later_checkpoints_with_messages:
|
||||
post_plan_checkpoint = max(
|
||||
later_checkpoints_with_messages,
|
||||
key=lambda cp: (cp.iteration_count, cp.timestamp),
|
||||
)
|
||||
else:
|
||||
later_checkpoints = [cp for cp in all_checkpoints if cp.iteration_count > resume_checkpoint.iteration_count]
|
||||
|
||||
if not later_checkpoints:
|
||||
print("\nNo additional checkpoints recorded beyond plan approval; sample complete.")
|
||||
return
|
||||
|
||||
post_plan_checkpoint = max(
|
||||
later_checkpoints,
|
||||
key=lambda cp: (cp.iteration_count, cp.timestamp),
|
||||
)
|
||||
print("\n=== Stage 3: resume from post-plan checkpoint ===")
|
||||
pending_messages = _pending_message_count(post_plan_checkpoint)
|
||||
print(
|
||||
f"Resuming from checkpoint {post_plan_checkpoint.checkpoint_id} at iteration "
|
||||
f"{post_plan_checkpoint.iteration_count} (pending messages: {pending_messages})"
|
||||
)
|
||||
if pending_messages == 0:
|
||||
print("Checkpoint has no pending messages; no additional work expected on resume.")
|
||||
|
||||
final_event_post: WorkflowOutputEvent | None = None
|
||||
post_emitted_events = False
|
||||
post_plan_workflow = build_workflow(checkpoint_storage)
|
||||
async for event in post_plan_workflow.workflow.run_stream_from_checkpoint(
|
||||
post_plan_checkpoint.checkpoint_id,
|
||||
responses={},
|
||||
):
|
||||
post_emitted_events = True
|
||||
if isinstance(event, WorkflowOutputEvent):
|
||||
final_event_post = event
|
||||
|
||||
if final_event_post is None:
|
||||
if not post_emitted_events:
|
||||
print("No new events were emitted; checkpoint already captured a completed run.")
|
||||
print("\n=== Final Answer (post-plan resume) ===")
|
||||
print(text)
|
||||
return
|
||||
print("Workflow did not complete after post-plan resume.")
|
||||
return
|
||||
|
||||
post_result = final_event_post.data
|
||||
if not post_result:
|
||||
print("No result data from post-plan resume.")
|
||||
return
|
||||
|
||||
post_text = getattr(post_result, "text", None) or str(post_result)
|
||||
print("\n=== Final Answer (post-plan resume) ===")
|
||||
print(post_text)
|
||||
|
||||
"""
|
||||
Sample Output:
|
||||
|
||||
=== Stage 1: run until plan review request (checkpointing active) ===
|
||||
Captured plan review request: 3a1a4a09-4ed1-4c90-9cf6-9ac488d452c0
|
||||
Using checkpoint 4c76d77a-6ff8-4d2b-84f6-824771ffac7e at iteration 1
|
||||
Pending plan-review requests persisted in checkpoint: ['3a1a4a09-4ed1-4c90-9cf6-9ac488d452c0']
|
||||
|
||||
=== Stage 2: resume from checkpoint and approve plan ===
|
||||
|
||||
=== Final Answer ===
|
||||
Certainly! Here's your concise internal brief on how the research and implementation teams should collaborate for
|
||||
the beta launch of the data-driven email summarization feature:
|
||||
|
||||
---
|
||||
|
||||
**Internal Brief: Collaboration Plan for Data-driven Email Summarization Beta Launch**
|
||||
|
||||
**Collaboration Approach**
|
||||
- **Joint Kickoff:** Research and Implementation teams hold a project kickoff to align on objectives, requirements,
|
||||
and success metrics.
|
||||
- **Ongoing Coordination:** Teams collaborate closely; researchers share model developments and insights, while
|
||||
implementation ensures smooth integration and user experience.
|
||||
- **Real-time Feedback Loop:** Implementation provides early feedback on technical integration and UX, while
|
||||
Research evaluates initial performance and user engagement signals post-integration.
|
||||
|
||||
**Key Milestones**
|
||||
1. **Requirement Finalization & Scoping** - Define MVP feature set and success criteria.
|
||||
2. **Model Prototyping & Evaluation** - Researchers develop and validate summarization models with agreed metrics.
|
||||
3. **Integration & Internal Testing** - Implementation team integrates the model; internal alpha testing and
|
||||
compliance checks.
|
||||
4. **Beta User Onboarding** - Recruit a select cohort of beta users and guide them through onboarding.
|
||||
5. **Beta Launch & Monitoring** - Soft-launch for beta group, with active monitoring of usage, feedback,
|
||||
and performance.
|
||||
6. **Iterative Improvements** - Address issues, refine features, and prepare for possible broader rollout.
|
||||
|
||||
**Top Risks**
|
||||
- **Data Privacy & Compliance:** Strict protocols and compliance reviews to prevent data leakage.
|
||||
- **Model Quality (Bias, Hallucination):** Careful monitoring of summary accuracy; rapid iterations if critical
|
||||
errors occur.
|
||||
- **User Adoption:** Ensuring the beta solves genuine user needs, collecting actionable feedback early.
|
||||
- **Feedback Quality & Quantity:** Proactively schedule user outreach to ensure substantive beta feedback.
|
||||
|
||||
**Communication Cadence**
|
||||
- **Weekly Team Syncs:** Short all-hands progress and blockers meeting.
|
||||
- **Bi-Weekly Stakeholder Check-ins:** Leadership and project leads address escalations and strategic decisions.
|
||||
- **Dedicated Slack Channel:** For real-time queries and updates.
|
||||
- **Documentation Hub:** Up-to-date project docs and FAQs on a shared internal wiki.
|
||||
- **Post-Milestone Retrospectives:** After critical phases (e.g., alpha, beta), reviewing what worked and what needs
|
||||
improvement.
|
||||
|
||||
**Summary**
|
||||
Clear alignment, consistent communication, and iterative feedback are key to a successful beta. All team members are
|
||||
expected to surface issues quickly and keep documentation current as we drive toward launch.
|
||||
---
|
||||
|
||||
=== Stage 3: resume from post-plan checkpoint ===
|
||||
Resuming from checkpoint 9a3b... at iteration 3 (pending messages: 0)
|
||||
No new events were emitted; checkpoint already captured a completed run.
|
||||
|
||||
=== Final Answer (post-plan resume) ===
|
||||
(same brief as above)
|
||||
"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user