mirror of
https://github.com/microsoft/agent-framework.git
synced 2026-06-16 21:04:09 +08:00
a2856d3b92
* restructure: Python samples into progressive 01-05 layout - 01-get-started/: 6 numbered steps (hello agent → hosting) - 02-agents/: all agent concept samples (tools, middleware, providers, etc.) - 03-workflows/: ALL existing workflow samples preserved as-is - 04-hosting/: azure-functions, durabletask, a2a - 05-end-to-end/: demos, evaluation, hosted agents - Old files moved to _to_delete/ for review - Added AGENTS.md with structure documentation - autogen-migration/ and semantic-kernel-migration/ preserved at root * fix: switch to AzureOpenAI Foundry, fix CI failures - Switch all 01-get-started samples to AzureOpenAIResponsesClient with Azure AI Foundry project endpoint (AZURE_AI_PROJECT_ENDPOINT + AZURE_OPENAI_RESPONSES_DEPLOYMENT_NAME + AzureCliCredential) - Add _to_delete/ and 05-end-to-end/ to pyrightconfig.samples.json excludes - Fix test paths in packages/ that referenced old getting_started/ dirs: durabletask conftest + streaming test, azurefunctions conftest, devui conftest + capture_messages + openai_sdk_integration - Fix workflow_as_agent_human_in_the_loop.py import (sibling import) - Update hosting READMEs and tool comment paths - Replace root README.md with new structure overview - Update AGENTS.md to document Azure OpenAI Foundry as default provider * cleanup: remove _to_delete folder, copy resource files to active dirs All files in _to_delete/ were either: - Exact duplicates of files in the new structure (240 files) - Same file with only comment path updates (100 files) - One import-fix diff (workflow_as_agent_human_in_the_loop.py) - One superseded minimal_sample.py Resource files (sample.pdf, countries.json, employees.pdf, weather.json) copied to 02-agents/sample_assets/ and 02-agents/resources/ since active samples reference them. * fix: address PR review comments, centralize resources, remove root duplicates - Fix type annotation in 04_memory.py (string union -> proper types) - Fix old sample paths in observability files - Fix grammar/spelling in observability samples - Move sample_assets/ and resources/ to shared/ folder - Remove 8 duplicate observability files from 02-agents root - Update resource path references in multimodal_input and provider samples * fix: update broken links from old getting_started paths to new structure - Update relative paths in READMEs: getting_started/ → 01-get-started/, 02-agents/, 03-workflows/, 04-hosting/, 05-end-to-end/ - Fix absolute GitHub URLs in package READMEs - Fix broken link in ollama package README * fix: convert absolute GitHub URLs to relative paths for link checker Absolute URLs to python/samples/ on main branch 404 until PR merges. Converted to relative paths that linkspector can verify locally. * fix: update link for handoff sample moved to orchestrations/ * fix: update chatkit-integration README path from demos/ to 05-end-to-end/ * fix: update broken links in orchestrations README to match flat directory structure
158 lines
5.7 KiB
Python
158 lines
5.7 KiB
Python
# Copyright (c) Microsoft. All rights reserved.
|
|
|
|
"""
|
|
Sample: Checkpointing and Resuming a Workflow
|
|
|
|
Purpose:
|
|
This sample shows how to enable checkpointing for a long-running workflow
|
|
that can be paused and resumed.
|
|
|
|
What you learn:
|
|
- How to configure checkpointing storage (InMemoryCheckpointStorage for testing)
|
|
- How to resume a workflow from a checkpoint after interruption
|
|
- How to implement executor state management with checkpoint hooks
|
|
- How to handle workflow interruptions and automatic recovery
|
|
|
|
Pipeline:
|
|
This sample shows a workflow that computes factor pairs for numbers up to a given limit:
|
|
1) A start executor that receives the upper limit and creates the initial task
|
|
2) A worker executor that processes each number to find its factor pairs
|
|
3) The worker uses checkpoint hooks to save/restore its internal state
|
|
|
|
Prerequisites:
|
|
- Basic understanding of workflow concepts, including executors, edges, events, etc.
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from random import random
|
|
from typing import Any
|
|
|
|
from agent_framework import (
|
|
Executor,
|
|
InMemoryCheckpointStorage,
|
|
WorkflowBuilder,
|
|
WorkflowCheckpoint,
|
|
WorkflowContext,
|
|
handler,
|
|
)
|
|
|
|
if sys.version_info >= (3, 12):
|
|
from typing import override # type: ignore # pragma: no cover
|
|
else:
|
|
from typing_extensions import override # type: ignore[import] # pragma: no cover
|
|
|
|
|
|
@dataclass
|
|
class ComputeTask:
|
|
"""Task containing the list of numbers remaining to be processed."""
|
|
|
|
remaining_numbers: list[int]
|
|
|
|
|
|
class StartExecutor(Executor):
|
|
"""Initiates the workflow by providing the upper limit for factor pair computation."""
|
|
|
|
@handler
|
|
async def start(self, upper_limit: int, ctx: WorkflowContext[ComputeTask]) -> None:
|
|
"""Start the workflow with a list of numbers to process."""
|
|
print(f"StartExecutor: Starting factor pair computation up to {upper_limit}")
|
|
await ctx.send_message(ComputeTask(remaining_numbers=list(range(1, upper_limit + 1))))
|
|
|
|
|
|
class WorkerExecutor(Executor):
|
|
"""Processes numbers to compute their factor pairs and manages executor state for checkpointing."""
|
|
|
|
def __init__(self, id: str) -> None:
|
|
super().__init__(id=id)
|
|
self._composite_number_pairs: dict[int, list[tuple[int, int]]] = {}
|
|
|
|
@handler
|
|
async def compute(
|
|
self,
|
|
task: ComputeTask,
|
|
ctx: WorkflowContext[ComputeTask, dict[int, list[tuple[int, int]]]],
|
|
) -> None:
|
|
"""Process the next number in the task, computing its factor pairs."""
|
|
next_number = task.remaining_numbers.pop(0)
|
|
|
|
print(f"WorkerExecutor: Computing factor pairs for {next_number}")
|
|
pairs: list[tuple[int, int]] = []
|
|
for i in range(1, next_number):
|
|
if next_number % i == 0:
|
|
pairs.append((i, next_number // i))
|
|
self._composite_number_pairs[next_number] = pairs
|
|
|
|
if not task.remaining_numbers:
|
|
# All numbers processed - output the results
|
|
await ctx.yield_output(self._composite_number_pairs)
|
|
else:
|
|
# More numbers to process - continue with remaining task
|
|
await ctx.send_message(task)
|
|
|
|
@override
|
|
async def on_checkpoint_save(self) -> dict[str, Any]:
|
|
"""Save the executor's internal state for checkpointing."""
|
|
return {"composite_number_pairs": self._composite_number_pairs}
|
|
|
|
@override
|
|
async def on_checkpoint_restore(self, state: dict[str, Any]) -> None:
|
|
"""Restore the executor's internal state from a checkpoint."""
|
|
self._composite_number_pairs = state.get("composite_number_pairs", {})
|
|
|
|
|
|
async def main():
|
|
# Build workflow with checkpointing enabled
|
|
checkpoint_storage = InMemoryCheckpointStorage()
|
|
start = StartExecutor(id="start")
|
|
worker = WorkerExecutor(id="worker")
|
|
workflow_builder = (
|
|
WorkflowBuilder(start_executor=start, checkpoint_storage=checkpoint_storage)
|
|
.add_edge(start, worker)
|
|
.add_edge(worker, worker) # Self-loop for iterative processing
|
|
)
|
|
|
|
# Run workflow with automatic checkpoint recovery
|
|
latest_checkpoint: WorkflowCheckpoint | None = None
|
|
while True:
|
|
workflow = workflow_builder.build()
|
|
|
|
# Start from checkpoint or fresh execution
|
|
print(f"\n** Workflow {workflow.id} started **")
|
|
event_stream = (
|
|
workflow.run(message=10, stream=True)
|
|
if latest_checkpoint is None
|
|
else workflow.run(checkpoint_id=latest_checkpoint.checkpoint_id, stream=True)
|
|
)
|
|
|
|
output: str | None = None
|
|
async for event in event_stream:
|
|
if event.type == "output":
|
|
output = event.data
|
|
break
|
|
if event.type == "superstep_completed" and random() < 0.5:
|
|
# Randomly simulate system interruptions
|
|
# The type="superstep_completed" event ensures we only interrupt after
|
|
# the current super-step is fully complete and checkpointed.
|
|
# If we interrupt mid-step, the workflow may resume from an earlier point.
|
|
print("\n** Simulating workflow interruption. Stopping execution. **")
|
|
break
|
|
|
|
# Find the latest checkpoint to resume from
|
|
latest_checkpoint = await checkpoint_storage.get_latest(workflow_name=workflow.name)
|
|
if not latest_checkpoint:
|
|
raise RuntimeError("No checkpoints available to resume from.")
|
|
print(
|
|
f"Checkpoint {latest_checkpoint.checkpoint_id}: "
|
|
f"(iter={latest_checkpoint.iteration_count}, messages={latest_checkpoint.messages})"
|
|
)
|
|
|
|
if output is not None:
|
|
print(f"\nWorkflow completed successfully with output: {output}")
|
|
break
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|