agent-framework/python/packages/lab/lightning/samples/train_math_agent.py

# Copyright (c) Microsoft. All rights reserved.

"""This sample demonstrates the basic usage pattern of agent-framework-lab-lightning.

It trains a math agent using a dataset in `data/math/` to solve mathematical problems
using an MCP calculator tool.

One GPU with 40GB of memory is sufficient for this sample.
"""

from __future__ import annotations

import argparse
import asyncio
import json
import math
import os
import re
import string
from typing import TypedDict, cast

import sympy  # type: ignore[import-untyped,reportMissingImports]
from agent_framework import AgentResponse, ChatAgent, MCPStdioTool
from agent_framework.lab.lightning import AgentFrameworkTracer
from agent_framework.openai import OpenAIChatClient
from agentlightning import LLM, Dataset, Trainer, rollout
from agentlightning.algorithm.verl import VERL


class MathProblem(TypedDict):
    """This TypedDict defines the structure of each training sample.

    Your task structure should contain all the information needed for:

    - The agent to process the task (e.g., 'question')
    - Evaluation (e.g., 'result' for ground truth)

    This type is optional. Not necessary to make the example work.
    """

    # The fields come from the dataset
    id: str
    question: str  # The math problem for the agent to solve
    chain: str  # Step-by-step solution (not used in training)
    result: str  # Ground truth answer for evaluation
    source: str


def _load_jsonl(file_path: str) -> Dataset[MathProblem]:
    """Load your dataset as a list of task samples.

    Each sample should match your task structure (MathProblem in this case).
    """
    with open(file_path) as f:
        raw_data = [MathProblem(**json.loads(line)) for line in f]
    return cast(Dataset[MathProblem], raw_data)


# Evaluation logic
# These functions evaluate whether the agent's answer matches the ground truth.
# Robust evaluation is crucial for RL training - the reward signal guides learning.


def _normalize_option(option: str) -> str:
    return re.sub(r"(\s+|\(|\))", "", option)


def _is_option_result(result: str) -> bool:
    return _normalize_option(result) in list(string.ascii_letters)


def _float_eval(input_str: str) -> float:
    if " = around " in input_str:
        input_str = input_str.split(" = around ")[0]
    expr = sympy.parse_expr(input_str, evaluate=True)
    return float(expr.evalf())


def _scalar_are_results_same(pred_result: str, true_result: str, rel_tol: float) -> bool:
    pred_result = str(pred_result) if pred_result is not None else ""
    true_result = str(true_result) if true_result is not None else ""

    if pred_result.strip() == true_result.strip():
        return True

    if _is_option_result(true_result):
        # The task is to select correct option
        true_result = _normalize_option(true_result)
        pred_result = _normalize_option(pred_result)
        return pred_result == true_result

    # The task is to calculate the result as a number
    try:
        pred_float = _float_eval(pred_result)
        true_float = _float_eval(true_result)
        return math.isclose(pred_float, true_float, rel_tol=rel_tol)
    except Exception:  # noqa: S110
        pass

    return False


def _is_result_correct(prediction: str, ground_truth: str) -> float:
    return float(_scalar_are_results_same(prediction, ground_truth, 1e-2))


def evaluate(result: AgentResponse, ground_truth: str) -> float:
    """Main evaluation function that extracts the agent's answer and compares with ground truth.

    This function:
    1. Extracts the final answer from the agent's response (after ###)
    2. Compares it with the ground truth using mathematical equivalence
    3. Returns a reward score (0.0 or 1.0) for RL training

    The reward signal is critical - it directly influences what the model learns.
    """
    # Check if agent provided any response
    if len(result.messages) == 0:
        print("No response from agent. Assuming incorrect.")
        return 0.0
    final_message = result.messages[-1].text

    # Extract answer after ### marker (as specified in agent instructions)
    answer = re.search(r"###\s*(.+?)(\s*###|$)", final_message)
    if answer is None:
        print("No answer can be extracted from agent's response. Assuming incorrect.")
        return 0.0
    answer = answer.group(1)

    # Compare extracted answer with ground truth
    reward = _is_result_correct(answer, ground_truth)
    print(f"Reward: {reward}")
    return reward


# Agent Logic

# Clear instructions are important for consistent agent behavior
# The ### format helps with reliable answer extraction during evaluation
AGENT_INSTRUCTION = """
Solve the following math problem. Use the calculator tool to help you calculate math expressions.

Output the answer when you are ready. The answer should be after three sharps (`###`), with no extra punctuations or texts. For example: ### 123
""".strip()  # noqa: E501


# The @rollout decorator is the key integration point with agent-lightning.
# It tells the training system that this function defines a trainable agent.
@rollout
async def math_agent(task: MathProblem, llm: LLM) -> float:
    """This is your trainable agent function.

    Key points:

    1. Must be decorated with @rollout
    2. Takes a task sample and LLM object as parameters
    3. Returns a float reward score (0.0 to 1.0 typically)
    4. The LLM object contains the model being trained and its configuration

    During training:
    - llm.model: The model checkpoint being trained
    - llm.endpoint: vLLM server endpoint for inference
    - llm.sampling_parameters: Temperature, etc.
    """
    # Create the Agent Framework components
    # MCPStdioTool provides calculator functionality via MCP protocol
    async with (
        MCPStdioTool(name="calculator", command="uvx", args=["mcp-server-calculator"]) as mcp_server,
        ChatAgent(
            chat_client=OpenAIChatClient(
                model_id=llm.model,  # This is the model being trained
                api_key=os.getenv("OPENAI_API_KEY") or "dummy",  # Can be dummy when connecting to training LLM
                base_url=llm.endpoint,  # vLLM server endpoint provided by agent-lightning
            ),
            name="MathAgent",
            instructions=AGENT_INSTRUCTION,
            temperature=llm.sampling_parameters.get("temperature", 0.0),
        ) as agent,
    ):
        print(f"Task: {task['question'][:10]}...")
        # Run the agent on the task
        result = await agent.run(task["question"], tools=mcp_server)
        print(f"Agent responses: {result}")

        # Evaluate and return reward - this is what drives RL training
        return evaluate(result, task["result"])


def main():
    """Main entrypoint."""
    # Configure RL training
    # This configuration controls all aspects of the RL training process.
    # Key sections: algorithm, data, rollout, actor, trainer
    rl_training_config = {
        "algorithm": {
            # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
            "adv_estimator": "grpo"
        },
        "data": {
            # Uses this many tasks from the dataset to perform rollouts
            "train_batch_size": 8,
            # Used to filter out the over-long prompt-response pairs
            "max_prompt_length": 4096,
            "max_response_length": 1024,
        },
        "actor_rollout_ref": {
            # Controls the rollout process
            "rollout": {
                # Set to 1 unless you want to use TP in multiple GPUs
                "tensor_model_parallel_size": 1,
                # Repeat each task N many times. Required by G(rouped)RPO
                "n": 4,
                # Controls the batch size per GPU when computing the log-prob
                "log_prob_micro_batch_size_per_gpu": 2,
                # Controls the multi-turn format (this is binded to the LLM used)
                # See https://docs.vllm.ai/en/stable/features/tool_calling.html
                "multi_turn": {"format": "hermes"},
                # Only vllm is supported for now
                "name": "vllm",
                # Controls the GPU memory utilization of vLLM
                # You might want to set this to under 0.8 to prevent OOM
                "gpu_memory_utilization": 0.7,
            },
            "actor": {
                # Split each sample into sub-batches of this size for PPO
                "ppo_mini_batch_size": 8,
                # Local per-GPU micro batch size
                "ppo_micro_batch_size_per_gpu": 2,
                # Optimizer configuration
                "optim": {"lr": 1e-6},
                # Whether to use KL loss during training
                "use_kl_loss": False,
                # PPO clipping ratios for policy updates
                "clip_ratio_low": 0.2,
                "clip_ratio_high": 0.3,
                # FSDP (Fully Sharded Data Parallel) configuration for memory efficiency
                # Useful when you don't have enough GPU memory
                "fsdp_config": {
                    # Whether to offload parameters to CPU
                    "param_offload": True,
                    # Whether to offload optimizer state to CPU
                    "optimizer_offload": True,
                },
            },
            # Reference model config
            "ref": {
                # Controls the batch size per GPU when computing log-prob for reference model
                "log_prob_micro_batch_size_per_gpu": 2,
                "fsdp_config": {"param_offload": True},
            },
            # Common configs for the model
            "model": {
                # Huggingface model path.
                # If you want to train a different model, change the path here.
                "path": "Qwen/Qwen2.5-1.5B-Instruct",
                # Whether to remove padding tokens in inputs during training
                "use_remove_padding": True,
                # Enable gradient checkpointing for memory efficiency
                "enable_gradient_checkpointing": True,
            },
        },
        # Config for the trainer
        "trainer": {
            # Number of GPUs per node
            "n_gpus_per_node": 1,
            # Whether to run validation before training begins
            "val_before_train": True,
            # Logging backends to use: "console", "wandb", etc.
            "logger": ["console"],
            # Number of nodes used in the training
            "nnodes": 1,
            # Validation frequency (in training iterations)
            "test_freq": 4,
            # Number of epochs in training
            "total_epochs": 2,
        },
    }

    # Load your datasets
    train_dataset = _load_jsonl("data/math/train.jsonl")
    val_dataset = _load_jsonl("data/math/test.jsonl")

    # Preview the data to ensure it's loaded correctly
    print("First 5 rows of train dataset:")
    for i in range(5):
        print(train_dataset[i])
    print("First 5 rows of val dataset:")
    for i in range(5):
        print(val_dataset[i])

    # Create trainer with VERL algorithm and start training
    # n_workers: Number of rollout workers (processes) for parallel data collection
    trainer = Trainer(algorithm=VERL(rl_training_config), tracer=AgentFrameworkTracer(), n_workers=2)

    # This starts the actual RL training loop:
    # 1. Collect rollouts using current model
    # 2. Compute advantages and train the model
    # 3. Repeat for specified number of epochs
    trainer.fit(math_agent, train_dataset, val_dataset=val_dataset)


def debug():
    """Debug mode allows you to test your agent function before training.

    Always run debug mode first before starting expensive RL training!
    """
    train_dataset = _load_jsonl("data/math/train.jsonl")
    train_sample = train_dataset[0]

    # Use a known good model for debugging (not the one being trained)
    model = "gpt-4o-mini"
    base_url = os.getenv("OPENAI_BASE_URL")
    api_key = os.getenv("OPENAI_API_KEY")
    if api_key is None:
        raise ValueError("OPENAI_API_KEY must be set")
    if base_url is None:
        raise ValueError("OPENAI_BASE_URL must be set")

    # Test your agent function with a sample task
    asyncio.run(math_agent(train_sample, LLM(model=model, endpoint=base_url)))  # type: ignore


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    if args.debug:
        debug()
    else:
        main()