agent-framework/python/scripts/check_md_code_blocks.py

# Copyright (c) Microsoft. All rights reserved.

"""Check code blocks in Markdown files for syntax errors."""

import argparse
from enum import Enum
import glob
import logging
import os
import tempfile
import subprocess  # nosec

from pygments import highlight  # type: ignore
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonLexer

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)


class Colors(str, Enum):
    CEND = "\33[0m"
    CRED = "\33[31m"
    CREDBG = "\33[41m"
    CGREEN = "\33[32m"
    CGREENBG = "\33[42m"
    CVIOLET = "\33[35m"
    CGREY = "\33[90m"


def with_color(text: str, color: Colors) -> str:
    """Prints a string with the specified color."""
    return f"{color.value}{text}{Colors.CEND.value}"


def expand_file_patterns(patterns: list[str], skip_glob: bool = False) -> list[str]:
    """Expand glob patterns to actual file paths."""
    all_files: list[str] = []
    for pattern in patterns:
        if skip_glob:
            # When skip_glob is True, treat patterns as literal file paths
            # Only include if it's a markdown file
            if pattern.endswith('.md'):
                matches = glob.glob(pattern, recursive=False)
                all_files.extend(matches)
        else:
            # Handle both relative and absolute paths with glob expansion
            matches = glob.glob(pattern, recursive=True)
            all_files.extend(matches)
    return sorted(set(all_files))  # Remove duplicates and sort


def extract_python_code_blocks(markdown_file_path: str) -> list[tuple[str, int]]:
    """Extract Python code blocks from a Markdown file."""
    with open(markdown_file_path, encoding="utf-8") as file:
        lines = file.readlines()

    code_blocks: list[tuple[str, int]] = []
    in_code_block = False
    current_block: list[str] = []

    for i, line in enumerate(lines):
        if line.strip().startswith("```python"):
            in_code_block = True
            current_block = []
        elif line.strip().startswith("```"):
            in_code_block = False
            code_blocks.append(("\n".join(current_block), i - len(current_block) + 1))
        elif in_code_block:
            current_block.append(line)

    return code_blocks


def check_code_blocks(markdown_file_paths: list[str], exclude_patterns: list[str] | None = None) -> None:
    """Check Python code blocks in a Markdown file for syntax errors."""
    files_with_errors: list[str] = []
    exclude_patterns = exclude_patterns or []

    for markdown_file_path in markdown_file_paths:
        # Skip files that match any exclude pattern
        if any(pattern in markdown_file_path for pattern in exclude_patterns):
            logger.info(f"Skipping {markdown_file_path} (matches exclude pattern)")
            continue
        code_blocks = extract_python_code_blocks(markdown_file_path)
        had_errors = False
        for code_block, line_no in code_blocks:
            markdown_file_path_with_line_no = f"{markdown_file_path}:{line_no}"
            logger.info("Checking a code block in %s...", markdown_file_path_with_line_no)

            # Skip blocks that don't import agent_framework modules or import lab modules
            if (all(
                all(import_code not in code_block for import_code in [f"import {module}", f"from {module}"])
                for module in ["agent_framework"]
            ) or "agent_framework.lab" in code_block):
                logger.info(f' {with_color("OK[ignored]", Colors.CGREENBG)}')
                continue

            with tempfile.TemporaryDirectory() as tmp_dir:
                # Use the same rules as pyrightconfig.samples.json:
                # typeCheckingMode=off, only reportMissingImports and reportAttributeAccessIssue enabled.
                pyright_cfg = os.path.join(tmp_dir, "pyrightconfig.json")
                with open(pyright_cfg, "w") as cfg:
                    cfg.write(
                        '{"include":["."],"typeCheckingMode":"off",'
                        '"reportMissingImports":"error","reportAttributeAccessIssue":"error"}'
                    )
                tmp_file = os.path.join(tmp_dir, "snippet.py")
                with open(tmp_file, "w", encoding="utf-8") as f:
                    f.write(code_block)

                result = subprocess.run(["uv", "run", "pyright", "-p", tmp_dir], capture_output=True, text=True, cwd=".")  # nosec
                # Filter to only errors from our config rules; syntax-level errors
                # (top-level await, etc.) are expected in README documentation snippets.
                # Only flag reportMissingImports for agent_framework modules, not third-party packages.
                relevant_errors = [
                    line for line in result.stdout.splitlines()
                    if ("reportMissingImports" in line and "agent_framework" in line)
                    or "reportAttributeAccessIssue" in line
                ]
                if relevant_errors:
                    highlighted_code = highlight(code_block, PythonLexer(), TerminalFormatter())  # type: ignore
                    logger.info(
                        f" {with_color('FAIL', Colors.CREDBG)}\n"
                        f"{with_color('========================================================', Colors.CGREY)}\n"
                        f"{with_color('Error', Colors.CRED)}: Pyright found issues in {with_color(markdown_file_path_with_line_no, Colors.CVIOLET)}:\n"
                        f"{with_color('--------------------------------------------------------', Colors.CGREY)}\n"
                        f"{highlighted_code}\n"
                        f"{with_color('--------------------------------------------------------', Colors.CGREY)}\n"
                        "\n"
                        f"{with_color('pyright output:', Colors.CVIOLET)}\n"
                        f"{with_color(result.stdout, Colors.CRED)}"
                        f"{with_color('========================================================', Colors.CGREY)}\n"
                    )
                    had_errors = True
                else:
                    logger.info(f" {with_color('OK', Colors.CGREENBG)}")

        if had_errors:
            files_with_errors.append(markdown_file_path)

    if files_with_errors:
        raise RuntimeError("Syntax errors found in the following files:\n" + "\n".join(files_with_errors))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Check code blocks in Markdown files for syntax errors.")
    # Argument is a list of markdown files containing glob patterns
    parser.add_argument("markdown_files", nargs="+", help="Markdown files to check (supports glob patterns).")
    parser.add_argument("--exclude", action="append", help="Exclude files containing this pattern.")
    parser.add_argument("--no-glob", action="store_true", help="Treat file arguments as literal paths (no glob expansion).")
    args = parser.parse_args()

    # Expand glob patterns to actual file paths (or skip if --no-glob)
    expanded_files = expand_file_patterns(args.markdown_files, skip_glob=args.no_glob)
    check_code_blocks(expanded_files, args.exclude)