diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index 0e59e4254f..3f6a66fd4d 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -23,6 +23,14 @@ jobs: with: persist-credentials: false + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Install Chrome for Puppeteer + run: npx puppeteer browsers install chrome + # Checks the status of hyperlinks in all files - name: Run linkspector uses: umbrelladocs/action-linkspector@963b6264d7de32c904942a70b488d3407453049e # v1 diff --git a/python/packages/core/AGENTS.md b/python/packages/core/AGENTS.md index ed47f363a2..ffb6b3e2c5 100644 --- a/python/packages/core/AGENTS.md +++ b/python/packages/core/AGENTS.md @@ -76,6 +76,14 @@ agent_framework/ - **`SkillScriptRunner`** - Protocol for file-based script execution. Any callable matching `(skill, script, args) -> Any` satisfies it. Code-defined scripts do not use a runner. - **`SkillsProvider`** - Context provider (extends `ContextProvider`) that discovers file-based skills from `SKILL.md` files and/or accepts code-defined `Skill` instances. Follows progressive disclosure: advertise → load → read resources / run scripts. +### File Access Harness (`_harness/_file_access.py`) + +- **`AgentFileStore`** - Abstract async store backing the file-access harness. Implementations expose `write_file`, `read_file`, `delete_file`, `list_files`, `file_exists`, `search_files`, and `create_directory` over forward-slash relative paths. +- **`InMemoryAgentFileStore`** - Dict-backed store suitable for tests and lightweight scenarios. +- **`FileSystemAgentFileStore`** - Disk-backed store rooted under a configurable directory. Enforces relative-path normalization, root containment, and rejects symlink/reparse-point segments to prevent escape. +- **`FileSearchResult`** / **`FileSearchMatch`** - `SerializationMixin` DTOs returned by `search_files`, carrying the matching file name, a context snippet, and the matching lines with 1-based line numbers. +- **`FileAccessProvider`** - `ContextProvider` that adds shared file-access tools (`file_access_save_file`, `file_access_read_file`, `file_access_delete_file`, `file_access_list_files`, `file_access_search_files`) plus default usage instructions to each invocation. Unlike `MemoryContextProvider`, the store is intentionally shared across sessions and agents. + ### Workflows (`_workflows/`) - **`Workflow`** - Graph-based workflow definition diff --git a/python/packages/core/agent_framework/__init__.py b/python/packages/core/agent_framework/__init__.py index d199388b17..cc517d993e 100644 --- a/python/packages/core/agent_framework/__init__.py +++ b/python/packages/core/agent_framework/__init__.py @@ -90,6 +90,16 @@ from ._harness._background_agents import ( BackgroundTaskInfo, BackgroundTaskStatus, ) +from ._harness._file_access import ( + DEFAULT_FILE_ACCESS_INSTRUCTIONS, + DEFAULT_FILE_ACCESS_SOURCE_ID, + AgentFileStore, + FileAccessProvider, + FileSearchMatch, + FileSearchResult, + FileSystemAgentFileStore, + InMemoryAgentFileStore, +) from ._harness._memory import ( DEFAULT_MEMORY_SOURCE_ID, MemoryContextProvider, @@ -309,6 +319,8 @@ __all__ = [ "APP_INFO", "COMPACTION_STATE_KEY", "DEFAULT_BACKGROUND_AGENTS_SOURCE_ID", + "DEFAULT_FILE_ACCESS_INSTRUCTIONS", + "DEFAULT_FILE_ACCESS_SOURCE_ID", "DEFAULT_HARNESS_INSTRUCTIONS", "DEFAULT_MAX_ITERATIONS", "DEFAULT_MEMORY_SOURCE_ID", @@ -334,6 +346,7 @@ __all__ = [ "AgentExecutor", "AgentExecutorRequest", "AgentExecutorResponse", + "AgentFileStore", "AgentFrameworkException", "AgentMiddleware", "AgentMiddlewareLayer", @@ -393,11 +406,15 @@ __all__ = [ "ExperimentalFeature", "FanInEdgeGroup", "FanOutEdgeGroup", + "FileAccessProvider", "FileCheckpointStorage", "FileHistoryProvider", + "FileSearchMatch", + "FileSearchResult", "FileSkill", "FileSkillScript", "FileSkillsSource", + "FileSystemAgentFileStore", "FilteringSkillsSource", "FinalT", "FinishReason", @@ -414,6 +431,7 @@ __all__ = [ "GeneratedEmbeddings", "GraphConnectivityError", "HistoryProvider", + "InMemoryAgentFileStore", "InMemoryCheckpointStorage", "InMemoryHistoryProvider", "InMemorySkillsSource", diff --git a/python/packages/core/agent_framework/_harness/_file_access.py b/python/packages/core/agent_framework/_harness/_file_access.py new file mode 100644 index 0000000000..08632827c9 --- /dev/null +++ b/python/packages/core/agent_framework/_harness/_file_access.py @@ -0,0 +1,1018 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""File-access harness provider exposing CRUD/search tools backed by an ``AgentFileStore``. + +Unlike :class:`~agent_framework.MemoryContextProvider`, which provides +session-scoped memory that may be isolated per session, :class:`FileAccessProvider` +operates on a shared, persistent storage area whose contents are visible across +sessions and agents. The provider exposes five tools — ``file_access_save_file``, +``file_access_read_file``, ``file_access_delete_file``, ``file_access_list_files``, +and ``file_access_search_files`` — by registering them on the per-invocation +:class:`~agent_framework.SessionContext` in :meth:`FileAccessProvider.before_run`. + +The store abstraction is generic so callers can plug in in-memory, local-disk, or +remote-blob backends. Two backends are shipped here: + +* :class:`InMemoryAgentFileStore` — dict-backed, suitable for tests. +* :class:`FileSystemAgentFileStore` — disk-backed, with traversal and symlink + protections. +""" + +from __future__ import annotations + +import asyncio +import errno +import fnmatch +import logging +import os +import re +from abc import ABC, abstractmethod +from collections.abc import Callable, Mapping, MutableMapping +from pathlib import Path +from typing import Any, cast + +from .._feature_stage import ExperimentalFeature, experimental +from .._serialization import SerializationMixin +from .._sessions import AgentSession, ContextProvider, SessionContext +from .._tools import ApprovalMode, tool + +logger = logging.getLogger(__name__) + +DEFAULT_FILE_ACCESS_SOURCE_ID = "file_access" +DEFAULT_FILE_ACCESS_INSTRUCTIONS = ( + "## File Access\n" + "You have access to a shared file storage area via the `file_access_*` tools " + "for reading, writing, and managing files.\n" + "These files persist beyond the current session and may be shared across " + "sessions or agents.\n" + "Use these tools to read input data provided by the user, write output " + "artifacts, and manage any files the user has asked you to work with.\n\n" + "- Never delete or overwrite existing files unless the user has explicitly " + "asked you to do so." +) + +# Maximum number of characters of context to include on either side of the first +# regex match when building a result snippet. +_SEARCH_SNIPPET_RADIUS = 50 + +# Hard cap on the length of a user-supplied search regex. Python's ``re`` module +# has no built-in timeout, so a catastrophic-backtracking pattern (such as +# ``(a+)+$``) submitted by the model could spin the CPU indefinitely. The cap +# alone does not stop short pathological patterns, so :meth:`search_files` +# additionally executes the regex scan in a worker thread and bounds the wall +# clock with :data:`_SEARCH_TIMEOUT_SECONDS`. The thread itself cannot be +# safely interrupted from Python, so a runaway scan continues until the +# regex engine returns, but the caller and event loop stay responsive. +_MAX_SEARCH_PATTERN_LENGTH = 256 +_SEARCH_TIMEOUT_SECONDS = 10.0 + +# Errno raised by POSIX ``open`` when ``O_NOFOLLOW`` was requested and the +# leaf path component is a symbolic link. Used to translate the kernel-level +# refusal into the same :class:`ValueError` the static probe raises so the +# caller can treat the two cases uniformly. +_ELOOP = errno.ELOOP + + +def _compile_search_regex(pattern: str) -> re.Pattern[str]: + """Compile a case-insensitive search regex, enforcing the length cap. + + Raises: + ValueError: When ``pattern`` exceeds ``_MAX_SEARCH_PATTERN_LENGTH`` characters. + re.error: When ``pattern`` is not a valid regular expression. + """ + if len(pattern) > _MAX_SEARCH_PATTERN_LENGTH: + raise ValueError( + f"Regex pattern is too long ({len(pattern)} characters). " + f"Maximum supported length is {_MAX_SEARCH_PATTERN_LENGTH} characters." + ) + return re.compile(pattern, flags=re.IGNORECASE) + + +async def _run_search_with_timeout( + fn: Callable[[], list[FileSearchResult]], +) -> list[FileSearchResult]: + """Run ``fn`` in a worker thread with a bounded wall-clock timeout. + + Raises: + ValueError: When the search does not complete within + :data:`_SEARCH_TIMEOUT_SECONDS` seconds. + """ + try: + return await asyncio.wait_for(asyncio.to_thread(fn), timeout=_SEARCH_TIMEOUT_SECONDS) + except asyncio.TimeoutError as exc: + # On Python 3.10 ``asyncio.wait_for`` raises ``asyncio.TimeoutError`` + # which is distinct from the builtin ``TimeoutError`` (the two were + # unified in 3.11). Catching the asyncio alias works on every + # supported version. + raise ValueError( + f"Regex search did not complete within {_SEARCH_TIMEOUT_SECONDS:g} seconds. " + "Use a more specific pattern (avoid nested quantifiers such as '(a+)+')." + ) from exc + + +def _normalize_relative_path(path: str, *, is_directory: bool = False) -> str: + """Normalize and validate a relative store path. + + Trims surrounding whitespace, replaces backslashes with forward slashes, + collapses repeated separators, and rejects rooted paths, drive letters, and + ``.``/``..`` segments. When ``is_directory`` is True, an empty result is + allowed and represents the root; otherwise an empty result is rejected and + trailing separators are not accepted (so ``"foo/"`` does not silently + become the file path ``"foo"``). + + Args: + path: The relative path to normalize. + + Keyword Args: + is_directory: Whether the path represents a directory (allows empty + results and trailing separators) or a file (rejects empty results + and trailing separators). + + Returns: + The normalized forward-slash relative path. + + Raises: + ValueError: When the path is rooted, starts with a drive letter, contains + ``.``/``..`` segments, is empty for a file path, or ends with a + separator for a file path. + """ + if not path or not path.strip(): + if not is_directory: + raise ValueError("A file path must not be empty or whitespace-only.") + return "" + + # Trim surrounding whitespace so spaces never leak into file segments. + path = path.strip() + converted = path.replace("\\", "/") + + # For file paths reject trailing separators so a directory-shaped string + # such as ``"foo/"`` is never silently treated as the file ``"foo"``. + if not is_directory and converted.endswith("/"): + raise ValueError(f"Invalid path: {path!r}. A file path must not end with a path separator.") + + normalized = converted.strip("/") + + if ( + os.path.isabs(path) + or path.startswith(("/", "\\")) + or (len(normalized) >= 2 and normalized[0].isalpha() and normalized[1] == ":") + ): + raise ValueError( + f"Invalid path: {path!r}. Paths must be relative and must not start with '/', '\\', or a drive root." + ) + + clean_segments: list[str] = [] + for segment in normalized.split("/"): + if not segment: + continue + if segment in (".", ".."): + raise ValueError(f"Invalid path: {path!r}. Paths must not contain '.' or '..' segments.") + clean_segments.append(segment) + + result = "/".join(clean_segments) + if not is_directory and not result: + raise ValueError(f"Invalid path: {path!r}. A file path must not be empty.") + return result + + +def _matches_glob(file_name: str, pattern: str | None) -> bool: + """Return whether ``file_name`` matches the optional glob pattern (case-insensitive). + + When ``pattern`` is ``None`` or blank this returns True so callers can skip + filtering by passing nothing. Matching uses :func:`fnmatch.fnmatchcase` over a + lowercased pattern/name pair to give consistent results across operating + systems (``fnmatch.fnmatch`` is case-sensitive on POSIX but not on Windows). + """ + if pattern is None or not pattern.strip(): + return True + return fnmatch.fnmatchcase(file_name.lower(), pattern.lower()) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSearchMatch(SerializationMixin): + """Represent one line within a file that matched a search pattern.""" + + line_number: int + line: str + __slots__ = ("line", "line_number") + + def __init__(self, line_number: int, line: str) -> None: + r"""Initialize one search match. + + Args: + line_number: The 1-based line number where the match was found. + line: The content of the matching line (trailing ``\r`` removed). + """ + if line_number < 1: + raise ValueError("line_number must be a positive integer.") + self.line_number = line_number + self.line = line + + def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: + """Serialize this match to a JSON-compatible dictionary. + + Overrides :meth:`SerializationMixin.to_dict` because this DTO is + declared with ``__slots__``: the base implementation iterates + ``self.__dict__`` which is empty for slotted classes and would emit + only the auto-injected ``type`` field. The ``exclude`` / + ``exclude_none`` arguments are accepted (and discarded) so the + signature remains drop-in compatible with the mixin — callers like + :meth:`SerializationMixin.to_json` always forward them. + """ + del exclude, exclude_none + return {"line_number": self.line_number, "line": self.line} + + @classmethod + def from_dict( + cls, raw_match: MutableMapping[str, Any], /, *, dependencies: MutableMapping[str, Any] | None = None + ) -> FileSearchMatch: + """Parse one search match from its dict representation.""" + del dependencies + line_number = raw_match.get("line_number") + line = raw_match.get("line", "") + if not isinstance(line_number, int) or isinstance(line_number, bool): + raise ValueError("FileSearchMatch.line_number must be an integer.") + if not isinstance(line, str): + raise ValueError("FileSearchMatch.line must be a string.") + return cls(line_number=line_number, line=line) + + def __eq__(self, other: object) -> bool: + """Return whether two matches have the same values.""" + return isinstance(other, FileSearchMatch) and self.to_dict() == other.to_dict() + + def __repr__(self) -> str: + """Return a helpful debug representation.""" + return f"FileSearchMatch(line_number={self.line_number!r}, line={self.line!r})" + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSearchResult(SerializationMixin): + """Represent the search result for one file: the file name, a snippet, and the matching lines.""" + + file_name: str + snippet: str + matching_lines: list[FileSearchMatch] + __slots__ = ("file_name", "matching_lines", "snippet") + + def __init__( + self, + file_name: str, + snippet: str = "", + matching_lines: list[FileSearchMatch] | None = None, + ) -> None: + """Initialize one search result. + + Args: + file_name: The name of the file that matched the search. + snippet: A short context snippet around the first match. + matching_lines: The list of matching lines within the file. + """ + self.file_name = file_name + self.snippet = snippet + self.matching_lines = list(matching_lines) if matching_lines is not None else [] + + def to_dict(self, *, exclude: set[str] | None = None, exclude_none: bool = True) -> dict[str, Any]: + """Serialize this result to a JSON-compatible dictionary. + + Overrides :meth:`SerializationMixin.to_dict` for the same reason as + :meth:`FileSearchMatch.to_dict`: this DTO uses ``__slots__`` so the + base implementation cannot introspect the payload. The ``exclude`` / + ``exclude_none`` arguments are accepted and ignored to preserve + signature compatibility with the mixin. + """ + del exclude, exclude_none + return { + "file_name": self.file_name, + "snippet": self.snippet, + "matching_lines": [match.to_dict() for match in self.matching_lines], + } + + @classmethod + def from_dict( + cls, raw_result: MutableMapping[str, Any], /, *, dependencies: MutableMapping[str, Any] | None = None + ) -> FileSearchResult: + """Parse one search result from its dict representation.""" + del dependencies + file_name = raw_result.get("file_name", "") + snippet = raw_result.get("snippet", "") + raw_matching_lines = raw_result.get("matching_lines", []) + if not isinstance(file_name, str): + raise ValueError("FileSearchResult.file_name must be a string.") + if not isinstance(snippet, str): + raise ValueError("FileSearchResult.snippet must be a string.") + if not isinstance(raw_matching_lines, list): + raise ValueError("FileSearchResult.matching_lines must be a list.") + matching_lines: list[FileSearchMatch] = [] + for item in cast(list[object], raw_matching_lines): + if not isinstance(item, Mapping): + raise ValueError("FileSearchResult.matching_lines elements must be mappings.") + matching_lines.append(FileSearchMatch.from_dict(cast(MutableMapping[str, Any], item))) + return cls(file_name=file_name, snippet=snippet, matching_lines=matching_lines) + + def __eq__(self, other: object) -> bool: + """Return whether two results have the same values.""" + return isinstance(other, FileSearchResult) and self.to_dict() == other.to_dict() + + def __repr__(self) -> str: + """Return a helpful debug representation.""" + return ( + "FileSearchResult(" + f"file_name={self.file_name!r}, snippet={self.snippet!r}, matching_lines={self.matching_lines!r})" + ) + + +def _search_file_content(file_name: str, content: str, regex: re.Pattern[str]) -> FileSearchResult | None: + r"""Search one file's content and return a :class:`FileSearchResult` if any lines match. + + Lines are split on ``\n`` (so ``\r`` at the end of each line is stripped on + the matching line itself). A snippet of up to ``±_SEARCH_SNIPPET_RADIUS`` + characters around the first match is included. Returns ``None`` when no + lines match. + """ + lines = content.split("\n") + matching_lines: list[FileSearchMatch] = [] + first_snippet: str | None = None + line_start_offset = 0 + + for line_number, line in enumerate(lines, start=1): + match = regex.search(line) + if match is not None: + matching_lines.append(FileSearchMatch(line_number=line_number, line=line.rstrip("\r"))) + if first_snippet is None: + char_index = line_start_offset + match.start() + snippet_start = max(0, char_index - _SEARCH_SNIPPET_RADIUS) + snippet_end = min(len(content), char_index + (match.end() - match.start()) + _SEARCH_SNIPPET_RADIUS) + first_snippet = content[snippet_start:snippet_end] + # Advance past this line and the implied '\n' separator. + line_start_offset += len(line) + 1 + + if not matching_lines: + return None + return FileSearchResult( + file_name=file_name, + snippet=first_snippet or "", + matching_lines=matching_lines, + ) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class AgentFileStore(ABC): + """Abstract base class for file storage operations used by :class:`FileAccessProvider`. + + All paths are relative to an implementation-defined root. Implementations may + map these paths to a local file system, in-memory store, remote blob storage, + or other mechanisms. Paths use forward slashes as separators and must not + escape the root (e.g., via ``..`` segments). Implementations are responsible + for enforcing that invariant. + """ + + @abstractmethod + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. + + Args: + path: The relative path of the file to write. + content: The content to write to the file. + + Keyword Args: + overwrite: When ``True`` (default) any existing file is replaced. + When ``False`` the implementation must perform an atomic + exclusive create and raise :class:`FileExistsError` if a file + already exists at ``path``. + + Raises: + FileExistsError: When ``overwrite`` is ``False`` and a file already + exists at ``path``. + """ + + @abstractmethod + async def read_file(self, path: str) -> str | None: + """Read the content of the file at ``path``. + + Args: + path: The relative path of the file to read. + + Returns: + The file content, or ``None`` if the file does not exist. + """ + + @abstractmethod + async def delete_file(self, path: str) -> bool: + """Delete the file at ``path``. + + Args: + path: The relative path of the file to delete. + + Returns: + ``True`` if the file was deleted; ``False`` if it did not exist. + """ + + @abstractmethod + async def list_files(self, directory: str = "") -> list[str]: + """List the direct child files of ``directory``. + + Args: + directory: The relative directory path to list. Use ``""`` for the root. + + Returns: + The list of file names (not full paths) in the specified directory. + """ + + @abstractmethod + async def file_exists(self, path: str) -> bool: + """Return whether a file exists at ``path``. + + Args: + path: The relative path of the file to check. + """ + + @abstractmethod + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search files in ``directory`` for content matching ``regex_pattern``. + + Args: + directory: The relative directory to search. Use ``""`` for the root. + regex_pattern: A regular expression matched against file contents + (case-insensitive). For example, ``"error|warning"`` matches lines + containing ``"error"`` or ``"warning"``. + file_pattern: An optional glob pattern (case-insensitive) used to + filter which files are searched. When ``None`` or blank, every + file in the directory is searched. + + Returns: + The list of files whose content matched, with snippet and matching + line metadata. + """ + + @abstractmethod + async def create_directory(self, path: str) -> None: + """Ensure ``path`` exists as a directory, creating it if necessary.""" + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class InMemoryAgentFileStore(AgentFileStore): + """An in-memory :class:`AgentFileStore` backed by a dict. + + Suitable for tests and lightweight scenarios where persistence is not + required. Directory concepts are simulated using path prefixes — no explicit + directory structure is maintained. + """ + + def __init__(self) -> None: + """Initialize an empty in-memory file store.""" + # Keys are case-insensitive (normalized + lowercased) so the store + # behaves consistently on case-insensitive deployments. Each entry + # also records the *original* normalized path so ``list_files`` and + # ``search_files`` return display names that match what the caller + # wrote, mirroring how :class:`FileSystemAgentFileStore` preserves the + # on-disk casing. + self._files: dict[str, tuple[str, str]] = {} + self._lock = asyncio.Lock() + + @staticmethod + def _key(path: str) -> str: + return _normalize_relative_path(path).lower() + + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. + + When ``overwrite`` is ``False`` the check-and-write happens under the + store lock so concurrent callers cannot both observe a missing file + and race to create it. + """ + display = _normalize_relative_path(path) + key = display.lower() + async with self._lock: + if not overwrite and key in self._files: + raise FileExistsError(f"File already exists: {path!r}") + self._files[key] = (display, content) + + async def read_file(self, path: str) -> str | None: + """Return the file content, or ``None`` if the file does not exist.""" + key = self._key(path) + async with self._lock: + entry = self._files.get(key) + return entry[1] if entry is not None else None + + async def delete_file(self, path: str) -> bool: + """Delete the file and return whether anything was removed.""" + key = self._key(path) + async with self._lock: + return self._files.pop(key, None) is not None + + async def list_files(self, directory: str = "") -> list[str]: + """Return the direct child files of ``directory``. + + Returns the *original-case* file names that were written, so a caller + that does ``write_file("Plan.MD", ...)`` then ``list_files()`` gets + back ``["Plan.MD"]`` rather than ``["plan.md"]``. This matches the + behaviour of :class:`FileSystemAgentFileStore` on case-preserving + filesystems. + """ + prefix = _normalize_relative_path(directory, is_directory=True).lower() + if prefix and not prefix.endswith("/"): + prefix += "/" + async with self._lock: + entries = [(key, display) for key, (display, _) in self._files.items()] + results: list[str] = [] + for key, display in entries: + if not key.startswith(prefix): + continue + if "/" in key[len(prefix) :]: + continue + # ``display`` is the original-case normalized path; strip the + # directory prefix using the same length we matched on ``key``. + results.append(display[len(prefix) :]) + return results + + async def file_exists(self, path: str) -> bool: + """Return whether the file exists.""" + key = self._key(path) + async with self._lock: + return key in self._files + + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search file contents for ``regex_pattern`` matches. + + Snapshots the entries under the store lock and offloads the regex scan + to a worker thread with a bounded timeout so a pathological pattern + cannot stall the event loop. Returned :class:`FileSearchResult` + instances use the *original-case* file names so the result mirrors + what :class:`FileSystemAgentFileStore` would produce. + """ + prefix = _normalize_relative_path(directory, is_directory=True).lower() + if prefix and not prefix.endswith("/"): + prefix += "/" + regex = _compile_search_regex(regex_pattern) + + async with self._lock: + entries = [(key, display, content) for key, (display, content) in self._files.items()] + + def scan() -> list[FileSearchResult]: + results: list[FileSearchResult] = [] + for key, display, file_content in entries: + if not key.startswith(prefix): + continue + relative_key = key[len(prefix) :] + if "/" in relative_key: + continue + relative_display = display[len(prefix) :] + if not _matches_glob(relative_display, file_pattern): + continue + result = _search_file_content(relative_display, file_content, regex) + if result is not None: + results.append(result) + return results + + return await _run_search_with_timeout(scan) + + async def create_directory(self, path: str) -> None: + """No-op: directories are implicit from file paths in the in-memory store.""" + del path + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileSystemAgentFileStore(AgentFileStore): + """A disk-backed :class:`AgentFileStore` rooted under a configurable directory. + + All paths are resolved relative to the root directory provided at + construction time. Lexical path traversal attempts (for example, via ``..`` + segments or absolute paths) are rejected with :class:`ValueError`. The root + directory is created automatically if it does not already exist. + + Symbolic links and reparse points anywhere along the resolved path are + rejected on read, write, delete, list, and existence checks. The check is + a probe followed by an open: on POSIX the open also passes ``O_NOFOLLOW`` + so the kernel refuses if the leaf segment becomes a symlink between the + probe and the open. On Windows the protection is best-effort: it covers + the static case (a symlink already exists when the call is made) but + cannot cover an adversarial caller that swaps a parent directory for a + symlink between the probe and the file open. This store is designed for + single-tenant or co-operating-tenant use; it is not a sandbox against a + hostile process that shares the root directory. + """ + + def __init__(self, root_directory: str | os.PathLike[str]) -> None: + """Initialize the file-system store. + + Args: + root_directory: The directory under which all files are stored. + Created if it does not exist. + """ + raw_root = os.fspath(root_directory) + if not raw_root or not raw_root.strip(): + raise ValueError("root_directory must not be empty or whitespace-only.") + root_path = Path(raw_root).resolve() + root_path.mkdir(parents=True, exist_ok=True) + self._root_path = root_path + + @property + def root_path(self) -> Path: + """Return the resolved root directory.""" + return self._root_path + + def _resolve_safe_path(self, relative_path: str) -> Path: + """Resolve a relative file path safely under the root directory. + + Symbolic links and reparse points are detected on the *unresolved* path + before any call to :meth:`~pathlib.Path.resolve`. ``Path.resolve`` + collapses symbolic links, so probing for them on a resolved path would + either silently follow in-root symlinks or produce a misleading + "escapes the root" error for out-of-root targets. Checking the + unresolved candidate first keeps the rejection deterministic and gives + the caller the specific symlink error. + + The probe is followed by the actual open, so there is an unavoidable + race window in which a concurrent writer on the host can swap an + intermediate path segment for a symlink. The open path mitigates the + common case by passing ``O_NOFOLLOW`` on POSIX so the kernel refuses + if the *leaf* segment becomes a symlink between the probe and the + open. The Windows ``open`` API has no equivalent flag and intermediate + directory swaps cannot be closed from user space on either platform. + """ + normalized = _normalize_relative_path(relative_path) + candidate = self._root_path / normalized + self._throw_if_contains_symlink(candidate) + resolved = candidate.resolve() + try: + resolved.relative_to(self._root_path) + except ValueError as exc: + raise ValueError(f"Invalid path: {relative_path!r}. The resolved path escapes the root directory.") from exc + return resolved + + def _resolve_safe_directory_path(self, relative_directory: str) -> Path: + """Resolve a relative directory path safely under the root directory. + + Empty and whitespace-only inputs both resolve to the root directory, + matching the behavior of ``_normalize_relative_path(..., is_directory=True)`` + and the convention used by :class:`InMemoryAgentFileStore`. + """ + normalized = _normalize_relative_path(relative_directory, is_directory=True) + if not normalized: + return self._root_path + return self._resolve_safe_path(normalized) + + def _throw_if_contains_symlink(self, candidate: Path) -> None: + """Reject any segment between the root and ``candidate`` that is a symlink/reparse point. + + Walks each ancestor down from the root on the *unresolved* candidate so + ``Path.is_symlink`` observes the on-disk entries instead of their + canonical targets. Stops once a segment does not exist on disk so write + scenarios remain allowed. ``Path.is_symlink`` detects both POSIX + symlinks and Windows reparse points (junctions). + """ + try: + relative_parts = candidate.relative_to(self._root_path).parts + except ValueError: + # ``_resolve_safe_path`` already validates containment; an + # unrelated path here would mean we were called with a path that + # never belonged to the root in the first place. + raise ValueError("Invalid path: the resolved path is not under the root directory.") from None + + current = self._root_path + for segment in relative_parts: + current = current / segment + try: + is_link = current.is_symlink() + except OSError as exc: + # Fail closed: if we cannot verify whether a segment is a + # symlink/reparse point we refuse the operation rather than + # silently allow access that may escape the root. + raise ValueError( + f"Invalid path: unable to verify whether '{segment}' is a symbolic link or reparse point." + ) from exc + if is_link: + raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") + if not current.exists(): + break + + async def write_file(self, path: str, content: str, *, overwrite: bool = True) -> None: + """Write ``content`` to the file at ``path``. + + When ``overwrite`` is ``False`` the file is created using + ``O_CREAT | O_EXCL`` so the underlying ``open`` call performs an + atomic exclusive create and raises :class:`FileExistsError` if a file + already exists. On POSIX the open additionally passes ``O_NOFOLLOW`` + so the kernel refuses to overwrite or replace a leaf symlink, closing + the obvious probe-then-open race for the file itself. + """ + full_path = self._resolve_safe_path(path) + await asyncio.to_thread(self._write_file_sync, full_path, content, overwrite) + + @staticmethod + def _write_file_sync(full_path: Path, content: str, overwrite: bool) -> None: + full_path.parent.mkdir(parents=True, exist_ok=True) + encoded = content.encode("utf-8") + flags = os.O_WRONLY | os.O_CREAT + if overwrite: + flags |= os.O_TRUNC + else: + flags |= os.O_EXCL + # ``O_NOFOLLOW`` is POSIX-only; on Windows ``Path.is_symlink`` / + # reparse-point detection in :meth:`_throw_if_contains_symlink` is the + # only line of defence for the leaf segment. + nofollow = getattr(os, "O_NOFOLLOW", 0) + flags |= nofollow + try: + fd = os.open(full_path, flags, 0o644) + except OSError as exc: + if not overwrite and isinstance(exc, FileExistsError): + raise + # ``ELOOP`` (POSIX): the open refused because the leaf is a + # symlink. Surface the same message as the static symlink probe so + # the caller's exception-handling path is uniform. + if nofollow and getattr(exc, "errno", None) == _ELOOP: + raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") from exc + raise + with os.fdopen(fd, "wb") as handle: + handle.write(encoded) + + async def read_file(self, path: str) -> str | None: + """Return the file content, or ``None`` if the file does not exist. + + Raises :class:`ValueError` if the file exists but its bytes are not + valid UTF-8. Tooling that calls this on possibly-binary content should + catch :class:`ValueError` and present the failure to the agent as a + recoverable string response rather than a stack trace. + """ + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._read_file_sync, full_path) + + @staticmethod + def _read_file_sync(full_path: Path) -> str | None: + if not full_path.is_file(): + return None + nofollow = getattr(os, "O_NOFOLLOW", 0) + try: + fd = os.open(full_path, os.O_RDONLY | nofollow) + except OSError as exc: + if nofollow and getattr(exc, "errno", None) == _ELOOP: + raise ValueError("Invalid path: the resolved path contains a symbolic link or reparse point.") from exc + raise + with os.fdopen(fd, "rb") as handle: + raw = handle.read() + try: + return raw.decode("utf-8") + except UnicodeDecodeError as exc: + raise ValueError(f"File '{full_path.name}' is not UTF-8 text and cannot be read.") from exc + + async def delete_file(self, path: str) -> bool: + """Delete the file and return whether anything was removed.""" + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._delete_file_sync, full_path) + + @staticmethod + def _delete_file_sync(full_path: Path) -> bool: + if not full_path.is_file(): + return False + full_path.unlink() + return True + + async def list_files(self, directory: str = "") -> list[str]: + """Return the direct child files of ``directory``.""" + full_dir = self._resolve_safe_directory_path(directory) + return await asyncio.to_thread(self._list_files_sync, full_dir) + + @staticmethod + def _list_files_sync(full_dir: Path) -> list[str]: + if not full_dir.is_dir(): + return [] + names: list[str] = [] + for entry in full_dir.iterdir(): + if entry.is_symlink(): + continue + if entry.is_file(): + names.append(entry.name) + return names + + async def file_exists(self, path: str) -> bool: + """Return whether the file exists.""" + full_path = self._resolve_safe_path(path) + return await asyncio.to_thread(self._file_exists_sync, full_path) + + @staticmethod + def _file_exists_sync(full_path: Path) -> bool: + return full_path.is_file() + + async def search_files( + self, + directory: str, + regex_pattern: str, + file_pattern: str | None = None, + ) -> list[FileSearchResult]: + """Search file contents for ``regex_pattern`` matches. + + Files whose bytes are not valid UTF-8 are skipped (so a single binary + file does not abort the whole directory search). Each skip is logged at + ``WARNING`` level and a summary is logged at ``INFO`` so operators can + tell the difference between "no matches" and "the corpus was largely + not searchable". + """ + full_dir = self._resolve_safe_directory_path(directory) + regex = _compile_search_regex(regex_pattern) + return await _run_search_with_timeout(lambda: self._search_files_sync(full_dir, regex, file_pattern)) + + @staticmethod + def _search_files_sync(full_dir: Path, regex: re.Pattern[str], file_pattern: str | None) -> list[FileSearchResult]: + if not full_dir.is_dir(): + return [] + results: list[FileSearchResult] = [] + skipped: list[str] = [] + for entry in full_dir.iterdir(): + if entry.is_symlink() or not entry.is_file(): + continue + file_name = entry.name + if not _matches_glob(file_name, file_pattern): + continue + try: + file_content = entry.read_text(encoding="utf-8") + except UnicodeDecodeError: + # Skip binary or otherwise non-UTF-8 files so a single + # un-decodable entry doesn't abort the whole directory search. + # Log per file so operators can audit which files were skipped. + logger.warning("Skipping non-UTF-8 file during search: %s", entry) + skipped.append(file_name) + continue + result = _search_file_content(file_name, file_content, regex) + if result is not None: + results.append(result) + if skipped: + logger.info( + "Search under %s skipped %d non-UTF-8 file(s) (matched %d).", + full_dir, + len(skipped), + len(results), + ) + return results + + async def create_directory(self, path: str) -> None: + """Ensure the directory at ``path`` exists, creating it if necessary.""" + full_path = self._resolve_safe_directory_path(path) + await asyncio.to_thread(lambda: full_path.mkdir(parents=True, exist_ok=True)) + + +@experimental(feature_id=ExperimentalFeature.HARNESS) +class FileAccessProvider(ContextProvider): + """Context provider that gives an agent CRUD/search access to a shared file store. + + The provider exposes five tools to the agent via the per-invocation + :class:`~agent_framework.SessionContext`: + + - ``file_access_save_file`` — Save a file (refuses to overwrite by default). + - ``file_access_read_file`` — Read the content of a file by name. + - ``file_access_delete_file`` — Delete a file by name. + - ``file_access_list_files`` — List all file names at the store root. + - ``file_access_search_files`` — Search file contents using a case-insensitive + regex, optionally filtered by a glob pattern over file names. + + Unlike :class:`~agent_framework.MemoryContextProvider`, which provides + session-scoped memory that may be isolated per session, + :class:`FileAccessProvider` operates on a shared, persistent store whose + contents are visible across sessions and agents. The store is passed in by + the caller and should already be scoped to the desired folder or storage + location. + """ + + def __init__( + self, + store: AgentFileStore, + *, + source_id: str = DEFAULT_FILE_ACCESS_SOURCE_ID, + instructions: str | None = None, + require_delete_approval: bool = True, + ) -> None: + """Initialize the file access provider. + + Args: + store: The file store implementation used for storage operations. + The store should already be scoped to the desired folder or + storage location. + + Keyword Args: + source_id: Unique source ID for the provider. + instructions: Optional instruction override. When ``None`` the + default file-access instructions are used. + require_delete_approval: When ``True`` (the default) the + ``file_access_delete_file`` tool is registered with + ``approval_mode="always_require"`` so the host must approve every + delete the model proposes. Set to ``False`` to opt out and allow + the agent to delete files autonomously (matching the .NET + ``FileAccessProvider``, which has no approval mechanism). + """ + super().__init__(source_id) + self.store = store + self.instructions = instructions or DEFAULT_FILE_ACCESS_INSTRUCTIONS + self.require_delete_approval = require_delete_approval + + async def before_run( + self, + *, + agent: Any, + session: AgentSession, + context: SessionContext, + state: dict[str, Any], + ) -> None: + """Inject file-access tools and instructions before the model runs.""" + del agent, session, state + + @tool(name="file_access_save_file", approval_mode="never_require") + async def file_access_save_file(file_name: str, content: str, overwrite: bool = False) -> str: + """Save a file with the given name and content. By default, does not overwrite an existing file unless overwrite is set to true.""" # noqa: E501 + try: + normalized = _normalize_relative_path(file_name) + await self.store.write_file(normalized, content, overwrite=overwrite) + except FileExistsError: + return f"File '{file_name}' already exists. To replace it, save again with overwrite set to true." + except ValueError as exc: + return f"Could not save file '{file_name}': {exc}" + except OSError as exc: + return f"Could not save file '{file_name}': {exc.strerror or exc}" + return f"File '{file_name}' saved." + + @tool(name="file_access_read_file", approval_mode="never_require") + async def file_access_read_file(file_name: str) -> str: + """Read the content of a file by name. Returns the file content or a message indicating the file could not be read.""" # noqa: E501 + try: + normalized = _normalize_relative_path(file_name) + content = await self.store.read_file(normalized) + except ValueError as exc: + return f"Could not read file '{file_name}': {exc}" + except OSError as exc: + return f"Could not read file '{file_name}': {exc.strerror or exc}" + return content if content is not None else f"File '{file_name}' not found." + + delete_approval_mode: ApprovalMode = "always_require" if self.require_delete_approval else "never_require" + + @tool(name="file_access_delete_file", approval_mode=delete_approval_mode) + async def file_access_delete_file(file_name: str) -> str: + """Delete a file by name.""" + try: + normalized = _normalize_relative_path(file_name) + deleted = await self.store.delete_file(normalized) + except ValueError as exc: + return f"Could not delete file '{file_name}': {exc}" + except OSError as exc: + return f"Could not delete file '{file_name}': {exc.strerror or exc}" + return f"File '{file_name}' deleted." if deleted else f"File '{file_name}' not found." + + @tool(name="file_access_list_files", approval_mode="never_require") + async def file_access_list_files(directory: str | None = None) -> list[str] | str: + """List the direct child file names of a directory. Omit ``directory`` (or pass an empty string) to list the root. To enumerate files in a subdirectory, pass its relative path, for example ``"reports"`` or ``"reports/2024"``.""" # noqa: E501 + target = directory if directory and directory.strip() else "" + try: + return await self.store.list_files(target) + except ValueError as exc: + return f"Could not list directory '{directory or ''}': {exc}" + except OSError as exc: + return f"Could not list directory '{directory or ''}': {exc.strerror or exc}" + + @tool(name="file_access_search_files", approval_mode="never_require") + async def file_access_search_files( + regex_pattern: str, + file_pattern: str | None = None, + directory: str | None = None, + ) -> list[dict[str, Any]] | str: + """Search file contents using a regular expression pattern (case-insensitive). Optionally filter which files to search using a glob pattern (e.g., "*.md", "research*"). Optionally scope the search to a subdirectory by passing its relative path; omit ``directory`` (or pass an empty string) to search the root. Returns matching file names, snippets, and matching lines with line numbers. The regex_pattern must be 256 characters or fewer.""" # noqa: E501 + pattern = file_pattern if file_pattern and file_pattern.strip() else None + target = directory if directory and directory.strip() else "" + try: + results = await self.store.search_files(target, regex_pattern, pattern) + except ValueError as exc: + return f"Could not search files: {exc}" + except OSError as exc: + return f"Could not search files: {exc.strerror or exc}" + return [result.to_dict() for result in results] + + context.extend_instructions(self.source_id, [self.instructions]) + context.extend_tools( + self.source_id, + [ + file_access_save_file, + file_access_read_file, + file_access_delete_file, + file_access_list_files, + file_access_search_files, + ], + ) + + +__all__ = [ + "DEFAULT_FILE_ACCESS_INSTRUCTIONS", + "DEFAULT_FILE_ACCESS_SOURCE_ID", + "AgentFileStore", + "FileAccessProvider", + "FileSearchMatch", + "FileSearchResult", + "FileSystemAgentFileStore", + "InMemoryAgentFileStore", +] diff --git a/python/packages/core/tests/core/test_harness_file_access.py b/python/packages/core/tests/core/test_harness_file_access.py new file mode 100644 index 0000000000..c9599ccbf3 --- /dev/null +++ b/python/packages/core/tests/core/test_harness_file_access.py @@ -0,0 +1,713 @@ +# Copyright (c) Microsoft. All rights reserved. + +from __future__ import annotations + +import json +import re +import time +from pathlib import Path + +import pytest + +from agent_framework import ( + Agent, + AgentFileStore, + AgentSession, + ExperimentalFeature, + FileAccessProvider, + FileSearchMatch, + FileSearchResult, + FileSystemAgentFileStore, + InMemoryAgentFileStore, + Message, + SupportsChatGetResponse, +) +from agent_framework._harness import _file_access as _file_access_module +from agent_framework._harness._file_access import ( + DEFAULT_FILE_ACCESS_INSTRUCTIONS, + DEFAULT_FILE_ACCESS_SOURCE_ID, + _matches_glob, + _normalize_relative_path, + _run_search_with_timeout, +) + + +def _tool_by_name(tools: list[object], name: str) -> object: + """Return the tool with the requested name from a prepared tool list.""" + for tool in tools: + if getattr(tool, "name", None) == name: + return tool + raise AssertionError(f"Tool {name!r} was not found.") + + +def test_normalize_relative_path_collapses_and_validates() -> None: + """The path normalizer should accept relative forward/backslash paths and reject unsafe ones.""" + assert _normalize_relative_path("foo/bar.txt") == "foo/bar.txt" + assert _normalize_relative_path("foo\\bar.txt") == "foo/bar.txt" + assert _normalize_relative_path("foo//bar.txt") == "foo/bar.txt" + assert _normalize_relative_path(" foo/bar.txt ") == "foo/bar.txt" + assert _normalize_relative_path("", is_directory=True) == "" + assert _normalize_relative_path(" ", is_directory=True) == "" + assert _normalize_relative_path("sub/", is_directory=True) == "sub" + assert _normalize_relative_path("sub\\", is_directory=True) == "sub" + + with pytest.raises(ValueError, match="must not be empty"): + _normalize_relative_path("") + with pytest.raises(ValueError, match="must not be empty"): + _normalize_relative_path(" ") + with pytest.raises(ValueError, match="must not end with a path separator"): + _normalize_relative_path("foo/") + with pytest.raises(ValueError, match="must not end with a path separator"): + _normalize_relative_path("foo\\") + with pytest.raises(ValueError, match="'..' segments"): + _normalize_relative_path("foo/../bar.txt") + with pytest.raises(ValueError, match="'..' segments"): + _normalize_relative_path("./bar.txt") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("C:/abs/path") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("\\rooted") + with pytest.raises(ValueError, match="must be relative"): + _normalize_relative_path("/foo/bar.txt") + + +def test_matches_glob_is_case_insensitive_and_optional() -> None: + """The glob matcher should be case-insensitive and treat missing patterns as match-all.""" + assert _matches_glob("notes.MD", "*.md") + assert _matches_glob("research_one.txt", "research*") + assert not _matches_glob("plan.txt", "*.md") + assert _matches_glob("anything", None) + assert _matches_glob("anything", "") + assert _matches_glob("anything", " ") + + +def test_file_search_match_round_trips() -> None: + """File search match values should serialize and validate cleanly.""" + raw_match = {"line_number": 3, "line": "error: boom"} + + match = FileSearchMatch.from_dict(raw_match) + assert match == FileSearchMatch(line_number=3, line="error: boom") + assert match.to_dict() == raw_match + assert "FileSearchMatch(" in repr(match) + + with pytest.raises(ValueError, match="positive integer"): + FileSearchMatch(line_number=0, line="oops") + with pytest.raises(ValueError, match="must be an integer"): + FileSearchMatch.from_dict({"line_number": "1", "line": "oops"}) + with pytest.raises(ValueError, match="must be a string"): + FileSearchMatch.from_dict({"line_number": 1, "line": 42}) + + +def test_file_search_result_round_trips() -> None: + """File search result values should serialize the matching-line list correctly.""" + raw_result = { + "file_name": "notes.md", + "snippet": "hello error world", + "matching_lines": [{"line_number": 2, "line": "error one"}], + } + + result = FileSearchResult.from_dict(raw_result) + assert result.file_name == "notes.md" + assert result.snippet == "hello error world" + assert result.matching_lines == [FileSearchMatch(line_number=2, line="error one")] + assert result.to_dict() == raw_result + assert json.loads(result.to_json()) == raw_result + + with pytest.raises(ValueError, match="matching_lines must be a list"): + FileSearchResult.from_dict({"file_name": "x", "snippet": "", "matching_lines": {}}) + + with pytest.raises(ValueError, match="elements must be mappings"): + FileSearchResult.from_dict({"file_name": "x", "snippet": "", "matching_lines": ["not-a-dict"]}) + + +async def test_in_memory_store_round_trips_files() -> None: + """The in-memory store should support write/read/exists/delete/list operations.""" + store = InMemoryAgentFileStore() + + await store.write_file("a.txt", "alpha") + await store.write_file("sub/b.txt", "beta") + + assert await store.file_exists("a.txt") + assert not await store.file_exists("missing.txt") + assert await store.read_file("a.txt") == "alpha" + assert await store.read_file("missing.txt") is None + + assert sorted(await store.list_files()) == ["a.txt"] # subdirs are not direct children + assert sorted(await store.list_files("sub")) == ["b.txt"] + + assert await store.delete_file("a.txt") is True + assert await store.delete_file("a.txt") is False + assert sorted(await store.list_files()) == [] + + +async def test_in_memory_store_search_returns_matches_with_snippets() -> None: + """The in-memory store should search file content case-insensitively and respect glob filters.""" + store = InMemoryAgentFileStore() + await store.write_file("a.md", "line one\nThis line has ERROR inside\nline three\r") + await store.write_file("b.md", "no match here") + await store.write_file("notes.txt", "ERROR but wrong extension") + + results = await store.search_files("", "error", "*.md") + assert [result.file_name for result in results] == ["a.md"] + matching_lines = results[0].matching_lines + assert matching_lines == [FileSearchMatch(line_number=2, line="This line has ERROR inside")] + assert "ERROR" in results[0].snippet + + # No glob -> searches every file. + results_all = await store.search_files("", "error") + assert {result.file_name for result in results_all} == {"a.md", "notes.txt"} + + +async def test_in_memory_store_search_rejects_invalid_and_oversize_regex() -> None: + """``search_files`` should surface clean errors for bad regex input.""" + store = InMemoryAgentFileStore() + await store.write_file("a.md", "hello") + + with pytest.raises(re.error): + await store.search_files("", "[unclosed") + + with pytest.raises(ValueError, match="too long"): + await store.search_files("", "a" * 257) + + +async def test_in_memory_store_normalizes_paths() -> None: + """Path normalization should reject traversal in the in-memory store too.""" + store = InMemoryAgentFileStore() + for bad in ("../escape.txt", "/abs/path.txt", "."): + with pytest.raises(ValueError): + await store.write_file(bad, "boom") + + +async def test_filesystem_store_round_trips_files(tmp_path: Path) -> None: + """The filesystem store should round-trip files on disk and create parents on write.""" + store = FileSystemAgentFileStore(tmp_path) + + await store.write_file("nested/a.txt", "alpha") + assert (tmp_path / "nested" / "a.txt").read_text(encoding="utf-8") == "alpha" + + assert await store.read_file("nested/a.txt") == "alpha" + assert await store.read_file("missing.txt") is None + assert await store.file_exists("nested/a.txt") + assert not await store.file_exists("missing.txt") + assert sorted(await store.list_files("nested")) == ["a.txt"] + assert sorted(await store.list_files()) == [] # root only contains the directory + + assert await store.delete_file("nested/a.txt") is True + assert await store.delete_file("nested/a.txt") is False + + +async def test_filesystem_store_rejects_traversal_and_rooted_paths(tmp_path: Path) -> None: + """The filesystem store should refuse paths that escape the configured root.""" + store = FileSystemAgentFileStore(tmp_path) + + for bad in ("../escape.txt", "/etc/passwd", "C:/Windows/System32/notepad.exe", ".", ".."): + with pytest.raises(ValueError): + await store.write_file(bad, "boom") + + +async def test_filesystem_store_rejects_symlinks_into_root(tmp_path: Path) -> None: + """The filesystem store should refuse to read through a symlink target.""" + target = tmp_path / "outside.txt" + target.write_text("outside", encoding="utf-8") + root = tmp_path / "root" + root.mkdir() + link = root / "link.txt" + try: + link.symlink_to(target) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"Symbolic links are not supported in this environment: {exc!r}") + + store = FileSystemAgentFileStore(root) + with pytest.raises(ValueError, match="symbolic link"): + await store.read_file("link.txt") + with pytest.raises(ValueError, match="symbolic link"): + await store.write_file("link.txt", "stomp") + + # List operations should silently skip the symlink entry rather than raise. + assert await store.list_files() == [] + + +async def test_filesystem_store_rejects_in_root_symlinks(tmp_path: Path) -> None: + """Symlinks whose target lives under the root must still be rejected. + + ``Path.resolve`` collapses the symlink, so a naive resolved-path check + would silently follow it. The symlink probe must operate on the + unresolved candidate for this case to fail closed. + """ + root = tmp_path / "root" + root.mkdir() + real = root / "real.txt" + real.write_text("payload", encoding="utf-8") + link = root / "alias.txt" + try: + link.symlink_to(real) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"Symbolic links are not supported in this environment: {exc!r}") + + store = FileSystemAgentFileStore(root) + with pytest.raises(ValueError, match="symbolic link"): + await store.read_file("alias.txt") + # The non-symlinked sibling must still be readable. + assert await store.read_file("real.txt") == "payload" + + +async def test_filesystem_store_search_matches_lines_and_filters_globs(tmp_path: Path) -> None: + """The filesystem store should search files on disk and apply glob filters by file name.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("a.md", "hello\nERROR happens\nbye\r") + await store.write_file("b.txt", "ERROR happens too") + await store.write_file("c.md", "nothing here") + + results = await store.search_files("", "error", "*.md") + assert [result.file_name for result in results] == ["a.md"] + assert results[0].matching_lines == [FileSearchMatch(line_number=2, line="ERROR happens")] + assert "ERROR" in results[0].snippet + + results_all = await store.search_files("", "error") + assert {result.file_name for result in results_all} == {"a.md", "b.txt"} + + +async def test_filesystem_store_search_skips_non_utf8_files(tmp_path: Path) -> None: + """The filesystem store should silently skip non-UTF-8 files instead of aborting the search.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("notes.md", "ERROR happens here") + (tmp_path / "blob.bin").write_bytes(b"\x80\x81\x82\x83") + + results = await store.search_files("", "error") + assert [result.file_name for result in results] == ["notes.md"] + + +async def test_filesystem_store_create_directory(tmp_path: Path) -> None: + """The filesystem store should create directories under the configured root.""" + store = FileSystemAgentFileStore(tmp_path) + await store.create_directory("nested/inner") + assert (tmp_path / "nested" / "inner").is_dir() + + +async def test_filesystem_store_list_files_accepts_blank_directory(tmp_path: Path) -> None: + """Whitespace-only directory inputs should resolve to the root, matching the in-memory store.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("a.txt", "alpha") + assert sorted(await store.list_files("")) == ["a.txt"] + assert sorted(await store.list_files(" ")) == ["a.txt"] + + +def test_filesystem_store_requires_non_empty_root() -> None: + """The filesystem store constructor should refuse blank root paths.""" + with pytest.raises(ValueError, match="must not be empty"): + FileSystemAgentFileStore("") + with pytest.raises(ValueError, match="must not be empty"): + FileSystemAgentFileStore(" ") + + +async def test_file_access_provider_registers_tools_and_instructions( + chat_client_base: SupportsChatGetResponse, +) -> None: + """``FileAccessProvider.before_run`` should add the canonical instructions and five tools.""" + session = AgentSession(session_id="session-1") + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + + tools = options["tools"] + assert isinstance(tools, list) + expected_names = { + "file_access_save_file", + "file_access_read_file", + "file_access_delete_file", + "file_access_list_files", + "file_access_search_files", + } + assert {getattr(tool, "name", None) for tool in tools} >= expected_names + + instructions = options.get("instructions") + if isinstance(instructions, str): + assert DEFAULT_FILE_ACCESS_INSTRUCTIONS in instructions + else: + assert any(DEFAULT_FILE_ACCESS_INSTRUCTIONS in chunk for chunk in (instructions or [])) + + +async def test_file_access_provider_delete_approval_defaults_to_always_require( + chat_client_base: SupportsChatGetResponse, +) -> None: + """By default ``file_access_delete_file`` should require host approval.""" + session = AgentSession(session_id="session-1") + provider = FileAccessProvider(store=InMemoryAgentFileStore()) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + + tools = options["tools"] + assert isinstance(tools, list) + delete_file = _tool_by_name(tools, "file_access_delete_file") + assert delete_file.approval_mode == "always_require" + # The non-destructive tools should remain autonomous. + for name in ( + "file_access_save_file", + "file_access_read_file", + "file_access_list_files", + "file_access_search_files", + ): + assert _tool_by_name(tools, name).approval_mode == "never_require" + + +async def test_file_access_provider_delete_approval_opt_out( + chat_client_base: SupportsChatGetResponse, +) -> None: + """``require_delete_approval=False`` should drop delete to ``never_require``.""" + session = AgentSession(session_id="session-1") + provider = FileAccessProvider(store=InMemoryAgentFileStore(), require_delete_approval=False) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + + delete_file = _tool_by_name(options["tools"], "file_access_delete_file") # type: ignore[arg-type] + assert delete_file.approval_mode == "never_require" + + +async def test_file_access_provider_tools_round_trip_files( + chat_client_base: SupportsChatGetResponse, +) -> None: + """The provider's tools should drive save/read/list/search/delete flows on an ``InMemoryAgentFileStore``.""" + session = AgentSession(session_id="session-1") + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + tools = options["tools"] + assert isinstance(tools, list) + + save_file = _tool_by_name(tools, "file_access_save_file") + read_file = _tool_by_name(tools, "file_access_read_file") + delete_file = _tool_by_name(tools, "file_access_delete_file") + list_files = _tool_by_name(tools, "file_access_list_files") + search_files = _tool_by_name(tools, "file_access_search_files") + + saved = await save_file.invoke(arguments={"file_name": "plan.md", "content": "step 1\nERROR step 2"}) + assert "plan.md" in saved[0].text and "saved" in saved[0].text + + # Default overwrite=False should refuse the second save. + refused = await save_file.invoke(arguments={"file_name": "plan.md", "content": "stomp"}) + assert "already exists" in refused[0].text + + # overwrite=True should succeed. + overwritten = await save_file.invoke( + arguments={"file_name": "plan.md", "content": "stomp\nERROR replaced", "overwrite": True} + ) + assert "saved" in overwritten[0].text + + read_back = await read_file.invoke(arguments={"file_name": "plan.md"}) + assert read_back[0].text == "stomp\nERROR replaced" + + listed = await list_files.invoke() + assert json.loads(listed[0].text) == ["plan.md"] + + # The list tool should accept an optional directory argument so agents can + # enumerate nested folders (not only the root). + await save_file.invoke(arguments={"file_name": "reports/2024.md", "content": "annual"}) + listed_nested = await list_files.invoke(arguments={"directory": "reports"}) + assert json.loads(listed_nested[0].text) == ["2024.md"] + # Blank / whitespace directory should fall back to the root listing. + listed_blank = await list_files.invoke(arguments={"directory": " "}) + assert sorted(json.loads(listed_blank[0].text)) == ["plan.md"] + + missing = await read_file.invoke(arguments={"file_name": "missing.md"}) + assert "not found" in missing[0].text + + search_payload = await search_files.invoke(arguments={"regex_pattern": "error", "file_pattern": "*.md"}) + parsed = json.loads(search_payload[0].text) + assert parsed[0]["file_name"] == "plan.md" + assert parsed[0]["matching_lines"][0]["line"] == "ERROR replaced" + + # The search tool should likewise accept an optional directory argument so + # agents can scope a search to a subfolder. + await save_file.invoke(arguments={"file_name": "reports/issues.md", "content": "ERROR nested"}) + scoped = await search_files.invoke( + arguments={"regex_pattern": "error", "file_pattern": "*.md", "directory": "reports"} + ) + scoped_parsed = json.loads(scoped[0].text) + assert [entry["file_name"] for entry in scoped_parsed] == ["issues.md"] + + deleted = await delete_file.invoke(arguments={"file_name": "plan.md"}) + assert "deleted" in deleted[0].text + + missing_delete = await delete_file.invoke(arguments={"file_name": "plan.md"}) + assert "not found" in missing_delete[0].text + + +async def test_file_access_provider_accepts_custom_instructions() -> None: + """Custom instructions should override the default banner.""" + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store, instructions="custom-banner") + assert provider.instructions == "custom-banner" + assert provider.source_id == DEFAULT_FILE_ACCESS_SOURCE_ID + + +async def test_in_memory_store_write_file_raises_when_exists_and_no_overwrite() -> None: + """The atomic exclusive-create path should raise ``FileExistsError`` under the lock.""" + store = InMemoryAgentFileStore() + await store.write_file("plan.md", "v1") + + with pytest.raises(FileExistsError): + await store.write_file("plan.md", "v2", overwrite=False) + + # The original content is preserved. + assert await store.read_file("plan.md") == "v1" + + # Default ``overwrite=True`` still replaces. + await store.write_file("plan.md", "v3") + assert await store.read_file("plan.md") == "v3" + + +async def test_filesystem_store_write_file_raises_when_exists_and_no_overwrite(tmp_path: Path) -> None: + """The filesystem store should use exclusive-create semantics when ``overwrite=False``.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("plan.md", "v1") + + with pytest.raises(FileExistsError): + await store.write_file("plan.md", "v2", overwrite=False) + + assert (tmp_path / "plan.md").read_text(encoding="utf-8") == "v1" + + await store.write_file("plan.md", "v3", overwrite=True) + assert (tmp_path / "plan.md").read_text(encoding="utf-8") == "v3" + + +async def test_run_search_with_timeout_raises_value_error(monkeypatch: pytest.MonkeyPatch) -> None: + """A scan that exceeds the timeout should surface a clean ``ValueError``.""" + monkeypatch.setattr(_file_access_module, "_SEARCH_TIMEOUT_SECONDS", 0.01) + + def slow() -> list[FileSearchResult]: + time.sleep(0.5) + return [] + + with pytest.raises(ValueError, match="did not complete"): + await _run_search_with_timeout(slow) + + +async def test_filesystem_store_symlink_probe_fails_closed_on_oserror( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """If ``Path.is_symlink`` raises during the probe, the operation must be refused.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("ok.txt", "content") + + def boom(self: Path) -> bool: + raise PermissionError("access denied") + + monkeypatch.setattr(Path, "is_symlink", boom) + + with pytest.raises(ValueError, match="symbolic link or reparse point"): + await store.read_file("ok.txt") + + +def test_file_access_harness_classes_are_marked_experimental() -> None: + """File-access harness public classes should expose HARNESS experimental metadata.""" + assert AgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert InMemoryAgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSystemAgentFileStore.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSearchMatch.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileSearchResult.__feature_id__ == ExperimentalFeature.HARNESS.value + assert FileAccessProvider.__feature_id__ == ExperimentalFeature.HARNESS.value + assert ".. warning:: Experimental" in (FileAccessProvider.__doc__ or "") + + +async def test_in_memory_store_preserves_original_case_on_list_and_search() -> None: + """``list_files`` / ``search_files`` should return original-case names, not lowercased keys. + + Matches :class:`FileSystemAgentFileStore` on case-preserving filesystems so + tests written against the in-memory backend cannot encode a contract that + will diverge in production. + """ + store = InMemoryAgentFileStore() + await store.write_file("Plan.MD", "ERROR happens here\n") + await store.write_file("Reports/Q1.MD", "alpha") + + # list_files keeps the original case + assert sorted(await store.list_files()) == ["Plan.MD"] + assert sorted(await store.list_files("Reports")) == ["Q1.MD"] + + # case-insensitive directory lookup still works + assert sorted(await store.list_files("reports")) == ["Q1.MD"] + + # search_files emits the original-case file name in FileSearchResult + results = await store.search_files("", "error", "*.MD") + assert [r.file_name for r in results] == ["Plan.MD"] + + # read_file remains case-insensitive + assert await store.read_file("plan.md") == "ERROR happens here\n" + + +async def test_filesystem_store_read_file_raises_value_error_on_non_utf8(tmp_path: Path) -> None: + """Binary / non-UTF-8 files should raise a clean ``ValueError`` rather than ``UnicodeDecodeError``. + + The tool-layer wrapper relies on this contract to convert the failure into + a recoverable string response for the agent. + """ + store = FileSystemAgentFileStore(tmp_path) + (tmp_path / "blob.bin").write_bytes(b"\x80\x81\x82\x83") + + with pytest.raises(ValueError, match="not UTF-8 text"): + await store.read_file("blob.bin") + + +async def test_filesystem_store_search_logs_skipped_non_utf8_files( + tmp_path: Path, caplog: pytest.LogCaptureFixture +) -> None: + """``search_files`` skips non-UTF-8 files but logs per-file and a summary so operators have signal.""" + store = FileSystemAgentFileStore(tmp_path) + await store.write_file("notes.md", "ERROR happens here") + (tmp_path / "blob.bin").write_bytes(b"\x80\x81\x82\x83") + + with caplog.at_level("INFO", logger="agent_framework._harness._file_access"): + results = await store.search_files("", "error") + + assert [r.file_name for r in results] == ["notes.md"] + assert any("Skipping non-UTF-8 file during search" in rec.message for rec in caplog.records) + assert any("skipped 1 non-UTF-8 file" in rec.message for rec in caplog.records) + + +async def test_file_access_tool_wrappers_surface_value_error_as_message( + chat_client_base: SupportsChatGetResponse, +) -> None: + """Recoverable failures (bad path, oversized regex, non-UTF-8 read) should be returned as strings. + + Without these wrappers the model sees a raw stack trace for "you used ``..``" + but a polite message for "the file already exists", which is the opposite + of what is recoverable. + """ + session = AgentSession(session_id="session-1") + store = InMemoryAgentFileStore() + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["work with files"])], + ) + tools = options["tools"] + assert isinstance(tools, list) + + save_file = _tool_by_name(tools, "file_access_save_file") + read_file = _tool_by_name(tools, "file_access_read_file") + delete_file = _tool_by_name(tools, "file_access_delete_file") + list_files = _tool_by_name(tools, "file_access_list_files") + search_files = _tool_by_name(tools, "file_access_search_files") + + # Path-traversal attempts on each tool should return a clean string, not raise. + saved = await save_file.invoke(arguments={"file_name": "../escape.txt", "content": "x"}) + assert "Could not save" in saved[0].text and "escape" in saved[0].text.lower() + read = await read_file.invoke(arguments={"file_name": "../escape.txt"}) + assert "Could not read" in read[0].text + deleted = await delete_file.invoke(arguments={"file_name": "../escape.txt"}) + assert "Could not delete" in deleted[0].text + listed = await list_files.invoke(arguments={"directory": "../escape"}) + assert "Could not list" in listed[0].text + + # Regex length cap should also be returned to the model as text. + too_long = "a" * 1024 + searched = await search_files.invoke(arguments={"regex_pattern": too_long}) + assert "Could not search files" in searched[0].text + + +async def test_file_access_tool_read_file_wrapper_surfaces_non_utf8( + tmp_path: Path, chat_client_base: SupportsChatGetResponse +) -> None: + """The read-file tool wrapper should convert a non-UTF-8 ``ValueError`` into a readable string.""" + store = FileSystemAgentFileStore(tmp_path) + (tmp_path / "blob.bin").write_bytes(b"\x80\x81\x82\x83") + + session = AgentSession(session_id="session-1") + provider = FileAccessProvider(store=store) + agent = Agent(client=chat_client_base, context_providers=[provider]) + + _, options = await agent._prepare_session_and_messages( # type: ignore[reportPrivateUsage] + session=session, + input_messages=[Message(role="user", contents=["read it"])], + ) + read_file = _tool_by_name(options["tools"], "file_access_read_file") + response = await read_file.invoke(arguments={"file_name": "blob.bin"}) + assert "Could not read" in response[0].text and "UTF-8" in response[0].text + + +_NEEDS_SYMLINK = "Symbolic links are not supported in this environment" + + +async def test_filesystem_store_rejects_symlink_on_delete_search_and_list(tmp_path: Path) -> None: + """The same symlink probe must front delete/search/list, not just read/write.""" + target = tmp_path / "outside.txt" + target.write_text("outside", encoding="utf-8") + root = tmp_path / "root" + root.mkdir() + link = root / "link.txt" + try: + link.symlink_to(target) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"{_NEEDS_SYMLINK}: {exc!r}") + + store = FileSystemAgentFileStore(root) + + with pytest.raises(ValueError, match="symbolic link"): + await store.delete_file("link.txt") + + # search_files of the root never touches the symlink leaf directly, but + # search_files of a symlinked *directory* path must be rejected by the + # safe-directory resolver. + dir_link = root / "alias_dir" + other_dir = tmp_path / "outside_dir" + other_dir.mkdir() + try: + dir_link.symlink_to(other_dir) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"{_NEEDS_SYMLINK}: {exc!r}") + + with pytest.raises(ValueError, match="symbolic link"): + await store.search_files("alias_dir", "anything") + with pytest.raises(ValueError, match="symbolic link"): + await store.list_files("alias_dir") + + +async def test_filesystem_store_rejects_symlinked_intermediate_directory(tmp_path: Path) -> None: + """A symlink used as a non-leaf path segment must still be rejected. + + The classic escape vector is ``root/aliased_dir/file.txt`` where + ``aliased_dir`` is a symlink to somewhere outside the root. The + ``_throw_if_contains_symlink`` walk must check every segment, not only + the leaf. + """ + outside = tmp_path / "outside_dir" + outside.mkdir() + (outside / "secret.txt").write_text("payload", encoding="utf-8") + root = tmp_path / "root" + root.mkdir() + link = root / "aliased_dir" + try: + link.symlink_to(outside) + except (OSError, NotImplementedError) as exc: + pytest.skip(f"{_NEEDS_SYMLINK}: {exc!r}") + + store = FileSystemAgentFileStore(root) + + for op in ("read", "write", "delete"): + with pytest.raises(ValueError, match="symbolic link"): + if op == "read": + await store.read_file("aliased_dir/secret.txt") + elif op == "write": + await store.write_file("aliased_dir/secret.txt", "stomp") + else: + await store.delete_file("aliased_dir/secret.txt") diff --git a/python/samples/02-agents/context_providers/README.md b/python/samples/02-agents/context_providers/README.md index 04f3a1395f..e49a472f39 100644 --- a/python/samples/02-agents/context_providers/README.md +++ b/python/samples/02-agents/context_providers/README.md @@ -8,6 +8,7 @@ These samples demonstrate how to use context providers to enrich agent conversat |---------------|-------------| | [`simple_context_provider.py`](simple_context_provider.py) | Implement a custom context provider by extending `ContextProvider` to extract and inject structured user information across turns. | | [`azure_ai_foundry_memory.py`](azure_ai_foundry_memory.py) | Use `FoundryMemoryProvider` to add semantic memory — automatically retrieves, searches, and stores memories via Azure AI Foundry. | +| [`file_access_data_processing/`](file_access_data_processing/) | Use `FileAccessProvider` with `FileSystemAgentFileStore` to give an agent read/write/search access to a folder of CSV data files. See its own [README](file_access_data_processing/README.md). | | [`azure_ai_search/`](azure_ai_search/) | Retrieval Augmented Generation (RAG) with Azure AI Search in semantic and agentic modes. See its own [README](azure_ai_search/README.md). | | [`mem0/`](mem0/) | Memory-powered context using the Mem0 integration (open-source and managed). See its own [README](mem0/README.md). | | [`redis/`](redis/) | Redis-backed context providers for conversation memory and sessions. See its own [README](redis/README.md). | @@ -25,4 +26,9 @@ These samples demonstrate how to use context providers to enrich agent conversat - `AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME`: Embedding model deployment name (e.g., `text-embedding-ada-002`) - Azure CLI authentication (`az login`) +**For `file_access_data_processing/`:** +- `FOUNDRY_PROJECT_ENDPOINT`: Your Azure AI Foundry project endpoint +- `FOUNDRY_MODEL`: Chat model deployment name +- Azure CLI authentication (`az login`) + See each subfolder's README for provider-specific prerequisites. diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/README.md b/python/samples/02-agents/context_providers/file_access_data_processing/README.md new file mode 100644 index 0000000000..9984b1f13b --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/README.md @@ -0,0 +1,62 @@ +# File Access Data Processing + +This sample demonstrates how to give an `Agent` access to a folder of data files +by attaching `FileAccessProvider` (backed by `FileSystemAgentFileStore`) as a +context provider. + +The agent is given a `working/` folder containing `sales.csv` — ~50 rows of +sales transaction data — and is driven through a short scripted conversation +that exercises every tool the provider exposes: + +| Step | Prompt | Tool(s) used | +|---|---|---| +| 1 | "What files do you have access to?" | `file_access_list_files` | +| 2 | "Read sales.csv and summarize…" | `file_access_read_file` | +| 3 | "Calculate the total revenue per region…" | (uses previously read data) | +| 4 | "Save a markdown report named `region_totals.md`…" | `file_access_save_file` | +| 5 | "List the files again so I can confirm…" | `file_access_list_files` | + +After the run, the sample prints the final contents of `working/` so the +written file is easy to spot. + +## Prerequisites + +| Variable | Description | +|---|---| +| `FOUNDRY_PROJECT_ENDPOINT` | Your Azure AI Foundry project endpoint. | +| `FOUNDRY_MODEL` | Chat model deployment name (e.g. `gpt-4o`). | + +Run `az login` before executing the sample so `AzureCliCredential` can +authenticate. + +## Running the sample + +From `python/`: + +```bash +uv run --package agent-framework-core python samples/02-agents/context_providers/file_access_data_processing/data_processing.py +``` + +Or directly: + +```bash +python samples/02-agents/context_providers/file_access_data_processing/data_processing.py +``` + +## Sample data + +`working/sales.csv` contains January–March 2025 sales transactions with these +columns: + +| Column | Description | +|---|---| +| `date` | Transaction date (YYYY-MM-DD) | +| `product` | Product name | +| `category` | Product category (Electronics, Furniture, Stationery) | +| `quantity` | Units sold | +| `unit_price` | Price per unit | +| `region` | Sales region (North, South, West) | +| `salesperson` | Name of the salesperson | + +The sample writes `region_totals.md` into the same folder. Delete it between +runs if you want a clean state. diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py new file mode 100644 index 0000000000..0d4e67ff38 --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/data_processing.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft. All rights reserved. + +"""Sample: use ``FileAccessProvider`` to give an agent access to a folder of CSV data files. + +This sample demonstrates how to attach :class:`FileAccessProvider` (backed by +:class:`FileSystemAgentFileStore`) to an ``Agent`` so the model can read input +data, perform analysis, and write summary output back to the same folder via +the ``file_access_*`` tools. + +The sibling ``working/`` folder contains ``sales.csv`` — ~50 rows of sales +transactions (date, product, category, quantity, unit_price, region, +salesperson). The agent is asked, in a single session, to: list available +files, inspect the data, compute regional totals, and save a markdown summary. + +Prerequisites: + - ``FOUNDRY_PROJECT_ENDPOINT``: Your Azure AI Foundry project endpoint. + - ``FOUNDRY_MODEL``: Chat model deployment name. + - Run ``az login`` before executing the sample. +""" + +import asyncio +import os +from pathlib import Path + +from agent_framework import Agent, FileAccessProvider, FileSystemAgentFileStore +from agent_framework.foundry import FoundryChatClient +from azure.identity import AzureCliCredential +from dotenv import load_dotenv + +# Load python/.env (python-dotenv walks up from this file by default). Pass +# override=True so values from .env take precedence over any pre-existing OS +# environment variables — without this, OS-level values silently win. +load_dotenv(override=True) + +INSTRUCTIONS = """ +You are a data analyst assistant. You have access to a folder of data files via +the file_access_* tools. + +## Getting started +- Start by listing available files with file_access_list_files to see what data + is available. +- Read the files to understand their structure and contents. + +## Working with data +- When asked to analyze data, read the relevant files first, then perform the + analysis. +- Show your analysis clearly with tables, summaries, and key insights. +- When calculations are needed, work through them step by step and show your + reasoning. + +## Writing output +- When asked to produce output files (e.g., reports, summaries, filtered data), + use file_access_save_file to write them. +- Use appropriate file formats: CSV for tabular data, Markdown for reports. +- Confirm what you wrote and where. + +## Important +- Never modify or delete the original input data files unless explicitly asked + to do so. +- If asked about data you haven't read yet, read it first before answering. +- Always explain your reasoning between tool calls so the user can follow along. +""" + +PROMPTS = [ + "What files do you have access to?", + "Read sales.csv and summarize what columns it contains and how many rows it has.", + "Calculate the total revenue (quantity * unit_price) per region and show the result as a table.", + ( + "Save a markdown report named region_totals.md that contains the regional totals " + "and a one-paragraph summary of which region performed best." + ), + "List the files again so I can confirm region_totals.md was created.", +] + + +async def main() -> None: + # 1. Resolve the working directory bundled alongside this script. + working_dir = Path(__file__).parent / "working" + + # 2. Build the chat client. + client = FoundryChatClient( + project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"], + model=os.environ["FOUNDRY_MODEL"], + credential=AzureCliCredential(), + ) + + # 3. Wire up the file access provider against a file-system-backed store + # rooted at the sample's working/ folder. The provider injects its + # default instructions plus exposes five file_access_* tools to the + # agent for the duration of each run. + file_access = FileAccessProvider(store=FileSystemAgentFileStore(working_dir)) + + # 4. Create the agent and attach the provider. + async with Agent( + client=client, + name="DataAnalyst", + description="A data analyst assistant that reads, analyzes, and processes data files.", + instructions=INSTRUCTIONS, + context_providers=[file_access], + ) as agent: + # 5. Run all prompts inside one session so the conversation remains + # coherent across turns. + session = agent.create_session() + for prompt in PROMPTS: + print(f"\nUser: {prompt}") + response = await agent.run(prompt, session=session) + print(f"Assistant: {response}") + + # 6. Show the final folder contents so the side effects of the run are + # visible to the reader. + print("\nFinal contents of working/:") + for path in sorted(working_dir.iterdir()): + print(f" - {path.name} ({path.stat().st_size} bytes)") + + +if __name__ == "__main__": + asyncio.run(main()) + + +# Sample output (truncated): +# +# User: What files do you have access to? +# Assistant: I can see one file in the working directory: sales.csv. +# +# User: Read sales.csv and summarize what columns it contains and how many rows it has. +# Assistant: sales.csv has 50 data rows and 7 columns: date, product, category, +# quantity, unit_price, region, salesperson. +# +# User: Calculate the total revenue (quantity * unit_price) per region and show the result as a table. +# Assistant: +# | Region | Total Revenue | +# |--------|---------------| +# | North | $X,XXX.XX | +# | South | $X,XXX.XX | +# | West | $X,XXX.XX | +# +# User: Save a markdown report named region_totals.md ... +# Assistant: I wrote region_totals.md to the working folder. +# +# User: List the files again so I can confirm region_totals.md was created. +# Assistant: The working folder now contains: region_totals.md, sales.csv. +# +# Final contents of working/: +# - region_totals.md (NNN bytes) +# - sales.csv (3175 bytes) diff --git a/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv b/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv new file mode 100644 index 0000000000..50a2369942 --- /dev/null +++ b/python/samples/02-agents/context_providers/file_access_data_processing/working/sales.csv @@ -0,0 +1,50 @@ +date,product,category,quantity,unit_price,region,salesperson +2025-01-03,Laptop Pro 15,Electronics,2,1299.99,North,Alice +2025-01-05,Ergonomic Chair,Furniture,5,349.50,South,Bob +2025-01-07,Wireless Mouse,Electronics,12,24.99,North,Alice +2025-01-08,Standing Desk,Furniture,1,599.00,West,Carol +2025-01-10,USB-C Hub,Electronics,8,45.99,North,David +2025-01-12,Monitor 27in,Electronics,3,429.00,South,Bob +2025-01-14,Desk Lamp,Furniture,6,79.95,West,Carol +2025-01-15,Keyboard Mech,Electronics,4,149.99,North,Alice +2025-01-17,Filing Cabinet,Furniture,2,189.00,South,David +2025-01-20,Webcam HD,Electronics,10,89.99,West,Bob +2025-01-22,Laptop Pro 15,Electronics,1,1299.99,South,Carol +2025-01-24,Ergonomic Chair,Furniture,3,349.50,North,Alice +2025-01-25,Notebook Pack,Stationery,20,12.99,South,David +2025-01-27,Wireless Mouse,Electronics,15,24.99,West,Carol +2025-01-28,Whiteboard,Stationery,4,129.00,North,Bob +2025-01-30,Standing Desk,Furniture,2,599.00,South,Alice +2025-02-02,USB-C Hub,Electronics,6,45.99,West,David +2025-02-04,Monitor 27in,Electronics,2,429.00,North,Carol +2025-02-05,Desk Lamp,Furniture,8,79.95,South,Bob +2025-02-07,Keyboard Mech,Electronics,5,149.99,West,Alice +2025-02-09,Filing Cabinet,Furniture,1,189.00,North,David +2025-02-11,Webcam HD,Electronics,7,89.99,South,Carol +2025-02-13,Laptop Pro 15,Electronics,3,1299.99,West,Bob +2025-02-15,Notebook Pack,Stationery,30,12.99,North,Alice +2025-02-17,Ergonomic Chair,Furniture,4,349.50,South,David +2025-02-19,Wireless Mouse,Electronics,20,24.99,North,Carol +2025-02-20,Whiteboard,Stationery,2,129.00,West,Bob +2025-02-22,Standing Desk,Furniture,1,599.00,North,Alice +2025-02-24,USB-C Hub,Electronics,10,45.99,South,David +2025-02-26,Monitor 27in,Electronics,4,429.00,West,Carol +2025-02-28,Desk Lamp,Furniture,3,79.95,North,Bob +2025-03-02,Keyboard Mech,Electronics,6,149.99,South,Alice +2025-03-04,Filing Cabinet,Furniture,3,189.00,West,David +2025-03-06,Webcam HD,Electronics,9,89.99,North,Carol +2025-03-08,Laptop Pro 15,Electronics,2,1299.99,South,Bob +2025-03-10,Notebook Pack,Stationery,25,12.99,West,Alice +2025-03-12,Ergonomic Chair,Furniture,6,349.50,North,David +2025-03-14,Wireless Mouse,Electronics,18,24.99,South,Carol +2025-03-15,Whiteboard,Stationery,5,129.00,North,Bob +2025-03-17,Standing Desk,Furniture,3,599.00,West,Alice +2025-03-19,USB-C Hub,Electronics,7,45.99,North,David +2025-03-21,Monitor 27in,Electronics,5,429.00,South,Carol +2025-03-23,Desk Lamp,Furniture,4,79.95,West,Bob +2025-03-25,Keyboard Mech,Electronics,3,149.99,North,Alice +2025-03-27,Filing Cabinet,Furniture,2,189.00,South,David +2025-03-28,Webcam HD,Electronics,11,89.99,West,Carol +2025-03-29,Laptop Pro 15,Electronics,1,1299.99,North,Bob +2025-03-30,Notebook Pack,Stationery,15,12.99,South,Alice +2025-03-31,Ergonomic Chair,Furniture,2,349.50,West,David diff --git a/python/scripts/task_runner.py b/python/scripts/task_runner.py index 617b4d58b0..8bd038359a 100644 --- a/python/scripts/task_runner.py +++ b/python/scripts/task_runner.py @@ -8,6 +8,7 @@ filters the same way. """ import concurrent.futures +import contextlib import glob import os import subprocess @@ -17,6 +18,16 @@ from collections.abc import Sequence from fnmatch import fnmatch from pathlib import Path +# On Windows, stdout defaults to cp1252 under non-interactive callers (e.g. +# prek / pre-commit hooks). Reconfigure to UTF-8 before importing rich so +# unicode glyphs like ``\u2713`` don't raise ``UnicodeEncodeError``. +if sys.platform == "win32": + for _stream in (sys.stdout, sys.stderr): + reconfigure = getattr(_stream, "reconfigure", None) + if callable(reconfigure): + with contextlib.suppress(OSError, ValueError): + reconfigure(encoding="utf-8") + import tomli from rich import print @@ -122,8 +133,7 @@ def project_filter_matches(project: Path | str, pattern: str, aliases: Sequence[ """ normalized_pattern = normalize_project_filter(pattern).lower() return any( - fnmatch(candidate, normalized_pattern) - for candidate in build_project_filter_candidates(project, aliases) + fnmatch(candidate, normalized_pattern) for candidate in build_project_filter_candidates(project, aliases) )