Python: Parse YAML block scalars in SKILL.md frontmatter (#5863)

The frontmatter parser previously matched only single-line `key: value` pairs, so block scalar indicators (`|` literal, `>` folded, with chomping `-`/`+`) were silently truncated to the indicator character. Multi-line descriptions like `description: >\n  ...` lost their content.

Add `_parse_yaml_scalar_value()` which detects block scalar indicators, collects indented continuation lines, strips the common leading indentation, joins per scalar style (newlines for `|`, spaces for `>`), and applies chomping per the YAML 1.2 spec. Update `_extract_frontmatter()` to use the helper for unquoted values.

Adds 15 unit tests covering literal/folded styles, all chomping variants, indentation handling, content containing colons, non-description fields, tab indentation, blank-line preservation, and a regression test for plain values.

Fixes #5713.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
SergeyMenshykh
2026-05-15 10:47:00 +01:00
committed by GitHub
Unverified
parent ad95f2f2fa
commit 19b2367366
2 changed files with 236 additions and 10 deletions
@@ -1513,6 +1513,97 @@ YAML_INDENTED_KV_RE = re.compile(
# must not start or end with a hyphen, and must not contain consecutive hyphens.
VALID_NAME_RE = re.compile(r"^[a-z0-9]([a-z0-9]*-[a-z0-9])*[a-z0-9]*$")
# Block scalar indicator characters recognised by the lightweight YAML parser.
_BLOCK_SCALAR_INDICATORS = ("|", ">")
def _parse_yaml_scalar_value(yaml_content: str, kv_match: re.Match[str]) -> str:
"""Resolve the scalar value for an unquoted YAML key-value match.
If the captured value starts with a YAML block scalar indicator (``|`` or
``>``), the function reads subsequent indented continuation lines, strips
the common leading indentation, and joins them according to the scalar
style (literal preserves newlines, folded replaces them with spaces).
Chomping indicators are respected per YAML 1.2 §8.1.1.2:
* ``-`` (strip) — final line break and trailing empty lines excluded
* ``+`` (keep) — final line break and any trailing empty lines preserved
* default (clip) — final line break preserved, trailing empty lines excluded
For plain (non-block-scalar) values the captured text is returned as-is.
Note: explicit indentation indicators (e.g. ``|2``) are not supported;
indentation is auto-detected from the common leading whitespace.
"""
value: str = kv_match.group(3)
if not value or value[0] not in _BLOCK_SCALAR_INDICATORS:
return value
scalar_style = value[0]
keep_trailing_newline = len(value) > 1 and value[1] == "+"
strip_trailing_newline = len(value) > 1 and value[1] == "-"
# Find the start of the next line after this key-value match.
next_line_start = yaml_content.find("\n", kv_match.end())
if next_line_start < 0:
return value
next_line_start += 1 # skip the newline character itself
# Collect indented continuation lines (or blank lines within the block).
block_lines: list[str] = []
pos = next_line_start
while pos < len(yaml_content):
line_end = yaml_content.find("\n", pos)
if line_end < 0:
line = yaml_content[pos:]
line_end = len(yaml_content)
else:
line = yaml_content[pos:line_end]
if not line or line.isspace():
# Blank / whitespace-only lines are part of the block.
block_lines.append("")
pos = line_end + 1 if line_end < len(yaml_content) else line_end
continue
if line[0] not in (" ", "\t"):
# Non-indented, non-blank line — end of the block.
break
block_lines.append(line)
pos = line_end + 1 if line_end < len(yaml_content) else line_end
# Strip trailing blank lines collected from the block.
while block_lines and block_lines[-1] == "":
block_lines.pop()
if not block_lines:
return ""
# Determine the common leading indentation across non-empty lines.
# Only space/tab characters count as indentation (matches YAML semantics).
def _indent_width(s: str) -> int:
i = 0
while i < len(s) and s[i] in (" ", "\t"):
i += 1
return i
common_indent = min(_indent_width(line) for line in block_lines if line)
normalized = [line[common_indent:] if line else "" for line in block_lines]
# Literal preserves newlines; folded joins non-empty lines with spaces.
parsed = "\n".join(normalized) if scalar_style == "|" else " ".join(line for line in normalized if line)
if keep_trailing_newline:
return parsed + "\n"
if strip_trailing_newline:
return parsed
# Clip (default): literal gets a trailing newline, folded does not.
if scalar_style == "|":
return parsed + "\n"
return parsed
# Default system prompt template for advertising available skills to the model.
# Use {skills} as the placeholder for the generated skills XML list.
@@ -2879,7 +2970,9 @@ class FileSkillsSource(SkillsSource):
for kv_match in YAML_KV_RE.finditer(yaml_content):
key = kv_match.group(1)
value = kv_match.group(2) if kv_match.group(2) is not None else kv_match.group(3)
value = (
kv_match.group(2) if kv_match.group(2) is not None else _parse_yaml_scalar_value(yaml_content, kv_match)
)
key_lower = key.lower()
if key_lower == "name":
+142 -9
View File
@@ -319,9 +319,7 @@ class TestDiscoverResourceFiles:
refs = skill_dir / "references"
refs.mkdir(parents=True)
(refs / "doc.md").write_text("content", encoding="utf-8")
resources = FileSkillsSource._discover_resource_files(
str(skill_dir), directories=("references", "references")
)
resources = FileSkillsSource._discover_resource_files(str(skill_dir), directories=("references", "references"))
assert resources == ["references/doc.md"]
def test_results_are_sorted(self, tmp_path: Path) -> None:
@@ -1675,9 +1673,7 @@ class TestValidateAndNormalizeDirectoryNames:
FileSkillsSource._validate_and_normalize_directory_names([" "])
def test_multiple_directories(self) -> None:
result = FileSkillsSource._validate_and_normalize_directory_names(
[".", "references", "assets", "scripts"]
)
result = FileSkillsSource._validate_and_normalize_directory_names([".", "references", "assets", "scripts"])
assert result == [".", "references", "assets", "scripts"]
def test_default_resource_directories(self) -> None:
@@ -2163,6 +2159,145 @@ class TestExtractFrontmatterEdgeCases:
assert result.description == desc
# ---------------------------------------------------------------------------
# Tests: _extract_frontmatter block scalar parsing
# ---------------------------------------------------------------------------
class TestExtractFrontmatterBlockScalars:
"""Tests for YAML block scalar (| and >) parsing in _extract_frontmatter."""
def test_literal_block_scalar(self) -> None:
content = "---\nname: test-skill\ndescription: |\n Line one\n Line two\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Line one\nLine two\n"
def test_folded_block_scalar(self) -> None:
content = "---\nname: test-skill\ndescription: >\n This is a multi-line\n description block\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "This is a multi-line description block"
def test_literal_strip_chomping(self) -> None:
content = "---\nname: test-skill\ndescription: |-\n No trailing newline\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "No trailing newline"
def test_folded_strip_chomping(self) -> None:
content = "---\nname: test-skill\ndescription: >-\n Folded with\n strip chomping\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Folded with strip chomping"
def test_literal_keep_chomping(self) -> None:
content = "---\nname: test-skill\ndescription: |+\n Keep trailing\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Keep trailing\n"
def test_folded_keep_chomping(self) -> None:
content = "---\nname: test-skill\ndescription: >+\n Keep trailing\n newline\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Keep trailing newline\n"
def test_block_scalar_no_continuation_lines(self) -> None:
content = "---\nname: test-skill\ndescription: |\nlicense: MIT\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
# description becomes empty string which fails validation (empty/whitespace)
assert result is None
def test_block_scalar_varying_indentation(self) -> None:
content = (
"---\n"
"name: test-skill\n"
"description: |\n"
" Line with 4-space indent\n"
" Line with 4-space indent\n"
"---\n"
"Body."
)
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Line with 4-space indent\nLine with 4-space indent\n"
def test_folded_block_scalar_real_skill_format(self) -> None:
"""End-to-end test matching the format used in .github/skills/ SKILL.md files."""
content = (
"---\n"
"name: python-development\n"
"description: >\n"
" Coding standards, conventions, and patterns for developing Python code in the\n"
" Agent Framework repository. Use this when writing or modifying Python source\n"
" files in the python/ directory.\n"
"---\n"
"\n"
"# Python Development Standards\n"
)
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == (
"Coding standards, conventions, and patterns for developing Python code in the "
"Agent Framework repository. Use this when writing or modifying Python source "
"files in the python/ directory."
)
def test_block_scalar_with_other_fields_after(self) -> None:
content = "---\nname: test-skill\ndescription: >\n A folded\n description\nlicense: MIT\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "A folded description"
assert result.license == "MIT"
def test_plain_value_unchanged(self) -> None:
"""Non-block-scalar values must not be affected by the block scalar logic."""
content = "---\nname: test-skill\ndescription: A simple description.\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "A simple description."
def test_block_scalar_content_with_colons(self) -> None:
"""Lines inside a block scalar that look like YAML key-value pairs must be preserved verbatim."""
content = (
"---\nname: test-skill\ndescription: |\n Some text with colon: in it\n Another: line here\n---\nBody."
)
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Some text with colon: in it\nAnother: line here\n"
def test_block_scalar_on_license_field(self) -> None:
"""Block scalars should work on any field, not only description."""
content = (
"---\n"
"name: test-skill\n"
"description: A skill.\n"
"license: >\n"
" Custom license\n"
" spanning multiple lines\n"
"---\n"
"Body."
)
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.license == "Custom license spanning multiple lines"
def test_block_scalar_tab_indentation(self) -> None:
"""Tab characters should count as indentation for block scalar continuation lines."""
content = "---\nname: test-skill\ndescription: |\n\tTab-indented line one\n\tTab-indented line two\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "Tab-indented line one\nTab-indented line two\n"
def test_block_scalar_blank_line_within_block(self) -> None:
"""Blank lines within a block scalar should be preserved as paragraph separators."""
content = "---\nname: test-skill\ndescription: |\n First paragraph\n\n Second paragraph\n---\nBody."
result = FileSkillsSource._extract_frontmatter(content, "test.md")
assert result is not None
assert result.description == "First paragraph\n\nSecond paragraph\n"
# ---------------------------------------------------------------------------
# Tests: Skill spec fields (via SkillFrontmatter)
# ---------------------------------------------------------------------------
@@ -5498,9 +5633,7 @@ class TestArrayStyleScriptArgs:
return "ok"
assert isinstance(my_runner, SkillScriptRunner)
skill = FileSkill(
frontmatter=SkillFrontmatter(name="s", description="d"), content="c", path=f"{_ABS}/test"
)
skill = FileSkill(frontmatter=SkillFrontmatter(name="s", description="d"), content="c", path=f"{_ABS}/test")
script = FileSkillScript(name="run.py", full_path=f"{_ABS}/test/run.py")
result = my_runner(skill, script, args=["--flag", "value"])
assert result == "ok"