diff --git a/codex-rs/utils/path-uri/src/api_path_string.rs b/codex-rs/utils/path-uri/src/api_path_string.rs new file mode 100644 index 000000000..a80fbdae9 --- /dev/null +++ b/codex-rs/utils/path-uri/src/api_path_string.rs @@ -0,0 +1,399 @@ +use crate::PathUri; +use codex_utils_absolute_path::AbsolutePathBuf; +use schemars::JsonSchema; +use serde::Deserialize; +use serde::Serialize; +use serde::Serializer; +use std::fmt; +use thiserror::Error; +use ts_rs::TS; + +/// A UTF-8 path for preserving raw path compatibility at the app-server API +/// boundary while Codex migrates to [`PathUri`]. +/// +/// Supports storing arbitrary strings read from the API and converting to and +/// from [`PathUri`] using an explicitly selected native path convention. +/// +/// When converting from [`PathUri`], "native" refers to the supplied +/// [`PathConvention`], which may be foreign to the operating system running +/// this process. The inner string is private so path-producing code must use +/// [`Self::from_abs_path`] or [`Self::from_path_uri`] instead of bypassing the +/// intended conversion boundary. Non-UTF-8 paths are converted to UTF-8 +/// lossily because this API value is serialized as a JSON string. +/// +/// Deserialization accepts any UTF-8 string without interpreting or validating +/// it. That unrestricted construction path is intentionally available only to +/// serde: Codex-internal code cannot construct this type directly from a raw +/// `String` and is instead encouraged to convert through [`PathUri`] or +/// [`AbsolutePathBuf`]. Relative path text remains valid until an operation +/// such as [`Self::to_path_uri`] requires an absolute path. +#[derive(Clone, Debug, PartialEq, Eq, Hash, Deserialize, TS)] +#[serde(transparent)] +#[ts(type = "string")] +pub struct ApiPathString(String); + +impl ApiPathString { + /// Renders an absolute path using the current host's path convention. + pub fn from_abs_path(path: &AbsolutePathBuf) -> Self { + Self(path.to_string_lossy().into_owned()) + } + + /// Renders a path URI using the requested native path convention. + /// + /// Rendering fails when the URI shape does not match the convention, such + /// as a POSIX path rendered as Windows or a UNC path rendered as POSIX. It + /// also fails when an opaque fallback does not encode an absolute path for + /// the convention. Non-UTF-8 segments are rendered lossily, and encoded + /// separators are emitted as native path text. + pub fn from_path_uri( + path: &PathUri, + convention: PathConvention, + ) -> Result { + if let Some(path_bytes) = path.opaque_fallback_bytes() { + return render_opaque_fallback(path, &path_bytes, convention).map(Self); + } + match convention { + PathConvention::Posix => render_posix_path(path), + PathConvention::Windows => render_windows_path(path), + } + .map(Self) + } + + /// Parses this API string as an absolute path using the requested native + /// path convention and returns its canonical path URI. + pub fn to_path_uri(&self, convention: PathConvention) -> Result { + let path = match convention { + PathConvention::Posix => parse_posix_path(&self.0), + PathConvention::Windows => parse_windows_path(&self.0), + }; + path.ok_or_else(|| ApiPathStringError::InvalidNativePath { + path: self.0.clone(), + convention, + }) + } + + /// Infers the path convention of an absolute API path from its spelling. + /// + /// Relative paths and ambiguous spellings return `None`. In particular, + /// slash-prefixed paths are treated as POSIX even when they could also be + /// interpreted as slash-delimited Windows UNC paths. + pub fn infer_absolute_path_convention(&self) -> Option { + let bytes = self.0.as_bytes(); + let has_windows_drive_root = matches!( + bytes, + [drive, b':', separator, ..] + if drive.is_ascii_alphabetic() && is_windows_separator_byte(*separator) + ); + if has_windows_drive_root || self.0.starts_with(r"\\") { + Some(PathConvention::Windows) + } else if self.0.starts_with('/') { + Some(PathConvention::Posix) + } else { + None + } + } + + pub fn as_str(&self) -> &str { + &self.0 + } + + pub fn into_string(self) -> String { + self.0 + } +} + +fn parse_posix_path(path: &str) -> Option { + let path = path.strip_prefix('/')?; + if path.contains('\0') { + return Some(PathUri::from_opaque_path_bytes( + format!("/{path}").as_bytes(), + )); + } + path_uri_from_segments(/*host*/ None, path.split('/')) +} + +fn parse_windows_path(path: &str) -> Option { + let bytes = path.as_bytes(); + let uses_namespace = matches!( + bytes, + [first, second, namespace @ (b'.' | b'?'), separator, ..] + if is_windows_separator_byte(*first) + && is_windows_separator_byte(*second) + && is_windows_separator_byte(*separator) + && matches!(*namespace, b'.' | b'?') + ); + if uses_namespace || path.contains('\0') { + return Some(windows_opaque_path_uri(path)); + } + + if matches!( + bytes, + [drive, b':', separator, ..] + if drive.is_ascii_alphabetic() && is_windows_separator_byte(*separator) + ) { + return path_uri_from_segments( + /*host*/ None, + std::iter::once(&path[..2]).chain(path[3..].split(is_windows_separator_char)), + ); + } + + if matches!(bytes, [first, second, ..] + if is_windows_separator_byte(*first) && is_windows_separator_byte(*second)) + { + let mut components = path[2..].split(is_windows_separator_char); + let host = components.next().filter(|host| !host.is_empty())?; + let share = components.next().filter(|share| !share.is_empty())?; + return path_uri_from_segments(Some(host), std::iter::once(share).chain(components)) + .or_else(|| Some(windows_opaque_path_uri(path))); + } + + None +} + +fn path_uri_from_segments<'a>( + host: Option<&str>, + segments: impl Iterator, +) -> Option { + let mut url = url::Url::parse("file:///").ok()?; + if let Some(host) = host { + url.set_host(Some(host)).ok()?; + } + { + let mut url_segments = url.path_segments_mut().ok()?; + url_segments.clear(); + for segment in segments { + url_segments.push(segment); + } + } + PathUri::try_from(url).ok() +} + +fn windows_opaque_path_uri(path: &str) -> PathUri { + let path_bytes = path + .encode_utf16() + .flat_map(u16::to_le_bytes) + .collect::>(); + PathUri::from_opaque_path_bytes(&path_bytes) +} + +fn is_windows_separator_char(character: char) -> bool { + matches!(character, '\\' | '/') +} + +fn is_windows_separator_byte(character: u8) -> bool { + matches!(character, b'\\' | b'/') +} + +fn render_opaque_fallback( + path: &PathUri, + path_bytes: &[u8], + convention: PathConvention, +) -> Result { + let rendered = match convention { + PathConvention::Posix if path_bytes.starts_with(b"/") => { + Some(String::from_utf8_lossy(path_bytes).into_owned()) + } + PathConvention::Windows => render_windows_opaque_fallback(path_bytes), + PathConvention::Posix => None, + }; + rendered.ok_or_else(|| ApiPathStringError::OpaqueFallback { + path: path.to_string(), + }) +} + +fn render_windows_opaque_fallback(path_bytes: &[u8]) -> Option { + if !path_bytes.len().is_multiple_of(2) { + return None; + } + let path_wide = path_bytes + .chunks_exact(2) + .map(|bytes| u16::from_le_bytes([bytes[0], bytes[1]])) + .collect::>(); + + // Windows absolute paths either have a rooted drive prefix (`C:\\`) or a + // rooted namespace/UNC prefix (`\\server`, `\\.\\`, or `\\?\\`). + let has_drive_root = matches!( + path_wide.as_slice(), + [drive, colon, separator, ..] + if ((u16::from(b'A')..=u16::from(b'Z')).contains(drive) + || (u16::from(b'a')..=u16::from(b'z')).contains(drive)) + && *colon == u16::from(b':') + && is_windows_separator(*separator) + ); + let has_namespace_or_unc_root = matches!( + path_wide.as_slice(), + [first, second, ..] + if is_windows_separator(*first) && is_windows_separator(*second) + ); + (has_drive_root || has_namespace_or_unc_root).then(|| String::from_utf16_lossy(&path_wide)) +} + +fn is_windows_separator(character: u16) -> bool { + character == u16::from(b'\\') || character == u16::from(b'/') +} + +impl fmt::Display for ApiPathString { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl Serialize for ApiPathString { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&self.0) + } +} + +impl JsonSchema for ApiPathString { + fn schema_name() -> String { + "ApiPathString".to_string() + } + + fn json_schema(generator: &mut schemars::r#gen::SchemaGenerator) -> schemars::schema::Schema { + String::json_schema(generator) + } +} + +fn render_posix_path(path: &PathUri) -> Result { + let url = path.to_url(); + // POSIX file paths do not have a UNC authority, so `file://server/share` + // cannot be represented as `/share` without losing the server identity. + if url.host_str().is_some() { + return Err(incompatible_convention(path, PathConvention::Posix)); + } + + // URI segments are already separated with `/` on every host. Decode each + // one independently so `file:///a%20dir/file` becomes `/a dir/file`. + let mut rendered = String::new(); + for segment in path_segments(&url) { + rendered.push('/'); + rendered.push_str(&decode_native_segment(segment)); + } + Ok(rendered) +} + +fn render_windows_path(path: &PathUri) -> Result { + let url = path.to_url(); + let mut segments = path_segments(&url); + let mut rendered = String::new(); + if let Some(host) = url.host_str() { + // A URI authority selects the UNC form: `file://server/share/file` + // becomes `\\server\share\file`. The first segment is the share name, + // which must be present. + let Some(share) = segments.next() else { + return Err(incompatible_convention(path, PathConvention::Windows)); + }; + let share = decode_native_segment(share); + if share.is_empty() { + return Err(incompatible_convention(path, PathConvention::Windows)); + } + rendered.push_str(r"\\"); + rendered.push_str(host); + rendered.push('\\'); + rendered.push_str(&share); + } else { + // Without an authority, Windows requires a drive root. For example, + // `file:///C:/src/main.rs` begins with the `C:` URI segment and renders + // as `C:\src\main.rs`; a POSIX URI such as `file:///usr/bin` is rejected. + let Some(drive) = segments.next() else { + return Err(incompatible_convention(path, PathConvention::Windows)); + }; + let drive = decode_native_segment(drive); + let bytes = drive.as_bytes(); + if bytes.len() != 2 || !bytes[0].is_ascii_alphabetic() || bytes[1] != b':' { + return Err(incompatible_convention(path, PathConvention::Windows)); + } + rendered.push_str(&drive); + } + + for segment in segments { + // URL path separators become Windows separators after each component + // has been decoded. + let segment = decode_native_segment(segment); + rendered.push('\\'); + rendered.push_str(&segment); + } + // `file:///C:` and `file:///C:/` both identify the drive root, never the + // drive-relative path `C:`. + if rendered.len() == 2 && rendered.as_bytes()[1] == b':' { + rendered.push('\\'); + } + Ok(rendered) +} + +fn path_segments(url: &url::Url) -> std::str::Split<'_, char> { + url.path_segments() + .unwrap_or_else(|| unreachable!("validated file URLs have path segments")) +} + +fn decode_native_segment(segment: &str) -> String { + // Decode exactly once. Thus `%20` becomes a space and `%252F` becomes the + // literal text `%2F`, rather than being decoded a second time into `/`. + let bytes = urlencoding::decode_binary(segment.as_bytes()); + String::from_utf8_lossy(&bytes).into_owned() +} + +fn incompatible_convention(path: &PathUri, convention: PathConvention) -> ApiPathStringError { + ApiPathStringError::IncompatibleConvention { + path: path.to_string(), + convention, + } +} + +#[derive(Debug, Error, PartialEq, Eq)] +pub enum ApiPathStringError { + #[error("opaque fallback path URI `{path}` cannot be recovered as a native path")] + OpaqueFallback { path: String }, + #[error("path URI `{path}` cannot be rendered using {convention} path syntax")] + IncompatibleConvention { + path: String, + convention: PathConvention, + }, + #[error("path `{path}` is not absolute using {convention} path syntax")] + InvalidNativePath { + path: String, + convention: PathConvention, + }, +} + +/// Path syntax used to render a [`PathUri`] as an operating-system path. +/// +/// This describes path grammar rather than a specific operating system because +/// Linux and macOS share the POSIX representation relevant here. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema, TS)] +#[serde(rename_all = "snake_case")] +#[ts(rename_all = "snake_case")] +pub enum PathConvention { + Posix, + Windows, +} + +impl PathConvention { + /// Returns the path convention used by the current process. + #[cfg(windows)] + pub const fn native() -> Self { + Self::Windows + } + + /// Returns the path convention used by the current process. + #[cfg(unix)] + pub const fn native() -> Self { + Self::Posix + } +} + +impl fmt::Display for PathConvention { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Posix => f.write_str("POSIX"), + Self::Windows => f.write_str("Windows"), + } + } +} + +#[cfg(test)] +#[path = "api_path_string_tests.rs"] +mod tests; diff --git a/codex-rs/utils/path-uri/src/api_path_string_tests.rs b/codex-rs/utils/path-uri/src/api_path_string_tests.rs new file mode 100644 index 000000000..8b849f178 --- /dev/null +++ b/codex-rs/utils/path-uri/src/api_path_string_tests.rs @@ -0,0 +1,498 @@ +use super::*; +use crate::PathUri; +use codex_utils_absolute_path::AbsolutePathBuf; +use pretty_assertions::assert_eq; + +#[derive(Clone, Copy, Debug)] +struct RenderCase { + uri: &'static str, + convention: PathConvention, + expected: RenderExpectation, +} + +impl RenderCase { + const fn round_trips( + uri: &'static str, + convention: PathConvention, + rendered: &'static str, + ) -> Self { + Self { + uri, + convention, + expected: RenderExpectation::RoundTrip(rendered), + } + } + + const fn rejects(uri: &'static str, convention: PathConvention, error: ExpectedError) -> Self { + Self { + uri, + convention, + expected: RenderExpectation::Error(error), + } + } + + const fn renders_lossily( + uri: &'static str, + convention: PathConvention, + rendered: &'static str, + ) -> Self { + Self { + uri, + convention, + expected: RenderExpectation::RenderOnly(rendered), + } + } +} + +#[derive(Clone, Copy, Debug)] +enum RenderExpectation { + RoundTrip(&'static str), + RenderOnly(&'static str), + Error(ExpectedError), +} + +#[derive(Clone, Copy, Debug)] +enum ExpectedError { + OpaqueFallback, + IncompatibleConvention, +} + +const RENDER_CASES: &[RenderCase] = &[ + // POSIX paths. + RenderCase::round_trips("file:///", PathConvention::Posix, "/"), + RenderCase::round_trips( + "file:///home/alice/src/main.rs", + PathConvention::Posix, + "/home/alice/src/main.rs", + ), + RenderCase::round_trips( + "file:///home/alice/a%20file.rs", + PathConvention::Posix, + "/home/alice/a file.rs", + ), + RenderCase::round_trips( + "file:///workspace/src/lib.rs", + PathConvention::Posix, + "/workspace/src/lib.rs", + ), + RenderCase::round_trips( + "file:///workspace/tests/test.rs", + PathConvention::Posix, + "/workspace/tests/test.rs", + ), + RenderCase::round_trips("file:///etc", PathConvention::Posix, "/etc"), + RenderCase::round_trips("file:///tmp/", PathConvention::Posix, "/tmp/"), + RenderCase::round_trips("file:///C:/Project", PathConvention::Posix, "/C:/Project"), + RenderCase::round_trips("file:///C:", PathConvention::Posix, "/C:"), + RenderCase::round_trips("file:///tmp/%E2%98%83", PathConvention::Posix, "/tmp/☃"), + RenderCase::round_trips("file:///tmp/a%5Cb", PathConvention::Posix, "/tmp/a\\b"), + RenderCase::round_trips( + "file:///tmp/100%25/file", + PathConvention::Posix, + "/tmp/100%/file", + ), + RenderCase::round_trips( + "file:///tmp/a%3Fb%23c%25d", + PathConvention::Posix, + "/tmp/a?b#c%d", + ), + RenderCase::round_trips("file:///tmp/a%252Fb", PathConvention::Posix, "/tmp/a%2Fb"), + RenderCase::round_trips( + "file:///bad/path/L3RtcC9udWxsLQAt_y1ieXRl", + PathConvention::Posix, + "/bad/path/L3RtcC9udWxsLQAt_y1ieXRl", + ), + RenderCase::round_trips( + "FILE:///workspace/src", + PathConvention::Posix, + "/workspace/src", + ), + RenderCase::round_trips( + "file:/workspace/src", + PathConvention::Posix, + "/workspace/src", + ), + RenderCase::round_trips( + "file://localhost/workspace/src", + PathConvention::Posix, + "/workspace/src", + ), + RenderCase::round_trips( + "file://LOCALHOST/workspace/src", + PathConvention::Posix, + "/workspace/src", + ), + // Windows drive paths. + RenderCase::round_trips( + "file:///C:/Users/Alice%20Smith/src/main.rs", + PathConvention::Windows, + r"C:\Users\Alice Smith\src\main.rs", + ), + RenderCase::round_trips("file:///C:/", PathConvention::Windows, "C:\\"), + RenderCase::renders_lossily("file:///C:", PathConvention::Windows, "C:\\"), + RenderCase::round_trips("file:///C:/Users", PathConvention::Windows, r"C:\Users"), + RenderCase::round_trips("file:///C:/Windows", PathConvention::Windows, r"C:\Windows"), + RenderCase::round_trips( + "file:///d:/snowman/%E2%98%83", + PathConvention::Windows, + r"d:\snowman\☃", + ), + RenderCase::round_trips("file:///C:/tmp/", PathConvention::Windows, "C:\\tmp\\"), + RenderCase::round_trips( + "file:///C:/test%20with%20%25/path", + PathConvention::Windows, + r"C:\test with %\path", + ), + RenderCase::round_trips( + "file:///C:/test%20with%20%2525/c%23code", + PathConvention::Windows, + r"C:\test with %25\c#code", + ), + RenderCase::round_trips( + "file:///C:/Source/Z%C3%BCrich%20or%20Zurich%20(%CB%88zj%CA%8A%C9%99r%C9%AAk,/Code/resources/app/plugins/c%23/plugin.json", + PathConvention::Windows, + r"C:\Source\Zürich or Zurich (ˈzjʊərɪk,\Code\resources\app\plugins\c#\plugin.json", + ), + RenderCase::round_trips( + "file:///C:/project/owner's_file/database.sqlite", + PathConvention::Windows, + r"C:\project\owner's_file\database.sqlite", + ), + RenderCase::round_trips( + "file:///C:/project/%25A0.txt", + PathConvention::Windows, + r"C:\project\%A0.txt", + ), + RenderCase::round_trips( + "file:///C:/project/%252e.txt", + PathConvention::Windows, + r"C:\project\%2e.txt", + ), + // Windows UNC paths. + RenderCase::round_trips( + "file://server/share/src/main.rs", + PathConvention::Windows, + r"\\server\share\src\main.rs", + ), + RenderCase::round_trips( + "file://server/share", + PathConvention::Windows, + r"\\server\share", + ), + RenderCase::round_trips( + "file://server/share/", + PathConvention::Windows, + "\\\\server\\share\\", + ), + RenderCase::round_trips( + "file://shares/files/c%23/p.cs", + PathConvention::Windows, + r"\\shares\files\c#\p.cs", + ), + RenderCase::round_trips( + "file://monacotools1/certificates/SSL/", + PathConvention::Windows, + "\\\\monacotools1\\certificates\\SSL\\", + ), + // Opaque fallbacks rendered according to their source convention. + RenderCase::renders_lossily( + "file:///%00/bad/path/L3RtcC9udWxsLQAt_y1ieXRl", + PathConvention::Posix, + "/tmp/null-\0-�-byte", + ), + RenderCase::round_trips( + "file:///%00/bad/path/XABcAC4AXABDAE8ATQAxAFwA", + PathConvention::Windows, + r"\\.\COM1\", + ), + RenderCase::round_trips( + "file:///%00/bad/path/XABcAD8AXABWAG8AbAB1AG0AZQB7ADAAMAAwADAAMAAwADAAMAAtADAAMAAwADAALQAwADAAMAAwAC0AMAAwADAAMAAtADAAMAAwADAAMAAwADAAMAAwADAAMAAwAH0AXABmAGkAbABlAC4AcgBzAA", + PathConvention::Windows, + r"\\?\Volume{00000000-0000-0000-0000-000000000000}\file.rs", + ), + // Windows rendering preserves path text without filesystem validation. + RenderCase::round_trips("file:///C:/a%3Fb", PathConvention::Windows, "C:\\a?b"), + RenderCase::round_trips("file:///C:/a*b", PathConvention::Windows, "C:\\a*b"), + RenderCase::round_trips( + "file:///C:/trailing.", + PathConvention::Windows, + "C:\\trailing.", + ), + RenderCase::round_trips( + "file:///C:/trailing%20", + PathConvention::Windows, + "C:\\trailing ", + ), + RenderCase::round_trips( + "file:///C:/control-%01", + PathConvention::Windows, + "C:\\control-\u{1}", + ), + RenderCase::round_trips( + "file:///C:/file.txt:stream", + PathConvention::Windows, + "C:\\file.txt:stream", + ), + RenderCase::round_trips( + "file://server/sh%3Fare/file.rs", + PathConvention::Windows, + "\\\\server\\sh?are\\file.rs", + ), + // These renderings intentionally lose URI byte or segment boundaries. + RenderCase::renders_lossily( + "file:///tmp/non-utf8-%FF", + PathConvention::Posix, + "/tmp/non-utf8-�", + ), + RenderCase::renders_lossily( + "file:///tmp/non-utf8-%A0", + PathConvention::Posix, + "/tmp/non-utf8-�", + ), + RenderCase::renders_lossily("file:///tmp/a%2Fb", PathConvention::Posix, "/tmp/a/b"), + RenderCase::renders_lossily("file:///C:/a%2Fb", PathConvention::Windows, "C:\\a/b"), + RenderCase::renders_lossily("file:///C:/a%5Cb", PathConvention::Windows, "C:\\a\\b"), + // URI shapes that do not match the requested convention. + RenderCase::rejects( + "file://server/share/file.txt", + PathConvention::Posix, + ExpectedError::IncompatibleConvention, + ), + RenderCase::rejects( + "file://server/share/file.rs", + PathConvention::Posix, + ExpectedError::IncompatibleConvention, + ), + RenderCase::rejects( + "file:///usr/local/file.txt", + PathConvention::Windows, + ExpectedError::IncompatibleConvention, + ), + RenderCase::rejects( + "file:///home/alice/file.rs", + PathConvention::Windows, + ExpectedError::IncompatibleConvention, + ), + RenderCase::rejects( + "file://server/", + PathConvention::Windows, + ExpectedError::IncompatibleConvention, + ), + RenderCase::rejects( + "file:///_:/path", + PathConvention::Windows, + ExpectedError::IncompatibleConvention, + ), + // Invalid opaque fallback payloads. + RenderCase::rejects( + "file:///%00/bad/path/YQ", + PathConvention::Posix, + ExpectedError::OpaqueFallback, + ), + RenderCase::rejects( + "file:///%00/bad/path/L3RtcC9udWxsLQAt_y1ieXRl", + PathConvention::Windows, + ExpectedError::OpaqueFallback, + ), +]; + +#[test] +fn renders_native_paths_from_shared_cases() { + for case in RENDER_CASES { + let path = PathUri::parse(case.uri).expect("valid file URI"); + let expected = match case.expected { + RenderExpectation::RoundTrip(rendered) => Ok(ApiPathString(rendered.to_string())), + RenderExpectation::RenderOnly(rendered) => Ok(ApiPathString(rendered.to_string())), + RenderExpectation::Error(ExpectedError::OpaqueFallback) => { + Err(ApiPathStringError::OpaqueFallback { + path: path.to_string(), + }) + } + RenderExpectation::Error(ExpectedError::IncompatibleConvention) => { + Err(ApiPathStringError::IncompatibleConvention { + path: path.to_string(), + convention: case.convention, + }) + } + }; + let actual = ApiPathString::from_path_uri(&path, case.convention); + + assert_eq!(actual, expected, "rendering {case:?}"); + if let Ok(rendered) = &actual { + assert_eq!( + rendered.infer_absolute_path_convention(), + Some(case.convention), + "inferring {case:?}" + ); + } + + if let RenderExpectation::RoundTrip(rendered) = case.expected { + let api_path = serde_json::from_value::(serde_json::json!(rendered)) + .expect("native path should deserialize from API text"); + let reparsed = api_path + .to_path_uri(case.convention) + .expect("native path should parse using its convention"); + assert_eq!(reparsed, path, "parsing {case:?}"); + assert_eq!( + ApiPathString::from_path_uri(&reparsed, case.convention), + Ok(api_path), + "round-tripping {case:?}" + ); + } + } +} + +#[test] +fn relative_api_path_serializes_and_deserializes_unchanged() { + for raw_path in [".", "subdir", "subdir/file.rs"] { + let path = serde_json::from_value::(serde_json::json!(raw_path)) + .expect("relative API path should deserialize"); + + assert_eq!( + serde_json::to_value(path).expect("relative API path should serialize"), + serde_json::json!(raw_path) + ); + } +} + +#[test] +fn relative_api_path_is_invalid_when_converted_to_a_path_uri() { + let raw_path = "subdir"; + let path = serde_json::from_value::(serde_json::json!(raw_path)) + .expect("relative API path should deserialize"); + + assert_eq!(path.infer_absolute_path_convention(), None); + assert_eq!( + path.to_path_uri(PathConvention::Posix), + Err(ApiPathStringError::InvalidNativePath { + path: raw_path.to_string(), + convention: PathConvention::Posix, + }) + ); +} + +#[test] +fn other_non_absolute_api_paths_cannot_be_converted_to_path_uris() { + for (raw_path, convention) in [ + (r"workspace\file.rs", PathConvention::Windows), + (r"C:file.rs", PathConvention::Windows), + ] { + let path = serde_json::from_value::(serde_json::json!(raw_path)) + .expect("API path should deserialize without validation"); + + assert_eq!(path.infer_absolute_path_convention(), None); + assert_eq!( + path.to_path_uri(convention), + Err(ApiPathStringError::InvalidNativePath { + path: raw_path.to_string(), + convention, + }) + ); + } +} + +#[test] +fn infers_absolute_path_conventions_from_api_text() { + for (raw_path, expected) in [ + (r"C:\workspace\file.rs", Some(PathConvention::Windows)), + ("c:/workspace/file.rs", Some(PathConvention::Windows)), + (r"\\server\share\file.rs", Some(PathConvention::Windows)), + (r"\\?\C:\workspace\file.rs", Some(PathConvention::Windows)), + (r"\\.\COM1", Some(PathConvention::Windows)), + ("/workspace/file.rs", Some(PathConvention::Posix)), + ("/C:/workspace/file.rs", Some(PathConvention::Posix)), + ("//server/share/file.rs", Some(PathConvention::Posix)), + ("", None), + (".", None), + ("subdir/file.rs", None), + (r"subdir\file.rs", None), + (r"C:file.rs", None), + (r"\rooted-without-drive", None), + ] { + let path = serde_json::from_value::(serde_json::json!(raw_path)) + .expect("API path should deserialize without validation"); + + assert_eq!( + path.infer_absolute_path_convention(), + expected, + "inferring {raw_path:?}" + ); + } +} + +#[test] +fn foreign_absolute_syntax_deserializes_without_host_interpretation() { + for (raw_path, convention) in [ + (r"C:\workspace\file.rs", PathConvention::Windows), + ("/workspace/file.rs", PathConvention::Posix), + ] { + let path = serde_json::from_value::(serde_json::json!(raw_path)) + .expect("foreign API path should deserialize"); + + assert_eq!(path.as_str(), raw_path); + assert_eq!(path.infer_absolute_path_convention(), Some(convention)); + } +} + +#[test] +fn renders_an_absolute_path_using_the_host_convention() { + #[cfg(unix)] + let native_path = "/workspace/a file.rs"; + #[cfg(windows)] + let native_path = r"C:\workspace\a file.rs"; + let path = AbsolutePathBuf::from_absolute_path_checked(native_path) + .expect("native path should be absolute"); + + assert_eq!( + ApiPathString::from_abs_path(&path), + ApiPathString(native_path.to_string()) + ); +} + +#[cfg(windows)] +#[test] +fn renders_native_non_unicode_windows_fallback_lossily() { + use std::os::windows::ffi::OsStringExt; + + let native_path = std::path::PathBuf::from(std::ffi::OsString::from_wide( + &r"C:\bad\" + .encode_utf16() + .chain([0xd800]) + .collect::>(), + )); + let native_path = + AbsolutePathBuf::from_absolute_path_checked(native_path).expect("absolute native path"); + + assert_eq!( + ApiPathString::from_abs_path(&native_path), + ApiPathString(r"C:\bad\�".to_string()) + ); + + let path = PathUri::from_abs_path(&native_path); + + assert_eq!( + ApiPathString::from_path_uri(&path, PathConvention::Windows), + Ok(ApiPathString(r"C:\bad\�".to_string())) + ); + assert_eq!( + ApiPathString::from_path_uri(&path, PathConvention::Posix), + Err(ApiPathStringError::OpaqueFallback { + path: path.to_string(), + }) + ); +} + +#[test] +fn serializes_and_deserializes_as_a_string() { + let path = PathUri::parse("file:///workspace/src/lib.rs").expect("valid file URI"); + let rendered = ApiPathString::from_path_uri(&path, PathConvention::Posix) + .expect("POSIX URI should render"); + + let json = serde_json::to_string(&rendered).expect("rendered path should serialize"); + assert_eq!(json, r#""/workspace/src/lib.rs""#); + assert_eq!( + serde_json::from_str::(&json) + .expect("rendered path should deserialize from a string"), + rendered + ); +} diff --git a/codex-rs/utils/path-uri/src/lib.rs b/codex-rs/utils/path-uri/src/lib.rs index 8c36a9d4f..e9a00543f 100644 --- a/codex-rs/utils/path-uri/src/lib.rs +++ b/codex-rs/utils/path-uri/src/lib.rs @@ -17,6 +17,12 @@ use thiserror::Error; use ts_rs::TS; use url::Url; +mod api_path_string; + +pub use api_path_string::ApiPathString; +pub use api_path_string::ApiPathStringError; +pub use api_path_string::PathConvention; + pub const FILE_SCHEME: &str = "file"; const BAD_PATH_URI_PREFIX: &str = "file:///%00/bad/path/"; @@ -29,15 +35,14 @@ const BAD_PATH_URI_PREFIX: &str = "file:///%00/bad/path/"; /// created by [`Self::from_abs_path`] are opaque to these lexical operations. /// /// `file:` paths retain their URI spelling so they can be parsed independently -/// of the current host. In particular, `/C:/src` remains ambiguous between a -/// Windows drive path and a valid POSIX path until [`Self::to_abs_path`] -/// applies the current host's rules. A local POSIX `file:` URI can also retain +/// of the current host. A local POSIX `file:` URI can also retain /// percent-encoded non-UTF-8 bytes for lossless native round trips. /// /// Like [VS Code resources], path operations use `/` URI separators on every -/// host. They preserve a URL authority but do not infer Windows drive or UNC -/// roots from path text. Native path normalization, filesystem aliases, -/// symlinks, case sensitivity, and Unicode normalization are not resolved. +/// host. Lexical path operations preserve a URL authority without interpreting +/// Windows drive or UNC roots from path text. Native path normalization, +/// filesystem aliases, symlinks, case sensitivity, and Unicode normalization +/// are not resolved. /// /// Serde represents a `PathUri` as its canonical URI string. Deserialization /// also accepts an absolute native path for compatibility with fields that @@ -76,22 +81,24 @@ impl PathUri { } #[cfg(unix)] - let encoded_path = { + let path_bytes = { use std::os::unix::ffi::OsStrExt; - base64::engine::general_purpose::URL_SAFE_NO_PAD - .encode(path.as_path().as_os_str().as_bytes()) + path.as_path().as_os_str().as_bytes().to_vec() }; #[cfg(windows)] - let encoded_path = { + let path_bytes = { use std::os::windows::ffi::OsStrExt; - let path_bytes = path - .as_path() + path.as_path() .as_os_str() .encode_wide() .flat_map(u16::to_le_bytes) - .collect::>(); - base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(path_bytes) + .collect::>() }; + Self::from_opaque_path_bytes(&path_bytes) + } + + fn from_opaque_path_bytes(path_bytes: &[u8]) -> Self { + let encoded_path = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(path_bytes); let Ok(uri) = Self::parse(&format!("{BAD_PATH_URI_PREFIX}{encoded_path}")) else { unreachable!("URL-safe base64 always produces a valid fallback path URI"); }; @@ -117,6 +124,46 @@ impl PathUri { self.0.path() } + fn opaque_fallback_bytes(&self) -> Option> { + decode_bad_path_uri(&self.0) + } + + /// Infers the native path convention represented by this URI. + /// + /// A URI authority is treated as a Windows UNC host, and a leading + /// drive-letter segment such as `C:` is treated as a Windows drive. All + /// other ordinary file URIs are treated as POSIX paths. This deliberately + /// classifies `file:///C:/src` as Windows even though `/C:/src` is also a + /// valid POSIX path. In practice, POSIX paths with a drive-shaped first + /// component are rare enough that recognizing foreign Windows paths is the + /// more useful default. + /// + /// Opaque fallback URIs are inspected for an absolute POSIX byte prefix or + /// an absolute Windows UTF-16LE prefix. `None` is returned when their + /// payload does not identify either convention. + /// + /// TODO(anp): Once `PathUri` carries an environment identifier, prefer the + /// environment's declared convention over this spelling-based heuristic. + pub fn infer_path_convention(&self) -> Option { + if let Some(path_bytes) = self.opaque_fallback_bytes() { + return infer_opaque_path_convention(&path_bytes); + } + if self.0.host_str().is_some() { + return Some(PathConvention::Windows); + } + + let has_windows_drive = self + .0 + .path_segments() + .and_then(|mut segments| segments.find(|segment| !segment.is_empty())) + .is_some_and(is_windows_drive_uri_segment); + if has_windows_drive { + Some(PathConvention::Windows) + } else { + Some(PathConvention::Posix) + } + } + /// Returns the decoded final URI path segment, or `None` for the URI root /// or an opaque fallback URI created by [`Self::from_abs_path`]. /// @@ -377,6 +424,29 @@ fn decode_bad_path_uri(url: &Url) -> Option> { .then_some(path_bytes) } +fn is_windows_drive_uri_segment(segment: &str) -> bool { + matches!(segment.as_bytes(), [drive, b':'] if drive.is_ascii_alphabetic()) +} + +fn infer_opaque_path_convention(path_bytes: &[u8]) -> Option { + if path_bytes.starts_with(b"/") { + return Some(PathConvention::Posix); + } + if !path_bytes.len().is_multiple_of(2) { + return None; + } + + let mut path_wide = path_bytes + .chunks_exact(2) + .map(|bytes| u16::from_le_bytes([bytes[0], bytes[1]])); + let first = path_wide.next()?; + let second = path_wide.next()?; + let has_drive = u8::try_from(first).is_ok_and(|drive| drive.is_ascii_alphabetic()) + && second == u16::from(b':'); + let has_unc_prefix = first == u16::from(b'\\') && second == u16::from(b'\\'); + (has_drive || has_unc_prefix).then_some(PathConvention::Windows) +} + /// Rejects URI metadata that has no defined meaning for `file:` URIs. fn validate_common_known_uri(url: &Url) -> Result<(), PathUriParseError> { if !url.username().is_empty() || url.password().is_some() { diff --git a/codex-rs/utils/path-uri/src/tests.rs b/codex-rs/utils/path-uri/src/tests.rs index f4ad44ae9..fa0c5d8b0 100644 --- a/codex-rs/utils/path-uri/src/tests.rs +++ b/codex-rs/utils/path-uri/src/tests.rs @@ -58,6 +58,42 @@ fn file_uri_parses_a_windows_path_on_any_host() { ); } +#[test] +fn infers_path_conventions_from_uri_shape() { + for (uri, expected) in [ + ("file:///", Some(PathConvention::Posix)), + ("file:///home/alice/src", Some(PathConvention::Posix)), + ("file:///C:/Users/Alice/src", Some(PathConvention::Windows)), + ("file:///d:", Some(PathConvention::Windows)), + ("file://server/share/src", Some(PathConvention::Windows)), + // Opaque fallback for POSIX bytes `/tmp/null-\0-\xff-byte`. + ( + "file:///%00/bad/path/L3RtcC9udWxsLQAt_y1ieXRl", + Some(PathConvention::Posix), + ), + // Opaque fallback for Windows UTF-16LE `\\.\COM1\`. + ( + "file:///%00/bad/path/XABcAC4AXABDAE8ATQAxAFwA", + Some(PathConvention::Windows), + ), + ("file:///%00/bad/path/YQ", None), + ] { + let path = PathUri::parse(uri).expect("valid path URI"); + + assert_eq!(path.infer_path_convention(), expected, "inferring {uri}"); + } +} + +#[test] +fn drive_shaped_posix_uri_is_intentionally_inferred_as_windows() { + let path = PathUri::parse("file:///C:/actually/a/posix/path").expect("valid path URI"); + + // `/C:/...` is valid on POSIX, but treating this uncommon spelling as a + // Windows drive lets callers render the overwhelmingly more common foreign + // Windows URI without separately carrying its source convention. + assert_eq!(path.infer_path_convention(), Some(PathConvention::Windows)); +} + #[cfg(windows)] #[test] fn file_uri_falls_back_for_windows_prefixes_without_a_uri_representation() {