From a50fd786520f04e1e4df1db3c8cdaf586f2ca094 Mon Sep 17 00:00:00 2001 From: chuan Date: Fri, 27 Mar 2026 17:08:58 +0800 Subject: [PATCH] feat: add simple files --- Cargo.lock | 287 ++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 + README.md | 3 +- docs/prd.md | 190 ++++++++++++++++++++++++++++++++ src/commands.rs | 54 +++++++++ src/config.rs | 26 +++++ src/main.rs | 37 +++++-- watchdog.toml | 3 + 8 files changed, 594 insertions(+), 9 deletions(-) create mode 100644 docs/prd.md create mode 100644 src/commands.rs create mode 100644 src/config.rs create mode 100644 watchdog.toml diff --git a/Cargo.lock b/Cargo.lock index a1c0c66..39a5473 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,293 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_spanned" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876ac351060d4f882bb1032b6369eb0aef79ad9df1ea8bc404874d8cc3d0cd98" +dependencies = [ + "serde_core", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "toml" +version = "1.1.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8195ca05e4eb728f4ba94f3e3291661320af739c4e43779cbdfae82ab239fcc" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow", +] + +[[package]] +name = "toml_datetime" +version = "1.1.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_parser" +version = "1.1.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" +dependencies = [ + "winnow", +] + +[[package]] +name = "toml_writer" +version = "1.1.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d282ade6016312faf3e41e57ebbba0c073e4056dab1232ab1cb624199648f8ed" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "watchdog" version = "0.1.0" +dependencies = [ + "clap", + "serde", + "toml", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "winnow" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" diff --git a/Cargo.toml b/Cargo.toml index 8578fc2..0057900 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,6 @@ version = "0.1.0" edition = "2024" [dependencies] +toml = "*" +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } diff --git a/README.md b/README.md index 4ffcea7..648fd12 100644 --- a/README.md +++ b/README.md @@ -17,5 +17,4 @@ sudo apt install -y gcc-aarch64-linux-gnu rustup target add aarch64-unknown-linux-musl sudo apt install -y musl-tools gcc-aarch64-linux-gnu -``` - +``` \ No newline at end of file diff --git a/docs/prd.md b/docs/prd.md new file mode 100644 index 0000000..2078c28 --- /dev/null +++ b/docs/prd.md @@ -0,0 +1,190 @@ +# Watchdog 项目 PRD + +## 1. 项目概述 + +本项目目标是实现一个运行于 Linux 平台的守护型看门狗程序,用于监控一个或多个目标程序的运行状态,并在目标程序异常退出、未启动或持续不稳定时,按照预定义策略执行重启与回退。该程序首先服务于 Linux arm64 部署场景,要求支持在 x64 开发环境中交叉编译到 arm64 设备运行。系统设计时需保留未来扩展到 Windows 的可能,但第一版本不承担 Windows 兼容实现。 + +该项目不负责业务程序本身的逻辑,不承担升级平台、远程编排、图形界面或复杂运维平台职责。其定位是一个本地进程守护与回退执行器,提供稳定、可预测、可测试的最小核心能力。 + +## 2. 项目目标 + +第一版本的核心目标如下: + +1. 支持监控多个目标程序。 +2. 支持通过轮询方式检测程序是否处于有效运行状态。 +3. 支持在目标程序未运行时自动拉起。 +4. 支持对频繁崩溃或持续启动失败的程序执行 fallback。 +5. fallback 顺序固定,但每一级 fallback 的目标内容由配置指定。 +6. fallback 为临时运行时切换,不持久化到磁盘。 +7. 支持固定 CLI 子命令执行。 +8. 支持 stdout/stderr 日志输出。 +9. 支持单元测试,保证核心状态机与策略逻辑可验证。 + +## 3. 非目标 + +第一版本明确不做以下内容: + +- 不支持任意 shell 指令编排。 +- 不支持动态插件系统。 +- 不支持配置热更新。 +- 不支持远程管理与网络 API。 +- 不支持安装包管理、升级平台和复杂版本治理。 +- 不支持集成测试框架自动化,仅要求单元测试。 +- 不支持所有极端失败场景的复杂恢复决策;若最终 fallback 目标仍无法启动,则系统持续尝试最后一级 fallback 即可。 + +## 4. 核心业务场景 + +### 4.1 目标程序未启动 +看门狗轮询检测目标程序状态;若检测失败,系统尝试执行该目标当前激活版本的启动命令。 + +### 4.2 目标程序启动后短时间退出 +若程序启动后未达到“稳定运行判定窗口”,则该次行为视为启动失败,不计入 crash。 + +### 4.3 目标程序稳定运行后退出 +若程序已连续存活至少 5 秒,再发生退出,则视为一次 crash 事件。看门狗根据配置统计检测窗口内 crash 次数,达到阈值后触发 fallback。 + +### 4.4 目标程序持续不稳定 +当一个目标在给定检测窗口内达到阈值,例如默认 1 分钟内 3 次 crash,则切换到固定顺序中的下一级 fallback 目标。 + +### 4.5 fallback 已触发 +一旦某目标切换到 fallback,除非整个 watchdog 进程重启,否则不主动恢复到主目标。系统继续守护当前 fallback 目标;若仍失败,则继续向后切换,直到最终 fallback 目标。最终 fallback 目标若仍无法启动,则持续尝试该最终目标。 + +## 5. 功能需求 + +## 5.1 多目标监控 +系统需支持多个目标程序并发受管。第一版本允许采用单线程轮询模型,即每一轮按顺序遍历所有目标,分别执行检测、重启和 fallback 决策。每个目标需维护独立运行状态,互不影响。 + +## 5.2 启动方式 +所有目标及 fallback 均采用 `exec` 方式启动。即配置明确给出: + +- 可执行文件路径 +- 参数列表 +- 可选工作目录 + +第一版本不通过 shell 解释器启动,不依赖 `sh -c` 或类似方式,以降低转义复杂度、不确定性和命令注入风险。 + +## 5.3 检测机制 +检测机制采用“provider + 配置项”的方式实现。第一版本不构建高度抽象的通用组合 DSL,而是定义若干固定检测方法,每个方法对应明确配置字段。一个目标可配置多个检测方法;所有检测方法均通过时,视为目标当前健康。 + +已确认需要支持或预留的检测维度包括: + +- 进程是否存在 +- 程序文件是否存在 +- PID 文件是否存在 +- 指定文件是否存在 + +“检测程序是否存在”属于检测条件之一,不等同于进程是否存在。 + +## 5.4 稳定运行与 crash 判定 +系统需定义稳定运行窗口。当前约定为 5 秒: + +- 启动后若存活不足 5 秒即退出,视为启动失败。 +- 启动后若连续运行至少 5 秒再退出,视为 crash。 + +看门狗需统计检测窗口中的 crash 事件次数。默认策略为“1 分钟内 3 次 crash 触发 fallback”,该阈值与窗口长度应允许通过配置覆盖。 + +## 5.5 fallback 机制 +fallback 顺序固定,第一版本定义为: + +1. 主目标 +2. 上一个版本 +3. 更新后的出厂版本 +4. 出厂版本 + +配置文件负责指定每一级实际启动目标,不负责改写顺序。fallback 的本质是运行时切换当前激活目标,不修改持久化配置、不覆盖原始程序文件、不执行安装动作。切换后仅影响当前 watchdog 生命周期内的行为。 + +## 5.6 CLI 子命令 +程序需支持固定 CLI 子命令。第一版本至少应考虑以下命令集合: + +- `run`:启动 watchdog 主循环 +- `check`:执行一次检测 +- `start`:手动启动指定目标 +- `check-env`:检测运行环境 + +是否进一步开放 fallback 相关手工命令可在后续迭代决定,但不作为第一版本必需能力。 + +## 5.7 日志 +第一版本日志输出目标为 stdout/stderr。日志需要覆盖以下关键信息: + +- watchdog 启动与退出 +- 每个目标的检测结果 +- 启动尝试 +- 启动失败 +- 稳定运行判定通过 +- crash 事件记录 +- fallback 切换 +- 最终 fallback 持续尝试 + +后续如需写文件或接入 journald,应通过独立日志层扩展,而不影响核心状态机。 + +## 5.8 测试 +必须提供单元测试,覆盖以下核心逻辑: + +- 启动成功与启动失败判定 +- 稳定运行 5 秒后的 crash 判定 +- 检测窗口内 crash 计数 +- fallback 触发阈值 +- fallback 顺序推进 +- 多目标状态相互隔离 + +## 6. 状态机要求 + +每个目标必须维护独立状态。最少应包含以下运行态信息: + +- 当前激活层级(主目标或某一级 fallback) +- 当前进程状态 +- 最近一次启动时间 +- 是否已达到稳定运行门槛 +- 检测窗口内 crash 记录 +- 当前是否处于 fallback 模式 + +watchdog 主循环根据检测结果和状态决定下一步动作: + +1. 检测通过:保持当前状态。 +2. 检测失败且当前无进程:尝试启动当前激活目标。 +3. 启动后未满 5 秒退出:记为启动失败。 +4. 启动后满 5 秒退出:记为一次 crash。 +5. crash 在窗口内达到阈值:切换到下一级 fallback。 +6. 已在最终 fallback:持续尝试最终 fallback,不再设计额外终止分支。 + +## 7. 架构原则 + +第一版本应坚持以下架构原则: + +- Linux 优先,接口设计与平台实现分离,为未来 Windows 预留空间。 +- 业务状态机与系统调用分离。 +- 检测器、启动器、fallback 决策解耦。 +- 多目标共享框架、独立状态。 +- 配置驱动目标定义,避免业务逻辑硬编码。 +- 第一版本优先简单可维护,不提前引入过度抽象。 + +## 8. 实施计划 + +### 阶段一:基础框架 +完成项目骨架、CLI 入口、日志初始化、配置加载与最小多目标数据模型。 + +### 阶段二:核心运行闭环 +实现目标轮询、exec 启动、基础检测器、稳定窗口判定、crash 记录。 + +### 阶段三:fallback 机制 +实现固定顺序 fallback、目标切换、最终 fallback 持续尝试。 + +### 阶段四:测试与部署 +补充关键单元测试,完善 x64 到 arm64 musl 交叉编译与部署脚本。 + +## 9. 成功标准 + +第一版本交付时,应满足以下标准: + +1. 能在 Linux arm64 目标机上稳定运行。 +2. 能同时监控多个目标程序。 +3. 能对未启动程序自动拉起。 +4. 能识别“启动失败”和“稳定运行后 crash”。 +5. 能在检测窗口内依据 crash 次数触发 fallback。 +6. 能按照固定顺序切换 fallback 目标。 +7. 能输出足够定位问题的日志。 +8. 核心状态机具备单元测试覆盖。 + +## 10. 结论 + +该项目第一版本定位为一个本地、轻量、配置驱动的多目标 watchdog。其核心价值在于以明确、稳定、低复杂度的方式实现进程守护、稳定性判定与临时 fallback 切换。设计上应优先保证 Linux 场景落地与 arm64 部署稳定性,同时通过合理模块边界为后续平台扩展、检测器扩展与日志扩展保留空间。 \ No newline at end of file diff --git a/src/commands.rs b/src/commands.rs new file mode 100644 index 0000000..b80ae5c --- /dev/null +++ b/src/commands.rs @@ -0,0 +1,54 @@ +use std::fs; +use std::path::Path; + +use crate::config::Config; + +fn load_file() -> Result> { + let config_path = Path::new("watchdog.toml"); + + if !config_path.exists() { + return Err(format!("missing config file: {}", config_path.display()).into()); + } + + let content = fs::read_to_string(config_path)?; + + if content.trim().is_empty() { + return Err("watchdog.toml exists but is empty".into()); + } + + let config = toml::from_str::(&content)?; + + config.validate()?; + + Ok(config) +} + +pub fn run() { + println!("run: not implemented yet"); +} + +pub fn check() -> Result<(), Box> { + let config = load_file()?; + + println!("checking service: {}", config.service_name); + println!("would run: {}", config.check_command); + + Ok(()) +} + +pub fn start() { + println!("start: not implemented yet"); +} + +pub fn check_env() -> Result<(), Box> { + let config = load_file()?; + + println!("environment ok: config parsed successfully"); + println!("config = {:?}", config); + + println!("interval = {}", config.interval); + println!("service_name = {}", config.service_name); + println!("check_command = {}", config.check_command); + + Ok(()) +} diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..fa23d99 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,26 @@ +use serde::Deserialize; + +#[derive(Deserialize, Debug)] +pub struct Config { + pub interval: u64, + pub service_name: String, + pub check_command: String, +} + +impl Config { + pub fn validate(&self) -> Result<(), Box> { + if self.interval == 0 { + return Err("interval must be greater than 0".into()); + } + + if self.service_name.trim().is_empty() { + return Err("service_name must not be empty".into()); + } + + if self.check_command.trim().is_empty() { + return Err("check_command must not be empty".into()); + } + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index a1b26ae..0303cab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,10 +1,33 @@ -fn main() { - println!("Hello, world!"); +use clap::{Parser, Subcommand}; - let a = 1; - let b = 2; +mod commands; +mod config; - let c = a + b; - - println!("The sum of {} and {} is {}", a, b, c); +#[derive(Parser, Debug)] +#[command(name = "watchdog")] +#[command(about = "A lightweight watchdog service", long_about = None)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand, Debug)] +enum Command { + Run, + Check, + Start, + CheckEnv, +} + +fn main() -> Result<(), Box> { + let cli = Cli::parse(); + + match cli.command { + Command::Run => commands::run(), + Command::Check => commands::check()?, + Command::Start => commands::start(), + Command::CheckEnv => commands::check_env()?, + } + + Ok(()) } diff --git a/watchdog.toml b/watchdog.toml new file mode 100644 index 0000000..0dbb3d2 --- /dev/null +++ b/watchdog.toml @@ -0,0 +1,3 @@ +interval = 60 +service_name = "demo" +check_command = "echo hello" \ No newline at end of file