diff --git a/generate_config_template.py b/generate_config_template.py index 6e5d88a..87a6b42 100644 --- a/generate_config_template.py +++ b/generate_config_template.py @@ -6,6 +6,7 @@ import sqlite3 import json +import argparse from pathlib import Path from typing import Dict, List, Any from collections import defaultdict @@ -127,6 +128,82 @@ class ConfigTemplateGenerator: try: cursor = conn.cursor() + + def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]: + """ + 从建表 SQL 中尽力解析列名(用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况)。 + 返回 (name, type);类型缺失时默认 TEXT。 + """ + out: list[tuple[str, str]] = [] + if not create_sql: + return out + try: + start = create_sql.find("(") + end = create_sql.rfind(")") + if start == -1 or end == -1 or end <= start: + return out + inner = create_sql[start + 1:end] + + parts: list[str] = [] + buf = "" + depth = 0 + for ch in inner: + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if ch == "," and depth == 0: + parts.append(buf.strip()) + buf = "" + else: + buf += ch + if buf.strip(): + parts.append(buf.strip()) + + for part in parts: + token = part.strip() + if not token: + continue + low = token.lower() + # 跳过约束/外键等 + if low.startswith(("constraint", "primary", "unique", "foreign", "check")): + continue + # fts5 选项(tokenize/prefix/content/content_rowid 等) + if "=" in token: + key = token.split("=", 1)[0].strip().lower() + if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"): + continue + tokens = token.split() + if not tokens: + continue + name = tokens[0].strip("`\"[]") + typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT" + out.append((name, typ)) + except Exception: + return out + return out + + def get_table_columns(table_name: str) -> list[tuple[str, str]]: + # 先尝试 PRAGMA + try: + cursor.execute(f"PRAGMA table_info({table_name})") + columns = cursor.fetchall() + if columns: + return [(col[1], col[2]) for col in columns] + except Exception: + pass + + # 兜底:从 sqlite_master.sql 解析 + try: + cursor.execute( + "SELECT sql FROM sqlite_master WHERE type='table' AND name=?", + (table_name,), + ) + row = cursor.fetchone() + create_sql = row[0] if row and len(row) > 0 else "" + return parse_columns_from_create_sql(create_sql or "") + except Exception: + return [] # 获取所有表名 cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") @@ -152,13 +229,10 @@ class ConfigTemplateGenerator: table_key = f"{prefix}_*" # 使用模式名 # 获取代表表的字段信息 - cursor.execute(f"PRAGMA table_info({representative_table})") - columns = cursor.fetchall() + columns = get_table_columns(representative_table) fields = {} - for col in columns: - field_name = col[1] - field_type = col[2] + for field_name, field_type in columns: fields[field_name] = { "type": field_type, "meaning": "", # 留空供用户填写 @@ -188,13 +262,10 @@ class ConfigTemplateGenerator: try: # 获取表字段信息 - cursor.execute(f"PRAGMA table_info({table_name})") - columns = cursor.fetchall() + columns = get_table_columns(table_name) fields = {} - for col in columns: - field_name = col[1] - field_type = col[2] + for field_name, field_type in columns: fields[field_name] = { "type": field_type, "meaning": "", # 留空供用户填写 @@ -219,16 +290,23 @@ class ConfigTemplateGenerator: finally: conn.close() - def generate_template(self, output_file: str = "wechat_db_config_template.json"): + def generate_template( + self, + output_file: str = "wechat_db_config_template.json", + *, + include_excluded: bool = False, + include_message_shards: bool = False, + exclude_db_stems: set[str] | None = None, + ): """生成配置模板""" print("开始生成微信数据库配置模板...") # 定义要排除的数据库模式和描述 - excluded_patterns = { - r'biz_message_\d+\.db$': '企业微信聊天记录数据库', - r'bizchat\.db$': '企业微信联系人数据库', - r'contact_fts\.db$': '搜索联系人数据库', - r'favorite_fts\.db$': '搜索收藏数据库' + excluded_patterns = {} if include_excluded else { + r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库(通常不参与个人聊天分析)', + r'bizchat\.db$': '企业微信联系人/会话数据库(通常不参与个人聊天分析)', + r'contact_fts\.db$': '联系人搜索索引数据库(FTS)', + r'favorite_fts\.db$': '收藏搜索索引数据库(FTS)' } # 查找所有数据库文件 @@ -263,29 +341,38 @@ class ConfigTemplateGenerator: for excluded_file, description in excluded_files: print(f" - {excluded_file.name} ({description})") + # 显式排除指定 stem(不含 .db) + if exclude_db_stems: + before = len(db_files) + db_files = [p for p in db_files if p.stem not in exclude_db_stems] + after = len(db_files) + if before != after: + print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}") + print(f"\n实际处理 {len(db_files)} 个数据库文件") # 过滤message数据库,只保留倒数第二个(与主脚本逻辑一致) - message_numbered_dbs = [] - message_other_dbs = [] - - for db in db_files: - if re.match(r'message_\d+$', db.stem): # message_{数字}.db - message_numbered_dbs.append(db) - elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等 - message_other_dbs.append(db) - - if len(message_numbered_dbs) > 1: - # 按数字编号排序(提取数字进行排序) - message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1))) - # 选择倒数第二个(按编号排序) - selected_message_db = message_numbered_dbs[-2] # 倒数第二个 - print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库") - print(f"选择倒数第二个: {selected_message_db.name}") - - # 从db_files中移除其他message_{数字}.db数据库,但保留message_fts.db等 - db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)] - db_files.append(selected_message_db) + if not include_message_shards: + message_numbered_dbs = [] + message_other_dbs = [] + + for db in db_files: + if re.match(r'message_\d+$', db.stem): # message_{数字}.db + message_numbered_dbs.append(db) + elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等 + message_other_dbs.append(db) + + if len(message_numbered_dbs) > 1: + # 按数字编号排序(提取数字进行排序) + message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1))) + # 选择倒数第二个(按编号排序) + selected_message_db = message_numbered_dbs[-2] # 倒数第二个 + print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库") + print(f"选择倒数第二个: {selected_message_db.name}") + + # 从db_files中移除其他message_{数字}.db数据库,但保留message_fts.db等 + db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)] + db_files.append(selected_message_db) print(f"实际分析 {len(db_files)} 个数据库文件") @@ -370,11 +457,24 @@ class ConfigTemplateGenerator: def main(): """主函数""" + parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器") + parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录(按账号分目录)") + parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径") + parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库(如 bizchat/contact_fts/favorite_fts 等)") + parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db(否则仅保留倒数第二个作代表)") + parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem(不含 .db)排除数据库,可重复,例如: --exclude-db-stem digital_twin") + args = parser.parse_args() + print("微信数据库配置模板生成器") print("=" * 50) - - generator = ConfigTemplateGenerator() - generator.generate_template() + + generator = ConfigTemplateGenerator(databases_path=args.databases_path) + generator.generate_template( + output_file=args.output, + include_excluded=bool(args.include_excluded), + include_message_shards=bool(args.include_message_shards), + exclude_db_stems=set(args.exclude_db_stem or []), + ) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/export_database_schema_markdown.py b/tools/export_database_schema_markdown.py new file mode 100644 index 0000000..d6043c7 --- /dev/null +++ b/tools/export_database_schema_markdown.py @@ -0,0 +1,530 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +导出微信数据库字段配置为一份 Markdown 文档(单文件): + +- 输入:wechat_db_config.json(由 tools/generate_wechat_db_config.py 生成) +- 输出:Markdown(包含:数据库 → 表/表组 → 字段与含义) + +说明: +- 本脚本只基于“配置文件中的结构与字段含义”生成文档,不会读取真实数据内容; +- 会对类似 Msg_ 这类用户相关的哈希表名做脱敏显示。 +- 会将“同结构但表名仅数字不同”的重复表自动折叠为一个表组(常见于 FTS 分片/内部表)。 + +用法示例: + python tools/export_database_schema_markdown.py \ + --config wechat_db_config.json \ + --output docs/wechat_database_schema.md +""" + +from __future__ import annotations + +import argparse +import json +import re +from datetime import datetime +from pathlib import Path +from typing import Any + +ROOT = Path(__file__).resolve().parents[1] + + +_HASH_TABLE_RE = re.compile(r"^([A-Za-z0-9]+)_([0-9a-fA-F]{16,})$") + + +def _md_escape_cell(v: Any) -> str: + """Escape Markdown table cell content.""" + if v is None: + return "-" + s = str(v) + # Keep it one-line for tables. + s = s.replace("\r", " ").replace("\n", " ").strip() + # Escape pipe + s = s.replace("|", r"\|") + return s if s else "-" + + +def _mask_hash_table_name(name: str) -> str: + """ + Mask user-specific hash suffix table names: + Msg_00140f... -> Msg_ + """ + m = _HASH_TABLE_RE.match(name) + if not m: + return name + return f"{m.group(1)}_" + + +def _db_sort_key(db_name: str) -> tuple[int, int, str]: + """ + Roughly sort DBs by importance for readers. + """ + # Core + if db_name == "contact": + return (10, 0, db_name) + if db_name == "session": + return (20, 0, db_name) + m = re.match(r"^message_(\d+)$", db_name) + if m: + return (30, int(m.group(1)), db_name) + if re.match(r"^biz_message_(\d+)$", db_name): + n = int(re.match(r"^biz_message_(\d+)$", db_name).group(1)) # type: ignore[union-attr] + return (31, n, db_name) + if db_name == "message_resource": + return (40, 0, db_name) + if db_name == "media_0": + return (41, 0, db_name) + if db_name == "hardlink": + return (42, 0, db_name) + if db_name == "head_image": + return (43, 0, db_name) + + # Social / content + if db_name == "sns": + return (50, 0, db_name) + if db_name == "favorite": + return (60, 0, db_name) + if db_name == "emoticon": + return (70, 0, db_name) + + # System / misc + if db_name in {"general", "unspportmsg"}: + return (80, 0, db_name) + + # Search / index + if db_name in {"chat_search_index", "message_fts"} or db_name.endswith("_fts"): + return (90, 0, db_name) + + # Others + return (100, 0, db_name) + + +def _render_message_type_map(message_types: dict[str, Any]) -> str: + # In Windows WeChat v4, `local_type` is commonly a 64-bit integer: + # raw = (sub_type << 32) | type + # Some configs may still store explicit (type, sub_type) pairs; handle both. + items: list[tuple[int, int, int, str]] = [] + for k, v in message_types.items(): + if k in {"_instructions", "examples"}: + continue + if not isinstance(k, str) or "," not in k: + continue + a, b = k.split(",", 1) + try: + a_i = int(a) + b_i = int(b) + except Exception: + continue + desc = str(v) + + if b_i != 0: + msg_type = a_i + msg_sub = b_i + raw = (msg_sub << 32) | (msg_type & 0xFFFFFFFF) + else: + raw = a_i + msg_type = raw & 0xFFFFFFFF + msg_sub = (raw >> 32) & 0xFFFFFFFF + + items.append((raw, msg_type, msg_sub, desc)) + + if not items: + return "" + + # Sort by decoded (type, sub_type), then raw value. + items.sort(key=lambda x: (x[1], x[2], x[0])) + + out = "## 消息类型(local_type)速查\n\n" + out += "说明:Windows 微信 v4 的 `local_type` 常见为 64 位整型:`raw = (sub_type<<32) | type`。\n\n" + out += "| local_type(raw) | type(low32) | sub_type(high32) | 含义 |\n|---:|---:|---:|---|\n" + for raw, t, st, desc in items: + out += f"| {raw} | {t} | {st} | {_md_escape_cell(desc)} |\n" + return out + "\n" + + +def _table_schema_signature(table: dict[str, Any]) -> tuple[str, str, tuple[tuple[str, str, str, str], ...]]: + """ + Build a stable signature for a table schema in config. + + Used to fold tables which are structurally identical but only differ in name + (e.g. message_fts_v4_aux_0..3). + """ + t_type = str(table.get("type", "table")) + desc = str(table.get("description", "")) + fields = table.get("fields") or {} + + items: list[tuple[str, str, str, str]] = [] + if isinstance(fields, dict): + for field_name, fm in fields.items(): + if not isinstance(fm, dict): + fm = {} + items.append( + ( + str(field_name), + str(fm.get("type", "")), + str(fm.get("meaning", "")), + str(fm.get("notes", "")), + ) + ) + items.sort(key=lambda x: x[0]) + return (t_type, desc, tuple(items)) + + +def _name_family_key(name: str) -> str: + """Normalize a table name into a family key by replacing digit runs with {n}.""" + return re.sub(r"\d+", "{n}", name) + + +def _make_group_pattern(table_names: list[str]) -> str: + """ + Make a readable pattern for a group of similar table names: + + - Only varying numeric segments become `{n}` + - Constant numeric segments are kept as-is + + Example: + message_fts_v4_0/message_fts_v4_1 -> message_fts_v4_{n} + ImgFts0V0/ImgFts1V0 -> ImgFts{n}V0 + """ + if not table_names: + return "" + + tokenized = [re.split(r"(\d+)", n) for n in table_names] + base = tokenized[0] + + # Ensure token structures match; otherwise fall back to a simple normalization. + for t in tokenized[1:]: + if len(t) != len(base): + return _name_family_key(table_names[0]) + for i in range(0, len(base), 2): + if t[i] != base[i]: + return _name_family_key(table_names[0]) + + out_parts: list[str] = [] + for i, part in enumerate(base): + if i % 2 == 0: + out_parts.append(part) + continue + nums = {t[i] for t in tokenized if i < len(t)} + out_parts.append(part if len(nums) == 1 else "{n}") + return "".join(out_parts) + + +def _fold_same_schema_tables_for_display( + tables: dict[str, Any], +) -> list[tuple[str, dict[str, Any]]]: + """ + Fold duplicated tables that share the same schema/signature but only differ in name. + + This is common in FTS shards, e.g.: + message_fts_v4_aux_0..3 + message_fts_v4_0..3 and their internal *_content/*_data/*_idx tables + ImgFts0V0..3 and their internal tables + + Returns a list of (display_name, table_dict) items sorted by the original table name order. + """ + if not tables: + return [] + + # (family_key, schema_sig) -> [table_name, ...] + groups: dict[tuple[str, tuple[str, str, tuple[tuple[str, str, str, str], ...]]], list[str]] = {} + for table_name, table in tables.items(): + if not isinstance(table, dict): + continue + if str(table.get("type", "table")) == "similar_group": + continue + family = _name_family_key(str(table_name)) + sig = _table_schema_signature(table) + groups.setdefault((family, sig), []).append(str(table_name)) + + consumed: set[str] = set() + items: list[tuple[str, str, dict[str, Any]]] = [] # (sort_key, display_name, table) + used_display_names: set[str] = set() + + # Create auto "similar_group" entries for groups > 1. + for (_, _), names in sorted(groups.items(), key=lambda x: x[0][0]): + if len(names) <= 1: + continue + names_sorted = sorted(names) + rep = names_sorted[0] + rep_table = tables.get(rep) + if not isinstance(rep_table, dict): + continue + pattern = _make_group_pattern(names_sorted) + if not pattern: + pattern = _name_family_key(rep) + + display_name = pattern + if display_name in used_display_names: + # Rare: same name pattern but different schema signatures. Disambiguate. + n = 2 + while f"{pattern} (var{n})" in used_display_names: + n += 1 + display_name = f"{pattern} (var{n})" + + group_entry = dict(rep_table) + group_entry.update( + { + "type": "similar_group", + "pattern": pattern, + "table_count": len(names_sorted), + "representative_table": rep, + "table_names": names_sorted, + } + ) + items.append((rep, display_name, group_entry)) + used_display_names.add(display_name) + consumed.update(names_sorted) + + # Keep non-grouped tables (and existing similar_group) as-is. + for table_name, table in tables.items(): + if not isinstance(table, dict): + continue + if str(table_name) in consumed: + continue + items.append((str(table_name), str(table_name), table)) + + items.sort(key=lambda x: (x[0], x[1])) + return [(display_name, table) for _, display_name, table in items] + + +def export_markdown(config_path: Path, output_path: Path) -> None: + cfg = json.loads(config_path.read_text(encoding="utf-8")) + meta = cfg.get("_metadata") or {} + databases: dict[str, Any] = cfg.get("databases") or {} + + # message_{n}.db are typically shards with identical schema. Keep only the last shard for detailed sections. + message_shards: list[tuple[int, str]] = [] + for name in databases.keys(): + m = re.match(r"^message_(\d+)$", str(name)) + if not m: + continue + try: + message_shards.append((int(m.group(1)), str(name))) + except Exception: + continue + message_shards.sort(key=lambda x: x[0]) + rep_message_db: str | None = message_shards[-1][1] if message_shards else None + all_message_db_names = [n for _, n in message_shards] + + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + gen_time = meta.get("generated_time") or now + + lines: list[str] = [] + lines.append("# Windows 微信数据库结构文档(自动生成)") + lines.append("") + lines.append(f"> 生成时间:{_md_escape_cell(gen_time)}") + lines.append(f"> 本次导出:{now}") + lines.append(f"> 配置来源:`{config_path.as_posix()}`(由 `tools/generate_wechat_db_config.py` 生成)") + lines.append("") + lines.append("参考资料:") + lines.append("- `万字长文带你了解Windows微信.md`(目录结构/部分表结构与含义)") + lines.append("- 本项目前端页面与后端解析逻辑(字段命名与用途)") + lines.append("") + lines.append("注意:") + lines.append("- 本文档尽量覆盖“库/表/字段”,字段含义部分来自启发式与公开资料,可能存在不准确之处。") + lines.append("- 为避免泄露个人数据,类似 `Msg_` 的哈希表名会脱敏显示。") + lines.append("- 部分 FTS 虚表可能依赖微信自定义 tokenizer(如 `MMFtsTokenizer`),普通 sqlite 环境下查询会报错;本文档字段来自建表 SQL/模板解析。") + lines.append("") + + # Overview + lines.append("## 数据库总览") + lines.append("") + lines.append("| 数据库 | 描述 | 表数量 |") + lines.append("|---|---|---:|") + + for db_name in sorted(databases.keys(), key=_db_sort_key): + db = databases.get(db_name) or {} + if not isinstance(db, dict): + continue + desc = db.get("description", "") + tables = db.get("tables") or {} + lines.append( + f"| `{db_name}.db` | {_md_escape_cell(desc)} | {len(tables) if isinstance(tables, dict) else 0} |" + ) + lines.append("") + + lines.append("## 本项目(前端)功能与数据库大致对应") + lines.append("") + lines.append("- 联系人/群聊:`contact.db`(contact/chat_room/chatroom_member/label 等)") + lines.append("- 会话列表/未读:`session.db`(通常为 SessionTable/ChatInfo 等)") + lines.append("- 聊天记录:`message_*.db`(`Msg_*` 表组 + `Name2Id` 映射等)") + lines.append("- 消息资源/媒体:`message_resource.db` / `hardlink.db` / `media_0.db` / `head_image.db`") + lines.append("- 朋友圈:`sns.db`") + lines.append("- 收藏:`favorite.db`") + lines.append("- 表情包:`emoticon.db`") + lines.append("- 搜索:`chat_search_index.db` / `message_fts.db` / `*_fts.db`(不同版本/实现可能不同)") + lines.append("") + + # Per DB + for db_name in sorted(databases.keys(), key=_db_sort_key): + # Skip duplicated details for message shards; only keep the last shard as representative. + if rep_message_db and re.match(r"^message_\d+$", str(db_name)) and str(db_name) != rep_message_db: + continue + + db = databases.get(db_name) or {} + if not isinstance(db, dict): + continue + + desc = db.get("description", "") + tables = db.get("tables") or {} + if not isinstance(tables, dict): + tables = {} + + display_table_items = _fold_same_schema_tables_for_display(tables) + display_table_count = len(display_table_items) + + lines.append(f"## {db_name}.db") + lines.append("") + lines.append(f"- 描述:{_md_escape_cell(desc)}") + if display_table_count != len(tables): + lines.append(f"- 表数量:{len(tables)}(同结构表折叠后展示 {display_table_count})") + else: + lines.append(f"- 表数量:{len(tables)}") + lines.append("") + + # Extra note for message shards + if re.match(r"^message_\d+$", db_name): + if rep_message_db and db_name == rep_message_db and len(all_message_db_names) > 1: + others = [n for n in all_message_db_names if n != rep_message_db] + # Keep it short; avoid blowing up the doc with too many names if there are lots of shards. + if len(others) <= 10: + lines.append(f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片结构通常一致:{', '.join([f'`{n}.db`' for n in others])}。") + else: + lines.append( + f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片({len(others)} 个)结构通常一致。" + ) + lines.append("说明:") + lines.append("- `Msg_*` 表组通常对应“每个联系人/会话一个表”,常见命名为 `Msg_{md5(wxid)}`。") + lines.append("- 可通过对 wxid 做 md5 计算定位具体会话表;或结合 `Name2Id`/`name2id` 映射表进行解析。") + lines.append("") + lines.append("示例(Python):") + lines.append("") + lines.append("```python") + lines.append("import hashlib") + lines.append("") + lines.append("wxid = \"wxid_xxx\"") + lines.append("md5_hex = hashlib.md5(wxid.encode(\"utf-8\")).hexdigest()") + lines.append("table = f\"Msg_{md5_hex}\"") + lines.append("print(table)") + lines.append("```") + lines.append("") + + # Tables + for table_name, table in display_table_items: + if not isinstance(table, dict): + continue + + t_type = table.get("type", "table") + t_desc = table.get("description", "") + + # Table header + display_table_name = _mask_hash_table_name(table_name) + lines.append(f"### {display_table_name}") + lines.append("") + if t_desc: + lines.append(f"- 描述:{_md_escape_cell(t_desc)}") + if t_type == "similar_group": + pat = table.get("pattern") or display_table_name + rep = table.get("representative_table") + table_count = table.get("table_count") + lines.append(f"- 类型:相似表组(pattern: `{_md_escape_cell(pat)}`)") + if table_count is not None: + lines.append(f"- 表数量:{_md_escape_cell(table_count)}") + if rep: + rep_s = str(rep) + rep_masked = _mask_hash_table_name(rep_s) + rep_note = "(已脱敏)" if rep_masked != rep_s else "" + lines.append(f"- 代表表:`{_md_escape_cell(rep_masked)}`{rep_note}") + + members = table.get("table_names") or table.get("tables") + if isinstance(members, list) and members: + member_names = [str(x) for x in members] + member_names = [_mask_hash_table_name(n) for n in member_names] + if len(member_names) <= 20: + show = member_names + suffix = "" + else: + show = member_names[:10] + ["..."] + member_names[-5:] + suffix = f"(共 {len(member_names)} 个)" + parts = [f"`{_md_escape_cell(n)}`" if n != "..." else "..." for n in show] + lines.append(f"- 包含表:{', '.join(parts)}{suffix}") + lines.append("") + + fields = table.get("fields") or {} + if not isinstance(fields, dict) or not fields: + lines.append("_无字段信息_\n") + continue + + lines.append("| 字段 | 类型 | 含义 | 备注 |") + lines.append("|---|---|---|---|") + for field_name in sorted(fields.keys()): + fm = fields.get(field_name) or {} + if not isinstance(fm, dict): + fm = {} + f_type = fm.get("type", "") + meaning = fm.get("meaning", "") + notes = fm.get("notes", "") + lines.append( + f"| `{_md_escape_cell(field_name)}` | `{_md_escape_cell(f_type)}` | {_md_escape_cell(meaning)} | {_md_escape_cell(notes)} |" + ) + lines.append("") + + # Appendices + message_types = cfg.get("message_types") or {} + if isinstance(message_types, dict) and message_types: + mt = _render_message_type_map(message_types) + if mt: + lines.append(mt) + + friend_types = cfg.get("friend_types") or {} + if isinstance(friend_types, dict) and friend_types: + # friend_types in config usually uses string keys + items: list[tuple[int, str]] = [] + for k, v in friend_types.items(): + if k in {"_instructions", "examples"}: + continue + try: + items.append((int(str(k)), str(v))) + except Exception: + continue + items.sort(key=lambda x: x[0]) + + if items: + lines.append("## 联系人类型(friend_type)速查\n") + lines.append("| 值 | 含义 |\n|---:|---|\n") + for code, desc in items: + lines.append(f"| {code} | {_md_escape_cell(desc)} |") + lines.append("") + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def main() -> int: + parser = argparse.ArgumentParser(description="导出微信数据库字段配置为 Markdown 文档(单文件)") + parser.add_argument( + "--config", + default=str(ROOT / "wechat_db_config.json"), + help="wechat_db_config.json 路径(由 tools/generate_wechat_db_config.py 生成)", + ) + parser.add_argument( + "--output", + default=str(ROOT / "docs" / "wechat_database_schema.md"), + help="Markdown 输出路径", + ) + args = parser.parse_args() + + cfg = Path(args.config) + if not cfg.exists(): + raise FileNotFoundError(f"未找到配置文件: {cfg},请先运行 tools/generate_wechat_db_config.py") + + out = Path(args.output) + export_markdown(cfg, out) + print(f"[OK] 写出 Markdown: {out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/generate_wechat_db_config.py b/tools/generate_wechat_db_config.py index d622368..4d8fde3 100644 --- a/tools/generate_wechat_db_config.py +++ b/tools/generate_wechat_db_config.py @@ -14,6 +14,7 @@ import json import re from pathlib import Path from datetime import datetime +import sys ROOT = Path(__file__).resolve().parents[1] TEMPLATE_PATH = ROOT / "wechat_db_config_template.json" @@ -21,6 +22,10 @@ OUTPUT_MAIN = ROOT / "wechat_db_config.json" OUTPUT_DIR = ROOT / "output" / "configs" OUTPUT_COPY = OUTPUT_DIR / "wechat_db_config.generated.json" +# 允许从 tools/ 目录运行时仍能 import 根目录模块 +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + # 尝试导入分析器以复用其启发式 AnalyzerCls = None try: @@ -33,19 +38,24 @@ except Exception: def build_db_descriptions() -> dict[str, str]: return { "message": "聊天记录核心数据库", - "message_3": "聊天消息分表数据库(示例或分片)", + # message_{n}.db 会在 fill_config 里按正则单独处理(分片/分表) "message_fts": "聊天消息全文索引数据库(FTS)", "message_resource": "消息资源索引数据库(图片/文件/视频等)", "contact": "联系人数据库(好友/群/公众号基础信息)", "session": "会话数据库(会话列表与未读统计)", "sns": "朋友圈数据库(动态与互动)", "favorite": "收藏数据库", + "favorite_fts": "收藏全文索引数据库(FTS)", "emoticon": "表情包数据库", "head_image": "头像数据数据库", "hardlink": "硬链接索引数据库(资源去重/快速定位)", "media_0": "媒体数据数据库(含语音SILK等)", "unspportmsg": "不支持消息数据库(客户端不支持的消息类型)", "general": "通用/系统数据库(新消息通知/支付等)", + "contact_fts": "联系人全文索引数据库(FTS)", + "chat_search_index": "(本项目生成)聊天记录全文检索索引库(FTS5,用于搜索)", + "bizchat": "公众号/企业微信相关数据库(会话/联系人等)", + "digital_twin": "(本项目生成)数字分身数据库(派生数据,非微信原始库)", } @@ -172,6 +182,12 @@ KNOWN_FIELD_MEANINGS = { "c4": "FTS列c4(内部结构)", "c5": "FTS列c5(内部结构)", "c6": "FTS列c6(内部结构)", + "c7": "FTS列c7(内部结构)", + "c8": "FTS列c8(内部结构)", + "c9": "FTS列c9(内部结构)", + "c10": "FTS列c10(内部结构)", + "c11": "FTS列c11(内部结构)", + "c12": "FTS列c12(内部结构)", "sz": "FTS文档大小信息", "_rowid_": "SQLite内部行ID", @@ -199,12 +215,483 @@ KNOWN_FIELD_MEANINGS = { "last_sender_display_name": "最后一条消息发送者显示名", "last_msg_ext_type": "最后一条消息扩展类型", + # 常见“Key-Value”配置表(多库复用) + "key": "键(Key-Value配置表)", + "valueint64": "整数值(int64)", + "valuedouble": "浮点值(double)", + "valuestdstr": "字符串值(std::string)", + "valueblob": "二进制值(blob)", + "k": "配置键(k)", + "v": "配置值(v)", + + # 常见保留字段 + "reserved0": "保留字段(reserved0)", + "reserved1": "保留字段(reserved1)", + "reserved2": "保留字段(reserved2)", + "reserved3": "保留字段(reserved3)", + + # 版本/位标志 + "version": "版本号(记录/结构版本,具体含义依表而定)", + "bit_flag": "位标志/开关(bit flags)", + + # 本项目索引/缓存库常见字段 + "render_type": "渲染类型(本项目定义:text/image/system/...)", + "db_stem": "来源数据库分片名(如 message_0)", + "table_name": "来源表名(如 Msg_xxx)", + "sender_username": "发送者username(解码后)", + "preview": "会话预览文本(用于会话列表展示)", + "built_at": "构建时间(Unix时间戳,秒)", + "tablename": "表名(tableName)", + "value": "值(value)", + "brand_user_name": "品牌/公众号username(brand_user_name)", + + # 常见业务字段(命名自解释) + "ticket": "票据/验证ticket(ticket)", + "delete_table_name": "删除记录关联的消息表名(delete_table_name)", + "res_path": "资源路径(res_path)", + "biz_username": "公众号username(biz_username)", + "search_key": "搜索键/索引字段(search_key)", + "click_type": "点击/热词类型(click_type)", + "a_group_remark": "群备注(FTS检索字段:a_group_remark)", + "op_code": "操作码(op_code)", + "query": "查询关键词(query)", + "score": "评分/权重(score)", + "keyword": "关键词(keyword)", + "pay_load_": "payload/扩展数据(pay_load_)", + "bill_no": "账单号(bill_no)", + "session_title": "会话标题(session_title)", + "unread_stat": "未读统计字段(unread_stat)", + "ui_type": "UI类型/发布类型(ui_type)", + "error_type": "错误类型(error_type)", + "tips_content": "提示内容(tips_content)", + "record_content": "记录内容(record_content)", + "business_type": "业务类型(business_type)", + "access_content_key": "访问内容key(access_content_key)", + "access_content_type": "访问内容类型(access_content_type)", + "range_type": "范围类型(range_type)", + "message_local_type": "消息类型(message_local_type)", + "message_origin_source": "消息来源标识(message_origin_source)", + + # 朋友圈(sns)常见拆分字段 + "tid_heigh_bit": "tid 高位拆分字段(heigh_bit,字段名原样保留)", + "tid_low_bit": "tid 低位拆分字段(low_bit)", + "break_flag": "断点/分页标志(0/1;用于分页/增量拉取水位)", + # WCDB 压缩控制 "WCDB_CT_message_content": "WCDB压缩标记(message_content列)", "WCDB_CT_source": "WCDB压缩标记(source列)", } +# 表级字段含义覆盖(优先级高于 KNOWN_FIELD_MEANINGS) +# key: table_name.lower() ; value: { field_name.lower(): meaning } +KNOWN_FIELD_MEANINGS_BY_TABLE: dict[str, dict[str, str]] = { + # contact.db + "contact": { + "id": "序号(通常与 name2id.rowid 对应)", + "username": "联系人的 wxid / 群聊 username(可唯一确定联系人)", + "local_type": "联系人类型:1=通讯录好友/公众号/已添加群聊;2=未添加到通讯录的群聊;3=群中的陌生人;5=企业微信好友;6=群聊中的陌生企业微信好友", + "alias": "微信号(微信里显示的微信号)", + "flag": "联系人标志位(需转二进制;常见:第7位星标,第12位置顶,第17位屏蔽朋友圈,第24位仅聊天)", + "head_img_md5": "头像md5(可通过 head_image.db 查询对应头像)", + "verify_flag": "认证标志(公众号/企业等;非0常表示公众号)", + "description": "描述字段(样本为空;用途待确认)", + "extra_buffer": "好友扩展信息(protobuf;包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene)", + "chat_room_notify": "群消息通知相关设置(样本为0/1;疑似免打扰/通知开关,待确认)", + "is_in_chat_room": "群聊状态标记(样本为1/2;具体含义待确认)", + "chat_room_type": "群聊类型/标志(样本为0/2;具体含义待确认)", + }, + "stranger": { + "id": "序号(通常与 name2id.rowid 对应)", + "username": "联系人的 wxid / 群聊 username", + "local_type": "联系人类型:1=通讯录好友/公众号/已添加群聊;2=未添加到通讯录的群聊;3=群中的陌生人;5=企业微信好友;6=群聊中的陌生企业微信好友", + "alias": "微信号(微信里显示的微信号)", + "flag": "联系人标志位(需转二进制;常见:第7位星标,第12位置顶,第17位屏蔽朋友圈,第24位仅聊天)", + "head_img_md5": "头像md5(可通过 head_image.db 查询对应头像)", + "verify_flag": "认证标志(公众号/企业等;非0常表示公众号)", + "description": "描述字段(样本为空;用途待确认)", + "extra_buffer": "好友扩展信息(protobuf;包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene)", + "chat_room_notify": "群消息通知相关设置(样本为0/1;疑似免打扰/通知开关,待确认)", + "is_in_chat_room": "群聊状态标记(样本为1/2;具体含义待确认)", + "chat_room_type": "群聊类型/标志(样本为0/2;具体含义待确认)", + }, + "biz_info": { + "id": "序号(与 name2id.rowid 对应,可唯一确定一个公众号)", + "username": "公众号username(原始 wxid/gh_xxx)", + "type": "公众号类型:1=公众号,0=订阅号(资料来源:万字长文)", + "accept_type": "接收类型(accept_type;含义待确认,样本常为0)", + "child_type": "子类型(child_type;含义待确认,样本常为0)", + "version": "版本号(含义待确认,样本常为0)", + "external_info": "公众号详细信息(常见 JSON;含底部菜单/交互配置等)", + "brand_info": "公众号品牌/菜单信息(常见 JSON:urls 等)", + "brand_list": "品牌列表/关联列表(格式待确认,可能为 JSON)", + "brand_flag": "品牌/能力标志位(含义待确认)", + "belong": "归属字段(含义待确认)", + "home_url": "主页链接(含义待确认)", + }, + "chat_room": { + "id": "序号(与 name2id.rowid 对应)", + "username": "群聊的username(xxx@chatroom)", + "owner": "群主username", + "ext_buffer": "群成员username与群昵称(protobuf:ChatRoomData.members 等)", + }, + "chat_room_info_detail": { + "room_id_": "序号(与 name2id.rowid 对应)", + "username_": "群聊的username(xxx@chatroom)", + "announcement_": "群公告(文本)", + "announcement_editor_": "群公告编辑者username", + "announcement_publish_time_": "群公告发布时间(时间戳)", + "chat_room_status_": "群状态/标志位(bitmask;样本常见 0x80000 等,具体位含义待确认)", + "xml_announcement_": "群公告(XML,可解析更多信息:图片/文件等)", + "ext_buffer_": "扩展信息(protobuf-like;样本长度较小,具体结构待确认)", + }, + "chatroom_member": { + "room_id": "群聊ID(对应 name2id.rowid)", + "member_id": "群成员ID(对应 name2id.rowid)", + }, + "contact_label": { + "label_id_": "标签ID", + "label_name_": "标签名称", + "sort_order_": "排序", + }, + + # message_*.db / biz_message_*.db + "msg_*": { + "local_id": "自增id(本地)", + "server_id": "服务端id(每条消息唯一)", + "local_type": "消息类型(local_type;低32位=type,高32位=sub_type;可用 (local_type & 0xFFFFFFFF) 与 (local_type >> 32) 拆分)", + "sort_seq": "排序字段(单会话内消息排序;样本≈create_time*1000)", + "real_sender_id": "发送者id(可通过 Name2Id.rowid 映射到 username)", + "create_time": "秒级时间戳", + "server_seq": "服务端接收顺序id(server_seq)", + "message_content": "消息内容:local_type=1 时为文本,其它类型多为 Zstandard 压缩后的XML/二进制", + "compress_content": "压缩后的内容(多见 Zstandard)", + "packed_info_data": "protobuf扩展信息(图片文件名/语音转文字/合并转发文件夹名等)", + }, + "name2id": { + "is_session": "是否会话名标记(1=会话/聊天对象;0=其它映射,如群成员ID)", + }, + + # session.db + "sessiontable": { + "type": "会话类型(样本为0;枚举待确认)", + "status": "会话状态(样本为0;枚举待确认)", + "unread_first_pat_msg_local_id": "未读拍一拍消息的本地ID(样本为0;含义待确认)", + "unread_first_pat_msg_sort_seq": "未读拍一拍消息的排序序号(样本为0;含义待确认)", + }, + "session_last_message": { + "username": "会话username", + "sort_seq": "最后一条消息sort_seq", + "local_id": "最后一条消息local_id", + "create_time": "最后一条消息create_time(秒级时间戳)", + "local_type": "最后一条消息local_type", + "sender_username": "最后一条消息发送者username", + "preview": "最后一条消息预览文本(用于会话列表)", + "db_stem": "来源消息库分片名(如 message_0)", + "table_name": "来源消息表名(如 Msg_xxx)", + "built_at": "构建时间(Unix时间戳,秒)", + }, + + # 本项目 chat_search_index.db + "message_fts": { + "text": "可检索文本(索引内容)", + "render_type": "渲染类型(text/system/image/voice/video/emoji/...,本项目定义)", + "db_stem": "来源消息库分片名(如 message_0)", + "table_name": "来源消息表名(如 Msg_xxx)", + "sender_username": "发送者username(解码后)", + }, + + # emoticon.db + "knonstoreemoticontable": { + "type": "表情类型(样本均为3;枚举含义待确认)", + "caption": "表情说明/标题(caption)", + "product_id": "表情包/产品ID(product_id)", + "aes_key": "AES密钥(用于CDN下载解密)", + "auth_key": "鉴权key(CDN下载)", + "extern_md5": "外部资源md5(extern_md5)", + }, + "kstoreemoticonpackagetable": { + "package_id_": "表情包ID(package_id)", + "package_name_": "表情包名称", + "payment_status_": "支付状态(payment_status)", + "download_status_": "下载状态(download_status)", + "install_time_": "安装时间(时间戳)", + "remove_time_": "移除时间(时间戳)", + "sort_order_": "排序", + "introduction_": "简介(introduction)", + "full_description_": "完整描述(full_description)", + "copyright_": "版权信息", + "author_": "作者信息", + "store_icon_url_": "商店图标URL", + "panel_url_": "面板/详情页URL", + }, + "kstoreemoticonfilestable": { + "package_id_": "表情包ID(package_id)", + "md5_": "表情md5", + "type_": "表情类型(type)", + "sort_order_": "排序", + "emoticon_size_": "表情文件大小(字节)", + "emoticon_offset_": "表情文件偏移(用于包内定位)", + "thumb_size_": "缩略图大小(字节)", + "thumb_offset_": "缩略图偏移(用于包内定位)", + }, + + # favorite.db + "fav_db_item": { + "version": "版本号(收藏条目结构/内容版本;样本为87)", + "fromusr": "来源用户username(收藏来源)", + "realchatname": "来源群聊username(若收藏来源于群聊)", + "upload_error_code": "上传错误码", + "trans_res_error_code": "资源转换错误码(trans_res_error_code)", + }, + + # general.db + "ilink_voip": { + "wx_chatroom_": "群聊username(xxx@chatroom)", + "millsecond_": "毫秒时间戳/时间标记(字段名推断)", + "group_id_": "ILink group_id(字段名推断)", + "room_id_": "房间ID(字段名推断)", + "room_key_": "房间key(字段名推断)", + "route_id_": "路由ID(字段名推断)", + "voice_status_": "通话状态(字段名推断)", + "talker_create_user_": "发起者username(字段名推断)", + "not_friend_user_list_": "非好友成员列表(字段名推断)", + "members_": "成员列表(字段名推断)", + "is_ilink_": "是否ilink通话(字段名推断)", + "ever_quit_chatroom_": "是否曾退出群聊(字段名推断)", + }, + "fmessagetable": { + "user_name_": "用户名(好友验证/陌生人会话用户名)", + "type_": "消息类型(好友验证/系统消息;样本为37)", + "timestamp_": "时间戳", + "encrypt_user_name_": "加密用户名", + "content_": "内容(验证消息/系统提示等)", + "is_sender_": "是否发送方(is_sender)", + "ticket_": "票据/验证ticket", + "scene_": "来源场景码(scene)", + "fmessage_detail_buf_": "详细信息(protobuf-like;包含验证文案/来源等信息)", + }, + "handoff_remind_v0": { + "item_id": "条目ID(item_id)", + "head_icon": "图标(URL/资源标识)", + "title": "标题", + "desc_type": "描述类型(desc_type)", + "create_time": "创建时间(时间戳)", + "start_time": "开始时间(时间戳)", + "expire_time": "过期时间(时间戳)", + "biz_type": "业务类型(biz_type)", + "version": "版本号(version)", + "url": "跳转URL", + "extra_info": "扩展信息(extra_info)", + }, + "transfertable": { + "transfer_id": "转账ID(transfer_id)", + "transcation_id": "交易ID(transaction_id,原字段拼写保留)", + "message_server_id": "关联消息server_id", + "second_message_server_id": "关联第二条转账消息server_id(可在 message_*.db::Msg_* 表的 server_id 对应到)", + "session_name": "会话username", + "pay_sub_type": "支付子类型(pay_sub_type)", + "pay_receiver": "收款方username", + "pay_payer": "付款方username", + "begin_transfer_time": "转账开始时间(时间戳)", + "last_modified_time": "最后修改时间(时间戳)", + "invalid_time": "失效时间(时间戳)", + "last_update_time": "最后更新时间(时间戳)", + "delay_confirm_flag": "延迟确认标志(delay_confirm_flag)", + "bubble_clicked_flag": "气泡点击标志(bubble_clicked_flag)", + }, + + # bizchat.db + "chat_group": { + "brand_user_name": "品牌/公众号username(brand_user_name)", + "bit_flag": "位标志/开关(bit_flag)", + "chat_name": "群组名称(chat_name)", + "user_list": "成员列表(常见为 ; 分隔的 user_id/username 列表;待确认)", + "reserved0": "保留字段(reserved0)", + "reserved1": "保留字段(reserved1)", + "reserved2": "保留字段(reserved2)", + "reserved3": "保留字段(reserved3)", + }, + "user_info": { + "brand_user_name": "品牌/公众号username(brand_user_name)", + "bit_flag": "位标志/开关(bit_flag)", + "reserved0": "保留字段(reserved0)", + "reserved1": "保留字段(reserved1)", + "reserved2": "保留字段(reserved2)", + "reserved3": "保留字段(reserved3)", + }, + + # sns.db + "snsmessage_tmp3": { + "from_username": "来源用户username(评论/点赞发起者)", + "from_nickname": "来源用户昵称(评论/点赞发起者)", + "to_username": "目标用户username(被回复/被@的人)", + "to_nickname": "目标用户昵称(被回复/被@的人)", + "comment_flag": "评论标志位(样本为0;具体 bit 含义待确认)", + }, + "snsadtimeline": { + "ad_content": "广告内容(ad_content,格式待确认)", + "remind_source_info": "提醒来源信息(remind_source_info,格式待确认)", + "remind_self_info": "提醒自身信息(remind_self_info,格式待确认)", + "extra_data": "扩展数据(extra_data,格式待确认)", + }, + + # unspportmsg.db + "unsupportmessage": { + "from_user": "发送者username", + "to_user": "接收者username", + "msg_source": "消息来源附加信息(msg_source)", + }, + + # contact.db + "openim_wording": { + "wording": "文案/提示语(wording)", + "pinyin": "拼音(pinyin)", + }, + + # message_*.db / biz_message_*.db (WCDB) + "wcdb_builtin_compression_record": { + "tablename": "表名(tableName)", + "columns": "被WCDB压缩的列列表(columns)", + }, + + # general.db + "revokemessage": { + "to_user_name": "会话username(撤回消息所在会话)", + "message_type": "消息类型(local_type)", + "at_user_list": "@用户列表(字段名推断)", + }, + "wcfinderlivestatus": { + "finder_username": "视频号作者username(finder_username)", + "charge_flag": "是否付费/收费标志(charge_flag)", + }, + "new_tips": { + "disable": "禁用标志(disable)", + "new_tips_content": "提示内容(new_tips_content)", + }, + "redenvelopetable": { + "sender_user_name": "红包发送者username", + "hb_type": "红包类型(hb_type)", + }, + "wacontact": { + "external_info": "外部信息(JSON;常见包含 BindWxaInfo/RegisterSource/WxaAppDynamic 等)", + "contact_pack_data": "联系人打包数据(protobuf-like;常含昵称/品牌名等)", + "wx_app_opt": "小程序/应用选项(wx_app_opt;位标志/开关;样本为0)", + }, + + # emoticon.db + "kstoreemoticoncaptionstable": { + "package_id_": "表情包ID(package_id)", + "md5_": "表情md5", + "language_": "语言(language)", + "caption_": "文案/标题(caption)", + }, +} + + +KNOWN_TABLE_DESCRIPTIONS: dict[str, str] = { + # contact.db + "biz_info": "公众号信息表(公众号类型/菜单/品牌信息等)", + "chat_room": "群聊基础信息表(群主/成员列表等扩展在 ext_buffer)", + "chat_room_info_detail": "群聊详细信息表(群公告/群状态等)", + "chatroom_member": "群聊成员映射表(room_id ↔ member_id)", + "contact": "联系人核心表(好友/群/公众号等基础信息)", + "contact_label": "联系人标签表(标签ID与名称)", + "name2id": "用户名(wxid/群id@chatroom 等)到内部数值ID映射表", + "encrypt_name2id": "加密用户名到内部数值ID映射表", + "stranger": "陌生人/临时会话信息表", + "ticket_info": "票据/会话票据信息表(用途待进一步确认)", + "stranger_ticket_info": "陌生人票据信息表(用途待进一步确认)", + "oplog": "操作/同步日志表(增量同步相关)", + "openim_appid": "OpenIM 应用ID表(企业微信/互通相关)", + "openim_acct_type": "OpenIM 账号类型表", + "openim_wording": "OpenIM 文案/提示语表", + + # session.db + "sessiontable": "会话列表表(会话展示/未读/置顶/隐藏等)", + "sessiondeletetable": "会话删除记录表", + "sessionunreadlisttable_1": "未读会话列表表(分表)", + "sessionunreadstattable_1": "未读统计表(分表)", + "sessionnocontactinfotable": "会话表(无联系人信息的会话)", + "session_last_message": "会话最后一条消息缓存/索引表(版本/实现差异)", + + # message_*.db / biz_message_*.db + "timestamp": "时间戳/增量同步辅助表", + "deleteinfo": "删除消息记录表(删除/撤回相关)", + "deleteresinfo": "删除资源记录表(资源删除相关)", + "sendinfo": "发送相关信息表(发送状态/队列等)", + "historysysmsginfo": "历史系统消息表", + "historyaddmsginfo": "历史新增消息表", + + # message_resource.db + "chatname2id": "会话名 → 会话ID 映射表(资源库维度)", + "sendername2id": "发送者名 → 发送者ID 映射表(资源库维度)", + "messageresourceinfo": "消息资源索引表(按消息/会话定位资源)", + "messageresourcedetail": "消息资源明细表(md5/路径/大小等)", + "ftsrange": "FTS 范围信息表(搜索/索引辅助)", + "ftsdeleteinfo": "FTS 删除记录表(索引维护)", + + # media_0.db + "voiceinfo": "语音数据表(voice_data 等)", + + # hardlink.db + "db_info": "WCDB Key-Value 元信息表(FTS构建状态/版本/扫描时间等)", + "dir2id": "目录 → ID 映射表(硬链接索引)", + "image_hardlink_info_v4": "图片硬链接索引表(v4)", + "file_hardlink_info_v4": "文件硬链接索引表(v4)", + "video_hardlink_info_v4": "视频硬链接索引表(v4)", + "file_checkpoint_v4": "文件索引检查点(增量)", + "video_checkpoint_v4": "视频索引检查点(增量)", + "talker_checkpoint_v4": "会话索引检查点(增量)", + + # *_fts.db / message_fts.db + "table_info": "WCDB Key-Value 元信息表(索引范围/水位/时间戳等)", + + # head_image.db + "head_image": "头像缓存表(头像 md5/二进制缩略图等)", + + # favorite.db + "buff": "WCDB Key-Value 缓冲/配置表(收藏等模块的缓存)", + "fav_db_item": "收藏条目表", + "fav_tag_db_item": "收藏标签表", + "fav_bind_tag_db_item": "收藏条目与标签绑定表", + + # emoticon.db + "kcustomemoticonordertable": "自定义表情排序表(md5 列表)", + "kexpressrecentuseeemoticontable": "最近使用表情记录(Key-Value)", + "knonstoreemoticontable": "非商店表情表(用户收藏/外部表情资源;含CDN下载信息)", + "kstoreemoticonpackagetable": "商店表情包信息表(package 元数据)", + "kstoreemoticoncaptionstable": "商店表情文案表(多语言 caption)", + + # unspportmsg.db + "unsupportmessage": "不支持消息表(PC端无法直接展示的消息类型)", + + # bizchat.db + "chat_group": "BizChat 群组表(企业微信/公众号群组信息)", + "user_info": "BizChat 用户表(企业微信/公众号用户信息)", + "my_user_info": "BizChat 当前账号映射表(brand_user_name ↔ user_id)", + + # general.db + "forwardrecent": "最近转发会话记录表(username/时间)", + "transfertable": "转账记录表(转账ID/关联消息/状态等)", + "redenvelopetable": "红包记录表(关联消息/状态等)", + "ilink_voip": "iLink/群通话相关表(房间ID/成员/状态等)", + "fmessagetable": "好友验证/陌生人消息表(FMessage)", + "handoff_remind_v0": "跨设备接力/提醒项表(handoff_remind_v0)", + "biz_pay_status": "公众号文章付费状态表(url_id/is_paid 等)", + "biz_subscribe_status": "公众号订阅模板状态表(template_id/is_subscribe)", + "new_tips": "新提示/新功能提示表", + "reddot": "小红点提示表", + "reddot_record": "小红点记录表", + "wcfinderlivestatus": "视频号直播状态表", + "teenager_apply_access_agree_info": "青少年模式访问同意记录表", + + # chat_search_index.db(本项目生成) + "meta": "索引元数据表(schema_version/构建时间等)", + "message_fts": "全文索引表(fts5,用于搜索)", +} + + def simple_heuristic(field_name: str, table_name: str) -> str: """简易兜底启发式,避免完全空白""" f = field_name.lower() @@ -243,10 +730,17 @@ def simple_heuristic(field_name: str, table_name: str) -> str: def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str: - # 优先精确已知映射 + lt = table_name.lower() + lf = field_name.lower() + + # 1) 表级覆盖优先 + tmap = KNOWN_FIELD_MEANINGS_BY_TABLE.get(lt) + if tmap and lf in tmap: + return tmap[lf] + + # 2) 全局精确映射 if field_name in KNOWN_FIELD_MEANINGS: return KNOWN_FIELD_MEANINGS[field_name] - lf = field_name.lower() if lf in KNOWN_FIELD_MEANINGS: return KNOWN_FIELD_MEANINGS[lf] @@ -266,13 +760,44 @@ def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str: def guess_table_desc(analyzer, table_name: str) -> str: + # 简易猜测(优先命中已知表名) + tl = table_name.lower() + + # 已知表名(大小写不敏感) + if tl in KNOWN_TABLE_DESCRIPTIONS: + return KNOWN_TABLE_DESCRIPTIONS[tl] + + # SQLite / WCDB 内置 + if tl == "sqlite_sequence": + return "SQLite 自增序列表" + if tl.startswith("wcdb"): + return "WCDB 内置表(压缩/元数据等)" + + # FTS 内部表(多为 *_data/_idx/_config/_content/_docsize/_aux) + if "fts" in tl: + if tl.endswith("_data"): + return "全文检索(FTS)内部数据表" + if tl.endswith("_idx"): + return "全文检索(FTS)内部索引表" + if tl.endswith("_config"): + return "全文检索(FTS)内部配置表" + if tl.endswith("_content"): + return "全文检索(FTS)内部内容表" + if tl.endswith("_docsize"): + return "全文检索(FTS)内部文档长度表" + if tl.endswith("_aux") or "_aux_" in tl: + return "全文检索(FTS)辅助表" + return "全文检索(FTS)表/索引表" + + # 借助分析器的启发式(如果可用,且不是“未知功能表”) if analyzer is not None: try: - return analyzer.guess_table_function(table_name) + guessed = analyzer.guess_table_function(table_name) + if isinstance(guessed, str) and guessed.strip() and guessed.strip() != "未知功能表": + return guessed.strip() except Exception: pass - # 简易猜测 - tl = table_name.lower() + if tl == "msg" or tl.startswith("msg_"): return "某会话的消息表(聊天消息数据)" if "name2id" in tl: @@ -281,10 +806,18 @@ def guess_table_desc(analyzer, table_name: str) -> str: return "联系人/群聊信息表" if "session" in tl: return "会话信息/未读统计表" - if "fts" in tl: - return "全文检索(FTS)内部表" if "resource" in tl: return "消息资源/附件索引表" + if "voice" in tl: + return "语音相关数据表" + if "image" in tl or "img" in tl: + return "图片相关数据表" + if "video" in tl: + return "视频相关数据表" + if "file" in tl: + return "文件相关数据表" + if "sns" in tl: + return "朋友圈相关数据表" return "未知功能表" @@ -301,13 +834,38 @@ def fill_config(template: dict) -> dict: # 数据库描述补齐 db_desc_map = build_db_descriptions() + def guess_db_desc(db_name: str) -> str: + # 1) 精确映射优先 + if db_name in db_desc_map: + return db_desc_map[db_name] + + # 2) 常见分片/变体:message_{n}.db + m = re.match(r"^message_(\d+)$", db_name) + if m: + return f"聊天记录数据库分片(message_{m.group(1)}.db)" + + # 3) 公众号/企业微信消息库:biz_message_{n}.db(结构通常同 message_{n}.db) + m = re.match(r"^biz_message_(\d+)$", db_name) + if m: + return f"公众号消息记录数据库(biz_message_{m.group(1)}.db,结构通常同 message_{m.group(1)}.db)" + + # 4) FTS/索引类库:*_fts.db + if db_name.endswith("_fts"): + return "全文索引数据库(FTS)" + + # 5) 退化到 base 前缀 + base = db_name.split("_", 1)[0] + if base in db_desc_map: + return db_desc_map[base] + + return "未知用途数据库" + databases = template.get("databases", {}) for db_name, db in databases.items(): if isinstance(db, dict): # 数据库级描述 if not db.get("description"): - # 用已知映射或尝试推断 - db["description"] = db_desc_map.get(db_name, db.get("description", "")) or "未知用途数据库" + db["description"] = guess_db_desc(db_name) # 遍历表 tables = db.get("tables", {}) @@ -378,4 +936,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()