improvement(tools): 增强配置模板与字段含义生成

- generate_config_template: 增加 CLI 参数;FTS/PRAGMA 失败时从建表 SQL 兜底解析列\n- generate_wechat_db_config: 扩充库描述/字段含义词典,并支持从 tools/ 目录运行\n- 新增 export_database_schema_markdown:基于 wechat_db_config.json 导出 Markdown 文档
This commit is contained in:
2977094657
2026-02-15 14:34:15 +08:00
parent 68bdcf6369
commit 35a2266b1c
3 changed files with 1239 additions and 51 deletions

View File

@@ -6,6 +6,7 @@
import sqlite3 import sqlite3
import json import json
import argparse
from pathlib import Path from pathlib import Path
from typing import Dict, List, Any from typing import Dict, List, Any
from collections import defaultdict from collections import defaultdict
@@ -127,6 +128,82 @@ class ConfigTemplateGenerator:
try: try:
cursor = conn.cursor() cursor = conn.cursor()
def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]:
"""
从建表 SQL 中尽力解析列名(用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况)。
返回 (name, type);类型缺失时默认 TEXT。
"""
out: list[tuple[str, str]] = []
if not create_sql:
return out
try:
start = create_sql.find("(")
end = create_sql.rfind(")")
if start == -1 or end == -1 or end <= start:
return out
inner = create_sql[start + 1:end]
parts: list[str] = []
buf = ""
depth = 0
for ch in inner:
if ch == "(":
depth += 1
elif ch == ")":
depth -= 1
if ch == "," and depth == 0:
parts.append(buf.strip())
buf = ""
else:
buf += ch
if buf.strip():
parts.append(buf.strip())
for part in parts:
token = part.strip()
if not token:
continue
low = token.lower()
# 跳过约束/外键等
if low.startswith(("constraint", "primary", "unique", "foreign", "check")):
continue
# fts5 选项tokenize/prefix/content/content_rowid 等)
if "=" in token:
key = token.split("=", 1)[0].strip().lower()
if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"):
continue
tokens = token.split()
if not tokens:
continue
name = tokens[0].strip("`\"[]")
typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT"
out.append((name, typ))
except Exception:
return out
return out
def get_table_columns(table_name: str) -> list[tuple[str, str]]:
# 先尝试 PRAGMA
try:
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
if columns:
return [(col[1], col[2]) for col in columns]
except Exception:
pass
# 兜底:从 sqlite_master.sql 解析
try:
cursor.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?",
(table_name,),
)
row = cursor.fetchone()
create_sql = row[0] if row and len(row) > 0 else ""
return parse_columns_from_create_sql(create_sql or "")
except Exception:
return []
# 获取所有表名 # 获取所有表名
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
@@ -152,13 +229,10 @@ class ConfigTemplateGenerator:
table_key = f"{prefix}_*" # 使用模式名 table_key = f"{prefix}_*" # 使用模式名
# 获取代表表的字段信息 # 获取代表表的字段信息
cursor.execute(f"PRAGMA table_info({representative_table})") columns = get_table_columns(representative_table)
columns = cursor.fetchall()
fields = {} fields = {}
for col in columns: for field_name, field_type in columns:
field_name = col[1]
field_type = col[2]
fields[field_name] = { fields[field_name] = {
"type": field_type, "type": field_type,
"meaning": "", # 留空供用户填写 "meaning": "", # 留空供用户填写
@@ -188,13 +262,10 @@ class ConfigTemplateGenerator:
try: try:
# 获取表字段信息 # 获取表字段信息
cursor.execute(f"PRAGMA table_info({table_name})") columns = get_table_columns(table_name)
columns = cursor.fetchall()
fields = {} fields = {}
for col in columns: for field_name, field_type in columns:
field_name = col[1]
field_type = col[2]
fields[field_name] = { fields[field_name] = {
"type": field_type, "type": field_type,
"meaning": "", # 留空供用户填写 "meaning": "", # 留空供用户填写
@@ -219,16 +290,23 @@ class ConfigTemplateGenerator:
finally: finally:
conn.close() conn.close()
def generate_template(self, output_file: str = "wechat_db_config_template.json"): def generate_template(
self,
output_file: str = "wechat_db_config_template.json",
*,
include_excluded: bool = False,
include_message_shards: bool = False,
exclude_db_stems: set[str] | None = None,
):
"""生成配置模板""" """生成配置模板"""
print("开始生成微信数据库配置模板...") print("开始生成微信数据库配置模板...")
# 定义要排除的数据库模式和描述 # 定义要排除的数据库模式和描述
excluded_patterns = { excluded_patterns = {} if include_excluded else {
r'biz_message_\d+\.db$': '企业微信聊天记录数据库', r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库(通常不参与个人聊天分析)',
r'bizchat\.db$': '企业微信联系人数据库', r'bizchat\.db$': '企业微信联系人/会话数据库(通常不参与个人聊天分析)',
r'contact_fts\.db$': '搜索联系人数据库', r'contact_fts\.db$': '联系人搜索索引数据库FTS',
r'favorite_fts\.db$': '搜索收藏数据库' r'favorite_fts\.db$': '收藏搜索索引数据库FTS'
} }
# 查找所有数据库文件 # 查找所有数据库文件
@@ -263,29 +341,38 @@ class ConfigTemplateGenerator:
for excluded_file, description in excluded_files: for excluded_file, description in excluded_files:
print(f" - {excluded_file.name} ({description})") print(f" - {excluded_file.name} ({description})")
# 显式排除指定 stem不含 .db
if exclude_db_stems:
before = len(db_files)
db_files = [p for p in db_files if p.stem not in exclude_db_stems]
after = len(db_files)
if before != after:
print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}")
print(f"\n实际处理 {len(db_files)} 个数据库文件") print(f"\n实际处理 {len(db_files)} 个数据库文件")
# 过滤message数据库只保留倒数第二个与主脚本逻辑一致 # 过滤message数据库只保留倒数第二个与主脚本逻辑一致
message_numbered_dbs = [] if not include_message_shards:
message_other_dbs = [] message_numbered_dbs = []
message_other_dbs = []
for db in db_files:
if re.match(r'message_\d+$', db.stem): # message_{数字}.db for db in db_files:
message_numbered_dbs.append(db) if re.match(r'message_\d+$', db.stem): # message_{数字}.db
elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等 message_numbered_dbs.append(db)
message_other_dbs.append(db) elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等
message_other_dbs.append(db)
if len(message_numbered_dbs) > 1:
# 按数字编号排序(提取数字进行排序) if len(message_numbered_dbs) > 1:
message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1))) # 按数字编号排序(提取数字进行排序)
# 选择倒数第二个(按编号排序) message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
selected_message_db = message_numbered_dbs[-2] # 倒数第二个 # 选择倒数第二个(按编号排序)
print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库") selected_message_db = message_numbered_dbs[-2] # 倒数第二个
print(f"选择倒数第二个: {selected_message_db.name}") print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
print(f"选择倒数第二个: {selected_message_db.name}")
# 从db_files中移除其他message_{数字}.db数据库但保留message_fts.db等
db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)] # 从db_files中移除其他message_{数字}.db数据库但保留message_fts.db等
db_files.append(selected_message_db) db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
db_files.append(selected_message_db)
print(f"实际分析 {len(db_files)} 个数据库文件") print(f"实际分析 {len(db_files)} 个数据库文件")
@@ -370,11 +457,24 @@ class ConfigTemplateGenerator:
def main(): def main():
"""主函数""" """主函数"""
parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器")
parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录(按账号分目录)")
parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径")
parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库(如 bizchat/contact_fts/favorite_fts 等)")
parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db否则仅保留倒数第二个作代表")
parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem不含 .db排除数据库可重复例如: --exclude-db-stem digital_twin")
args = parser.parse_args()
print("微信数据库配置模板生成器") print("微信数据库配置模板生成器")
print("=" * 50) print("=" * 50)
generator = ConfigTemplateGenerator() generator = ConfigTemplateGenerator(databases_path=args.databases_path)
generator.generate_template() generator.generate_template(
output_file=args.output,
include_excluded=bool(args.include_excluded),
include_message_shards=bool(args.include_message_shards),
exclude_db_stems=set(args.exclude_db_stem or []),
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -0,0 +1,530 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
导出微信数据库字段配置为一份 Markdown 文档(单文件):
- 输入wechat_db_config.json由 tools/generate_wechat_db_config.py 生成)
- 输出Markdown包含数据库 → 表/表组 → 字段与含义)
说明:
- 本脚本只基于“配置文件中的结构与字段含义”生成文档,不会读取真实数据内容;
- 会对类似 Msg_<md5> 这类用户相关的哈希表名做脱敏显示。
- 会将“同结构但表名仅数字不同”的重复表自动折叠为一个表组(常见于 FTS 分片/内部表)。
用法示例:
python tools/export_database_schema_markdown.py \
--config wechat_db_config.json \
--output docs/wechat_database_schema.md
"""
from __future__ import annotations
import argparse
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
_HASH_TABLE_RE = re.compile(r"^([A-Za-z0-9]+)_([0-9a-fA-F]{16,})$")
def _md_escape_cell(v: Any) -> str:
"""Escape Markdown table cell content."""
if v is None:
return "-"
s = str(v)
# Keep it one-line for tables.
s = s.replace("\r", " ").replace("\n", " ").strip()
# Escape pipe
s = s.replace("|", r"\|")
return s if s else "-"
def _mask_hash_table_name(name: str) -> str:
"""
Mask user-specific hash suffix table names:
Msg_00140f... -> Msg_<hash>
"""
m = _HASH_TABLE_RE.match(name)
if not m:
return name
return f"{m.group(1)}_<hash>"
def _db_sort_key(db_name: str) -> tuple[int, int, str]:
"""
Roughly sort DBs by importance for readers.
"""
# Core
if db_name == "contact":
return (10, 0, db_name)
if db_name == "session":
return (20, 0, db_name)
m = re.match(r"^message_(\d+)$", db_name)
if m:
return (30, int(m.group(1)), db_name)
if re.match(r"^biz_message_(\d+)$", db_name):
n = int(re.match(r"^biz_message_(\d+)$", db_name).group(1)) # type: ignore[union-attr]
return (31, n, db_name)
if db_name == "message_resource":
return (40, 0, db_name)
if db_name == "media_0":
return (41, 0, db_name)
if db_name == "hardlink":
return (42, 0, db_name)
if db_name == "head_image":
return (43, 0, db_name)
# Social / content
if db_name == "sns":
return (50, 0, db_name)
if db_name == "favorite":
return (60, 0, db_name)
if db_name == "emoticon":
return (70, 0, db_name)
# System / misc
if db_name in {"general", "unspportmsg"}:
return (80, 0, db_name)
# Search / index
if db_name in {"chat_search_index", "message_fts"} or db_name.endswith("_fts"):
return (90, 0, db_name)
# Others
return (100, 0, db_name)
def _render_message_type_map(message_types: dict[str, Any]) -> str:
# In Windows WeChat v4, `local_type` is commonly a 64-bit integer:
# raw = (sub_type << 32) | type
# Some configs may still store explicit (type, sub_type) pairs; handle both.
items: list[tuple[int, int, int, str]] = []
for k, v in message_types.items():
if k in {"_instructions", "examples"}:
continue
if not isinstance(k, str) or "," not in k:
continue
a, b = k.split(",", 1)
try:
a_i = int(a)
b_i = int(b)
except Exception:
continue
desc = str(v)
if b_i != 0:
msg_type = a_i
msg_sub = b_i
raw = (msg_sub << 32) | (msg_type & 0xFFFFFFFF)
else:
raw = a_i
msg_type = raw & 0xFFFFFFFF
msg_sub = (raw >> 32) & 0xFFFFFFFF
items.append((raw, msg_type, msg_sub, desc))
if not items:
return ""
# Sort by decoded (type, sub_type), then raw value.
items.sort(key=lambda x: (x[1], x[2], x[0]))
out = "## 消息类型local_type速查\n\n"
out += "说明Windows 微信 v4 的 `local_type` 常见为 64 位整型:`raw = (sub_type<<32) | type`。\n\n"
out += "| local_type(raw) | type(low32) | sub_type(high32) | 含义 |\n|---:|---:|---:|---|\n"
for raw, t, st, desc in items:
out += f"| {raw} | {t} | {st} | {_md_escape_cell(desc)} |\n"
return out + "\n"
def _table_schema_signature(table: dict[str, Any]) -> tuple[str, str, tuple[tuple[str, str, str, str], ...]]:
"""
Build a stable signature for a table schema in config.
Used to fold tables which are structurally identical but only differ in name
(e.g. message_fts_v4_aux_0..3).
"""
t_type = str(table.get("type", "table"))
desc = str(table.get("description", ""))
fields = table.get("fields") or {}
items: list[tuple[str, str, str, str]] = []
if isinstance(fields, dict):
for field_name, fm in fields.items():
if not isinstance(fm, dict):
fm = {}
items.append(
(
str(field_name),
str(fm.get("type", "")),
str(fm.get("meaning", "")),
str(fm.get("notes", "")),
)
)
items.sort(key=lambda x: x[0])
return (t_type, desc, tuple(items))
def _name_family_key(name: str) -> str:
"""Normalize a table name into a family key by replacing digit runs with {n}."""
return re.sub(r"\d+", "{n}", name)
def _make_group_pattern(table_names: list[str]) -> str:
"""
Make a readable pattern for a group of similar table names:
- Only varying numeric segments become `{n}`
- Constant numeric segments are kept as-is
Example:
message_fts_v4_0/message_fts_v4_1 -> message_fts_v4_{n}
ImgFts0V0/ImgFts1V0 -> ImgFts{n}V0
"""
if not table_names:
return ""
tokenized = [re.split(r"(\d+)", n) for n in table_names]
base = tokenized[0]
# Ensure token structures match; otherwise fall back to a simple normalization.
for t in tokenized[1:]:
if len(t) != len(base):
return _name_family_key(table_names[0])
for i in range(0, len(base), 2):
if t[i] != base[i]:
return _name_family_key(table_names[0])
out_parts: list[str] = []
for i, part in enumerate(base):
if i % 2 == 0:
out_parts.append(part)
continue
nums = {t[i] for t in tokenized if i < len(t)}
out_parts.append(part if len(nums) == 1 else "{n}")
return "".join(out_parts)
def _fold_same_schema_tables_for_display(
tables: dict[str, Any],
) -> list[tuple[str, dict[str, Any]]]:
"""
Fold duplicated tables that share the same schema/signature but only differ in name.
This is common in FTS shards, e.g.:
message_fts_v4_aux_0..3
message_fts_v4_0..3 and their internal *_content/*_data/*_idx tables
ImgFts0V0..3 and their internal tables
Returns a list of (display_name, table_dict) items sorted by the original table name order.
"""
if not tables:
return []
# (family_key, schema_sig) -> [table_name, ...]
groups: dict[tuple[str, tuple[str, str, tuple[tuple[str, str, str, str], ...]]], list[str]] = {}
for table_name, table in tables.items():
if not isinstance(table, dict):
continue
if str(table.get("type", "table")) == "similar_group":
continue
family = _name_family_key(str(table_name))
sig = _table_schema_signature(table)
groups.setdefault((family, sig), []).append(str(table_name))
consumed: set[str] = set()
items: list[tuple[str, str, dict[str, Any]]] = [] # (sort_key, display_name, table)
used_display_names: set[str] = set()
# Create auto "similar_group" entries for groups > 1.
for (_, _), names in sorted(groups.items(), key=lambda x: x[0][0]):
if len(names) <= 1:
continue
names_sorted = sorted(names)
rep = names_sorted[0]
rep_table = tables.get(rep)
if not isinstance(rep_table, dict):
continue
pattern = _make_group_pattern(names_sorted)
if not pattern:
pattern = _name_family_key(rep)
display_name = pattern
if display_name in used_display_names:
# Rare: same name pattern but different schema signatures. Disambiguate.
n = 2
while f"{pattern} (var{n})" in used_display_names:
n += 1
display_name = f"{pattern} (var{n})"
group_entry = dict(rep_table)
group_entry.update(
{
"type": "similar_group",
"pattern": pattern,
"table_count": len(names_sorted),
"representative_table": rep,
"table_names": names_sorted,
}
)
items.append((rep, display_name, group_entry))
used_display_names.add(display_name)
consumed.update(names_sorted)
# Keep non-grouped tables (and existing similar_group) as-is.
for table_name, table in tables.items():
if not isinstance(table, dict):
continue
if str(table_name) in consumed:
continue
items.append((str(table_name), str(table_name), table))
items.sort(key=lambda x: (x[0], x[1]))
return [(display_name, table) for _, display_name, table in items]
def export_markdown(config_path: Path, output_path: Path) -> None:
cfg = json.loads(config_path.read_text(encoding="utf-8"))
meta = cfg.get("_metadata") or {}
databases: dict[str, Any] = cfg.get("databases") or {}
# message_{n}.db are typically shards with identical schema. Keep only the last shard for detailed sections.
message_shards: list[tuple[int, str]] = []
for name in databases.keys():
m = re.match(r"^message_(\d+)$", str(name))
if not m:
continue
try:
message_shards.append((int(m.group(1)), str(name)))
except Exception:
continue
message_shards.sort(key=lambda x: x[0])
rep_message_db: str | None = message_shards[-1][1] if message_shards else None
all_message_db_names = [n for _, n in message_shards]
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
gen_time = meta.get("generated_time") or now
lines: list[str] = []
lines.append("# Windows 微信数据库结构文档(自动生成)")
lines.append("")
lines.append(f"> 生成时间:{_md_escape_cell(gen_time)}")
lines.append(f"> 本次导出:{now}")
lines.append(f"> 配置来源:`{config_path.as_posix()}`(由 `tools/generate_wechat_db_config.py` 生成)")
lines.append("")
lines.append("参考资料:")
lines.append("- `万字长文带你了解Windows微信.md`(目录结构/部分表结构与含义)")
lines.append("- 本项目前端页面与后端解析逻辑(字段命名与用途)")
lines.append("")
lines.append("注意:")
lines.append("- 本文档尽量覆盖“库/表/字段”,字段含义部分来自启发式与公开资料,可能存在不准确之处。")
lines.append("- 为避免泄露个人数据,类似 `Msg_<md5>` 的哈希表名会脱敏显示。")
lines.append("- 部分 FTS 虚表可能依赖微信自定义 tokenizer如 `MMFtsTokenizer`),普通 sqlite 环境下查询会报错;本文档字段来自建表 SQL/模板解析。")
lines.append("")
# Overview
lines.append("## 数据库总览")
lines.append("")
lines.append("| 数据库 | 描述 | 表数量 |")
lines.append("|---|---|---:|")
for db_name in sorted(databases.keys(), key=_db_sort_key):
db = databases.get(db_name) or {}
if not isinstance(db, dict):
continue
desc = db.get("description", "")
tables = db.get("tables") or {}
lines.append(
f"| `{db_name}.db` | {_md_escape_cell(desc)} | {len(tables) if isinstance(tables, dict) else 0} |"
)
lines.append("")
lines.append("## 本项目(前端)功能与数据库大致对应")
lines.append("")
lines.append("- 联系人/群聊:`contact.db`contact/chat_room/chatroom_member/label 等)")
lines.append("- 会话列表/未读:`session.db`(通常为 SessionTable/ChatInfo 等)")
lines.append("- 聊天记录:`message_*.db``Msg_*` 表组 + `Name2Id` 映射等)")
lines.append("- 消息资源/媒体:`message_resource.db` / `hardlink.db` / `media_0.db` / `head_image.db`")
lines.append("- 朋友圈:`sns.db`")
lines.append("- 收藏:`favorite.db`")
lines.append("- 表情包:`emoticon.db`")
lines.append("- 搜索:`chat_search_index.db` / `message_fts.db` / `*_fts.db`(不同版本/实现可能不同)")
lines.append("")
# Per DB
for db_name in sorted(databases.keys(), key=_db_sort_key):
# Skip duplicated details for message shards; only keep the last shard as representative.
if rep_message_db and re.match(r"^message_\d+$", str(db_name)) and str(db_name) != rep_message_db:
continue
db = databases.get(db_name) or {}
if not isinstance(db, dict):
continue
desc = db.get("description", "")
tables = db.get("tables") or {}
if not isinstance(tables, dict):
tables = {}
display_table_items = _fold_same_schema_tables_for_display(tables)
display_table_count = len(display_table_items)
lines.append(f"## {db_name}.db")
lines.append("")
lines.append(f"- 描述:{_md_escape_cell(desc)}")
if display_table_count != len(tables):
lines.append(f"- 表数量:{len(tables)}(同结构表折叠后展示 {display_table_count}")
else:
lines.append(f"- 表数量:{len(tables)}")
lines.append("")
# Extra note for message shards
if re.match(r"^message_\d+$", db_name):
if rep_message_db and db_name == rep_message_db and len(all_message_db_names) > 1:
others = [n for n in all_message_db_names if n != rep_message_db]
# Keep it short; avoid blowing up the doc with too many names if there are lots of shards.
if len(others) <= 10:
lines.append(f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片结构通常一致:{', '.join([f'`{n}.db`' for n in others])}")
else:
lines.append(
f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片({len(others)} 个)结构通常一致。"
)
lines.append("说明:")
lines.append("- `Msg_*` 表组通常对应“每个联系人/会话一个表”,常见命名为 `Msg_{md5(wxid)}`。")
lines.append("- 可通过对 wxid 做 md5 计算定位具体会话表;或结合 `Name2Id`/`name2id` 映射表进行解析。")
lines.append("")
lines.append("示例Python")
lines.append("")
lines.append("```python")
lines.append("import hashlib")
lines.append("")
lines.append("wxid = \"wxid_xxx\"")
lines.append("md5_hex = hashlib.md5(wxid.encode(\"utf-8\")).hexdigest()")
lines.append("table = f\"Msg_{md5_hex}\"")
lines.append("print(table)")
lines.append("```")
lines.append("")
# Tables
for table_name, table in display_table_items:
if not isinstance(table, dict):
continue
t_type = table.get("type", "table")
t_desc = table.get("description", "")
# Table header
display_table_name = _mask_hash_table_name(table_name)
lines.append(f"### {display_table_name}")
lines.append("")
if t_desc:
lines.append(f"- 描述:{_md_escape_cell(t_desc)}")
if t_type == "similar_group":
pat = table.get("pattern") or display_table_name
rep = table.get("representative_table")
table_count = table.get("table_count")
lines.append(f"- 类型相似表组pattern: `{_md_escape_cell(pat)}`")
if table_count is not None:
lines.append(f"- 表数量:{_md_escape_cell(table_count)}")
if rep:
rep_s = str(rep)
rep_masked = _mask_hash_table_name(rep_s)
rep_note = "(已脱敏)" if rep_masked != rep_s else ""
lines.append(f"- 代表表:`{_md_escape_cell(rep_masked)}`{rep_note}")
members = table.get("table_names") or table.get("tables")
if isinstance(members, list) and members:
member_names = [str(x) for x in members]
member_names = [_mask_hash_table_name(n) for n in member_names]
if len(member_names) <= 20:
show = member_names
suffix = ""
else:
show = member_names[:10] + ["..."] + member_names[-5:]
suffix = f"(共 {len(member_names)} 个)"
parts = [f"`{_md_escape_cell(n)}`" if n != "..." else "..." for n in show]
lines.append(f"- 包含表:{', '.join(parts)}{suffix}")
lines.append("")
fields = table.get("fields") or {}
if not isinstance(fields, dict) or not fields:
lines.append("_无字段信息_\n")
continue
lines.append("| 字段 | 类型 | 含义 | 备注 |")
lines.append("|---|---|---|---|")
for field_name in sorted(fields.keys()):
fm = fields.get(field_name) or {}
if not isinstance(fm, dict):
fm = {}
f_type = fm.get("type", "")
meaning = fm.get("meaning", "")
notes = fm.get("notes", "")
lines.append(
f"| `{_md_escape_cell(field_name)}` | `{_md_escape_cell(f_type)}` | {_md_escape_cell(meaning)} | {_md_escape_cell(notes)} |"
)
lines.append("")
# Appendices
message_types = cfg.get("message_types") or {}
if isinstance(message_types, dict) and message_types:
mt = _render_message_type_map(message_types)
if mt:
lines.append(mt)
friend_types = cfg.get("friend_types") or {}
if isinstance(friend_types, dict) and friend_types:
# friend_types in config usually uses string keys
items: list[tuple[int, str]] = []
for k, v in friend_types.items():
if k in {"_instructions", "examples"}:
continue
try:
items.append((int(str(k)), str(v)))
except Exception:
continue
items.sort(key=lambda x: x[0])
if items:
lines.append("## 联系人类型friend_type速查\n")
lines.append("| 值 | 含义 |\n|---:|---|\n")
for code, desc in items:
lines.append(f"| {code} | {_md_escape_cell(desc)} |")
lines.append("")
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(description="导出微信数据库字段配置为 Markdown 文档(单文件)")
parser.add_argument(
"--config",
default=str(ROOT / "wechat_db_config.json"),
help="wechat_db_config.json 路径(由 tools/generate_wechat_db_config.py 生成)",
)
parser.add_argument(
"--output",
default=str(ROOT / "docs" / "wechat_database_schema.md"),
help="Markdown 输出路径",
)
args = parser.parse_args()
cfg = Path(args.config)
if not cfg.exists():
raise FileNotFoundError(f"未找到配置文件: {cfg},请先运行 tools/generate_wechat_db_config.py")
out = Path(args.output)
export_markdown(cfg, out)
print(f"[OK] 写出 Markdown: {out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -14,6 +14,7 @@ import json
import re import re
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import sys
ROOT = Path(__file__).resolve().parents[1] ROOT = Path(__file__).resolve().parents[1]
TEMPLATE_PATH = ROOT / "wechat_db_config_template.json" TEMPLATE_PATH = ROOT / "wechat_db_config_template.json"
@@ -21,6 +22,10 @@ OUTPUT_MAIN = ROOT / "wechat_db_config.json"
OUTPUT_DIR = ROOT / "output" / "configs" OUTPUT_DIR = ROOT / "output" / "configs"
OUTPUT_COPY = OUTPUT_DIR / "wechat_db_config.generated.json" OUTPUT_COPY = OUTPUT_DIR / "wechat_db_config.generated.json"
# 允许从 tools/ 目录运行时仍能 import 根目录模块
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
# 尝试导入分析器以复用其启发式 # 尝试导入分析器以复用其启发式
AnalyzerCls = None AnalyzerCls = None
try: try:
@@ -33,19 +38,24 @@ except Exception:
def build_db_descriptions() -> dict[str, str]: def build_db_descriptions() -> dict[str, str]:
return { return {
"message": "聊天记录核心数据库", "message": "聊天记录核心数据库",
"message_3": "聊天消息分表数据库(示例或分片)", # message_{n}.db 会在 fill_config 里按正则单独处理(分片/分表)
"message_fts": "聊天消息全文索引数据库FTS", "message_fts": "聊天消息全文索引数据库FTS",
"message_resource": "消息资源索引数据库(图片/文件/视频等)", "message_resource": "消息资源索引数据库(图片/文件/视频等)",
"contact": "联系人数据库(好友/群/公众号基础信息)", "contact": "联系人数据库(好友/群/公众号基础信息)",
"session": "会话数据库(会话列表与未读统计)", "session": "会话数据库(会话列表与未读统计)",
"sns": "朋友圈数据库(动态与互动)", "sns": "朋友圈数据库(动态与互动)",
"favorite": "收藏数据库", "favorite": "收藏数据库",
"favorite_fts": "收藏全文索引数据库FTS",
"emoticon": "表情包数据库", "emoticon": "表情包数据库",
"head_image": "头像数据数据库", "head_image": "头像数据数据库",
"hardlink": "硬链接索引数据库(资源去重/快速定位)", "hardlink": "硬链接索引数据库(资源去重/快速定位)",
"media_0": "媒体数据数据库含语音SILK等", "media_0": "媒体数据数据库含语音SILK等",
"unspportmsg": "不支持消息数据库(客户端不支持的消息类型)", "unspportmsg": "不支持消息数据库(客户端不支持的消息类型)",
"general": "通用/系统数据库(新消息通知/支付等)", "general": "通用/系统数据库(新消息通知/支付等)",
"contact_fts": "联系人全文索引数据库FTS",
"chat_search_index": "本项目生成聊天记录全文检索索引库FTS5用于搜索",
"bizchat": "公众号/企业微信相关数据库(会话/联系人等)",
"digital_twin": "(本项目生成)数字分身数据库(派生数据,非微信原始库)",
} }
@@ -172,6 +182,12 @@ KNOWN_FIELD_MEANINGS = {
"c4": "FTS列c4内部结构", "c4": "FTS列c4内部结构",
"c5": "FTS列c5内部结构", "c5": "FTS列c5内部结构",
"c6": "FTS列c6内部结构", "c6": "FTS列c6内部结构",
"c7": "FTS列c7内部结构",
"c8": "FTS列c8内部结构",
"c9": "FTS列c9内部结构",
"c10": "FTS列c10内部结构",
"c11": "FTS列c11内部结构",
"c12": "FTS列c12内部结构",
"sz": "FTS文档大小信息", "sz": "FTS文档大小信息",
"_rowid_": "SQLite内部行ID", "_rowid_": "SQLite内部行ID",
@@ -199,12 +215,483 @@ KNOWN_FIELD_MEANINGS = {
"last_sender_display_name": "最后一条消息发送者显示名", "last_sender_display_name": "最后一条消息发送者显示名",
"last_msg_ext_type": "最后一条消息扩展类型", "last_msg_ext_type": "最后一条消息扩展类型",
# 常见“Key-Value”配置表多库复用
"key": "Key-Value配置表",
"valueint64": "整数值int64",
"valuedouble": "浮点值double",
"valuestdstr": "字符串值std::string",
"valueblob": "二进制值blob",
"k": "配置键k",
"v": "配置值v",
# 常见保留字段
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
# 版本/位标志
"version": "版本号(记录/结构版本,具体含义依表而定)",
"bit_flag": "位标志/开关bit flags",
# 本项目索引/缓存库常见字段
"render_type": "渲染类型本项目定义text/image/system/...",
"db_stem": "来源数据库分片名(如 message_0",
"table_name": "来源表名(如 Msg_xxx",
"sender_username": "发送者username解码后",
"preview": "会话预览文本(用于会话列表展示)",
"built_at": "构建时间Unix时间戳",
"tablename": "表名tableName",
"value": "value",
"brand_user_name": "品牌/公众号usernamebrand_user_name",
# 常见业务字段(命名自解释)
"ticket": "票据/验证ticketticket",
"delete_table_name": "删除记录关联的消息表名delete_table_name",
"res_path": "资源路径res_path",
"biz_username": "公众号usernamebiz_username",
"search_key": "搜索键/索引字段search_key",
"click_type": "点击/热词类型click_type",
"a_group_remark": "群备注FTS检索字段a_group_remark",
"op_code": "操作码op_code",
"query": "查询关键词query",
"score": "评分/权重score",
"keyword": "关键词keyword",
"pay_load_": "payload/扩展数据pay_load_",
"bill_no": "账单号bill_no",
"session_title": "会话标题session_title",
"unread_stat": "未读统计字段unread_stat",
"ui_type": "UI类型/发布类型ui_type",
"error_type": "错误类型error_type",
"tips_content": "提示内容tips_content",
"record_content": "记录内容record_content",
"business_type": "业务类型business_type",
"access_content_key": "访问内容keyaccess_content_key",
"access_content_type": "访问内容类型access_content_type",
"range_type": "范围类型range_type",
"message_local_type": "消息类型message_local_type",
"message_origin_source": "消息来源标识message_origin_source",
# 朋友圈sns常见拆分字段
"tid_heigh_bit": "tid 高位拆分字段heigh_bit字段名原样保留",
"tid_low_bit": "tid 低位拆分字段low_bit",
"break_flag": "断点/分页标志0/1用于分页/增量拉取水位)",
# WCDB 压缩控制 # WCDB 压缩控制
"WCDB_CT_message_content": "WCDB压缩标记message_content列", "WCDB_CT_message_content": "WCDB压缩标记message_content列",
"WCDB_CT_source": "WCDB压缩标记source列", "WCDB_CT_source": "WCDB压缩标记source列",
} }
# 表级字段含义覆盖(优先级高于 KNOWN_FIELD_MEANINGS
# key: table_name.lower() ; value: { field_name.lower(): meaning }
KNOWN_FIELD_MEANINGS_BY_TABLE: dict[str, dict[str, str]] = {
# contact.db
"contact": {
"id": "序号(通常与 name2id.rowid 对应)",
"username": "联系人的 wxid / 群聊 username可唯一确定联系人",
"local_type": "联系人类型1=通讯录好友/公众号/已添加群聊2=未添加到通讯录的群聊3=群中的陌生人5=企业微信好友6=群聊中的陌生企业微信好友",
"alias": "微信号(微信里显示的微信号)",
"flag": "联系人标志位需转二进制常见第7位星标第12位置顶第17位屏蔽朋友圈第24位仅聊天",
"head_img_md5": "头像md5可通过 head_image.db 查询对应头像)",
"verify_flag": "认证标志(公众号/企业等非0常表示公众号",
"description": "描述字段(样本为空;用途待确认)",
"extra_buffer": "好友扩展信息protobuf包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene",
"chat_room_notify": "群消息通知相关设置样本为0/1疑似免打扰/通知开关,待确认)",
"is_in_chat_room": "群聊状态标记样本为1/2具体含义待确认",
"chat_room_type": "群聊类型/标志样本为0/2具体含义待确认",
},
"stranger": {
"id": "序号(通常与 name2id.rowid 对应)",
"username": "联系人的 wxid / 群聊 username",
"local_type": "联系人类型1=通讯录好友/公众号/已添加群聊2=未添加到通讯录的群聊3=群中的陌生人5=企业微信好友6=群聊中的陌生企业微信好友",
"alias": "微信号(微信里显示的微信号)",
"flag": "联系人标志位需转二进制常见第7位星标第12位置顶第17位屏蔽朋友圈第24位仅聊天",
"head_img_md5": "头像md5可通过 head_image.db 查询对应头像)",
"verify_flag": "认证标志(公众号/企业等非0常表示公众号",
"description": "描述字段(样本为空;用途待确认)",
"extra_buffer": "好友扩展信息protobuf包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene",
"chat_room_notify": "群消息通知相关设置样本为0/1疑似免打扰/通知开关,待确认)",
"is_in_chat_room": "群聊状态标记样本为1/2具体含义待确认",
"chat_room_type": "群聊类型/标志样本为0/2具体含义待确认",
},
"biz_info": {
"id": "序号(与 name2id.rowid 对应,可唯一确定一个公众号)",
"username": "公众号username原始 wxid/gh_xxx",
"type": "公众号类型1=公众号0=订阅号(资料来源:万字长文)",
"accept_type": "接收类型accept_type含义待确认样本常为0",
"child_type": "子类型child_type含义待确认样本常为0",
"version": "版本号含义待确认样本常为0",
"external_info": "公众号详细信息(常见 JSON含底部菜单/交互配置等)",
"brand_info": "公众号品牌/菜单信息(常见 JSONurls 等)",
"brand_list": "品牌列表/关联列表(格式待确认,可能为 JSON",
"brand_flag": "品牌/能力标志位(含义待确认)",
"belong": "归属字段(含义待确认)",
"home_url": "主页链接(含义待确认)",
},
"chat_room": {
"id": "序号(与 name2id.rowid 对应)",
"username": "群聊的usernamexxx@chatroom",
"owner": "群主username",
"ext_buffer": "群成员username与群昵称protobufChatRoomData.members 等)",
},
"chat_room_info_detail": {
"room_id_": "序号(与 name2id.rowid 对应)",
"username_": "群聊的usernamexxx@chatroom",
"announcement_": "群公告(文本)",
"announcement_editor_": "群公告编辑者username",
"announcement_publish_time_": "群公告发布时间(时间戳)",
"chat_room_status_": "群状态/标志位bitmask样本常见 0x80000 等,具体位含义待确认)",
"xml_announcement_": "群公告XML可解析更多信息图片/文件等)",
"ext_buffer_": "扩展信息protobuf-like样本长度较小具体结构待确认",
},
"chatroom_member": {
"room_id": "群聊ID对应 name2id.rowid",
"member_id": "群成员ID对应 name2id.rowid",
},
"contact_label": {
"label_id_": "标签ID",
"label_name_": "标签名称",
"sort_order_": "排序",
},
# message_*.db / biz_message_*.db
"msg_*": {
"local_id": "自增id本地",
"server_id": "服务端id每条消息唯一",
"local_type": "消息类型local_type低32位=type高32位=sub_type可用 (local_type & 0xFFFFFFFF) 与 (local_type >> 32) 拆分)",
"sort_seq": "排序字段单会话内消息排序样本≈create_time*1000",
"real_sender_id": "发送者id可通过 Name2Id.rowid 映射到 username",
"create_time": "秒级时间戳",
"server_seq": "服务端接收顺序idserver_seq",
"message_content": "消息内容local_type=1 时为文本,其它类型多为 Zstandard 压缩后的XML/二进制",
"compress_content": "压缩后的内容(多见 Zstandard",
"packed_info_data": "protobuf扩展信息图片文件名/语音转文字/合并转发文件夹名等)",
},
"name2id": {
"is_session": "是否会话名标记1=会话/聊天对象0=其它映射如群成员ID",
},
# session.db
"sessiontable": {
"type": "会话类型样本为0枚举待确认",
"status": "会话状态样本为0枚举待确认",
"unread_first_pat_msg_local_id": "未读拍一拍消息的本地ID样本为0含义待确认",
"unread_first_pat_msg_sort_seq": "未读拍一拍消息的排序序号样本为0含义待确认",
},
"session_last_message": {
"username": "会话username",
"sort_seq": "最后一条消息sort_seq",
"local_id": "最后一条消息local_id",
"create_time": "最后一条消息create_time秒级时间戳",
"local_type": "最后一条消息local_type",
"sender_username": "最后一条消息发送者username",
"preview": "最后一条消息预览文本(用于会话列表)",
"db_stem": "来源消息库分片名(如 message_0",
"table_name": "来源消息表名(如 Msg_xxx",
"built_at": "构建时间Unix时间戳",
},
# 本项目 chat_search_index.db
"message_fts": {
"text": "可检索文本(索引内容)",
"render_type": "渲染类型text/system/image/voice/video/emoji/...,本项目定义)",
"db_stem": "来源消息库分片名(如 message_0",
"table_name": "来源消息表名(如 Msg_xxx",
"sender_username": "发送者username解码后",
},
# emoticon.db
"knonstoreemoticontable": {
"type": "表情类型样本均为3枚举含义待确认",
"caption": "表情说明/标题caption",
"product_id": "表情包/产品IDproduct_id",
"aes_key": "AES密钥用于CDN下载解密",
"auth_key": "鉴权keyCDN下载",
"extern_md5": "外部资源md5extern_md5",
},
"kstoreemoticonpackagetable": {
"package_id_": "表情包IDpackage_id",
"package_name_": "表情包名称",
"payment_status_": "支付状态payment_status",
"download_status_": "下载状态download_status",
"install_time_": "安装时间(时间戳)",
"remove_time_": "移除时间(时间戳)",
"sort_order_": "排序",
"introduction_": "简介introduction",
"full_description_": "完整描述full_description",
"copyright_": "版权信息",
"author_": "作者信息",
"store_icon_url_": "商店图标URL",
"panel_url_": "面板/详情页URL",
},
"kstoreemoticonfilestable": {
"package_id_": "表情包IDpackage_id",
"md5_": "表情md5",
"type_": "表情类型type",
"sort_order_": "排序",
"emoticon_size_": "表情文件大小(字节)",
"emoticon_offset_": "表情文件偏移(用于包内定位)",
"thumb_size_": "缩略图大小(字节)",
"thumb_offset_": "缩略图偏移(用于包内定位)",
},
# favorite.db
"fav_db_item": {
"version": "版本号(收藏条目结构/内容版本样本为87",
"fromusr": "来源用户username收藏来源",
"realchatname": "来源群聊username若收藏来源于群聊",
"upload_error_code": "上传错误码",
"trans_res_error_code": "资源转换错误码trans_res_error_code",
},
# general.db
"ilink_voip": {
"wx_chatroom_": "群聊usernamexxx@chatroom",
"millsecond_": "毫秒时间戳/时间标记(字段名推断)",
"group_id_": "ILink group_id字段名推断",
"room_id_": "房间ID字段名推断",
"room_key_": "房间key字段名推断",
"route_id_": "路由ID字段名推断",
"voice_status_": "通话状态(字段名推断)",
"talker_create_user_": "发起者username字段名推断",
"not_friend_user_list_": "非好友成员列表(字段名推断)",
"members_": "成员列表(字段名推断)",
"is_ilink_": "是否ilink通话字段名推断",
"ever_quit_chatroom_": "是否曾退出群聊(字段名推断)",
},
"fmessagetable": {
"user_name_": "用户名(好友验证/陌生人会话用户名)",
"type_": "消息类型(好友验证/系统消息样本为37",
"timestamp_": "时间戳",
"encrypt_user_name_": "加密用户名",
"content_": "内容(验证消息/系统提示等)",
"is_sender_": "是否发送方is_sender",
"ticket_": "票据/验证ticket",
"scene_": "来源场景码scene",
"fmessage_detail_buf_": "详细信息protobuf-like包含验证文案/来源等信息)",
},
"handoff_remind_v0": {
"item_id": "条目IDitem_id",
"head_icon": "图标URL/资源标识)",
"title": "标题",
"desc_type": "描述类型desc_type",
"create_time": "创建时间(时间戳)",
"start_time": "开始时间(时间戳)",
"expire_time": "过期时间(时间戳)",
"biz_type": "业务类型biz_type",
"version": "版本号version",
"url": "跳转URL",
"extra_info": "扩展信息extra_info",
},
"transfertable": {
"transfer_id": "转账IDtransfer_id",
"transcation_id": "交易IDtransaction_id原字段拼写保留",
"message_server_id": "关联消息server_id",
"second_message_server_id": "关联第二条转账消息server_id可在 message_*.db::Msg_* 表的 server_id 对应到)",
"session_name": "会话username",
"pay_sub_type": "支付子类型pay_sub_type",
"pay_receiver": "收款方username",
"pay_payer": "付款方username",
"begin_transfer_time": "转账开始时间(时间戳)",
"last_modified_time": "最后修改时间(时间戳)",
"invalid_time": "失效时间(时间戳)",
"last_update_time": "最后更新时间(时间戳)",
"delay_confirm_flag": "延迟确认标志delay_confirm_flag",
"bubble_clicked_flag": "气泡点击标志bubble_clicked_flag",
},
# bizchat.db
"chat_group": {
"brand_user_name": "品牌/公众号usernamebrand_user_name",
"bit_flag": "位标志/开关bit_flag",
"chat_name": "群组名称chat_name",
"user_list": "成员列表(常见为 ; 分隔的 user_id/username 列表;待确认)",
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
},
"user_info": {
"brand_user_name": "品牌/公众号usernamebrand_user_name",
"bit_flag": "位标志/开关bit_flag",
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
},
# sns.db
"snsmessage_tmp3": {
"from_username": "来源用户username评论/点赞发起者)",
"from_nickname": "来源用户昵称(评论/点赞发起者)",
"to_username": "目标用户username被回复/被@的人)",
"to_nickname": "目标用户昵称(被回复/被@的人)",
"comment_flag": "评论标志位样本为0具体 bit 含义待确认)",
},
"snsadtimeline": {
"ad_content": "广告内容ad_content格式待确认",
"remind_source_info": "提醒来源信息remind_source_info格式待确认",
"remind_self_info": "提醒自身信息remind_self_info格式待确认",
"extra_data": "扩展数据extra_data格式待确认",
},
# unspportmsg.db
"unsupportmessage": {
"from_user": "发送者username",
"to_user": "接收者username",
"msg_source": "消息来源附加信息msg_source",
},
# contact.db
"openim_wording": {
"wording": "文案/提示语wording",
"pinyin": "拼音pinyin",
},
# message_*.db / biz_message_*.db (WCDB)
"wcdb_builtin_compression_record": {
"tablename": "表名tableName",
"columns": "被WCDB压缩的列列表columns",
},
# general.db
"revokemessage": {
"to_user_name": "会话username撤回消息所在会话",
"message_type": "消息类型local_type",
"at_user_list": "@用户列表(字段名推断)",
},
"wcfinderlivestatus": {
"finder_username": "视频号作者usernamefinder_username",
"charge_flag": "是否付费/收费标志charge_flag",
},
"new_tips": {
"disable": "禁用标志disable",
"new_tips_content": "提示内容new_tips_content",
},
"redenvelopetable": {
"sender_user_name": "红包发送者username",
"hb_type": "红包类型hb_type",
},
"wacontact": {
"external_info": "外部信息JSON常见包含 BindWxaInfo/RegisterSource/WxaAppDynamic 等)",
"contact_pack_data": "联系人打包数据protobuf-like常含昵称/品牌名等)",
"wx_app_opt": "小程序/应用选项wx_app_opt位标志/开关样本为0",
},
# emoticon.db
"kstoreemoticoncaptionstable": {
"package_id_": "表情包IDpackage_id",
"md5_": "表情md5",
"language_": "语言language",
"caption_": "文案/标题caption",
},
}
KNOWN_TABLE_DESCRIPTIONS: dict[str, str] = {
# contact.db
"biz_info": "公众号信息表(公众号类型/菜单/品牌信息等)",
"chat_room": "群聊基础信息表(群主/成员列表等扩展在 ext_buffer",
"chat_room_info_detail": "群聊详细信息表(群公告/群状态等)",
"chatroom_member": "群聊成员映射表room_id ↔ member_id",
"contact": "联系人核心表(好友/群/公众号等基础信息)",
"contact_label": "联系人标签表标签ID与名称",
"name2id": "用户名wxid/群id@chatroom 等到内部数值ID映射表",
"encrypt_name2id": "加密用户名到内部数值ID映射表",
"stranger": "陌生人/临时会话信息表",
"ticket_info": "票据/会话票据信息表(用途待进一步确认)",
"stranger_ticket_info": "陌生人票据信息表(用途待进一步确认)",
"oplog": "操作/同步日志表(增量同步相关)",
"openim_appid": "OpenIM 应用ID表企业微信/互通相关)",
"openim_acct_type": "OpenIM 账号类型表",
"openim_wording": "OpenIM 文案/提示语表",
# session.db
"sessiontable": "会话列表表(会话展示/未读/置顶/隐藏等)",
"sessiondeletetable": "会话删除记录表",
"sessionunreadlisttable_1": "未读会话列表表(分表)",
"sessionunreadstattable_1": "未读统计表(分表)",
"sessionnocontactinfotable": "会话表(无联系人信息的会话)",
"session_last_message": "会话最后一条消息缓存/索引表(版本/实现差异)",
# message_*.db / biz_message_*.db
"timestamp": "时间戳/增量同步辅助表",
"deleteinfo": "删除消息记录表(删除/撤回相关)",
"deleteresinfo": "删除资源记录表(资源删除相关)",
"sendinfo": "发送相关信息表(发送状态/队列等)",
"historysysmsginfo": "历史系统消息表",
"historyaddmsginfo": "历史新增消息表",
# message_resource.db
"chatname2id": "会话名 → 会话ID 映射表(资源库维度)",
"sendername2id": "发送者名 → 发送者ID 映射表(资源库维度)",
"messageresourceinfo": "消息资源索引表(按消息/会话定位资源)",
"messageresourcedetail": "消息资源明细表md5/路径/大小等)",
"ftsrange": "FTS 范围信息表(搜索/索引辅助)",
"ftsdeleteinfo": "FTS 删除记录表(索引维护)",
# media_0.db
"voiceinfo": "语音数据表voice_data 等)",
# hardlink.db
"db_info": "WCDB Key-Value 元信息表FTS构建状态/版本/扫描时间等)",
"dir2id": "目录 → ID 映射表(硬链接索引)",
"image_hardlink_info_v4": "图片硬链接索引表v4",
"file_hardlink_info_v4": "文件硬链接索引表v4",
"video_hardlink_info_v4": "视频硬链接索引表v4",
"file_checkpoint_v4": "文件索引检查点(增量)",
"video_checkpoint_v4": "视频索引检查点(增量)",
"talker_checkpoint_v4": "会话索引检查点(增量)",
# *_fts.db / message_fts.db
"table_info": "WCDB Key-Value 元信息表(索引范围/水位/时间戳等)",
# head_image.db
"head_image": "头像缓存表(头像 md5/二进制缩略图等)",
# favorite.db
"buff": "WCDB Key-Value 缓冲/配置表(收藏等模块的缓存)",
"fav_db_item": "收藏条目表",
"fav_tag_db_item": "收藏标签表",
"fav_bind_tag_db_item": "收藏条目与标签绑定表",
# emoticon.db
"kcustomemoticonordertable": "自定义表情排序表md5 列表)",
"kexpressrecentuseeemoticontable": "最近使用表情记录Key-Value",
"knonstoreemoticontable": "非商店表情表(用户收藏/外部表情资源含CDN下载信息",
"kstoreemoticonpackagetable": "商店表情包信息表package 元数据)",
"kstoreemoticoncaptionstable": "商店表情文案表(多语言 caption",
# unspportmsg.db
"unsupportmessage": "不支持消息表PC端无法直接展示的消息类型",
# bizchat.db
"chat_group": "BizChat 群组表(企业微信/公众号群组信息)",
"user_info": "BizChat 用户表(企业微信/公众号用户信息)",
"my_user_info": "BizChat 当前账号映射表brand_user_name ↔ user_id",
# general.db
"forwardrecent": "最近转发会话记录表username/时间)",
"transfertable": "转账记录表转账ID/关联消息/状态等)",
"redenvelopetable": "红包记录表(关联消息/状态等)",
"ilink_voip": "iLink/群通话相关表房间ID/成员/状态等)",
"fmessagetable": "好友验证/陌生人消息表FMessage",
"handoff_remind_v0": "跨设备接力/提醒项表handoff_remind_v0",
"biz_pay_status": "公众号文章付费状态表url_id/is_paid 等)",
"biz_subscribe_status": "公众号订阅模板状态表template_id/is_subscribe",
"new_tips": "新提示/新功能提示表",
"reddot": "小红点提示表",
"reddot_record": "小红点记录表",
"wcfinderlivestatus": "视频号直播状态表",
"teenager_apply_access_agree_info": "青少年模式访问同意记录表",
# chat_search_index.db本项目生成
"meta": "索引元数据表schema_version/构建时间等)",
"message_fts": "全文索引表fts5用于搜索",
}
def simple_heuristic(field_name: str, table_name: str) -> str: def simple_heuristic(field_name: str, table_name: str) -> str:
"""简易兜底启发式,避免完全空白""" """简易兜底启发式,避免完全空白"""
f = field_name.lower() f = field_name.lower()
@@ -243,10 +730,17 @@ def simple_heuristic(field_name: str, table_name: str) -> str:
def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str: def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str:
# 优先精确已知映射 lt = table_name.lower()
lf = field_name.lower()
# 1) 表级覆盖优先
tmap = KNOWN_FIELD_MEANINGS_BY_TABLE.get(lt)
if tmap and lf in tmap:
return tmap[lf]
# 2) 全局精确映射
if field_name in KNOWN_FIELD_MEANINGS: if field_name in KNOWN_FIELD_MEANINGS:
return KNOWN_FIELD_MEANINGS[field_name] return KNOWN_FIELD_MEANINGS[field_name]
lf = field_name.lower()
if lf in KNOWN_FIELD_MEANINGS: if lf in KNOWN_FIELD_MEANINGS:
return KNOWN_FIELD_MEANINGS[lf] return KNOWN_FIELD_MEANINGS[lf]
@@ -266,13 +760,44 @@ def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str:
def guess_table_desc(analyzer, table_name: str) -> str: def guess_table_desc(analyzer, table_name: str) -> str:
# 简易猜测(优先命中已知表名)
tl = table_name.lower()
# 已知表名(大小写不敏感)
if tl in KNOWN_TABLE_DESCRIPTIONS:
return KNOWN_TABLE_DESCRIPTIONS[tl]
# SQLite / WCDB 内置
if tl == "sqlite_sequence":
return "SQLite 自增序列表"
if tl.startswith("wcdb"):
return "WCDB 内置表(压缩/元数据等)"
# FTS 内部表(多为 *_data/_idx/_config/_content/_docsize/_aux
if "fts" in tl:
if tl.endswith("_data"):
return "全文检索FTS内部数据表"
if tl.endswith("_idx"):
return "全文检索FTS内部索引表"
if tl.endswith("_config"):
return "全文检索FTS内部配置表"
if tl.endswith("_content"):
return "全文检索FTS内部内容表"
if tl.endswith("_docsize"):
return "全文检索FTS内部文档长度表"
if tl.endswith("_aux") or "_aux_" in tl:
return "全文检索FTS辅助表"
return "全文检索FTS表/索引表"
# 借助分析器的启发式(如果可用,且不是“未知功能表”)
if analyzer is not None: if analyzer is not None:
try: try:
return analyzer.guess_table_function(table_name) guessed = analyzer.guess_table_function(table_name)
if isinstance(guessed, str) and guessed.strip() and guessed.strip() != "未知功能表":
return guessed.strip()
except Exception: except Exception:
pass pass
# 简易猜测
tl = table_name.lower()
if tl == "msg" or tl.startswith("msg_"): if tl == "msg" or tl.startswith("msg_"):
return "某会话的消息表(聊天消息数据)" return "某会话的消息表(聊天消息数据)"
if "name2id" in tl: if "name2id" in tl:
@@ -281,10 +806,18 @@ def guess_table_desc(analyzer, table_name: str) -> str:
return "联系人/群聊信息表" return "联系人/群聊信息表"
if "session" in tl: if "session" in tl:
return "会话信息/未读统计表" return "会话信息/未读统计表"
if "fts" in tl:
return "全文检索FTS内部表"
if "resource" in tl: if "resource" in tl:
return "消息资源/附件索引表" return "消息资源/附件索引表"
if "voice" in tl:
return "语音相关数据表"
if "image" in tl or "img" in tl:
return "图片相关数据表"
if "video" in tl:
return "视频相关数据表"
if "file" in tl:
return "文件相关数据表"
if "sns" in tl:
return "朋友圈相关数据表"
return "未知功能表" return "未知功能表"
@@ -301,13 +834,38 @@ def fill_config(template: dict) -> dict:
# 数据库描述补齐 # 数据库描述补齐
db_desc_map = build_db_descriptions() db_desc_map = build_db_descriptions()
def guess_db_desc(db_name: str) -> str:
# 1) 精确映射优先
if db_name in db_desc_map:
return db_desc_map[db_name]
# 2) 常见分片/变体message_{n}.db
m = re.match(r"^message_(\d+)$", db_name)
if m:
return f"聊天记录数据库分片message_{m.group(1)}.db"
# 3) 公众号/企业微信消息库biz_message_{n}.db结构通常同 message_{n}.db
m = re.match(r"^biz_message_(\d+)$", db_name)
if m:
return f"公众号消息记录数据库biz_message_{m.group(1)}.db结构通常同 message_{m.group(1)}.db"
# 4) FTS/索引类库:*_fts.db
if db_name.endswith("_fts"):
return "全文索引数据库FTS"
# 5) 退化到 base 前缀
base = db_name.split("_", 1)[0]
if base in db_desc_map:
return db_desc_map[base]
return "未知用途数据库"
databases = template.get("databases", {}) databases = template.get("databases", {})
for db_name, db in databases.items(): for db_name, db in databases.items():
if isinstance(db, dict): if isinstance(db, dict):
# 数据库级描述 # 数据库级描述
if not db.get("description"): if not db.get("description"):
# 用已知映射或尝试推断 db["description"] = guess_db_desc(db_name)
db["description"] = db_desc_map.get(db_name, db.get("description", "")) or "未知用途数据库"
# 遍历表 # 遍历表
tables = db.get("tables", {}) tables = db.get("tables", {})
@@ -378,4 +936,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()