improvement(tools): 增强配置模板与字段含义生成

- generate_config_template: 增加 CLI 参数;FTS/PRAGMA 失败时从建表 SQL 兜底解析列\n- generate_wechat_db_config: 扩充库描述/字段含义词典,并支持从 tools/ 目录运行\n- 新增 export_database_schema_markdown:基于 wechat_db_config.json 导出 Markdown 文档
This commit is contained in:
2977094657
2026-02-15 14:34:15 +08:00
parent 68bdcf6369
commit 35a2266b1c
3 changed files with 1239 additions and 51 deletions

View File

@@ -6,6 +6,7 @@
import sqlite3
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict
@@ -127,6 +128,82 @@ class ConfigTemplateGenerator:
try:
cursor = conn.cursor()
def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]:
"""
从建表 SQL 中尽力解析列名(用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况)。
返回 (name, type);类型缺失时默认 TEXT。
"""
out: list[tuple[str, str]] = []
if not create_sql:
return out
try:
start = create_sql.find("(")
end = create_sql.rfind(")")
if start == -1 or end == -1 or end <= start:
return out
inner = create_sql[start + 1:end]
parts: list[str] = []
buf = ""
depth = 0
for ch in inner:
if ch == "(":
depth += 1
elif ch == ")":
depth -= 1
if ch == "," and depth == 0:
parts.append(buf.strip())
buf = ""
else:
buf += ch
if buf.strip():
parts.append(buf.strip())
for part in parts:
token = part.strip()
if not token:
continue
low = token.lower()
# 跳过约束/外键等
if low.startswith(("constraint", "primary", "unique", "foreign", "check")):
continue
# fts5 选项tokenize/prefix/content/content_rowid 等)
if "=" in token:
key = token.split("=", 1)[0].strip().lower()
if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"):
continue
tokens = token.split()
if not tokens:
continue
name = tokens[0].strip("`\"[]")
typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT"
out.append((name, typ))
except Exception:
return out
return out
def get_table_columns(table_name: str) -> list[tuple[str, str]]:
# 先尝试 PRAGMA
try:
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
if columns:
return [(col[1], col[2]) for col in columns]
except Exception:
pass
# 兜底:从 sqlite_master.sql 解析
try:
cursor.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?",
(table_name,),
)
row = cursor.fetchone()
create_sql = row[0] if row and len(row) > 0 else ""
return parse_columns_from_create_sql(create_sql or "")
except Exception:
return []
# 获取所有表名
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
@@ -152,13 +229,10 @@ class ConfigTemplateGenerator:
table_key = f"{prefix}_*" # 使用模式名
# 获取代表表的字段信息
cursor.execute(f"PRAGMA table_info({representative_table})")
columns = cursor.fetchall()
columns = get_table_columns(representative_table)
fields = {}
for col in columns:
field_name = col[1]
field_type = col[2]
for field_name, field_type in columns:
fields[field_name] = {
"type": field_type,
"meaning": "", # 留空供用户填写
@@ -188,13 +262,10 @@ class ConfigTemplateGenerator:
try:
# 获取表字段信息
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
columns = get_table_columns(table_name)
fields = {}
for col in columns:
field_name = col[1]
field_type = col[2]
for field_name, field_type in columns:
fields[field_name] = {
"type": field_type,
"meaning": "", # 留空供用户填写
@@ -219,16 +290,23 @@ class ConfigTemplateGenerator:
finally:
conn.close()
def generate_template(self, output_file: str = "wechat_db_config_template.json"):
def generate_template(
self,
output_file: str = "wechat_db_config_template.json",
*,
include_excluded: bool = False,
include_message_shards: bool = False,
exclude_db_stems: set[str] | None = None,
):
"""生成配置模板"""
print("开始生成微信数据库配置模板...")
# 定义要排除的数据库模式和描述
excluded_patterns = {
r'biz_message_\d+\.db$': '企业微信聊天记录数据库',
r'bizchat\.db$': '企业微信联系人数据库',
r'contact_fts\.db$': '搜索联系人数据库',
r'favorite_fts\.db$': '搜索收藏数据库'
excluded_patterns = {} if include_excluded else {
r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库(通常不参与个人聊天分析)',
r'bizchat\.db$': '企业微信联系人/会话数据库(通常不参与个人聊天分析)',
r'contact_fts\.db$': '联系人搜索索引数据库FTS',
r'favorite_fts\.db$': '收藏搜索索引数据库FTS'
}
# 查找所有数据库文件
@@ -263,29 +341,38 @@ class ConfigTemplateGenerator:
for excluded_file, description in excluded_files:
print(f" - {excluded_file.name} ({description})")
# 显式排除指定 stem不含 .db
if exclude_db_stems:
before = len(db_files)
db_files = [p for p in db_files if p.stem not in exclude_db_stems]
after = len(db_files)
if before != after:
print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}")
print(f"\n实际处理 {len(db_files)} 个数据库文件")
# 过滤message数据库只保留倒数第二个与主脚本逻辑一致
message_numbered_dbs = []
message_other_dbs = []
for db in db_files:
if re.match(r'message_\d+$', db.stem): # message_{数字}.db
message_numbered_dbs.append(db)
elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等
message_other_dbs.append(db)
if len(message_numbered_dbs) > 1:
# 按数字编号排序(提取数字进行排序)
message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
# 选择倒数第二个(按编号排序)
selected_message_db = message_numbered_dbs[-2] # 倒数第二个
print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
print(f"选择倒数第二个: {selected_message_db.name}")
# 从db_files中移除其他message_{数字}.db数据库但保留message_fts.db等
db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
db_files.append(selected_message_db)
if not include_message_shards:
message_numbered_dbs = []
message_other_dbs = []
for db in db_files:
if re.match(r'message_\d+$', db.stem): # message_{数字}.db
message_numbered_dbs.append(db)
elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等
message_other_dbs.append(db)
if len(message_numbered_dbs) > 1:
# 按数字编号排序(提取数字进行排序)
message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
# 选择倒数第二个(按编号排序)
selected_message_db = message_numbered_dbs[-2] # 倒数第二个
print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
print(f"选择倒数第二个: {selected_message_db.name}")
# 从db_files中移除其他message_{数字}.db数据库但保留message_fts.db等
db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
db_files.append(selected_message_db)
print(f"实际分析 {len(db_files)} 个数据库文件")
@@ -370,11 +457,24 @@ class ConfigTemplateGenerator:
def main():
"""主函数"""
parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器")
parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录(按账号分目录)")
parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径")
parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库(如 bizchat/contact_fts/favorite_fts 等)")
parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db否则仅保留倒数第二个作代表")
parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem不含 .db排除数据库可重复例如: --exclude-db-stem digital_twin")
args = parser.parse_args()
print("微信数据库配置模板生成器")
print("=" * 50)
generator = ConfigTemplateGenerator()
generator.generate_template()
generator = ConfigTemplateGenerator(databases_path=args.databases_path)
generator.generate_template(
output_file=args.output,
include_excluded=bool(args.include_excluded),
include_message_shards=bool(args.include_message_shards),
exclude_db_stems=set(args.exclude_db_stem or []),
)
if __name__ == "__main__":
main()
main()

View File

@@ -0,0 +1,530 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
导出微信数据库字段配置为一份 Markdown 文档(单文件):
- 输入wechat_db_config.json由 tools/generate_wechat_db_config.py 生成)
- 输出Markdown包含数据库 → 表/表组 → 字段与含义)
说明:
- 本脚本只基于“配置文件中的结构与字段含义”生成文档,不会读取真实数据内容;
- 会对类似 Msg_<md5> 这类用户相关的哈希表名做脱敏显示。
- 会将“同结构但表名仅数字不同”的重复表自动折叠为一个表组(常见于 FTS 分片/内部表)。
用法示例:
python tools/export_database_schema_markdown.py \
--config wechat_db_config.json \
--output docs/wechat_database_schema.md
"""
from __future__ import annotations
import argparse
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
_HASH_TABLE_RE = re.compile(r"^([A-Za-z0-9]+)_([0-9a-fA-F]{16,})$")
def _md_escape_cell(v: Any) -> str:
"""Escape Markdown table cell content."""
if v is None:
return "-"
s = str(v)
# Keep it one-line for tables.
s = s.replace("\r", " ").replace("\n", " ").strip()
# Escape pipe
s = s.replace("|", r"\|")
return s if s else "-"
def _mask_hash_table_name(name: str) -> str:
"""
Mask user-specific hash suffix table names:
Msg_00140f... -> Msg_<hash>
"""
m = _HASH_TABLE_RE.match(name)
if not m:
return name
return f"{m.group(1)}_<hash>"
def _db_sort_key(db_name: str) -> tuple[int, int, str]:
"""
Roughly sort DBs by importance for readers.
"""
# Core
if db_name == "contact":
return (10, 0, db_name)
if db_name == "session":
return (20, 0, db_name)
m = re.match(r"^message_(\d+)$", db_name)
if m:
return (30, int(m.group(1)), db_name)
if re.match(r"^biz_message_(\d+)$", db_name):
n = int(re.match(r"^biz_message_(\d+)$", db_name).group(1)) # type: ignore[union-attr]
return (31, n, db_name)
if db_name == "message_resource":
return (40, 0, db_name)
if db_name == "media_0":
return (41, 0, db_name)
if db_name == "hardlink":
return (42, 0, db_name)
if db_name == "head_image":
return (43, 0, db_name)
# Social / content
if db_name == "sns":
return (50, 0, db_name)
if db_name == "favorite":
return (60, 0, db_name)
if db_name == "emoticon":
return (70, 0, db_name)
# System / misc
if db_name in {"general", "unspportmsg"}:
return (80, 0, db_name)
# Search / index
if db_name in {"chat_search_index", "message_fts"} or db_name.endswith("_fts"):
return (90, 0, db_name)
# Others
return (100, 0, db_name)
def _render_message_type_map(message_types: dict[str, Any]) -> str:
# In Windows WeChat v4, `local_type` is commonly a 64-bit integer:
# raw = (sub_type << 32) | type
# Some configs may still store explicit (type, sub_type) pairs; handle both.
items: list[tuple[int, int, int, str]] = []
for k, v in message_types.items():
if k in {"_instructions", "examples"}:
continue
if not isinstance(k, str) or "," not in k:
continue
a, b = k.split(",", 1)
try:
a_i = int(a)
b_i = int(b)
except Exception:
continue
desc = str(v)
if b_i != 0:
msg_type = a_i
msg_sub = b_i
raw = (msg_sub << 32) | (msg_type & 0xFFFFFFFF)
else:
raw = a_i
msg_type = raw & 0xFFFFFFFF
msg_sub = (raw >> 32) & 0xFFFFFFFF
items.append((raw, msg_type, msg_sub, desc))
if not items:
return ""
# Sort by decoded (type, sub_type), then raw value.
items.sort(key=lambda x: (x[1], x[2], x[0]))
out = "## 消息类型local_type速查\n\n"
out += "说明Windows 微信 v4 的 `local_type` 常见为 64 位整型:`raw = (sub_type<<32) | type`。\n\n"
out += "| local_type(raw) | type(low32) | sub_type(high32) | 含义 |\n|---:|---:|---:|---|\n"
for raw, t, st, desc in items:
out += f"| {raw} | {t} | {st} | {_md_escape_cell(desc)} |\n"
return out + "\n"
def _table_schema_signature(table: dict[str, Any]) -> tuple[str, str, tuple[tuple[str, str, str, str], ...]]:
"""
Build a stable signature for a table schema in config.
Used to fold tables which are structurally identical but only differ in name
(e.g. message_fts_v4_aux_0..3).
"""
t_type = str(table.get("type", "table"))
desc = str(table.get("description", ""))
fields = table.get("fields") or {}
items: list[tuple[str, str, str, str]] = []
if isinstance(fields, dict):
for field_name, fm in fields.items():
if not isinstance(fm, dict):
fm = {}
items.append(
(
str(field_name),
str(fm.get("type", "")),
str(fm.get("meaning", "")),
str(fm.get("notes", "")),
)
)
items.sort(key=lambda x: x[0])
return (t_type, desc, tuple(items))
def _name_family_key(name: str) -> str:
"""Normalize a table name into a family key by replacing digit runs with {n}."""
return re.sub(r"\d+", "{n}", name)
def _make_group_pattern(table_names: list[str]) -> str:
"""
Make a readable pattern for a group of similar table names:
- Only varying numeric segments become `{n}`
- Constant numeric segments are kept as-is
Example:
message_fts_v4_0/message_fts_v4_1 -> message_fts_v4_{n}
ImgFts0V0/ImgFts1V0 -> ImgFts{n}V0
"""
if not table_names:
return ""
tokenized = [re.split(r"(\d+)", n) for n in table_names]
base = tokenized[0]
# Ensure token structures match; otherwise fall back to a simple normalization.
for t in tokenized[1:]:
if len(t) != len(base):
return _name_family_key(table_names[0])
for i in range(0, len(base), 2):
if t[i] != base[i]:
return _name_family_key(table_names[0])
out_parts: list[str] = []
for i, part in enumerate(base):
if i % 2 == 0:
out_parts.append(part)
continue
nums = {t[i] for t in tokenized if i < len(t)}
out_parts.append(part if len(nums) == 1 else "{n}")
return "".join(out_parts)
def _fold_same_schema_tables_for_display(
tables: dict[str, Any],
) -> list[tuple[str, dict[str, Any]]]:
"""
Fold duplicated tables that share the same schema/signature but only differ in name.
This is common in FTS shards, e.g.:
message_fts_v4_aux_0..3
message_fts_v4_0..3 and their internal *_content/*_data/*_idx tables
ImgFts0V0..3 and their internal tables
Returns a list of (display_name, table_dict) items sorted by the original table name order.
"""
if not tables:
return []
# (family_key, schema_sig) -> [table_name, ...]
groups: dict[tuple[str, tuple[str, str, tuple[tuple[str, str, str, str], ...]]], list[str]] = {}
for table_name, table in tables.items():
if not isinstance(table, dict):
continue
if str(table.get("type", "table")) == "similar_group":
continue
family = _name_family_key(str(table_name))
sig = _table_schema_signature(table)
groups.setdefault((family, sig), []).append(str(table_name))
consumed: set[str] = set()
items: list[tuple[str, str, dict[str, Any]]] = [] # (sort_key, display_name, table)
used_display_names: set[str] = set()
# Create auto "similar_group" entries for groups > 1.
for (_, _), names in sorted(groups.items(), key=lambda x: x[0][0]):
if len(names) <= 1:
continue
names_sorted = sorted(names)
rep = names_sorted[0]
rep_table = tables.get(rep)
if not isinstance(rep_table, dict):
continue
pattern = _make_group_pattern(names_sorted)
if not pattern:
pattern = _name_family_key(rep)
display_name = pattern
if display_name in used_display_names:
# Rare: same name pattern but different schema signatures. Disambiguate.
n = 2
while f"{pattern} (var{n})" in used_display_names:
n += 1
display_name = f"{pattern} (var{n})"
group_entry = dict(rep_table)
group_entry.update(
{
"type": "similar_group",
"pattern": pattern,
"table_count": len(names_sorted),
"representative_table": rep,
"table_names": names_sorted,
}
)
items.append((rep, display_name, group_entry))
used_display_names.add(display_name)
consumed.update(names_sorted)
# Keep non-grouped tables (and existing similar_group) as-is.
for table_name, table in tables.items():
if not isinstance(table, dict):
continue
if str(table_name) in consumed:
continue
items.append((str(table_name), str(table_name), table))
items.sort(key=lambda x: (x[0], x[1]))
return [(display_name, table) for _, display_name, table in items]
def export_markdown(config_path: Path, output_path: Path) -> None:
cfg = json.loads(config_path.read_text(encoding="utf-8"))
meta = cfg.get("_metadata") or {}
databases: dict[str, Any] = cfg.get("databases") or {}
# message_{n}.db are typically shards with identical schema. Keep only the last shard for detailed sections.
message_shards: list[tuple[int, str]] = []
for name in databases.keys():
m = re.match(r"^message_(\d+)$", str(name))
if not m:
continue
try:
message_shards.append((int(m.group(1)), str(name)))
except Exception:
continue
message_shards.sort(key=lambda x: x[0])
rep_message_db: str | None = message_shards[-1][1] if message_shards else None
all_message_db_names = [n for _, n in message_shards]
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
gen_time = meta.get("generated_time") or now
lines: list[str] = []
lines.append("# Windows 微信数据库结构文档(自动生成)")
lines.append("")
lines.append(f"> 生成时间:{_md_escape_cell(gen_time)}")
lines.append(f"> 本次导出:{now}")
lines.append(f"> 配置来源:`{config_path.as_posix()}`(由 `tools/generate_wechat_db_config.py` 生成)")
lines.append("")
lines.append("参考资料:")
lines.append("- `万字长文带你了解Windows微信.md`(目录结构/部分表结构与含义)")
lines.append("- 本项目前端页面与后端解析逻辑(字段命名与用途)")
lines.append("")
lines.append("注意:")
lines.append("- 本文档尽量覆盖“库/表/字段”,字段含义部分来自启发式与公开资料,可能存在不准确之处。")
lines.append("- 为避免泄露个人数据,类似 `Msg_<md5>` 的哈希表名会脱敏显示。")
lines.append("- 部分 FTS 虚表可能依赖微信自定义 tokenizer如 `MMFtsTokenizer`),普通 sqlite 环境下查询会报错;本文档字段来自建表 SQL/模板解析。")
lines.append("")
# Overview
lines.append("## 数据库总览")
lines.append("")
lines.append("| 数据库 | 描述 | 表数量 |")
lines.append("|---|---|---:|")
for db_name in sorted(databases.keys(), key=_db_sort_key):
db = databases.get(db_name) or {}
if not isinstance(db, dict):
continue
desc = db.get("description", "")
tables = db.get("tables") or {}
lines.append(
f"| `{db_name}.db` | {_md_escape_cell(desc)} | {len(tables) if isinstance(tables, dict) else 0} |"
)
lines.append("")
lines.append("## 本项目(前端)功能与数据库大致对应")
lines.append("")
lines.append("- 联系人/群聊:`contact.db`contact/chat_room/chatroom_member/label 等)")
lines.append("- 会话列表/未读:`session.db`(通常为 SessionTable/ChatInfo 等)")
lines.append("- 聊天记录:`message_*.db``Msg_*` 表组 + `Name2Id` 映射等)")
lines.append("- 消息资源/媒体:`message_resource.db` / `hardlink.db` / `media_0.db` / `head_image.db`")
lines.append("- 朋友圈:`sns.db`")
lines.append("- 收藏:`favorite.db`")
lines.append("- 表情包:`emoticon.db`")
lines.append("- 搜索:`chat_search_index.db` / `message_fts.db` / `*_fts.db`(不同版本/实现可能不同)")
lines.append("")
# Per DB
for db_name in sorted(databases.keys(), key=_db_sort_key):
# Skip duplicated details for message shards; only keep the last shard as representative.
if rep_message_db and re.match(r"^message_\d+$", str(db_name)) and str(db_name) != rep_message_db:
continue
db = databases.get(db_name) or {}
if not isinstance(db, dict):
continue
desc = db.get("description", "")
tables = db.get("tables") or {}
if not isinstance(tables, dict):
tables = {}
display_table_items = _fold_same_schema_tables_for_display(tables)
display_table_count = len(display_table_items)
lines.append(f"## {db_name}.db")
lines.append("")
lines.append(f"- 描述:{_md_escape_cell(desc)}")
if display_table_count != len(tables):
lines.append(f"- 表数量:{len(tables)}(同结构表折叠后展示 {display_table_count}")
else:
lines.append(f"- 表数量:{len(tables)}")
lines.append("")
# Extra note for message shards
if re.match(r"^message_\d+$", db_name):
if rep_message_db and db_name == rep_message_db and len(all_message_db_names) > 1:
others = [n for n in all_message_db_names if n != rep_message_db]
# Keep it short; avoid blowing up the doc with too many names if there are lots of shards.
if len(others) <= 10:
lines.append(f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片结构通常一致:{', '.join([f'`{n}.db`' for n in others])}")
else:
lines.append(
f"本节仅展示最后一个分片 `{rep_message_db}.db` 的结构;其它分片({len(others)} 个)结构通常一致。"
)
lines.append("说明:")
lines.append("- `Msg_*` 表组通常对应“每个联系人/会话一个表”,常见命名为 `Msg_{md5(wxid)}`。")
lines.append("- 可通过对 wxid 做 md5 计算定位具体会话表;或结合 `Name2Id`/`name2id` 映射表进行解析。")
lines.append("")
lines.append("示例Python")
lines.append("")
lines.append("```python")
lines.append("import hashlib")
lines.append("")
lines.append("wxid = \"wxid_xxx\"")
lines.append("md5_hex = hashlib.md5(wxid.encode(\"utf-8\")).hexdigest()")
lines.append("table = f\"Msg_{md5_hex}\"")
lines.append("print(table)")
lines.append("```")
lines.append("")
# Tables
for table_name, table in display_table_items:
if not isinstance(table, dict):
continue
t_type = table.get("type", "table")
t_desc = table.get("description", "")
# Table header
display_table_name = _mask_hash_table_name(table_name)
lines.append(f"### {display_table_name}")
lines.append("")
if t_desc:
lines.append(f"- 描述:{_md_escape_cell(t_desc)}")
if t_type == "similar_group":
pat = table.get("pattern") or display_table_name
rep = table.get("representative_table")
table_count = table.get("table_count")
lines.append(f"- 类型相似表组pattern: `{_md_escape_cell(pat)}`")
if table_count is not None:
lines.append(f"- 表数量:{_md_escape_cell(table_count)}")
if rep:
rep_s = str(rep)
rep_masked = _mask_hash_table_name(rep_s)
rep_note = "(已脱敏)" if rep_masked != rep_s else ""
lines.append(f"- 代表表:`{_md_escape_cell(rep_masked)}`{rep_note}")
members = table.get("table_names") or table.get("tables")
if isinstance(members, list) and members:
member_names = [str(x) for x in members]
member_names = [_mask_hash_table_name(n) for n in member_names]
if len(member_names) <= 20:
show = member_names
suffix = ""
else:
show = member_names[:10] + ["..."] + member_names[-5:]
suffix = f"(共 {len(member_names)} 个)"
parts = [f"`{_md_escape_cell(n)}`" if n != "..." else "..." for n in show]
lines.append(f"- 包含表:{', '.join(parts)}{suffix}")
lines.append("")
fields = table.get("fields") or {}
if not isinstance(fields, dict) or not fields:
lines.append("_无字段信息_\n")
continue
lines.append("| 字段 | 类型 | 含义 | 备注 |")
lines.append("|---|---|---|---|")
for field_name in sorted(fields.keys()):
fm = fields.get(field_name) or {}
if not isinstance(fm, dict):
fm = {}
f_type = fm.get("type", "")
meaning = fm.get("meaning", "")
notes = fm.get("notes", "")
lines.append(
f"| `{_md_escape_cell(field_name)}` | `{_md_escape_cell(f_type)}` | {_md_escape_cell(meaning)} | {_md_escape_cell(notes)} |"
)
lines.append("")
# Appendices
message_types = cfg.get("message_types") or {}
if isinstance(message_types, dict) and message_types:
mt = _render_message_type_map(message_types)
if mt:
lines.append(mt)
friend_types = cfg.get("friend_types") or {}
if isinstance(friend_types, dict) and friend_types:
# friend_types in config usually uses string keys
items: list[tuple[int, str]] = []
for k, v in friend_types.items():
if k in {"_instructions", "examples"}:
continue
try:
items.append((int(str(k)), str(v)))
except Exception:
continue
items.sort(key=lambda x: x[0])
if items:
lines.append("## 联系人类型friend_type速查\n")
lines.append("| 值 | 含义 |\n|---:|---|\n")
for code, desc in items:
lines.append(f"| {code} | {_md_escape_cell(desc)} |")
lines.append("")
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def main() -> int:
parser = argparse.ArgumentParser(description="导出微信数据库字段配置为 Markdown 文档(单文件)")
parser.add_argument(
"--config",
default=str(ROOT / "wechat_db_config.json"),
help="wechat_db_config.json 路径(由 tools/generate_wechat_db_config.py 生成)",
)
parser.add_argument(
"--output",
default=str(ROOT / "docs" / "wechat_database_schema.md"),
help="Markdown 输出路径",
)
args = parser.parse_args()
cfg = Path(args.config)
if not cfg.exists():
raise FileNotFoundError(f"未找到配置文件: {cfg},请先运行 tools/generate_wechat_db_config.py")
out = Path(args.output)
export_markdown(cfg, out)
print(f"[OK] 写出 Markdown: {out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -14,6 +14,7 @@ import json
import re
from pathlib import Path
from datetime import datetime
import sys
ROOT = Path(__file__).resolve().parents[1]
TEMPLATE_PATH = ROOT / "wechat_db_config_template.json"
@@ -21,6 +22,10 @@ OUTPUT_MAIN = ROOT / "wechat_db_config.json"
OUTPUT_DIR = ROOT / "output" / "configs"
OUTPUT_COPY = OUTPUT_DIR / "wechat_db_config.generated.json"
# 允许从 tools/ 目录运行时仍能 import 根目录模块
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
# 尝试导入分析器以复用其启发式
AnalyzerCls = None
try:
@@ -33,19 +38,24 @@ except Exception:
def build_db_descriptions() -> dict[str, str]:
return {
"message": "聊天记录核心数据库",
"message_3": "聊天消息分表数据库(示例或分片)",
# message_{n}.db 会在 fill_config 里按正则单独处理(分片/分表)
"message_fts": "聊天消息全文索引数据库FTS",
"message_resource": "消息资源索引数据库(图片/文件/视频等)",
"contact": "联系人数据库(好友/群/公众号基础信息)",
"session": "会话数据库(会话列表与未读统计)",
"sns": "朋友圈数据库(动态与互动)",
"favorite": "收藏数据库",
"favorite_fts": "收藏全文索引数据库FTS",
"emoticon": "表情包数据库",
"head_image": "头像数据数据库",
"hardlink": "硬链接索引数据库(资源去重/快速定位)",
"media_0": "媒体数据数据库含语音SILK等",
"unspportmsg": "不支持消息数据库(客户端不支持的消息类型)",
"general": "通用/系统数据库(新消息通知/支付等)",
"contact_fts": "联系人全文索引数据库FTS",
"chat_search_index": "本项目生成聊天记录全文检索索引库FTS5用于搜索",
"bizchat": "公众号/企业微信相关数据库(会话/联系人等)",
"digital_twin": "(本项目生成)数字分身数据库(派生数据,非微信原始库)",
}
@@ -172,6 +182,12 @@ KNOWN_FIELD_MEANINGS = {
"c4": "FTS列c4内部结构",
"c5": "FTS列c5内部结构",
"c6": "FTS列c6内部结构",
"c7": "FTS列c7内部结构",
"c8": "FTS列c8内部结构",
"c9": "FTS列c9内部结构",
"c10": "FTS列c10内部结构",
"c11": "FTS列c11内部结构",
"c12": "FTS列c12内部结构",
"sz": "FTS文档大小信息",
"_rowid_": "SQLite内部行ID",
@@ -199,12 +215,483 @@ KNOWN_FIELD_MEANINGS = {
"last_sender_display_name": "最后一条消息发送者显示名",
"last_msg_ext_type": "最后一条消息扩展类型",
# 常见“Key-Value”配置表多库复用
"key": "Key-Value配置表",
"valueint64": "整数值int64",
"valuedouble": "浮点值double",
"valuestdstr": "字符串值std::string",
"valueblob": "二进制值blob",
"k": "配置键k",
"v": "配置值v",
# 常见保留字段
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
# 版本/位标志
"version": "版本号(记录/结构版本,具体含义依表而定)",
"bit_flag": "位标志/开关bit flags",
# 本项目索引/缓存库常见字段
"render_type": "渲染类型本项目定义text/image/system/...",
"db_stem": "来源数据库分片名(如 message_0",
"table_name": "来源表名(如 Msg_xxx",
"sender_username": "发送者username解码后",
"preview": "会话预览文本(用于会话列表展示)",
"built_at": "构建时间Unix时间戳",
"tablename": "表名tableName",
"value": "value",
"brand_user_name": "品牌/公众号usernamebrand_user_name",
# 常见业务字段(命名自解释)
"ticket": "票据/验证ticketticket",
"delete_table_name": "删除记录关联的消息表名delete_table_name",
"res_path": "资源路径res_path",
"biz_username": "公众号usernamebiz_username",
"search_key": "搜索键/索引字段search_key",
"click_type": "点击/热词类型click_type",
"a_group_remark": "群备注FTS检索字段a_group_remark",
"op_code": "操作码op_code",
"query": "查询关键词query",
"score": "评分/权重score",
"keyword": "关键词keyword",
"pay_load_": "payload/扩展数据pay_load_",
"bill_no": "账单号bill_no",
"session_title": "会话标题session_title",
"unread_stat": "未读统计字段unread_stat",
"ui_type": "UI类型/发布类型ui_type",
"error_type": "错误类型error_type",
"tips_content": "提示内容tips_content",
"record_content": "记录内容record_content",
"business_type": "业务类型business_type",
"access_content_key": "访问内容keyaccess_content_key",
"access_content_type": "访问内容类型access_content_type",
"range_type": "范围类型range_type",
"message_local_type": "消息类型message_local_type",
"message_origin_source": "消息来源标识message_origin_source",
# 朋友圈sns常见拆分字段
"tid_heigh_bit": "tid 高位拆分字段heigh_bit字段名原样保留",
"tid_low_bit": "tid 低位拆分字段low_bit",
"break_flag": "断点/分页标志0/1用于分页/增量拉取水位)",
# WCDB 压缩控制
"WCDB_CT_message_content": "WCDB压缩标记message_content列",
"WCDB_CT_source": "WCDB压缩标记source列",
}
# 表级字段含义覆盖(优先级高于 KNOWN_FIELD_MEANINGS
# key: table_name.lower() ; value: { field_name.lower(): meaning }
KNOWN_FIELD_MEANINGS_BY_TABLE: dict[str, dict[str, str]] = {
# contact.db
"contact": {
"id": "序号(通常与 name2id.rowid 对应)",
"username": "联系人的 wxid / 群聊 username可唯一确定联系人",
"local_type": "联系人类型1=通讯录好友/公众号/已添加群聊2=未添加到通讯录的群聊3=群中的陌生人5=企业微信好友6=群聊中的陌生企业微信好友",
"alias": "微信号(微信里显示的微信号)",
"flag": "联系人标志位需转二进制常见第7位星标第12位置顶第17位屏蔽朋友圈第24位仅聊天",
"head_img_md5": "头像md5可通过 head_image.db 查询对应头像)",
"verify_flag": "认证标志(公众号/企业等非0常表示公众号",
"description": "描述字段(样本为空;用途待确认)",
"extra_buffer": "好友扩展信息protobuf包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene",
"chat_room_notify": "群消息通知相关设置样本为0/1疑似免打扰/通知开关,待确认)",
"is_in_chat_room": "群聊状态标记样本为1/2具体含义待确认",
"chat_room_type": "群聊类型/标志样本为0/2具体含义待确认",
},
"stranger": {
"id": "序号(通常与 name2id.rowid 对应)",
"username": "联系人的 wxid / 群聊 username",
"local_type": "联系人类型1=通讯录好友/公众号/已添加群聊2=未添加到通讯录的群聊3=群中的陌生人5=企业微信好友6=群聊中的陌生企业微信好友",
"alias": "微信号(微信里显示的微信号)",
"flag": "联系人标志位需转二进制常见第7位星标第12位置顶第17位屏蔽朋友圈第24位仅聊天",
"head_img_md5": "头像md5可通过 head_image.db 查询对应头像)",
"verify_flag": "认证标志(公众号/企业等非0常表示公众号",
"description": "描述字段(样本为空;用途待确认)",
"extra_buffer": "好友扩展信息protobuf包含性别/地区/签名等,本项目解析 gender/signature/country/province/city/source_scene",
"chat_room_notify": "群消息通知相关设置样本为0/1疑似免打扰/通知开关,待确认)",
"is_in_chat_room": "群聊状态标记样本为1/2具体含义待确认",
"chat_room_type": "群聊类型/标志样本为0/2具体含义待确认",
},
"biz_info": {
"id": "序号(与 name2id.rowid 对应,可唯一确定一个公众号)",
"username": "公众号username原始 wxid/gh_xxx",
"type": "公众号类型1=公众号0=订阅号(资料来源:万字长文)",
"accept_type": "接收类型accept_type含义待确认样本常为0",
"child_type": "子类型child_type含义待确认样本常为0",
"version": "版本号含义待确认样本常为0",
"external_info": "公众号详细信息(常见 JSON含底部菜单/交互配置等)",
"brand_info": "公众号品牌/菜单信息(常见 JSONurls 等)",
"brand_list": "品牌列表/关联列表(格式待确认,可能为 JSON",
"brand_flag": "品牌/能力标志位(含义待确认)",
"belong": "归属字段(含义待确认)",
"home_url": "主页链接(含义待确认)",
},
"chat_room": {
"id": "序号(与 name2id.rowid 对应)",
"username": "群聊的usernamexxx@chatroom",
"owner": "群主username",
"ext_buffer": "群成员username与群昵称protobufChatRoomData.members 等)",
},
"chat_room_info_detail": {
"room_id_": "序号(与 name2id.rowid 对应)",
"username_": "群聊的usernamexxx@chatroom",
"announcement_": "群公告(文本)",
"announcement_editor_": "群公告编辑者username",
"announcement_publish_time_": "群公告发布时间(时间戳)",
"chat_room_status_": "群状态/标志位bitmask样本常见 0x80000 等,具体位含义待确认)",
"xml_announcement_": "群公告XML可解析更多信息图片/文件等)",
"ext_buffer_": "扩展信息protobuf-like样本长度较小具体结构待确认",
},
"chatroom_member": {
"room_id": "群聊ID对应 name2id.rowid",
"member_id": "群成员ID对应 name2id.rowid",
},
"contact_label": {
"label_id_": "标签ID",
"label_name_": "标签名称",
"sort_order_": "排序",
},
# message_*.db / biz_message_*.db
"msg_*": {
"local_id": "自增id本地",
"server_id": "服务端id每条消息唯一",
"local_type": "消息类型local_type低32位=type高32位=sub_type可用 (local_type & 0xFFFFFFFF) 与 (local_type >> 32) 拆分)",
"sort_seq": "排序字段单会话内消息排序样本≈create_time*1000",
"real_sender_id": "发送者id可通过 Name2Id.rowid 映射到 username",
"create_time": "秒级时间戳",
"server_seq": "服务端接收顺序idserver_seq",
"message_content": "消息内容local_type=1 时为文本,其它类型多为 Zstandard 压缩后的XML/二进制",
"compress_content": "压缩后的内容(多见 Zstandard",
"packed_info_data": "protobuf扩展信息图片文件名/语音转文字/合并转发文件夹名等)",
},
"name2id": {
"is_session": "是否会话名标记1=会话/聊天对象0=其它映射如群成员ID",
},
# session.db
"sessiontable": {
"type": "会话类型样本为0枚举待确认",
"status": "会话状态样本为0枚举待确认",
"unread_first_pat_msg_local_id": "未读拍一拍消息的本地ID样本为0含义待确认",
"unread_first_pat_msg_sort_seq": "未读拍一拍消息的排序序号样本为0含义待确认",
},
"session_last_message": {
"username": "会话username",
"sort_seq": "最后一条消息sort_seq",
"local_id": "最后一条消息local_id",
"create_time": "最后一条消息create_time秒级时间戳",
"local_type": "最后一条消息local_type",
"sender_username": "最后一条消息发送者username",
"preview": "最后一条消息预览文本(用于会话列表)",
"db_stem": "来源消息库分片名(如 message_0",
"table_name": "来源消息表名(如 Msg_xxx",
"built_at": "构建时间Unix时间戳",
},
# 本项目 chat_search_index.db
"message_fts": {
"text": "可检索文本(索引内容)",
"render_type": "渲染类型text/system/image/voice/video/emoji/...,本项目定义)",
"db_stem": "来源消息库分片名(如 message_0",
"table_name": "来源消息表名(如 Msg_xxx",
"sender_username": "发送者username解码后",
},
# emoticon.db
"knonstoreemoticontable": {
"type": "表情类型样本均为3枚举含义待确认",
"caption": "表情说明/标题caption",
"product_id": "表情包/产品IDproduct_id",
"aes_key": "AES密钥用于CDN下载解密",
"auth_key": "鉴权keyCDN下载",
"extern_md5": "外部资源md5extern_md5",
},
"kstoreemoticonpackagetable": {
"package_id_": "表情包IDpackage_id",
"package_name_": "表情包名称",
"payment_status_": "支付状态payment_status",
"download_status_": "下载状态download_status",
"install_time_": "安装时间(时间戳)",
"remove_time_": "移除时间(时间戳)",
"sort_order_": "排序",
"introduction_": "简介introduction",
"full_description_": "完整描述full_description",
"copyright_": "版权信息",
"author_": "作者信息",
"store_icon_url_": "商店图标URL",
"panel_url_": "面板/详情页URL",
},
"kstoreemoticonfilestable": {
"package_id_": "表情包IDpackage_id",
"md5_": "表情md5",
"type_": "表情类型type",
"sort_order_": "排序",
"emoticon_size_": "表情文件大小(字节)",
"emoticon_offset_": "表情文件偏移(用于包内定位)",
"thumb_size_": "缩略图大小(字节)",
"thumb_offset_": "缩略图偏移(用于包内定位)",
},
# favorite.db
"fav_db_item": {
"version": "版本号(收藏条目结构/内容版本样本为87",
"fromusr": "来源用户username收藏来源",
"realchatname": "来源群聊username若收藏来源于群聊",
"upload_error_code": "上传错误码",
"trans_res_error_code": "资源转换错误码trans_res_error_code",
},
# general.db
"ilink_voip": {
"wx_chatroom_": "群聊usernamexxx@chatroom",
"millsecond_": "毫秒时间戳/时间标记(字段名推断)",
"group_id_": "ILink group_id字段名推断",
"room_id_": "房间ID字段名推断",
"room_key_": "房间key字段名推断",
"route_id_": "路由ID字段名推断",
"voice_status_": "通话状态(字段名推断)",
"talker_create_user_": "发起者username字段名推断",
"not_friend_user_list_": "非好友成员列表(字段名推断)",
"members_": "成员列表(字段名推断)",
"is_ilink_": "是否ilink通话字段名推断",
"ever_quit_chatroom_": "是否曾退出群聊(字段名推断)",
},
"fmessagetable": {
"user_name_": "用户名(好友验证/陌生人会话用户名)",
"type_": "消息类型(好友验证/系统消息样本为37",
"timestamp_": "时间戳",
"encrypt_user_name_": "加密用户名",
"content_": "内容(验证消息/系统提示等)",
"is_sender_": "是否发送方is_sender",
"ticket_": "票据/验证ticket",
"scene_": "来源场景码scene",
"fmessage_detail_buf_": "详细信息protobuf-like包含验证文案/来源等信息)",
},
"handoff_remind_v0": {
"item_id": "条目IDitem_id",
"head_icon": "图标URL/资源标识)",
"title": "标题",
"desc_type": "描述类型desc_type",
"create_time": "创建时间(时间戳)",
"start_time": "开始时间(时间戳)",
"expire_time": "过期时间(时间戳)",
"biz_type": "业务类型biz_type",
"version": "版本号version",
"url": "跳转URL",
"extra_info": "扩展信息extra_info",
},
"transfertable": {
"transfer_id": "转账IDtransfer_id",
"transcation_id": "交易IDtransaction_id原字段拼写保留",
"message_server_id": "关联消息server_id",
"second_message_server_id": "关联第二条转账消息server_id可在 message_*.db::Msg_* 表的 server_id 对应到)",
"session_name": "会话username",
"pay_sub_type": "支付子类型pay_sub_type",
"pay_receiver": "收款方username",
"pay_payer": "付款方username",
"begin_transfer_time": "转账开始时间(时间戳)",
"last_modified_time": "最后修改时间(时间戳)",
"invalid_time": "失效时间(时间戳)",
"last_update_time": "最后更新时间(时间戳)",
"delay_confirm_flag": "延迟确认标志delay_confirm_flag",
"bubble_clicked_flag": "气泡点击标志bubble_clicked_flag",
},
# bizchat.db
"chat_group": {
"brand_user_name": "品牌/公众号usernamebrand_user_name",
"bit_flag": "位标志/开关bit_flag",
"chat_name": "群组名称chat_name",
"user_list": "成员列表(常见为 ; 分隔的 user_id/username 列表;待确认)",
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
},
"user_info": {
"brand_user_name": "品牌/公众号usernamebrand_user_name",
"bit_flag": "位标志/开关bit_flag",
"reserved0": "保留字段reserved0",
"reserved1": "保留字段reserved1",
"reserved2": "保留字段reserved2",
"reserved3": "保留字段reserved3",
},
# sns.db
"snsmessage_tmp3": {
"from_username": "来源用户username评论/点赞发起者)",
"from_nickname": "来源用户昵称(评论/点赞发起者)",
"to_username": "目标用户username被回复/被@的人)",
"to_nickname": "目标用户昵称(被回复/被@的人)",
"comment_flag": "评论标志位样本为0具体 bit 含义待确认)",
},
"snsadtimeline": {
"ad_content": "广告内容ad_content格式待确认",
"remind_source_info": "提醒来源信息remind_source_info格式待确认",
"remind_self_info": "提醒自身信息remind_self_info格式待确认",
"extra_data": "扩展数据extra_data格式待确认",
},
# unspportmsg.db
"unsupportmessage": {
"from_user": "发送者username",
"to_user": "接收者username",
"msg_source": "消息来源附加信息msg_source",
},
# contact.db
"openim_wording": {
"wording": "文案/提示语wording",
"pinyin": "拼音pinyin",
},
# message_*.db / biz_message_*.db (WCDB)
"wcdb_builtin_compression_record": {
"tablename": "表名tableName",
"columns": "被WCDB压缩的列列表columns",
},
# general.db
"revokemessage": {
"to_user_name": "会话username撤回消息所在会话",
"message_type": "消息类型local_type",
"at_user_list": "@用户列表(字段名推断)",
},
"wcfinderlivestatus": {
"finder_username": "视频号作者usernamefinder_username",
"charge_flag": "是否付费/收费标志charge_flag",
},
"new_tips": {
"disable": "禁用标志disable",
"new_tips_content": "提示内容new_tips_content",
},
"redenvelopetable": {
"sender_user_name": "红包发送者username",
"hb_type": "红包类型hb_type",
},
"wacontact": {
"external_info": "外部信息JSON常见包含 BindWxaInfo/RegisterSource/WxaAppDynamic 等)",
"contact_pack_data": "联系人打包数据protobuf-like常含昵称/品牌名等)",
"wx_app_opt": "小程序/应用选项wx_app_opt位标志/开关样本为0",
},
# emoticon.db
"kstoreemoticoncaptionstable": {
"package_id_": "表情包IDpackage_id",
"md5_": "表情md5",
"language_": "语言language",
"caption_": "文案/标题caption",
},
}
KNOWN_TABLE_DESCRIPTIONS: dict[str, str] = {
# contact.db
"biz_info": "公众号信息表(公众号类型/菜单/品牌信息等)",
"chat_room": "群聊基础信息表(群主/成员列表等扩展在 ext_buffer",
"chat_room_info_detail": "群聊详细信息表(群公告/群状态等)",
"chatroom_member": "群聊成员映射表room_id ↔ member_id",
"contact": "联系人核心表(好友/群/公众号等基础信息)",
"contact_label": "联系人标签表标签ID与名称",
"name2id": "用户名wxid/群id@chatroom 等到内部数值ID映射表",
"encrypt_name2id": "加密用户名到内部数值ID映射表",
"stranger": "陌生人/临时会话信息表",
"ticket_info": "票据/会话票据信息表(用途待进一步确认)",
"stranger_ticket_info": "陌生人票据信息表(用途待进一步确认)",
"oplog": "操作/同步日志表(增量同步相关)",
"openim_appid": "OpenIM 应用ID表企业微信/互通相关)",
"openim_acct_type": "OpenIM 账号类型表",
"openim_wording": "OpenIM 文案/提示语表",
# session.db
"sessiontable": "会话列表表(会话展示/未读/置顶/隐藏等)",
"sessiondeletetable": "会话删除记录表",
"sessionunreadlisttable_1": "未读会话列表表(分表)",
"sessionunreadstattable_1": "未读统计表(分表)",
"sessionnocontactinfotable": "会话表(无联系人信息的会话)",
"session_last_message": "会话最后一条消息缓存/索引表(版本/实现差异)",
# message_*.db / biz_message_*.db
"timestamp": "时间戳/增量同步辅助表",
"deleteinfo": "删除消息记录表(删除/撤回相关)",
"deleteresinfo": "删除资源记录表(资源删除相关)",
"sendinfo": "发送相关信息表(发送状态/队列等)",
"historysysmsginfo": "历史系统消息表",
"historyaddmsginfo": "历史新增消息表",
# message_resource.db
"chatname2id": "会话名 → 会话ID 映射表(资源库维度)",
"sendername2id": "发送者名 → 发送者ID 映射表(资源库维度)",
"messageresourceinfo": "消息资源索引表(按消息/会话定位资源)",
"messageresourcedetail": "消息资源明细表md5/路径/大小等)",
"ftsrange": "FTS 范围信息表(搜索/索引辅助)",
"ftsdeleteinfo": "FTS 删除记录表(索引维护)",
# media_0.db
"voiceinfo": "语音数据表voice_data 等)",
# hardlink.db
"db_info": "WCDB Key-Value 元信息表FTS构建状态/版本/扫描时间等)",
"dir2id": "目录 → ID 映射表(硬链接索引)",
"image_hardlink_info_v4": "图片硬链接索引表v4",
"file_hardlink_info_v4": "文件硬链接索引表v4",
"video_hardlink_info_v4": "视频硬链接索引表v4",
"file_checkpoint_v4": "文件索引检查点(增量)",
"video_checkpoint_v4": "视频索引检查点(增量)",
"talker_checkpoint_v4": "会话索引检查点(增量)",
# *_fts.db / message_fts.db
"table_info": "WCDB Key-Value 元信息表(索引范围/水位/时间戳等)",
# head_image.db
"head_image": "头像缓存表(头像 md5/二进制缩略图等)",
# favorite.db
"buff": "WCDB Key-Value 缓冲/配置表(收藏等模块的缓存)",
"fav_db_item": "收藏条目表",
"fav_tag_db_item": "收藏标签表",
"fav_bind_tag_db_item": "收藏条目与标签绑定表",
# emoticon.db
"kcustomemoticonordertable": "自定义表情排序表md5 列表)",
"kexpressrecentuseeemoticontable": "最近使用表情记录Key-Value",
"knonstoreemoticontable": "非商店表情表(用户收藏/外部表情资源含CDN下载信息",
"kstoreemoticonpackagetable": "商店表情包信息表package 元数据)",
"kstoreemoticoncaptionstable": "商店表情文案表(多语言 caption",
# unspportmsg.db
"unsupportmessage": "不支持消息表PC端无法直接展示的消息类型",
# bizchat.db
"chat_group": "BizChat 群组表(企业微信/公众号群组信息)",
"user_info": "BizChat 用户表(企业微信/公众号用户信息)",
"my_user_info": "BizChat 当前账号映射表brand_user_name ↔ user_id",
# general.db
"forwardrecent": "最近转发会话记录表username/时间)",
"transfertable": "转账记录表转账ID/关联消息/状态等)",
"redenvelopetable": "红包记录表(关联消息/状态等)",
"ilink_voip": "iLink/群通话相关表房间ID/成员/状态等)",
"fmessagetable": "好友验证/陌生人消息表FMessage",
"handoff_remind_v0": "跨设备接力/提醒项表handoff_remind_v0",
"biz_pay_status": "公众号文章付费状态表url_id/is_paid 等)",
"biz_subscribe_status": "公众号订阅模板状态表template_id/is_subscribe",
"new_tips": "新提示/新功能提示表",
"reddot": "小红点提示表",
"reddot_record": "小红点记录表",
"wcfinderlivestatus": "视频号直播状态表",
"teenager_apply_access_agree_info": "青少年模式访问同意记录表",
# chat_search_index.db本项目生成
"meta": "索引元数据表schema_version/构建时间等)",
"message_fts": "全文索引表fts5用于搜索",
}
def simple_heuristic(field_name: str, table_name: str) -> str:
"""简易兜底启发式,避免完全空白"""
f = field_name.lower()
@@ -243,10 +730,17 @@ def simple_heuristic(field_name: str, table_name: str) -> str:
def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str:
# 优先精确已知映射
lt = table_name.lower()
lf = field_name.lower()
# 1) 表级覆盖优先
tmap = KNOWN_FIELD_MEANINGS_BY_TABLE.get(lt)
if tmap and lf in tmap:
return tmap[lf]
# 2) 全局精确映射
if field_name in KNOWN_FIELD_MEANINGS:
return KNOWN_FIELD_MEANINGS[field_name]
lf = field_name.lower()
if lf in KNOWN_FIELD_MEANINGS:
return KNOWN_FIELD_MEANINGS[lf]
@@ -266,13 +760,44 @@ def compute_field_meaning(analyzer, table_name: str, field_name: str) -> str:
def guess_table_desc(analyzer, table_name: str) -> str:
# 简易猜测(优先命中已知表名)
tl = table_name.lower()
# 已知表名(大小写不敏感)
if tl in KNOWN_TABLE_DESCRIPTIONS:
return KNOWN_TABLE_DESCRIPTIONS[tl]
# SQLite / WCDB 内置
if tl == "sqlite_sequence":
return "SQLite 自增序列表"
if tl.startswith("wcdb"):
return "WCDB 内置表(压缩/元数据等)"
# FTS 内部表(多为 *_data/_idx/_config/_content/_docsize/_aux
if "fts" in tl:
if tl.endswith("_data"):
return "全文检索FTS内部数据表"
if tl.endswith("_idx"):
return "全文检索FTS内部索引表"
if tl.endswith("_config"):
return "全文检索FTS内部配置表"
if tl.endswith("_content"):
return "全文检索FTS内部内容表"
if tl.endswith("_docsize"):
return "全文检索FTS内部文档长度表"
if tl.endswith("_aux") or "_aux_" in tl:
return "全文检索FTS辅助表"
return "全文检索FTS表/索引表"
# 借助分析器的启发式(如果可用,且不是“未知功能表”)
if analyzer is not None:
try:
return analyzer.guess_table_function(table_name)
guessed = analyzer.guess_table_function(table_name)
if isinstance(guessed, str) and guessed.strip() and guessed.strip() != "未知功能表":
return guessed.strip()
except Exception:
pass
# 简易猜测
tl = table_name.lower()
if tl == "msg" or tl.startswith("msg_"):
return "某会话的消息表(聊天消息数据)"
if "name2id" in tl:
@@ -281,10 +806,18 @@ def guess_table_desc(analyzer, table_name: str) -> str:
return "联系人/群聊信息表"
if "session" in tl:
return "会话信息/未读统计表"
if "fts" in tl:
return "全文检索FTS内部表"
if "resource" in tl:
return "消息资源/附件索引表"
if "voice" in tl:
return "语音相关数据表"
if "image" in tl or "img" in tl:
return "图片相关数据表"
if "video" in tl:
return "视频相关数据表"
if "file" in tl:
return "文件相关数据表"
if "sns" in tl:
return "朋友圈相关数据表"
return "未知功能表"
@@ -301,13 +834,38 @@ def fill_config(template: dict) -> dict:
# 数据库描述补齐
db_desc_map = build_db_descriptions()
def guess_db_desc(db_name: str) -> str:
# 1) 精确映射优先
if db_name in db_desc_map:
return db_desc_map[db_name]
# 2) 常见分片/变体message_{n}.db
m = re.match(r"^message_(\d+)$", db_name)
if m:
return f"聊天记录数据库分片message_{m.group(1)}.db"
# 3) 公众号/企业微信消息库biz_message_{n}.db结构通常同 message_{n}.db
m = re.match(r"^biz_message_(\d+)$", db_name)
if m:
return f"公众号消息记录数据库biz_message_{m.group(1)}.db结构通常同 message_{m.group(1)}.db"
# 4) FTS/索引类库:*_fts.db
if db_name.endswith("_fts"):
return "全文索引数据库FTS"
# 5) 退化到 base 前缀
base = db_name.split("_", 1)[0]
if base in db_desc_map:
return db_desc_map[base]
return "未知用途数据库"
databases = template.get("databases", {})
for db_name, db in databases.items():
if isinstance(db, dict):
# 数据库级描述
if not db.get("description"):
# 用已知映射或尝试推断
db["description"] = db_desc_map.get(db_name, db.get("description", "")) or "未知用途数据库"
db["description"] = guess_db_desc(db_name)
# 遍历表
tables = db.get("tables", {})
@@ -378,4 +936,4 @@ def main():
if __name__ == "__main__":
main()
main()