Files
WeChatDataAnalysis/generate_config_template.py
2977094657 35a2266b1c improvement(tools): 增强配置模板与字段含义生成
- generate_config_template: 增加 CLI 参数;FTS/PRAGMA 失败时从建表 SQL 兜底解析列\n- generate_wechat_db_config: 扩充库描述/字段含义词典,并支持从 tools/ 目录运行\n- 新增 export_database_schema_markdown:基于 wechat_db_config.json 导出 Markdown 文档
2026-02-15 14:34:15 +08:00

481 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
生成微信数据库字段配置模板
基于实际数据库结构生成JSON模板供人工填写字段含义
"""
import sqlite3
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict
import re
class ConfigTemplateGenerator:
"""配置模板生成器"""
def __init__(self, databases_path: str = "output/databases"):
"""初始化生成器
Args:
databases_path: 数据库文件路径
"""
self.databases_path = Path(databases_path)
self.template_structure = {}
def connect_database(self, db_path: Path) -> sqlite3.Connection:
"""连接SQLite数据库"""
try:
conn = sqlite3.connect(str(db_path))
return conn
except Exception as e:
print(f"连接数据库失败 {db_path}: {e}")
return None
def detect_similar_table_patterns(self, table_names: List[str]) -> Dict[str, List[str]]:
"""检测相似的表名模式(与主脚本逻辑一致)"""
patterns = defaultdict(list)
for table_name in table_names:
# 检测 前缀_后缀 模式其中后缀是32位或更长的哈希字符串
if '_' in table_name:
parts = table_name.split('_', 1) # 只分割第一个下划线
if len(parts) == 2:
prefix, suffix = parts
# 检查后缀是否像哈希值(长度>=16的十六进制字符串
if len(suffix) >= 16 and all(c in '0123456789abcdefABCDEF' for c in suffix):
patterns[prefix].append(table_name)
# 只返回有多个表的模式
return {prefix: tables for prefix, tables in patterns.items() if len(tables) > 1}
def compare_table_structures(self, conn: sqlite3.Connection, table_names: List[str]) -> Dict[str, Any]:
"""比较多个表的结构是否相同(与主脚本逻辑一致)"""
if not table_names:
return {'are_identical': False, 'representative_table': None}
try:
cursor = conn.cursor()
structures = {}
# 获取每个表的结构
for table_name in table_names:
try:
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
# 标准化字段信息用于比较
structure = []
for col in columns:
structure.append({
'name': col[1],
'type': col[2].upper(), # 统一大小写
'notnull': col[3],
'pk': col[5]
})
structures[table_name] = structure
except Exception as e:
print(f"获取表结构失败 {table_name}: {e}")
continue
if not structures:
return {'are_identical': False, 'representative_table': None}
# 比较所有表结构
first_table = list(structures.keys())[0]
first_structure = structures[first_table]
are_identical = True
for table_name, structure in structures.items():
if table_name == first_table:
continue
if len(structure) != len(first_structure):
are_identical = False
break
for i, (field1, field2) in enumerate(zip(first_structure, structure)):
if field1 != field2:
are_identical = False
break
if not are_identical:
break
return {
'are_identical': are_identical,
'representative_table': first_table,
'structure': first_structure,
'table_count': len(structures),
'table_names': list(structures.keys())
}
except Exception as e:
print(f"比较表结构失败: {e}")
return {'are_identical': False, 'representative_table': None}
def analyze_database_structure(self, db_path: Path) -> Dict[str, Any]:
"""分析单个数据库结构"""
db_name = db_path.stem
print(f"分析数据库结构: {db_name}")
conn = self.connect_database(db_path)
if not conn:
return {}
try:
cursor = conn.cursor()
def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]:
"""
从建表 SQL 中尽力解析列名(用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况)。
返回 (name, type);类型缺失时默认 TEXT。
"""
out: list[tuple[str, str]] = []
if not create_sql:
return out
try:
start = create_sql.find("(")
end = create_sql.rfind(")")
if start == -1 or end == -1 or end <= start:
return out
inner = create_sql[start + 1:end]
parts: list[str] = []
buf = ""
depth = 0
for ch in inner:
if ch == "(":
depth += 1
elif ch == ")":
depth -= 1
if ch == "," and depth == 0:
parts.append(buf.strip())
buf = ""
else:
buf += ch
if buf.strip():
parts.append(buf.strip())
for part in parts:
token = part.strip()
if not token:
continue
low = token.lower()
# 跳过约束/外键等
if low.startswith(("constraint", "primary", "unique", "foreign", "check")):
continue
# fts5 选项tokenize/prefix/content/content_rowid 等)
if "=" in token:
key = token.split("=", 1)[0].strip().lower()
if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"):
continue
tokens = token.split()
if not tokens:
continue
name = tokens[0].strip("`\"[]")
typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT"
out.append((name, typ))
except Exception:
return out
return out
def get_table_columns(table_name: str) -> list[tuple[str, str]]:
# 先尝试 PRAGMA
try:
cursor.execute(f"PRAGMA table_info({table_name})")
columns = cursor.fetchall()
if columns:
return [(col[1], col[2]) for col in columns]
except Exception:
pass
# 兜底:从 sqlite_master.sql 解析
try:
cursor.execute(
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?",
(table_name,),
)
row = cursor.fetchone()
create_sql = row[0] if row and len(row) > 0 else ""
return parse_columns_from_create_sql(create_sql or "")
except Exception:
return []
# 获取所有表名
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
tables = cursor.fetchall()
table_names = [table[0] for table in tables]
# 检测相似表并分组
similar_patterns = self.detect_similar_table_patterns(table_names)
processed_tables = set()
db_structure = {}
# 处理相似表组
for prefix, pattern_tables in similar_patterns.items():
print(f" 检测到相似表模式 {prefix}_*: {len(pattern_tables)} 个表")
# 比较表结构
comparison = self.compare_table_structures(conn, pattern_tables)
if comparison['are_identical']:
print(f" → 表结构完全相同,使用代表表: {comparison['representative_table']}")
# 使用模式名作为键,记录代表表的字段
representative_table = comparison['representative_table']
table_key = f"{prefix}_*" # 使用模式名
# 获取代表表的字段信息
columns = get_table_columns(representative_table)
fields = {}
for field_name, field_type in columns:
fields[field_name] = {
"type": field_type,
"meaning": "", # 留空供用户填写
"notes": f"字段类型: {field_type}"
}
db_structure[table_key] = {
"type": "similar_group",
"pattern": f"{prefix}_{{hash}}",
"table_count": comparison['table_count'],
"representative_table": representative_table,
"description": "", # 留空供用户填写
"fields": fields
}
# 标记这些表已被处理
processed_tables.update(pattern_tables)
else:
print(f" → 表结构不同,保持独立处理")
# 处理剩余的独立表
for table in tables:
table_name = table[0]
if table_name in processed_tables:
continue
try:
# 获取表字段信息
columns = get_table_columns(table_name)
fields = {}
for field_name, field_type in columns:
fields[field_name] = {
"type": field_type,
"meaning": "", # 留空供用户填写
"notes": f"字段类型: {field_type}"
}
db_structure[table_name] = {
"type": "table",
"description": "", # 留空供用户填写
"fields": fields
}
except Exception as e:
print(f" 处理表 {table_name} 失败: {e}")
continue
return db_structure
except Exception as e:
print(f"分析数据库失败 {db_name}: {e}")
return {}
finally:
conn.close()
def generate_template(
self,
output_file: str = "wechat_db_config_template.json",
*,
include_excluded: bool = False,
include_message_shards: bool = False,
exclude_db_stems: set[str] | None = None,
):
"""生成配置模板"""
print("开始生成微信数据库配置模板...")
# 定义要排除的数据库模式和描述
excluded_patterns = {} if include_excluded else {
r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库(通常不参与个人聊天分析)',
r'bizchat\.db$': '企业微信联系人/会话数据库(通常不参与个人聊天分析)',
r'contact_fts\.db$': '联系人搜索索引数据库FTS',
r'favorite_fts\.db$': '收藏搜索索引数据库FTS'
}
# 查找所有数据库文件
all_db_files = []
for account_dir in self.databases_path.iterdir():
if account_dir.is_dir():
for db_file in account_dir.glob("*.db"):
all_db_files.append(db_file)
print(f"找到 {len(all_db_files)} 个数据库文件")
# 过滤数据库文件
db_files = []
excluded_files = []
for db_file in all_db_files:
db_filename = db_file.name
excluded_info = None
for pattern, description in excluded_patterns.items():
if re.match(pattern, db_filename):
excluded_files.append((db_file, description))
excluded_info = description
break
if excluded_info is None:
db_files.append(db_file)
# 显示排除的数据库
if excluded_files:
print(f"\n排除以下数据库文件({len(excluded_files)} 个):")
for excluded_file, description in excluded_files:
print(f" - {excluded_file.name} ({description})")
# 显式排除指定 stem不含 .db
if exclude_db_stems:
before = len(db_files)
db_files = [p for p in db_files if p.stem not in exclude_db_stems]
after = len(db_files)
if before != after:
print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}")
print(f"\n实际处理 {len(db_files)} 个数据库文件")
# 过滤message数据库只保留倒数第二个与主脚本逻辑一致
if not include_message_shards:
message_numbered_dbs = []
message_other_dbs = []
for db in db_files:
if re.match(r'message_\d+$', db.stem): # message_{数字}.db
message_numbered_dbs.append(db)
elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等
message_other_dbs.append(db)
if len(message_numbered_dbs) > 1:
# 按数字编号排序(提取数字进行排序)
message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
# 选择倒数第二个(按编号排序)
selected_message_db = message_numbered_dbs[-2] # 倒数第二个
print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
print(f"选择倒数第二个: {selected_message_db.name}")
# 从db_files中移除其他message_{数字}.db数据库但保留message_fts.db等
db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
db_files.append(selected_message_db)
print(f"实际分析 {len(db_files)} 个数据库文件")
# 生成模板结构
template = {
"_metadata": {
"description": "微信数据库字段配置模板",
"version": "1.0",
"instructions": {
"zh": "请为每个字段的 'meaning' 填入准确的中文含义,'description' 填入数据库/表的功能描述",
"en": "Please fill in accurate Chinese meanings for each field's 'meaning' and functional descriptions for 'description'"
},
"database_count": len(db_files),
"generated_time": __import__('datetime').datetime.now().isoformat()
},
"databases": {}
}
# 分析每个数据库
for db_file in db_files:
db_structure = self.analyze_database_structure(db_file)
if db_structure:
template["databases"][db_file.stem] = {
"description": "", # 留空供用户填写
"file_size": db_file.stat().st_size,
"tables": db_structure
}
# 添加额外的配置项
template["message_types"] = {
"_instructions": "消息类型映射 - 格式: 'Type,SubType': '含义描述'",
"examples": {
"1,0": "文本消息",
"3,0": "图片消息",
"34,0": "语音消息"
}
}
template["friend_types"] = {
"_instructions": "好友类型映射 - 格式: 'TypeCode': '类型描述'",
"examples": {
"1": "好友",
"2": "微信群",
"3": "好友"
}
}
# 写入模板文件
output_path = Path(output_file)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(template, f, ensure_ascii=False, indent=2)
print(f"\n配置模板生成完成: {output_file}")
print(f" - 数据库数量: {len(template['databases'])}")
# 统计信息
total_tables = 0
total_fields = 0
similar_groups = 0
for db_name, db_info in template["databases"].items():
db_tables = len(db_info["tables"])
total_tables += db_tables
for table_name, table_info in db_info["tables"].items():
if table_info["type"] == "similar_group":
similar_groups += 1
total_fields += len(table_info["fields"])
print(f" - 表数量: {total_tables}")
print(f" - 相似表组: {similar_groups}")
print(f" - 字段总数: {total_fields}")
# 显示完成统计信息
if excluded_files:
print(f"\n生成完成统计:")
print(f" - 成功处理: {len(template['databases'])} 个数据库")
print(f" - 排除数据库: {len(excluded_files)}")
print(f" - 排除原因: 个人微信数据分析不需要企业微信和搜索索引数据")
print(f"\n请编辑 {output_file} 文件,填入准确的字段含义和描述")
def main():
"""主函数"""
parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器")
parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录(按账号分目录)")
parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径")
parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库(如 bizchat/contact_fts/favorite_fts 等)")
parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db否则仅保留倒数第二个作代表")
parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem不含 .db排除数据库可重复例如: --exclude-db-stem digital_twin")
args = parser.parse_args()
print("微信数据库配置模板生成器")
print("=" * 50)
generator = ConfigTemplateGenerator(databases_path=args.databases_path)
generator.generate_template(
output_file=args.output,
include_excluded=bool(args.include_excluded),
include_message_shards=bool(args.include_message_shards),
exclude_db_stems=set(args.exclude_db_stem or []),
)
if __name__ == "__main__":
main()