mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-19 14:20:51 +08:00
- generate_config_template: 增加 CLI 参数;FTS/PRAGMA 失败时从建表 SQL 兜底解析列\n- generate_wechat_db_config: 扩充库描述/字段含义词典,并支持从 tools/ 目录运行\n- 新增 export_database_schema_markdown:基于 wechat_db_config.json 导出 Markdown 文档
481 lines
20 KiB
Python
481 lines
20 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
生成微信数据库字段配置模板
|
||
基于实际数据库结构生成JSON模板,供人工填写字段含义
|
||
"""
|
||
|
||
import sqlite3
|
||
import json
|
||
import argparse
|
||
from pathlib import Path
|
||
from typing import Dict, List, Any
|
||
from collections import defaultdict
|
||
import re
|
||
|
||
class ConfigTemplateGenerator:
|
||
"""配置模板生成器"""
|
||
|
||
def __init__(self, databases_path: str = "output/databases"):
|
||
"""初始化生成器
|
||
|
||
Args:
|
||
databases_path: 数据库文件路径
|
||
"""
|
||
self.databases_path = Path(databases_path)
|
||
self.template_structure = {}
|
||
|
||
def connect_database(self, db_path: Path) -> sqlite3.Connection:
|
||
"""连接SQLite数据库"""
|
||
try:
|
||
conn = sqlite3.connect(str(db_path))
|
||
return conn
|
||
except Exception as e:
|
||
print(f"连接数据库失败 {db_path}: {e}")
|
||
return None
|
||
|
||
def detect_similar_table_patterns(self, table_names: List[str]) -> Dict[str, List[str]]:
|
||
"""检测相似的表名模式(与主脚本逻辑一致)"""
|
||
patterns = defaultdict(list)
|
||
|
||
for table_name in table_names:
|
||
# 检测 前缀_后缀 模式,其中后缀是32位或更长的哈希字符串
|
||
if '_' in table_name:
|
||
parts = table_name.split('_', 1) # 只分割第一个下划线
|
||
if len(parts) == 2:
|
||
prefix, suffix = parts
|
||
# 检查后缀是否像哈希值(长度>=16的十六进制字符串)
|
||
if len(suffix) >= 16 and all(c in '0123456789abcdefABCDEF' for c in suffix):
|
||
patterns[prefix].append(table_name)
|
||
|
||
# 只返回有多个表的模式
|
||
return {prefix: tables for prefix, tables in patterns.items() if len(tables) > 1}
|
||
|
||
def compare_table_structures(self, conn: sqlite3.Connection, table_names: List[str]) -> Dict[str, Any]:
|
||
"""比较多个表的结构是否相同(与主脚本逻辑一致)"""
|
||
if not table_names:
|
||
return {'are_identical': False, 'representative_table': None}
|
||
|
||
try:
|
||
cursor = conn.cursor()
|
||
structures = {}
|
||
|
||
# 获取每个表的结构
|
||
for table_name in table_names:
|
||
try:
|
||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||
columns = cursor.fetchall()
|
||
|
||
# 标准化字段信息用于比较
|
||
structure = []
|
||
for col in columns:
|
||
structure.append({
|
||
'name': col[1],
|
||
'type': col[2].upper(), # 统一大小写
|
||
'notnull': col[3],
|
||
'pk': col[5]
|
||
})
|
||
|
||
structures[table_name] = structure
|
||
except Exception as e:
|
||
print(f"获取表结构失败 {table_name}: {e}")
|
||
continue
|
||
|
||
if not structures:
|
||
return {'are_identical': False, 'representative_table': None}
|
||
|
||
# 比较所有表结构
|
||
first_table = list(structures.keys())[0]
|
||
first_structure = structures[first_table]
|
||
|
||
are_identical = True
|
||
|
||
for table_name, structure in structures.items():
|
||
if table_name == first_table:
|
||
continue
|
||
|
||
if len(structure) != len(first_structure):
|
||
are_identical = False
|
||
break
|
||
|
||
for i, (field1, field2) in enumerate(zip(first_structure, structure)):
|
||
if field1 != field2:
|
||
are_identical = False
|
||
break
|
||
|
||
if not are_identical:
|
||
break
|
||
|
||
return {
|
||
'are_identical': are_identical,
|
||
'representative_table': first_table,
|
||
'structure': first_structure,
|
||
'table_count': len(structures),
|
||
'table_names': list(structures.keys())
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"比较表结构失败: {e}")
|
||
return {'are_identical': False, 'representative_table': None}
|
||
|
||
def analyze_database_structure(self, db_path: Path) -> Dict[str, Any]:
|
||
"""分析单个数据库结构"""
|
||
db_name = db_path.stem
|
||
print(f"分析数据库结构: {db_name}")
|
||
|
||
conn = self.connect_database(db_path)
|
||
if not conn:
|
||
return {}
|
||
|
||
try:
|
||
cursor = conn.cursor()
|
||
|
||
def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]:
|
||
"""
|
||
从建表 SQL 中尽力解析列名(用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况)。
|
||
返回 (name, type);类型缺失时默认 TEXT。
|
||
"""
|
||
out: list[tuple[str, str]] = []
|
||
if not create_sql:
|
||
return out
|
||
try:
|
||
start = create_sql.find("(")
|
||
end = create_sql.rfind(")")
|
||
if start == -1 or end == -1 or end <= start:
|
||
return out
|
||
inner = create_sql[start + 1:end]
|
||
|
||
parts: list[str] = []
|
||
buf = ""
|
||
depth = 0
|
||
for ch in inner:
|
||
if ch == "(":
|
||
depth += 1
|
||
elif ch == ")":
|
||
depth -= 1
|
||
if ch == "," and depth == 0:
|
||
parts.append(buf.strip())
|
||
buf = ""
|
||
else:
|
||
buf += ch
|
||
if buf.strip():
|
||
parts.append(buf.strip())
|
||
|
||
for part in parts:
|
||
token = part.strip()
|
||
if not token:
|
||
continue
|
||
low = token.lower()
|
||
# 跳过约束/外键等
|
||
if low.startswith(("constraint", "primary", "unique", "foreign", "check")):
|
||
continue
|
||
# fts5 选项(tokenize/prefix/content/content_rowid 等)
|
||
if "=" in token:
|
||
key = token.split("=", 1)[0].strip().lower()
|
||
if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"):
|
||
continue
|
||
tokens = token.split()
|
||
if not tokens:
|
||
continue
|
||
name = tokens[0].strip("`\"[]")
|
||
typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT"
|
||
out.append((name, typ))
|
||
except Exception:
|
||
return out
|
||
return out
|
||
|
||
def get_table_columns(table_name: str) -> list[tuple[str, str]]:
|
||
# 先尝试 PRAGMA
|
||
try:
|
||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||
columns = cursor.fetchall()
|
||
if columns:
|
||
return [(col[1], col[2]) for col in columns]
|
||
except Exception:
|
||
pass
|
||
|
||
# 兜底:从 sqlite_master.sql 解析
|
||
try:
|
||
cursor.execute(
|
||
"SELECT sql FROM sqlite_master WHERE type='table' AND name=?",
|
||
(table_name,),
|
||
)
|
||
row = cursor.fetchone()
|
||
create_sql = row[0] if row and len(row) > 0 else ""
|
||
return parse_columns_from_create_sql(create_sql or "")
|
||
except Exception:
|
||
return []
|
||
|
||
# 获取所有表名
|
||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
||
tables = cursor.fetchall()
|
||
table_names = [table[0] for table in tables]
|
||
|
||
# 检测相似表并分组
|
||
similar_patterns = self.detect_similar_table_patterns(table_names)
|
||
processed_tables = set()
|
||
db_structure = {}
|
||
|
||
# 处理相似表组
|
||
for prefix, pattern_tables in similar_patterns.items():
|
||
print(f" 检测到相似表模式 {prefix}_*: {len(pattern_tables)} 个表")
|
||
|
||
# 比较表结构
|
||
comparison = self.compare_table_structures(conn, pattern_tables)
|
||
|
||
if comparison['are_identical']:
|
||
print(f" → 表结构完全相同,使用代表表: {comparison['representative_table']}")
|
||
# 使用模式名作为键,记录代表表的字段
|
||
representative_table = comparison['representative_table']
|
||
table_key = f"{prefix}_*" # 使用模式名
|
||
|
||
# 获取代表表的字段信息
|
||
columns = get_table_columns(representative_table)
|
||
|
||
fields = {}
|
||
for field_name, field_type in columns:
|
||
fields[field_name] = {
|
||
"type": field_type,
|
||
"meaning": "", # 留空供用户填写
|
||
"notes": f"字段类型: {field_type}"
|
||
}
|
||
|
||
db_structure[table_key] = {
|
||
"type": "similar_group",
|
||
"pattern": f"{prefix}_{{hash}}",
|
||
"table_count": comparison['table_count'],
|
||
"representative_table": representative_table,
|
||
"description": "", # 留空供用户填写
|
||
"fields": fields
|
||
}
|
||
|
||
# 标记这些表已被处理
|
||
processed_tables.update(pattern_tables)
|
||
else:
|
||
print(f" → 表结构不同,保持独立处理")
|
||
|
||
# 处理剩余的独立表
|
||
for table in tables:
|
||
table_name = table[0]
|
||
|
||
if table_name in processed_tables:
|
||
continue
|
||
|
||
try:
|
||
# 获取表字段信息
|
||
columns = get_table_columns(table_name)
|
||
|
||
fields = {}
|
||
for field_name, field_type in columns:
|
||
fields[field_name] = {
|
||
"type": field_type,
|
||
"meaning": "", # 留空供用户填写
|
||
"notes": f"字段类型: {field_type}"
|
||
}
|
||
|
||
db_structure[table_name] = {
|
||
"type": "table",
|
||
"description": "", # 留空供用户填写
|
||
"fields": fields
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f" 处理表 {table_name} 失败: {e}")
|
||
continue
|
||
|
||
return db_structure
|
||
|
||
except Exception as e:
|
||
print(f"分析数据库失败 {db_name}: {e}")
|
||
return {}
|
||
finally:
|
||
conn.close()
|
||
|
||
def generate_template(
|
||
self,
|
||
output_file: str = "wechat_db_config_template.json",
|
||
*,
|
||
include_excluded: bool = False,
|
||
include_message_shards: bool = False,
|
||
exclude_db_stems: set[str] | None = None,
|
||
):
|
||
"""生成配置模板"""
|
||
print("开始生成微信数据库配置模板...")
|
||
|
||
# 定义要排除的数据库模式和描述
|
||
excluded_patterns = {} if include_excluded else {
|
||
r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库(通常不参与个人聊天分析)',
|
||
r'bizchat\.db$': '企业微信联系人/会话数据库(通常不参与个人聊天分析)',
|
||
r'contact_fts\.db$': '联系人搜索索引数据库(FTS)',
|
||
r'favorite_fts\.db$': '收藏搜索索引数据库(FTS)'
|
||
}
|
||
|
||
# 查找所有数据库文件
|
||
all_db_files = []
|
||
for account_dir in self.databases_path.iterdir():
|
||
if account_dir.is_dir():
|
||
for db_file in account_dir.glob("*.db"):
|
||
all_db_files.append(db_file)
|
||
|
||
print(f"找到 {len(all_db_files)} 个数据库文件")
|
||
|
||
# 过滤数据库文件
|
||
db_files = []
|
||
excluded_files = []
|
||
|
||
for db_file in all_db_files:
|
||
db_filename = db_file.name
|
||
excluded_info = None
|
||
|
||
for pattern, description in excluded_patterns.items():
|
||
if re.match(pattern, db_filename):
|
||
excluded_files.append((db_file, description))
|
||
excluded_info = description
|
||
break
|
||
|
||
if excluded_info is None:
|
||
db_files.append(db_file)
|
||
|
||
# 显示排除的数据库
|
||
if excluded_files:
|
||
print(f"\n排除以下数据库文件({len(excluded_files)} 个):")
|
||
for excluded_file, description in excluded_files:
|
||
print(f" - {excluded_file.name} ({description})")
|
||
|
||
# 显式排除指定 stem(不含 .db)
|
||
if exclude_db_stems:
|
||
before = len(db_files)
|
||
db_files = [p for p in db_files if p.stem not in exclude_db_stems]
|
||
after = len(db_files)
|
||
if before != after:
|
||
print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}")
|
||
|
||
print(f"\n实际处理 {len(db_files)} 个数据库文件")
|
||
|
||
# 过滤message数据库,只保留倒数第二个(与主脚本逻辑一致)
|
||
if not include_message_shards:
|
||
message_numbered_dbs = []
|
||
message_other_dbs = []
|
||
|
||
for db in db_files:
|
||
if re.match(r'message_\d+$', db.stem): # message_{数字}.db
|
||
message_numbered_dbs.append(db)
|
||
elif db.stem.startswith('message_'): # message_fts.db, message_resource.db等
|
||
message_other_dbs.append(db)
|
||
|
||
if len(message_numbered_dbs) > 1:
|
||
# 按数字编号排序(提取数字进行排序)
|
||
message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
|
||
# 选择倒数第二个(按编号排序)
|
||
selected_message_db = message_numbered_dbs[-2] # 倒数第二个
|
||
print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
|
||
print(f"选择倒数第二个: {selected_message_db.name}")
|
||
|
||
# 从db_files中移除其他message_{数字}.db数据库,但保留message_fts.db等
|
||
db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
|
||
db_files.append(selected_message_db)
|
||
|
||
print(f"实际分析 {len(db_files)} 个数据库文件")
|
||
|
||
# 生成模板结构
|
||
template = {
|
||
"_metadata": {
|
||
"description": "微信数据库字段配置模板",
|
||
"version": "1.0",
|
||
"instructions": {
|
||
"zh": "请为每个字段的 'meaning' 填入准确的中文含义,'description' 填入数据库/表的功能描述",
|
||
"en": "Please fill in accurate Chinese meanings for each field's 'meaning' and functional descriptions for 'description'"
|
||
},
|
||
"database_count": len(db_files),
|
||
"generated_time": __import__('datetime').datetime.now().isoformat()
|
||
},
|
||
"databases": {}
|
||
}
|
||
|
||
# 分析每个数据库
|
||
for db_file in db_files:
|
||
db_structure = self.analyze_database_structure(db_file)
|
||
if db_structure:
|
||
template["databases"][db_file.stem] = {
|
||
"description": "", # 留空供用户填写
|
||
"file_size": db_file.stat().st_size,
|
||
"tables": db_structure
|
||
}
|
||
|
||
# 添加额外的配置项
|
||
template["message_types"] = {
|
||
"_instructions": "消息类型映射 - 格式: 'Type,SubType': '含义描述'",
|
||
"examples": {
|
||
"1,0": "文本消息",
|
||
"3,0": "图片消息",
|
||
"34,0": "语音消息"
|
||
}
|
||
}
|
||
|
||
template["friend_types"] = {
|
||
"_instructions": "好友类型映射 - 格式: 'TypeCode': '类型描述'",
|
||
"examples": {
|
||
"1": "好友",
|
||
"2": "微信群",
|
||
"3": "好友"
|
||
}
|
||
}
|
||
|
||
# 写入模板文件
|
||
output_path = Path(output_file)
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
json.dump(template, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\n配置模板生成完成: {output_file}")
|
||
print(f" - 数据库数量: {len(template['databases'])}")
|
||
|
||
# 统计信息
|
||
total_tables = 0
|
||
total_fields = 0
|
||
similar_groups = 0
|
||
|
||
for db_name, db_info in template["databases"].items():
|
||
db_tables = len(db_info["tables"])
|
||
total_tables += db_tables
|
||
|
||
for table_name, table_info in db_info["tables"].items():
|
||
if table_info["type"] == "similar_group":
|
||
similar_groups += 1
|
||
total_fields += len(table_info["fields"])
|
||
|
||
print(f" - 表数量: {total_tables}")
|
||
print(f" - 相似表组: {similar_groups}")
|
||
print(f" - 字段总数: {total_fields}")
|
||
|
||
# 显示完成统计信息
|
||
if excluded_files:
|
||
print(f"\n生成完成统计:")
|
||
print(f" - 成功处理: {len(template['databases'])} 个数据库")
|
||
print(f" - 排除数据库: {len(excluded_files)} 个")
|
||
print(f" - 排除原因: 个人微信数据分析不需要企业微信和搜索索引数据")
|
||
|
||
print(f"\n请编辑 {output_file} 文件,填入准确的字段含义和描述")
|
||
|
||
def main():
|
||
"""主函数"""
|
||
parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器")
|
||
parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录(按账号分目录)")
|
||
parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径")
|
||
parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库(如 bizchat/contact_fts/favorite_fts 等)")
|
||
parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db(否则仅保留倒数第二个作代表)")
|
||
parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem(不含 .db)排除数据库,可重复,例如: --exclude-db-stem digital_twin")
|
||
args = parser.parse_args()
|
||
|
||
print("微信数据库配置模板生成器")
|
||
print("=" * 50)
|
||
|
||
generator = ConfigTemplateGenerator(databases_path=args.databases_path)
|
||
generator.generate_template(
|
||
output_file=args.output,
|
||
include_excluded=bool(args.include_excluded),
|
||
include_message_shards=bool(args.include_message_shards),
|
||
exclude_db_stems=set(args.exclude_db_stem or []),
|
||
)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|