WeChatDataAnalysis/generate_config_template.py

#!/usr/bin/env python3
"""
生成微信数据库字段配置模板
基于实际数据库结构生成JSON模板，供人工填写字段含义
"""

import sqlite3
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict
import re

class ConfigTemplateGenerator:
    """配置模板生成器"""

    def __init__(self, databases_path: str = "output/databases"):
        """初始化生成器

        Args:
            databases_path: 数据库文件路径
        """
        self.databases_path = Path(databases_path)
        self.template_structure = {}

    def connect_database(self, db_path: Path) -> sqlite3.Connection:
        """连接SQLite数据库"""
        try:
            conn = sqlite3.connect(str(db_path))
            return conn
        except Exception as e:
            print(f"连接数据库失败 {db_path}: {e}")
            return None

    def detect_similar_table_patterns(self, table_names: List[str]) -> Dict[str, List[str]]:
        """检测相似的表名模式（与主脚本逻辑一致）"""
        patterns = defaultdict(list)

        for table_name in table_names:
            # 检测 前缀_后缀 模式，其中后缀是32位或更长的哈希字符串
            if '_' in table_name:
                parts = table_name.split('_', 1)  # 只分割第一个下划线
                if len(parts) == 2:
                    prefix, suffix = parts
                    # 检查后缀是否像哈希值（长度>=16的十六进制字符串）
                    if len(suffix) >= 16 and all(c in '0123456789abcdefABCDEF' for c in suffix):
                        patterns[prefix].append(table_name)

        # 只返回有多个表的模式
        return {prefix: tables for prefix, tables in patterns.items() if len(tables) > 1}

    def compare_table_structures(self, conn: sqlite3.Connection, table_names: List[str]) -> Dict[str, Any]:
        """比较多个表的结构是否相同（与主脚本逻辑一致）"""
        if not table_names:
            return {'are_identical': False, 'representative_table': None}

        try:
            cursor = conn.cursor()
            structures = {}

            # 获取每个表的结构
            for table_name in table_names:
                try:
                    cursor.execute(f"PRAGMA table_info({table_name})")
                    columns = cursor.fetchall()

                    # 标准化字段信息用于比较
                    structure = []
                    for col in columns:
                        structure.append({
                            'name': col[1],
                            'type': col[2].upper(),  # 统一大小写
                            'notnull': col[3],
                            'pk': col[5]
                        })

                    structures[table_name] = structure
                except Exception as e:
                    print(f"获取表结构失败 {table_name}: {e}")
                    continue

            if not structures:
                return {'are_identical': False, 'representative_table': None}

            # 比较所有表结构
            first_table = list(structures.keys())[0]
            first_structure = structures[first_table]

            are_identical = True

            for table_name, structure in structures.items():
                if table_name == first_table:
                    continue

                if len(structure) != len(first_structure):
                    are_identical = False
                    break

                for i, (field1, field2) in enumerate(zip(first_structure, structure)):
                    if field1 != field2:
                        are_identical = False
                        break

                if not are_identical:
                    break

            return {
                'are_identical': are_identical,
                'representative_table': first_table,
                'structure': first_structure,
                'table_count': len(structures),
                'table_names': list(structures.keys())
            }

        except Exception as e:
            print(f"比较表结构失败: {e}")
            return {'are_identical': False, 'representative_table': None}

    def analyze_database_structure(self, db_path: Path) -> Dict[str, Any]:
        """分析单个数据库结构"""
        db_name = db_path.stem
        print(f"分析数据库结构: {db_name}")

        conn = self.connect_database(db_path)
        if not conn:
            return {}

        try:
            cursor = conn.cursor()

            def parse_columns_from_create_sql(create_sql: str) -> list[tuple[str, str]]:
                """
                从建表 SQL 中尽力解析列名（用于 FTS5/缺失 tokenizer 扩展导致 PRAGMA 失败的情况）。
                返回 (name, type)；类型缺失时默认 TEXT。
                """
                out: list[tuple[str, str]] = []
                if not create_sql:
                    return out
                try:
                    start = create_sql.find("(")
                    end = create_sql.rfind(")")
                    if start == -1 or end == -1 or end <= start:
                        return out
                    inner = create_sql[start + 1:end]

                    parts: list[str] = []
                    buf = ""
                    depth = 0
                    for ch in inner:
                        if ch == "(":
                            depth += 1
                        elif ch == ")":
                            depth -= 1
                        if ch == "," and depth == 0:
                            parts.append(buf.strip())
                            buf = ""
                        else:
                            buf += ch
                    if buf.strip():
                        parts.append(buf.strip())

                    for part in parts:
                        token = part.strip()
                        if not token:
                            continue
                        low = token.lower()
                        # 跳过约束/外键等
                        if low.startswith(("constraint", "primary", "unique", "foreign", "check")):
                            continue
                        # fts5 选项（tokenize/prefix/content/content_rowid 等）
                        if "=" in token:
                            key = token.split("=", 1)[0].strip().lower()
                            if key in ("tokenize", "prefix", "content", "content_rowid", "compress", "uncompress"):
                                continue
                        tokens = token.split()
                        if not tokens:
                            continue
                        name = tokens[0].strip("`\"[]")
                        typ = tokens[1].upper() if len(tokens) > 1 and "=" not in tokens[1] else "TEXT"
                        out.append((name, typ))
                except Exception:
                    return out
                return out

            def get_table_columns(table_name: str) -> list[tuple[str, str]]:
                # 先尝试 PRAGMA
                try:
                    cursor.execute(f"PRAGMA table_info({table_name})")
                    columns = cursor.fetchall()
                    if columns:
                        return [(col[1], col[2]) for col in columns]
                except Exception:
                    pass

                # 兜底：从 sqlite_master.sql 解析
                try:
                    cursor.execute(
                        "SELECT sql FROM sqlite_master WHERE type='table' AND name=?",
                        (table_name,),
                    )
                    row = cursor.fetchone()
                    create_sql = row[0] if row and len(row) > 0 else ""
                    return parse_columns_from_create_sql(create_sql or "")
                except Exception:
                    return []

            # 获取所有表名
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
            tables = cursor.fetchall()
            table_names = [table[0] for table in tables]

            # 检测相似表并分组
            similar_patterns = self.detect_similar_table_patterns(table_names)
            processed_tables = set()
            db_structure = {}

            # 处理相似表组
            for prefix, pattern_tables in similar_patterns.items():
                print(f"  检测到相似表模式 {prefix}_*: {len(pattern_tables)} 个表")

                # 比较表结构
                comparison = self.compare_table_structures(conn, pattern_tables)

                if comparison['are_identical']:
                    print(f"    → 表结构完全相同，使用代表表: {comparison['representative_table']}")
                    # 使用模式名作为键，记录代表表的字段
                    representative_table = comparison['representative_table']
                    table_key = f"{prefix}_*"  # 使用模式名

                    # 获取代表表的字段信息
                    columns = get_table_columns(representative_table)

                    fields = {}
                    for field_name, field_type in columns:
                        fields[field_name] = {
                            "type": field_type,
                            "meaning": "",  # 留空供用户填写
                            "notes": f"字段类型: {field_type}"
                        }

                    db_structure[table_key] = {
                        "type": "similar_group",
                        "pattern": f"{prefix}_{{hash}}",
                        "table_count": comparison['table_count'],
                        "representative_table": representative_table,
                        "description": "",  # 留空供用户填写
                        "fields": fields
                    }

                    # 标记这些表已被处理
                    processed_tables.update(pattern_tables)
                else:
                    print(f"    → 表结构不同，保持独立处理")

            # 处理剩余的独立表
            for table in tables:
                table_name = table[0]

                if table_name in processed_tables:
                    continue

                try:
                    # 获取表字段信息
                    columns = get_table_columns(table_name)

                    fields = {}
                    for field_name, field_type in columns:
                        fields[field_name] = {
                            "type": field_type,
                            "meaning": "",  # 留空供用户填写
                            "notes": f"字段类型: {field_type}"
                        }

                    db_structure[table_name] = {
                        "type": "table",
                        "description": "",  # 留空供用户填写
                        "fields": fields
                    }

                except Exception as e:
                    print(f"    处理表 {table_name} 失败: {e}")
                    continue

            return db_structure

        except Exception as e:
            print(f"分析数据库失败 {db_name}: {e}")
            return {}
        finally:
            conn.close()

    def generate_template(
        self,
        output_file: str = "wechat_db_config_template.json",
        *,
        include_excluded: bool = False,
        include_message_shards: bool = False,
        exclude_db_stems: set[str] | None = None,
    ):
        """生成配置模板"""
        print("开始生成微信数据库配置模板...")

        # 定义要排除的数据库模式和描述
        excluded_patterns = {} if include_excluded else {
            r'biz_message_\d+\.db$': '公众号/企业微信聊天记录数据库（通常不参与个人聊天分析）',
            r'bizchat\.db$': '企业微信联系人/会话数据库（通常不参与个人聊天分析）',
            r'contact_fts\.db$': '联系人搜索索引数据库（FTS）',
            r'favorite_fts\.db$': '收藏搜索索引数据库（FTS）'
        }

        # 查找所有数据库文件
        all_db_files = []
        for account_dir in self.databases_path.iterdir():
            if account_dir.is_dir():
                for db_file in account_dir.glob("*.db"):
                    all_db_files.append(db_file)

        print(f"找到 {len(all_db_files)} 个数据库文件")

        # 过滤数据库文件
        db_files = []
        excluded_files = []

        for db_file in all_db_files:
            db_filename = db_file.name
            excluded_info = None

            for pattern, description in excluded_patterns.items():
                if re.match(pattern, db_filename):
                    excluded_files.append((db_file, description))
                    excluded_info = description
                    break

            if excluded_info is None:
                db_files.append(db_file)

        # 显示排除的数据库
        if excluded_files:
            print(f"\n排除以下数据库文件（{len(excluded_files)} 个）：")
            for excluded_file, description in excluded_files:
                print(f"  - {excluded_file.name} ({description})")

        # 显式排除指定 stem（不含 .db）
        if exclude_db_stems:
            before = len(db_files)
            db_files = [p for p in db_files if p.stem not in exclude_db_stems]
            after = len(db_files)
            if before != after:
                print(f"\n按 --exclude-db-stem 排除 {before - after} 个数据库: {sorted(exclude_db_stems)}")

        print(f"\n实际处理 {len(db_files)} 个数据库文件")

        # 过滤message数据库，只保留倒数第二个（与主脚本逻辑一致）
        if not include_message_shards:
            message_numbered_dbs = []
            message_other_dbs = []

            for db in db_files:
                if re.match(r'message_\d+$', db.stem):  # message_{数字}.db
                    message_numbered_dbs.append(db)
                elif db.stem.startswith('message_'):  # message_fts.db, message_resource.db等
                    message_other_dbs.append(db)

            if len(message_numbered_dbs) > 1:
                # 按数字编号排序（提取数字进行排序）
                message_numbered_dbs.sort(key=lambda x: int(re.search(r'message_(\d+)', x.stem).group(1)))
                # 选择倒数第二个（按编号排序）
                selected_message_db = message_numbered_dbs[-2]  # 倒数第二个
                print(f"检测到 {len(message_numbered_dbs)} 个message_{{数字}}.db数据库")
                print(f"选择倒数第二个: {selected_message_db.name}")

                # 从db_files中移除其他message_{数字}.db数据库，但保留message_fts.db等
                db_files = [db for db in db_files if not re.match(r'message_\d+$', db.stem)]
                db_files.append(selected_message_db)

        print(f"实际分析 {len(db_files)} 个数据库文件")

        # 生成模板结构
        template = {
            "_metadata": {
                "description": "微信数据库字段配置模板",
                "version": "1.0",
                "instructions": {
                    "zh": "请为每个字段的 'meaning' 填入准确的中文含义，'description' 填入数据库/表的功能描述",
                    "en": "Please fill in accurate Chinese meanings for each field's 'meaning' and functional descriptions for 'description'"
                },
                "database_count": len(db_files),
                "generated_time": __import__('datetime').datetime.now().isoformat()
            },
            "databases": {}
        }

        # 分析每个数据库
        for db_file in db_files:
            db_structure = self.analyze_database_structure(db_file)
            if db_structure:
                template["databases"][db_file.stem] = {
                    "description": "",  # 留空供用户填写
                    "file_size": db_file.stat().st_size,
                    "tables": db_structure
                }

        # 添加额外的配置项
        template["message_types"] = {
            "_instructions": "消息类型映射 - 格式: 'Type,SubType': '含义描述'",
            "examples": {
                "1,0": "文本消息",
                "3,0": "图片消息",
                "34,0": "语音消息"
            }
        }

        template["friend_types"] = {
            "_instructions": "好友类型映射 - 格式: 'TypeCode': '类型描述'",
            "examples": {
                "1": "好友",
                "2": "微信群",
                "3": "好友"
            }
        }

        # 写入模板文件
        output_path = Path(output_file)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(template, f, ensure_ascii=False, indent=2)

        print(f"\n配置模板生成完成: {output_file}")
        print(f"  - 数据库数量: {len(template['databases'])}")

        # 统计信息
        total_tables = 0
        total_fields = 0
        similar_groups = 0

        for db_name, db_info in template["databases"].items():
            db_tables = len(db_info["tables"])
            total_tables += db_tables

            for table_name, table_info in db_info["tables"].items():
                if table_info["type"] == "similar_group":
                    similar_groups += 1
                total_fields += len(table_info["fields"])

        print(f"  - 表数量: {total_tables}")
        print(f"  - 相似表组: {similar_groups}")
        print(f"  - 字段总数: {total_fields}")

        # 显示完成统计信息
        if excluded_files:
            print(f"\n生成完成统计：")
            print(f"  - 成功处理: {len(template['databases'])} 个数据库")
            print(f"  - 排除数据库: {len(excluded_files)} 个")
            print(f"  - 排除原因: 个人微信数据分析不需要企业微信和搜索索引数据")

        print(f"\n请编辑 {output_file} 文件，填入准确的字段含义和描述")

def main():
    """主函数"""
    parser = argparse.ArgumentParser(description="微信数据库字段配置模板生成器")
    parser.add_argument("--databases-path", default="output/databases", help="解密后的数据库根目录（按账号分目录）")
    parser.add_argument("--output", default="wechat_db_config_template.json", help="输出 JSON 模板路径")
    parser.add_argument("--include-excluded", action="store_true", help="包含默认会被排除的数据库（如 bizchat/contact_fts/favorite_fts 等）")
    parser.add_argument("--include-message-shards", action="store_true", help="包含所有 message_{n}.db（否则仅保留倒数第二个作代表）")
    parser.add_argument("--exclude-db-stem", action="append", default=[], help="按 stem（不含 .db）排除数据库，可重复，例如: --exclude-db-stem digital_twin")
    args = parser.parse_args()

    print("微信数据库配置模板生成器")
    print("=" * 50)

    generator = ConfigTemplateGenerator(databases_path=args.databases_path)
    generator.generate_template(
        output_file=args.output,
        include_excluded=bool(args.include_excluded),
        include_message_shards=bool(args.include_message_shards),
        exclude_db_stems=set(args.exclude_db_stem or []),
    )

if __name__ == "__main__":
    main()