mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-19 14:20:51 +08:00
feat(chat): 增加消息搜索索引与会话预览索引
- 新增 chat_search_index.db:提供索引状态/构建/发送者接口 - 新增 session_preview.db:会话最新消息预览索引,支持指纹校验与过期重建 - 解密完成后默认预构建会话预览索引(WECHAT_TOOL_BUILD_SESSION_PREVIEW=0 可关闭)
This commit is contained in:
@@ -669,8 +669,7 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
des = _extract_xml_tag_text(text, "des")
|
des = _extract_xml_tag_text(text, "des")
|
||||||
url = _extract_xml_tag_text(text, "url")
|
url = _extract_xml_tag_text(text, "url")
|
||||||
|
|
||||||
if "<patmsg" in text.lower() or "<template>" in text.lower():
|
lower = text.lower()
|
||||||
return {"renderType": "system", "content": "[拍一拍]"}
|
|
||||||
|
|
||||||
if app_type in (5, 68) and url:
|
if app_type in (5, 68) and url:
|
||||||
thumb_url = _extract_xml_tag_text(text, "thumburl")
|
thumb_url = _extract_xml_tag_text(text, "thumburl")
|
||||||
@@ -698,7 +697,7 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
"fileMd5": file_md5 or "",
|
"fileMd5": file_md5 or "",
|
||||||
}
|
}
|
||||||
|
|
||||||
if app_type == 57 or "<refermsg" in text:
|
if app_type == 57 or "<refermsg" in lower:
|
||||||
refer_block = _extract_refermsg_block(text)
|
refer_block = _extract_refermsg_block(text)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -713,6 +712,12 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
|
|
||||||
reply_text = _extract_xml_tag_text(text_wo_refer, "title") or _extract_xml_tag_text(text, "title")
|
reply_text = _extract_xml_tag_text(text_wo_refer, "title") or _extract_xml_tag_text(text, "title")
|
||||||
refer_displayname = _extract_xml_tag_or_attr(refer_block, "displayname")
|
refer_displayname = _extract_xml_tag_or_attr(refer_block, "displayname")
|
||||||
|
refer_fromusr = (
|
||||||
|
_extract_xml_tag_or_attr(refer_block, "fromusr")
|
||||||
|
or _extract_xml_tag_or_attr(refer_block, "fromusername")
|
||||||
|
or ""
|
||||||
|
)
|
||||||
|
refer_svrid = _extract_xml_tag_or_attr(refer_block, "svrid")
|
||||||
refer_content = _extract_xml_tag_text(refer_block, "content")
|
refer_content = _extract_xml_tag_text(refer_block, "content")
|
||||||
refer_type = _extract_xml_tag_or_attr(refer_block, "type")
|
refer_type = _extract_xml_tag_or_attr(refer_block, "type")
|
||||||
|
|
||||||
@@ -730,6 +735,7 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
refer_content = rest
|
refer_content = rest
|
||||||
|
|
||||||
t = str(refer_type or "").strip()
|
t = str(refer_type or "").strip()
|
||||||
|
quote_voice_length = ""
|
||||||
if t == "3":
|
if t == "3":
|
||||||
refer_content = "[图片]"
|
refer_content = "[图片]"
|
||||||
elif t == "47":
|
elif t == "47":
|
||||||
@@ -737,6 +743,17 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
elif t == "43" or t == "62":
|
elif t == "43" or t == "62":
|
||||||
refer_content = "[视频]"
|
refer_content = "[视频]"
|
||||||
elif t == "34":
|
elif t == "34":
|
||||||
|
# Some versions embed voice length (ms) in refermsg.content, e.g.
|
||||||
|
# "wxid_xxx:15369:1:" -> 15s
|
||||||
|
try:
|
||||||
|
rc = str(refer_content or "").strip()
|
||||||
|
parts = rc.split(":")
|
||||||
|
if len(parts) >= 2:
|
||||||
|
dur_raw = (parts[1] or "").strip()
|
||||||
|
if dur_raw.isdigit():
|
||||||
|
quote_voice_length = str(int(dur_raw))
|
||||||
|
except Exception:
|
||||||
|
quote_voice_length = ""
|
||||||
refer_content = "[语音]"
|
refer_content = "[语音]"
|
||||||
elif t == "49" and refer_content:
|
elif t == "49" and refer_content:
|
||||||
refer_content = f"[链接] {refer_content}".strip()
|
refer_content = f"[链接] {refer_content}".strip()
|
||||||
@@ -744,10 +761,17 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
|||||||
return {
|
return {
|
||||||
"renderType": "quote",
|
"renderType": "quote",
|
||||||
"content": reply_text or "[引用消息]",
|
"content": reply_text or "[引用消息]",
|
||||||
|
"quoteUsername": str(refer_fromusr or "").strip(),
|
||||||
"quoteTitle": refer_displayname or "",
|
"quoteTitle": refer_displayname or "",
|
||||||
"quoteContent": refer_content or "",
|
"quoteContent": refer_content or "",
|
||||||
|
"quoteType": t,
|
||||||
|
"quoteServerId": str(refer_svrid or "").strip(),
|
||||||
|
"quoteVoiceLength": quote_voice_length,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if app_type == 62 or "<patmsg" in lower or 'type="patmsg"' in lower or "type='patmsg'" in lower:
|
||||||
|
return {"renderType": "system", "content": "[拍一拍]"}
|
||||||
|
|
||||||
if app_type == 2000 or (
|
if app_type == 2000 or (
|
||||||
"<wcpayinfo" in text and ("transfer" in text.lower() or "paysubtype" in text.lower())
|
"<wcpayinfo" in text and ("transfer" in text.lower() or "paysubtype" in text.lower())
|
||||||
):
|
):
|
||||||
@@ -976,7 +1000,7 @@ def _load_latest_message_previews(account_dir: Path, usernames: list[str]) -> di
|
|||||||
"n.user_name AS sender_username "
|
"n.user_name AS sender_username "
|
||||||
f"FROM {quoted} m "
|
f"FROM {quoted} m "
|
||||||
"LEFT JOIN Name2Id n ON m.real_sender_id = n.rowid "
|
"LEFT JOIN Name2Id n ON m.real_sender_id = n.rowid "
|
||||||
"ORDER BY m.create_time DESC, m.sort_seq DESC, m.local_id DESC "
|
"ORDER BY m.sort_seq DESC, m.local_id DESC "
|
||||||
"LIMIT 1"
|
"LIMIT 1"
|
||||||
).fetchone()
|
).fetchone()
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -984,7 +1008,7 @@ def _load_latest_message_previews(account_dir: Path, usernames: list[str]) -> di
|
|||||||
"SELECT "
|
"SELECT "
|
||||||
"local_type, message_content, compress_content, create_time, sort_seq, local_id, '' AS sender_username "
|
"local_type, message_content, compress_content, create_time, sort_seq, local_id, '' AS sender_username "
|
||||||
f"FROM {quoted} "
|
f"FROM {quoted} "
|
||||||
"ORDER BY create_time DESC, sort_seq DESC, local_id DESC "
|
"ORDER BY sort_seq DESC, local_id DESC "
|
||||||
"LIMIT 1"
|
"LIMIT 1"
|
||||||
).fetchone()
|
).fetchone()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1000,7 +1024,7 @@ def _load_latest_message_previews(account_dir: Path, usernames: list[str]) -> di
|
|||||||
create_time = int(r["create_time"] or 0)
|
create_time = int(r["create_time"] or 0)
|
||||||
sort_seq = int(r["sort_seq"] or 0) if r["sort_seq"] is not None else 0
|
sort_seq = int(r["sort_seq"] or 0) if r["sort_seq"] is not None else 0
|
||||||
local_id = int(r["local_id"] or 0)
|
local_id = int(r["local_id"] or 0)
|
||||||
sort_key = (create_time, sort_seq, local_id)
|
sort_key = (sort_seq, local_id, create_time)
|
||||||
|
|
||||||
raw_text = _decode_message_content(r["compress_content"], r["message_content"]).strip()
|
raw_text = _decode_message_content(r["compress_content"], r["message_content"]).strip()
|
||||||
sender_username = _decode_sqlite_text(r["sender_username"]).strip()
|
sender_username = _decode_sqlite_text(r["sender_username"]).strip()
|
||||||
@@ -1087,3 +1111,263 @@ def _load_contact_rows(contact_db_path: Path, usernames: list[str]) -> dict[str,
|
|||||||
return result
|
return result
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _make_search_tokens(q: str) -> list[str]:
|
||||||
|
tokens = [t for t in re.split(r"\s+", str(q or "").strip()) if t]
|
||||||
|
if len(tokens) > 8:
|
||||||
|
tokens = tokens[:8]
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def _make_snippet(text: str, tokens: list[str], *, max_len: int = 90) -> str:
|
||||||
|
s = str(text or "").strip()
|
||||||
|
if not s:
|
||||||
|
return ""
|
||||||
|
if not tokens or max_len <= 0:
|
||||||
|
return s[:max_len]
|
||||||
|
|
||||||
|
lowered = s.lower()
|
||||||
|
best_idx = None
|
||||||
|
best_tok = ""
|
||||||
|
for t in tokens:
|
||||||
|
i = lowered.find(t.lower())
|
||||||
|
if i >= 0 and (best_idx is None or i < best_idx):
|
||||||
|
best_idx = i
|
||||||
|
best_tok = t
|
||||||
|
if best_idx is None:
|
||||||
|
return s[:max_len]
|
||||||
|
|
||||||
|
left = max(0, best_idx - max_len // 2)
|
||||||
|
right = min(len(s), left + max_len)
|
||||||
|
if right - left < max_len and left > 0:
|
||||||
|
left = max(0, right - max_len)
|
||||||
|
out = s[left:right].strip()
|
||||||
|
if left > 0:
|
||||||
|
out = "…" + out
|
||||||
|
if right < len(s):
|
||||||
|
out = out + "…"
|
||||||
|
if best_tok and best_tok not in out:
|
||||||
|
out = s[:max_len].strip()
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _match_tokens(haystack: str, tokens: list[str]) -> bool:
|
||||||
|
if not tokens:
|
||||||
|
return False
|
||||||
|
h = (haystack or "").lower()
|
||||||
|
return all(t.lower() in h for t in tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def _to_char_token_text(s: str) -> str:
|
||||||
|
t = str(s or "").strip()
|
||||||
|
if not t:
|
||||||
|
return ""
|
||||||
|
chars = [ch for ch in t.lower() if not ch.isspace()]
|
||||||
|
return " ".join(chars)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_fts_query(q: str) -> str:
|
||||||
|
tokens = _make_search_tokens(q)
|
||||||
|
parts: list[str] = []
|
||||||
|
for tok in tokens:
|
||||||
|
clean = str(tok or "").replace('"', "").strip()
|
||||||
|
if not clean:
|
||||||
|
continue
|
||||||
|
phrase = " ".join([ch for ch in clean if not ch.isspace()])
|
||||||
|
phrase = phrase.strip()
|
||||||
|
if not phrase:
|
||||||
|
continue
|
||||||
|
parts.append(f"\"{phrase}\"")
|
||||||
|
return " AND ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_search_hit(
|
||||||
|
r: sqlite3.Row,
|
||||||
|
*,
|
||||||
|
db_path: Path,
|
||||||
|
table_name: str,
|
||||||
|
username: str,
|
||||||
|
account_dir: Path,
|
||||||
|
is_group: bool,
|
||||||
|
my_rowid: Optional[int],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
local_id = int(r["local_id"] or 0)
|
||||||
|
create_time = int(r["create_time"] or 0)
|
||||||
|
sort_seq = int(r["sort_seq"] or 0) if r["sort_seq"] is not None else 0
|
||||||
|
local_type = int(r["local_type"] or 0)
|
||||||
|
sender_username = _decode_sqlite_text(r["sender_username"]).strip()
|
||||||
|
|
||||||
|
is_sent = False
|
||||||
|
if my_rowid is not None:
|
||||||
|
try:
|
||||||
|
is_sent = int(r["real_sender_id"] or 0) == int(my_rowid)
|
||||||
|
except Exception:
|
||||||
|
is_sent = False
|
||||||
|
|
||||||
|
raw_text = _decode_message_content(r["compress_content"], r["message_content"]).strip()
|
||||||
|
|
||||||
|
sender_prefix = ""
|
||||||
|
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||||
|
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||||
|
|
||||||
|
if is_group and sender_prefix:
|
||||||
|
sender_username = sender_prefix
|
||||||
|
|
||||||
|
if is_group and raw_text and (raw_text.startswith("<") or raw_text.startswith('"<')):
|
||||||
|
xml_sender = _extract_sender_from_group_xml(raw_text)
|
||||||
|
if xml_sender:
|
||||||
|
sender_username = xml_sender
|
||||||
|
|
||||||
|
if is_sent:
|
||||||
|
sender_username = account_dir.name
|
||||||
|
elif (not is_group) and (not sender_username):
|
||||||
|
sender_username = username
|
||||||
|
|
||||||
|
render_type = "text"
|
||||||
|
content_text = raw_text
|
||||||
|
title = ""
|
||||||
|
url = ""
|
||||||
|
quote_username = ""
|
||||||
|
quote_title = ""
|
||||||
|
quote_content = ""
|
||||||
|
amount = ""
|
||||||
|
pay_sub_type = ""
|
||||||
|
transfer_status = ""
|
||||||
|
voip_type = ""
|
||||||
|
|
||||||
|
if local_type == 10000:
|
||||||
|
render_type = "system"
|
||||||
|
if "revokemsg" in raw_text:
|
||||||
|
content_text = "撤回了一条消息"
|
||||||
|
else:
|
||||||
|
content_text = re.sub(r"</?[_a-zA-Z0-9]+[^>]*>", "", raw_text)
|
||||||
|
content_text = re.sub(r"\s+", " ", content_text).strip() or "[系统消息]"
|
||||||
|
elif local_type == 49:
|
||||||
|
parsed = _parse_app_message(raw_text)
|
||||||
|
render_type = str(parsed.get("renderType") or "text")
|
||||||
|
content_text = str(parsed.get("content") or "")
|
||||||
|
title = str(parsed.get("title") or "")
|
||||||
|
url = str(parsed.get("url") or "")
|
||||||
|
quote_title = str(parsed.get("quoteTitle") or "")
|
||||||
|
quote_content = str(parsed.get("quoteContent") or "")
|
||||||
|
quote_username = str(parsed.get("quoteUsername") or "")
|
||||||
|
amount = str(parsed.get("amount") or "")
|
||||||
|
pay_sub_type = str(parsed.get("paySubType") or "")
|
||||||
|
if render_type == "transfer":
|
||||||
|
transfer_status = _infer_transfer_status_text(
|
||||||
|
is_sent=is_sent,
|
||||||
|
paysubtype=pay_sub_type,
|
||||||
|
receivestatus=str(parsed.get("receiveStatus") or ""),
|
||||||
|
sendertitle=str(parsed.get("senderTitle") or ""),
|
||||||
|
receivertitle=str(parsed.get("receiverTitle") or ""),
|
||||||
|
senderdes=str(parsed.get("senderDes") or ""),
|
||||||
|
receiverdes=str(parsed.get("receiverDes") or ""),
|
||||||
|
)
|
||||||
|
if not content_text:
|
||||||
|
content_text = transfer_status or "转账"
|
||||||
|
elif local_type == 266287972401:
|
||||||
|
render_type = "system"
|
||||||
|
content_text = "[拍一拍]"
|
||||||
|
elif local_type == 244813135921:
|
||||||
|
render_type = "quote"
|
||||||
|
parsed = _parse_app_message(raw_text)
|
||||||
|
content_text = str(parsed.get("content") or "[引用消息]")
|
||||||
|
quote_title = str(parsed.get("quoteTitle") or "")
|
||||||
|
quote_content = str(parsed.get("quoteContent") or "")
|
||||||
|
quote_username = str(parsed.get("quoteUsername") or "")
|
||||||
|
elif local_type == 3:
|
||||||
|
render_type = "image"
|
||||||
|
content_text = "[图片]"
|
||||||
|
elif local_type == 34:
|
||||||
|
render_type = "voice"
|
||||||
|
duration = _extract_xml_attr(raw_text, "voicelength")
|
||||||
|
content_text = f"[语音 {duration}秒]" if duration else "[语音]"
|
||||||
|
elif local_type == 43 or local_type == 62:
|
||||||
|
render_type = "video"
|
||||||
|
content_text = "[视频]"
|
||||||
|
elif local_type == 47:
|
||||||
|
render_type = "emoji"
|
||||||
|
content_text = "[表情]"
|
||||||
|
elif local_type == 50:
|
||||||
|
render_type = "voip"
|
||||||
|
try:
|
||||||
|
block = raw_text
|
||||||
|
m_voip = re.search(
|
||||||
|
r"(<VoIPBubbleMsg[^>]*>.*?</VoIPBubbleMsg>)",
|
||||||
|
raw_text,
|
||||||
|
flags=re.IGNORECASE | re.DOTALL,
|
||||||
|
)
|
||||||
|
if m_voip:
|
||||||
|
block = m_voip.group(1) or raw_text
|
||||||
|
room_type = str(_extract_xml_tag_text(block, "room_type") or "").strip()
|
||||||
|
if room_type == "0":
|
||||||
|
voip_type = "video"
|
||||||
|
elif room_type == "1":
|
||||||
|
voip_type = "audio"
|
||||||
|
voip_msg = str(_extract_xml_tag_text(block, "msg") or "").strip()
|
||||||
|
content_text = voip_msg or "通话"
|
||||||
|
except Exception:
|
||||||
|
content_text = "通话"
|
||||||
|
elif local_type != 1:
|
||||||
|
if not content_text:
|
||||||
|
content_text = _infer_message_brief_by_local_type(local_type)
|
||||||
|
else:
|
||||||
|
if content_text.startswith("<") or content_text.startswith('"<'):
|
||||||
|
if "<appmsg" in content_text.lower():
|
||||||
|
parsed = _parse_app_message(content_text)
|
||||||
|
rt = str(parsed.get("renderType") or "")
|
||||||
|
if rt and rt != "text":
|
||||||
|
render_type = rt
|
||||||
|
content_text = str(parsed.get("content") or content_text)
|
||||||
|
title = str(parsed.get("title") or title)
|
||||||
|
url = str(parsed.get("url") or url)
|
||||||
|
quote_title = str(parsed.get("quoteTitle") or quote_title)
|
||||||
|
quote_content = str(parsed.get("quoteContent") or quote_content)
|
||||||
|
amount = str(parsed.get("amount") or amount)
|
||||||
|
pay_sub_type = str(parsed.get("paySubType") or pay_sub_type)
|
||||||
|
quote_username = str(parsed.get("quoteUsername") or quote_username)
|
||||||
|
|
||||||
|
if render_type == "transfer":
|
||||||
|
transfer_status = _infer_transfer_status_text(
|
||||||
|
is_sent=is_sent,
|
||||||
|
paysubtype=pay_sub_type,
|
||||||
|
receivestatus=str(parsed.get("receiveStatus") or ""),
|
||||||
|
sendertitle=str(parsed.get("senderTitle") or ""),
|
||||||
|
receivertitle=str(parsed.get("receiverTitle") or ""),
|
||||||
|
senderdes=str(parsed.get("senderDes") or ""),
|
||||||
|
receiverdes=str(parsed.get("receiverDes") or ""),
|
||||||
|
)
|
||||||
|
if not content_text:
|
||||||
|
content_text = transfer_status or "转账"
|
||||||
|
t = _extract_xml_tag_text(content_text, "title")
|
||||||
|
d = _extract_xml_tag_text(content_text, "des")
|
||||||
|
content_text = t or d or _infer_message_brief_by_local_type(local_type)
|
||||||
|
|
||||||
|
if not content_text:
|
||||||
|
content_text = _infer_message_brief_by_local_type(local_type)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"id": f"{db_path.stem}:{table_name}:{local_id}",
|
||||||
|
"db": str(db_path.stem),
|
||||||
|
"table": str(table_name),
|
||||||
|
"username": str(username),
|
||||||
|
"localId": local_id,
|
||||||
|
"serverId": int(r["server_id"] or 0),
|
||||||
|
"type": local_type,
|
||||||
|
"createTime": create_time,
|
||||||
|
"sortSeq": sort_seq,
|
||||||
|
"senderUsername": sender_username,
|
||||||
|
"isSent": bool(is_sent),
|
||||||
|
"renderType": render_type,
|
||||||
|
"content": content_text,
|
||||||
|
"title": title,
|
||||||
|
"url": url,
|
||||||
|
"quoteUsername": quote_username,
|
||||||
|
"quoteTitle": quote_title,
|
||||||
|
"quoteContent": quote_content,
|
||||||
|
"amount": amount,
|
||||||
|
"paySubType": pay_sub_type,
|
||||||
|
"transferStatus": transfer_status,
|
||||||
|
"voipType": voip_type,
|
||||||
|
}
|
||||||
|
|||||||
478
src/wechat_decrypt_tool/chat_search_index.py
Normal file
478
src/wechat_decrypt_tool/chat_search_index.py
Normal file
@@ -0,0 +1,478 @@
|
|||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from .chat_helpers import (
|
||||||
|
_decode_sqlite_text,
|
||||||
|
_quote_ident,
|
||||||
|
_resolve_msg_table_name_by_map,
|
||||||
|
_row_to_search_hit,
|
||||||
|
_should_keep_session,
|
||||||
|
_to_char_token_text,
|
||||||
|
_iter_message_db_paths,
|
||||||
|
)
|
||||||
|
from .logging_config import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
_SCHEMA_VERSION = 1
|
||||||
|
_INDEX_DB_NAME = "chat_search_index.db"
|
||||||
|
_INDEX_DB_TMP_NAME = "chat_search_index.tmp.db"
|
||||||
|
_LEGACY_INDEX_DB_NAME = "message_fts.db"
|
||||||
|
|
||||||
|
_BUILD_LOCK = threading.Lock()
|
||||||
|
_BUILD_STATE: dict[str, dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _account_key(account_dir: Path) -> str:
|
||||||
|
return str(account_dir.name)
|
||||||
|
|
||||||
|
|
||||||
|
def _index_db_path(account_dir: Path) -> Path:
|
||||||
|
return account_dir / _INDEX_DB_NAME
|
||||||
|
|
||||||
|
|
||||||
|
def _index_db_tmp_path(account_dir: Path) -> Path:
|
||||||
|
return account_dir / _INDEX_DB_TMP_NAME
|
||||||
|
|
||||||
|
|
||||||
|
def get_chat_search_index_db_path(account_dir: Path) -> Path:
|
||||||
|
"""
|
||||||
|
Preferred index file: {account}/chat_search_index.db
|
||||||
|
Legacy (older builds): {account}/message_fts.db (only if it looks like our index schema).
|
||||||
|
"""
|
||||||
|
|
||||||
|
preferred = account_dir / _INDEX_DB_NAME
|
||||||
|
if preferred.exists():
|
||||||
|
return preferred
|
||||||
|
|
||||||
|
legacy = account_dir / _LEGACY_INDEX_DB_NAME
|
||||||
|
if legacy.exists():
|
||||||
|
insp = _inspect_index(legacy)
|
||||||
|
if bool(insp.get("hasFtsTable")) and bool(insp.get("hasMetaTable")):
|
||||||
|
return legacy
|
||||||
|
|
||||||
|
return preferred
|
||||||
|
|
||||||
|
|
||||||
|
def _read_meta(index_path: Path) -> dict[str, str]:
|
||||||
|
if not index_path.exists():
|
||||||
|
return {}
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='meta'").fetchall()
|
||||||
|
if not rows:
|
||||||
|
return {}
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
for k, v in conn.execute("SELECT key, value FROM meta").fetchall():
|
||||||
|
if k is None:
|
||||||
|
continue
|
||||||
|
out[str(k)] = "" if v is None else str(v)
|
||||||
|
return out
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _inspect_index(index_path: Path) -> dict[str, Any]:
|
||||||
|
if not index_path.exists():
|
||||||
|
return {
|
||||||
|
"exists": False,
|
||||||
|
"ready": False,
|
||||||
|
"hasFtsTable": False,
|
||||||
|
"hasMetaTable": False,
|
||||||
|
"schemaVersion": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
||||||
|
except Exception:
|
||||||
|
rows = []
|
||||||
|
names = {str(r[0]).lower() for r in rows if r and r[0]}
|
||||||
|
|
||||||
|
has_meta = "meta" in names
|
||||||
|
has_fts = "message_fts" in names
|
||||||
|
|
||||||
|
schema_version: Optional[int] = None
|
||||||
|
if has_meta:
|
||||||
|
try:
|
||||||
|
r = conn.execute("SELECT value FROM meta WHERE key='schema_version' LIMIT 1").fetchone()
|
||||||
|
if r and r[0] is not None:
|
||||||
|
schema_version = int(str(r[0]).strip() or "0")
|
||||||
|
except Exception:
|
||||||
|
schema_version = None
|
||||||
|
|
||||||
|
ready = bool(has_fts and (schema_version is None or schema_version >= _SCHEMA_VERSION))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"exists": True,
|
||||||
|
"ready": ready,
|
||||||
|
"hasFtsTable": bool(has_fts),
|
||||||
|
"hasMetaTable": bool(has_meta),
|
||||||
|
"schemaVersion": schema_version,
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return {
|
||||||
|
"exists": True,
|
||||||
|
"ready": False,
|
||||||
|
"hasFtsTable": False,
|
||||||
|
"hasMetaTable": False,
|
||||||
|
"schemaVersion": None,
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_chat_search_index_status(account_dir: Path) -> dict[str, Any]:
|
||||||
|
key = _account_key(account_dir)
|
||||||
|
index_path = get_chat_search_index_db_path(account_dir)
|
||||||
|
inspect = _inspect_index(index_path)
|
||||||
|
meta = _read_meta(index_path)
|
||||||
|
with _BUILD_LOCK:
|
||||||
|
state = dict(_BUILD_STATE.get(key) or {})
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"index": {
|
||||||
|
"path": str(index_path),
|
||||||
|
"exists": bool(inspect.get("exists")),
|
||||||
|
"ready": bool(inspect.get("ready")),
|
||||||
|
"hasFtsTable": bool(inspect.get("hasFtsTable")),
|
||||||
|
"hasMetaTable": bool(inspect.get("hasMetaTable")),
|
||||||
|
"schemaVersion": inspect.get("schemaVersion"),
|
||||||
|
"meta": meta,
|
||||||
|
"build": state,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def start_chat_search_index_build(account_dir: Path, *, rebuild: bool = False) -> dict[str, Any]:
|
||||||
|
key = _account_key(account_dir)
|
||||||
|
now = int(time.time())
|
||||||
|
with _BUILD_LOCK:
|
||||||
|
st = _BUILD_STATE.get(key)
|
||||||
|
if st and st.get("status") == "building":
|
||||||
|
return get_chat_search_index_status(account_dir)
|
||||||
|
_BUILD_STATE[key] = {
|
||||||
|
"status": "building",
|
||||||
|
"rebuild": bool(rebuild),
|
||||||
|
"startedAt": now,
|
||||||
|
"finishedAt": None,
|
||||||
|
"indexedMessages": 0,
|
||||||
|
"currentDb": "",
|
||||||
|
"currentConversation": "",
|
||||||
|
"error": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
t = threading.Thread(
|
||||||
|
target=_build_worker,
|
||||||
|
args=(account_dir, bool(rebuild)),
|
||||||
|
daemon=True,
|
||||||
|
name=f"chat-search-index:{key}",
|
||||||
|
)
|
||||||
|
t.start()
|
||||||
|
return get_chat_search_index_status(account_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def _update_build_state(account_key: str, **kwargs: Any) -> None:
|
||||||
|
with _BUILD_LOCK:
|
||||||
|
st = _BUILD_STATE.get(account_key)
|
||||||
|
if not st:
|
||||||
|
return
|
||||||
|
st.update(kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_sessions_for_index(account_dir: Path) -> dict[str, dict[str, Any]]:
|
||||||
|
session_db_path = account_dir / "session.db"
|
||||||
|
if not session_db_path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(session_db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
rows = conn.execute("SELECT username, is_hidden FROM SessionTable").fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
out: dict[str, dict[str, Any]] = {}
|
||||||
|
for r in rows:
|
||||||
|
u = str(r["username"] or "").strip()
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
if not _should_keep_session(u, include_official=True):
|
||||||
|
continue
|
||||||
|
out[u] = {
|
||||||
|
"is_hidden": 1 if int(r["is_hidden"] or 0) == 1 else 0,
|
||||||
|
"is_official": 1 if u.startswith("gh_") else 0,
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _init_index_db(conn: sqlite3.Connection) -> None:
|
||||||
|
# NOTE: This index DB is built as a temporary file and then atomically swapped in.
|
||||||
|
# Using WAL here would create `-wal/-shm` side files that are *not* swapped together,
|
||||||
|
# which can lead to a final DB missing schema/data (e.g. "no such table: message_fts").
|
||||||
|
conn.execute("PRAGMA journal_mode=DELETE")
|
||||||
|
conn.execute("PRAGMA synchronous=OFF")
|
||||||
|
conn.execute("PRAGMA temp_store=MEMORY")
|
||||||
|
|
||||||
|
conn.execute("CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT NOT NULL)")
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS message_fts USING fts5(
|
||||||
|
text,
|
||||||
|
username UNINDEXED,
|
||||||
|
render_type UNINDEXED,
|
||||||
|
create_time UNINDEXED,
|
||||||
|
sort_seq UNINDEXED,
|
||||||
|
local_id UNINDEXED,
|
||||||
|
server_id UNINDEXED,
|
||||||
|
local_type UNINDEXED,
|
||||||
|
db_stem UNINDEXED,
|
||||||
|
table_name UNINDEXED,
|
||||||
|
sender_username UNINDEXED,
|
||||||
|
is_hidden UNINDEXED,
|
||||||
|
is_official UNINDEXED,
|
||||||
|
tokenize='unicode61'
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("schema_version", str(_SCHEMA_VERSION)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_begin(conn: sqlite3.Connection) -> None:
|
||||||
|
try:
|
||||||
|
if not conn.in_transaction:
|
||||||
|
conn.execute("BEGIN")
|
||||||
|
except sqlite3.OperationalError as e:
|
||||||
|
# Some environments may report `in_transaction` inconsistently; avoid hard failing on nested BEGIN.
|
||||||
|
if "within a transaction" in str(e).lower():
|
||||||
|
return
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def _build_worker(account_dir: Path, rebuild: bool) -> None:
|
||||||
|
key = _account_key(account_dir)
|
||||||
|
started = time.time()
|
||||||
|
tmp_path = _index_db_tmp_path(account_dir)
|
||||||
|
final_path = _index_db_path(account_dir)
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
sessions = _load_sessions_for_index(account_dir)
|
||||||
|
if not sessions:
|
||||||
|
raise RuntimeError("No sessions found (session.db empty or missing).")
|
||||||
|
|
||||||
|
db_paths = _iter_message_db_paths(account_dir)
|
||||||
|
if not db_paths:
|
||||||
|
raise RuntimeError("No message databases found for this account.")
|
||||||
|
|
||||||
|
conn_fts = sqlite3.connect(str(tmp_path))
|
||||||
|
conn_fts.isolation_level = None # manual transaction control (prevents implicit BEGIN)
|
||||||
|
try:
|
||||||
|
_init_index_db(conn_fts)
|
||||||
|
try:
|
||||||
|
conn_fts.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
insert_sql = (
|
||||||
|
"INSERT INTO message_fts("
|
||||||
|
"text, username, render_type, create_time, sort_seq, local_id, server_id, local_type, "
|
||||||
|
"db_stem, table_name, sender_username, is_hidden, is_official"
|
||||||
|
") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
batch: list[tuple[Any, ...]] = []
|
||||||
|
indexed = 0
|
||||||
|
|
||||||
|
_safe_begin(conn_fts)
|
||||||
|
|
||||||
|
for db_path in db_paths:
|
||||||
|
_update_build_state(key, currentDb=str(db_path.name))
|
||||||
|
msg_conn = sqlite3.connect(str(db_path))
|
||||||
|
msg_conn.row_factory = sqlite3.Row
|
||||||
|
msg_conn.text_factory = bytes
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
trows = msg_conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
||||||
|
lower_to_actual: dict[str, str] = {}
|
||||||
|
for x in trows:
|
||||||
|
if not x or x[0] is None:
|
||||||
|
continue
|
||||||
|
nm = _decode_sqlite_text(x[0]).strip()
|
||||||
|
if not nm:
|
||||||
|
continue
|
||||||
|
lower_to_actual[nm.lower()] = nm
|
||||||
|
except Exception:
|
||||||
|
lower_to_actual = {}
|
||||||
|
|
||||||
|
my_rowid = None
|
||||||
|
try:
|
||||||
|
r2 = msg_conn.execute(
|
||||||
|
"SELECT rowid FROM Name2Id WHERE user_name = ? LIMIT 1",
|
||||||
|
(account_dir.name,),
|
||||||
|
).fetchone()
|
||||||
|
if r2 is not None and r2[0] is not None:
|
||||||
|
my_rowid = int(r2[0])
|
||||||
|
except Exception:
|
||||||
|
my_rowid = None
|
||||||
|
|
||||||
|
for conv_username, sess_info in sessions.items():
|
||||||
|
_update_build_state(key, currentConversation=str(conv_username))
|
||||||
|
table_name = _resolve_msg_table_name_by_map(lower_to_actual, conv_username)
|
||||||
|
if not table_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_group = bool(conv_username.endswith("@chatroom"))
|
||||||
|
quoted_table = _quote_ident(table_name)
|
||||||
|
|
||||||
|
sql_with_join = (
|
||||||
|
"SELECT "
|
||||||
|
"m.local_id, m.server_id, m.local_type, m.sort_seq, m.real_sender_id, m.create_time, "
|
||||||
|
"m.message_content, m.compress_content, n.user_name AS sender_username "
|
||||||
|
f"FROM {quoted_table} m "
|
||||||
|
"LEFT JOIN Name2Id n ON m.real_sender_id = n.rowid"
|
||||||
|
)
|
||||||
|
sql_no_join = (
|
||||||
|
"SELECT "
|
||||||
|
"m.local_id, m.server_id, m.local_type, m.sort_seq, m.real_sender_id, m.create_time, "
|
||||||
|
"m.message_content, m.compress_content, '' AS sender_username "
|
||||||
|
f"FROM {quoted_table} m "
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cursor = msg_conn.execute(sql_with_join)
|
||||||
|
except Exception:
|
||||||
|
cursor = msg_conn.execute(sql_no_join)
|
||||||
|
|
||||||
|
for r in cursor:
|
||||||
|
try:
|
||||||
|
hit = _row_to_search_hit(
|
||||||
|
r,
|
||||||
|
db_path=db_path,
|
||||||
|
table_name=table_name,
|
||||||
|
username=conv_username,
|
||||||
|
account_dir=account_dir,
|
||||||
|
is_group=is_group,
|
||||||
|
my_rowid=my_rowid,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
hay_items = [
|
||||||
|
str(hit.get("content") or ""),
|
||||||
|
str(hit.get("title") or ""),
|
||||||
|
str(hit.get("url") or ""),
|
||||||
|
str(hit.get("quoteTitle") or ""),
|
||||||
|
str(hit.get("quoteContent") or ""),
|
||||||
|
str(hit.get("amount") or ""),
|
||||||
|
]
|
||||||
|
haystack = "\n".join([x for x in hay_items if x.strip()])
|
||||||
|
if not haystack.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
token_text = _to_char_token_text(haystack)
|
||||||
|
if not token_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
batch.append(
|
||||||
|
(
|
||||||
|
token_text,
|
||||||
|
conv_username,
|
||||||
|
str(hit.get("renderType") or ""),
|
||||||
|
int(hit.get("createTime") or 0),
|
||||||
|
int(hit.get("sortSeq") or 0),
|
||||||
|
int(hit.get("localId") or 0),
|
||||||
|
int(hit.get("serverId") or 0),
|
||||||
|
int(hit.get("type") or 0),
|
||||||
|
str(db_path.stem),
|
||||||
|
str(table_name),
|
||||||
|
str(hit.get("senderUsername") or ""),
|
||||||
|
int(sess_info.get("is_hidden") or 0),
|
||||||
|
int(sess_info.get("is_official") or 0),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(batch) >= 1000:
|
||||||
|
conn_fts.executemany(insert_sql, batch)
|
||||||
|
indexed += len(batch)
|
||||||
|
batch.clear()
|
||||||
|
_update_build_state(key, indexedMessages=int(indexed))
|
||||||
|
|
||||||
|
if indexed % 20000 == 0:
|
||||||
|
conn_fts.commit()
|
||||||
|
_safe_begin(conn_fts)
|
||||||
|
finally:
|
||||||
|
msg_conn.close()
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
conn_fts.executemany(insert_sql, batch)
|
||||||
|
indexed += len(batch)
|
||||||
|
batch.clear()
|
||||||
|
_update_build_state(key, indexedMessages=int(indexed))
|
||||||
|
|
||||||
|
conn_fts.commit()
|
||||||
|
|
||||||
|
finished_at = int(time.time())
|
||||||
|
conn_fts.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("built_at", str(finished_at)),
|
||||||
|
)
|
||||||
|
conn_fts.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("message_count", str(indexed)),
|
||||||
|
)
|
||||||
|
conn_fts.commit()
|
||||||
|
finally:
|
||||||
|
conn_fts.close()
|
||||||
|
|
||||||
|
if rebuild or final_path.exists():
|
||||||
|
try:
|
||||||
|
os.replace(str(tmp_path), str(final_path))
|
||||||
|
except Exception:
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
raise
|
||||||
|
else:
|
||||||
|
os.replace(str(tmp_path), str(final_path))
|
||||||
|
|
||||||
|
duration = max(0.0, time.time() - started)
|
||||||
|
_update_build_state(
|
||||||
|
key,
|
||||||
|
status="ready",
|
||||||
|
finishedAt=int(time.time()),
|
||||||
|
currentDb="",
|
||||||
|
currentConversation="",
|
||||||
|
error="",
|
||||||
|
durationSec=round(duration, 3),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Failed to build chat search index")
|
||||||
|
try:
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
_update_build_state(
|
||||||
|
key,
|
||||||
|
status="error",
|
||||||
|
finishedAt=int(time.time()),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
File diff suppressed because it is too large
Load Diff
574
src/wechat_decrypt_tool/session_preview_index.py
Normal file
574
src/wechat_decrypt_tool/session_preview_index.py
Normal file
@@ -0,0 +1,574 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from .chat_helpers import (
|
||||||
|
_build_latest_message_preview,
|
||||||
|
_decode_message_content,
|
||||||
|
_decode_sqlite_text,
|
||||||
|
_infer_last_message_brief,
|
||||||
|
_is_mostly_printable_text,
|
||||||
|
_iter_message_db_paths,
|
||||||
|
_quote_ident,
|
||||||
|
_should_keep_session,
|
||||||
|
)
|
||||||
|
from .logging_config import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
_SCHEMA_VERSION = 1
|
||||||
|
_INDEX_DB_NAME = "session_preview.db"
|
||||||
|
_INDEX_DB_TMP_NAME = "session_preview.tmp.db"
|
||||||
|
|
||||||
|
_TABLE_NAME_RE = re.compile(r"^(msg_|chat_)([0-9a-f]{32})", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_preview_index_db_path(account_dir: Path) -> Path:
|
||||||
|
return account_dir / _INDEX_DB_NAME
|
||||||
|
|
||||||
|
|
||||||
|
def _index_db_tmp_path(account_dir: Path) -> Path:
|
||||||
|
return account_dir / _INDEX_DB_TMP_NAME
|
||||||
|
|
||||||
|
|
||||||
|
def _file_sig(path: Path) -> tuple[str, int, int]:
|
||||||
|
st = path.stat()
|
||||||
|
mtime_ns = getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000))
|
||||||
|
return (path.name, int(st.st_size), int(mtime_ns))
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_source_fingerprint(account_dir: Path) -> dict[str, Any]:
|
||||||
|
"""Compute a stable fingerprint for the current decrypted data set."""
|
||||||
|
session_db_path = account_dir / "session.db"
|
||||||
|
msg_paths = _iter_message_db_paths(account_dir)
|
||||||
|
|
||||||
|
items: list[tuple[str, int, int]] = []
|
||||||
|
try:
|
||||||
|
if session_db_path.exists():
|
||||||
|
items.append(_file_sig(session_db_path))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for p in msg_paths:
|
||||||
|
try:
|
||||||
|
if p.exists():
|
||||||
|
items.append(_file_sig(p))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
items.sort()
|
||||||
|
payload = json.dumps(items, ensure_ascii=False, separators=(",", ":")).encode("utf-8", errors="ignore")
|
||||||
|
return {
|
||||||
|
"fingerprint": hashlib.sha256(payload).hexdigest(),
|
||||||
|
"files": items,
|
||||||
|
"dbCount": len(msg_paths),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _inspect_index(index_path: Path) -> dict[str, Any]:
|
||||||
|
if not index_path.exists():
|
||||||
|
return {
|
||||||
|
"exists": False,
|
||||||
|
"ready": False,
|
||||||
|
"schemaVersion": None,
|
||||||
|
"hasMetaTable": False,
|
||||||
|
"hasPreviewTable": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
||||||
|
names = {str(r[0]).lower() for r in rows if r and r[0]}
|
||||||
|
has_meta = "meta" in names
|
||||||
|
has_preview = "session_preview" in names
|
||||||
|
|
||||||
|
schema_version: Optional[int] = None
|
||||||
|
if has_meta:
|
||||||
|
try:
|
||||||
|
r = conn.execute("SELECT value FROM meta WHERE key='schema_version' LIMIT 1").fetchone()
|
||||||
|
if r and r[0] is not None:
|
||||||
|
schema_version = int(str(r[0]).strip() or "0")
|
||||||
|
except Exception:
|
||||||
|
schema_version = None
|
||||||
|
|
||||||
|
ready = bool(has_preview and (schema_version is None or schema_version >= _SCHEMA_VERSION))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"exists": True,
|
||||||
|
"ready": ready,
|
||||||
|
"schemaVersion": schema_version,
|
||||||
|
"hasMetaTable": bool(has_meta),
|
||||||
|
"hasPreviewTable": bool(has_preview),
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return {
|
||||||
|
"exists": True,
|
||||||
|
"ready": False,
|
||||||
|
"schemaVersion": None,
|
||||||
|
"hasMetaTable": False,
|
||||||
|
"hasPreviewTable": False,
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_session_preview_index_status(account_dir: Path) -> dict[str, Any]:
|
||||||
|
index_path = get_session_preview_index_db_path(account_dir)
|
||||||
|
inspect = _inspect_index(index_path)
|
||||||
|
meta: dict[str, str] = {}
|
||||||
|
current: dict[str, Any] = {}
|
||||||
|
stale = False
|
||||||
|
|
||||||
|
if bool(inspect.get("ready")):
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute("SELECT key, value FROM meta").fetchall()
|
||||||
|
for k, v in rows:
|
||||||
|
if k is None:
|
||||||
|
continue
|
||||||
|
meta[str(k)] = "" if v is None else str(v)
|
||||||
|
except Exception:
|
||||||
|
meta = {}
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
current = _compute_source_fingerprint(account_dir)
|
||||||
|
expected = str(meta.get("source_fingerprint") or "").strip()
|
||||||
|
actual = str(current.get("fingerprint") or "").strip()
|
||||||
|
if expected and actual and expected != actual:
|
||||||
|
stale = True
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"index": {
|
||||||
|
"path": str(index_path),
|
||||||
|
"exists": bool(inspect.get("exists")),
|
||||||
|
"ready": bool(inspect.get("ready")),
|
||||||
|
"stale": bool(stale),
|
||||||
|
"needsRebuild": (not bool(inspect.get("ready"))) or bool(stale),
|
||||||
|
"schemaVersion": inspect.get("schemaVersion"),
|
||||||
|
"meta": meta,
|
||||||
|
"current": current,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_session_previews(account_dir: Path, usernames: list[str]) -> dict[str, str]:
|
||||||
|
status = get_session_preview_index_status(account_dir)
|
||||||
|
index = dict(status.get("index") or {})
|
||||||
|
if not bool(index.get("ready")):
|
||||||
|
return {}
|
||||||
|
if bool(index.get("stale")):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
index_path = get_session_preview_index_db_path(account_dir)
|
||||||
|
|
||||||
|
uniq = list(dict.fromkeys([str(u or "").strip() for u in usernames if str(u or "").strip()]))
|
||||||
|
if not uniq:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
conn = sqlite3.connect(str(index_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
chunk_size = 900 # sqlite 默认变量上限常见为 999
|
||||||
|
for i in range(0, len(uniq), chunk_size):
|
||||||
|
chunk = uniq[i : i + chunk_size]
|
||||||
|
placeholders = ",".join(["?"] * len(chunk))
|
||||||
|
rows = conn.execute(
|
||||||
|
f"SELECT username, preview FROM session_preview WHERE username IN ({placeholders})",
|
||||||
|
chunk,
|
||||||
|
).fetchall()
|
||||||
|
for r in rows:
|
||||||
|
u = str(r["username"] or "").strip()
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
out[u] = str(r["preview"] or "")
|
||||||
|
return out
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _init_index_db(conn: sqlite3.Connection) -> None:
|
||||||
|
conn.execute("PRAGMA journal_mode=DELETE")
|
||||||
|
conn.execute("PRAGMA synchronous=OFF")
|
||||||
|
conn.execute("PRAGMA temp_store=MEMORY")
|
||||||
|
|
||||||
|
conn.execute("CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT NOT NULL)")
|
||||||
|
conn.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE IF NOT EXISTS session_preview (
|
||||||
|
username TEXT PRIMARY KEY,
|
||||||
|
sort_seq INTEGER NOT NULL DEFAULT 0,
|
||||||
|
local_id INTEGER NOT NULL DEFAULT 0,
|
||||||
|
create_time INTEGER NOT NULL DEFAULT 0,
|
||||||
|
local_type INTEGER NOT NULL DEFAULT 0,
|
||||||
|
sender_username TEXT NOT NULL DEFAULT '',
|
||||||
|
preview TEXT NOT NULL DEFAULT '',
|
||||||
|
db_stem TEXT NOT NULL DEFAULT '',
|
||||||
|
table_name TEXT NOT NULL DEFAULT ''
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("schema_version", str(_SCHEMA_VERSION)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_session_preview_index(
|
||||||
|
account_dir: Path,
|
||||||
|
*,
|
||||||
|
rebuild: bool = False,
|
||||||
|
include_hidden: bool = True,
|
||||||
|
include_official: bool = True,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Build a per-account session preview index as `{account}/session_preview.db`.
|
||||||
|
|
||||||
|
Why: `message_*.db` tables do NOT index `create_time`, so `ORDER BY create_time DESC LIMIT 1`
|
||||||
|
is extremely slow when done per-session at runtime. This index shifts that work to a one-time build.
|
||||||
|
"""
|
||||||
|
|
||||||
|
account_dir = Path(account_dir)
|
||||||
|
session_db_path = account_dir / "session.db"
|
||||||
|
if not session_db_path.exists():
|
||||||
|
return {
|
||||||
|
"status": "error",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"message": "session.db not found.",
|
||||||
|
}
|
||||||
|
|
||||||
|
db_paths = _iter_message_db_paths(account_dir)
|
||||||
|
if not db_paths:
|
||||||
|
return {
|
||||||
|
"status": "error",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"message": "No message databases found.",
|
||||||
|
}
|
||||||
|
|
||||||
|
started = time.time()
|
||||||
|
logger.info(f"[session_preview] build start account={account_dir.name} dbs={len(db_paths)}")
|
||||||
|
|
||||||
|
sconn = sqlite3.connect(str(session_db_path))
|
||||||
|
sconn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
srows = sconn.execute(
|
||||||
|
"""
|
||||||
|
SELECT username, is_hidden, summary, draft, last_msg_type, last_msg_sub_type, sort_timestamp, last_timestamp
|
||||||
|
FROM SessionTable
|
||||||
|
ORDER BY sort_timestamp DESC
|
||||||
|
"""
|
||||||
|
).fetchall()
|
||||||
|
finally:
|
||||||
|
sconn.close()
|
||||||
|
|
||||||
|
sessions: list[sqlite3.Row] = []
|
||||||
|
usernames: list[str] = []
|
||||||
|
for r in srows:
|
||||||
|
u = str(r["username"] or "").strip()
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
if not include_hidden and int(r["is_hidden"] or 0) == 1:
|
||||||
|
continue
|
||||||
|
if not _should_keep_session(u, include_official=bool(include_official)):
|
||||||
|
continue
|
||||||
|
sessions.append(r)
|
||||||
|
usernames.append(u)
|
||||||
|
|
||||||
|
if not usernames:
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"message": "No sessions to index.",
|
||||||
|
"indexed": 0,
|
||||||
|
"path": str(get_session_preview_index_db_path(account_dir)),
|
||||||
|
}
|
||||||
|
|
||||||
|
md5_to_users: dict[str, list[str]] = {}
|
||||||
|
for u in usernames:
|
||||||
|
h = hashlib.md5(u.encode("utf-8")).hexdigest()
|
||||||
|
md5_to_users.setdefault(h, []).append(u)
|
||||||
|
|
||||||
|
best: dict[str, tuple[tuple[int, int, int], dict[str, Any]]] = {}
|
||||||
|
|
||||||
|
for db_path in db_paths:
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.text_factory = bytes
|
||||||
|
try:
|
||||||
|
trows = conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()
|
||||||
|
md5_to_table: dict[str, str] = {}
|
||||||
|
for tr in trows:
|
||||||
|
if not tr or tr[0] is None:
|
||||||
|
continue
|
||||||
|
name = _decode_sqlite_text(tr[0]).strip()
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
m = _TABLE_NAME_RE.match(name.lower())
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
md5_hex = str(m.group(2) or "").lower()
|
||||||
|
if md5_hex not in md5_to_users:
|
||||||
|
continue
|
||||||
|
prefix = str(m.group(1) or "").lower()
|
||||||
|
if md5_hex not in md5_to_table or prefix == "msg_":
|
||||||
|
md5_to_table[md5_hex] = name
|
||||||
|
|
||||||
|
if not md5_to_table:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for md5_hex, table_name in md5_to_table.items():
|
||||||
|
users = md5_to_users.get(md5_hex) or []
|
||||||
|
if not users:
|
||||||
|
continue
|
||||||
|
|
||||||
|
quoted = _quote_ident(table_name)
|
||||||
|
|
||||||
|
row = None
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT "
|
||||||
|
"m.local_id, m.local_type, m.sort_seq, m.create_time, "
|
||||||
|
"m.message_content, m.compress_content, n.user_name AS sender_username "
|
||||||
|
f"FROM {quoted} m "
|
||||||
|
"LEFT JOIN Name2Id n ON m.real_sender_id = n.rowid "
|
||||||
|
"ORDER BY m.sort_seq DESC, m.local_id DESC "
|
||||||
|
"LIMIT 1"
|
||||||
|
).fetchone()
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT "
|
||||||
|
"local_id, local_type, sort_seq, create_time, "
|
||||||
|
"message_content, compress_content, '' AS sender_username "
|
||||||
|
f"FROM {quoted} "
|
||||||
|
"ORDER BY sort_seq DESC, local_id DESC "
|
||||||
|
"LIMIT 1"
|
||||||
|
).fetchone()
|
||||||
|
except Exception:
|
||||||
|
row = None
|
||||||
|
|
||||||
|
if row is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
sort_seq = int(row["sort_seq"] or 0) if row["sort_seq"] is not None else 0
|
||||||
|
except Exception:
|
||||||
|
sort_seq = 0
|
||||||
|
try:
|
||||||
|
local_id = int(row["local_id"] or 0)
|
||||||
|
except Exception:
|
||||||
|
local_id = 0
|
||||||
|
try:
|
||||||
|
create_time = int(row["create_time"] or 0)
|
||||||
|
except Exception:
|
||||||
|
create_time = 0
|
||||||
|
sort_key = (int(sort_seq), int(local_id), int(create_time))
|
||||||
|
|
||||||
|
raw_text = _decode_message_content(row["compress_content"], row["message_content"]).strip()
|
||||||
|
if raw_text and (not raw_text.lstrip().startswith("<")) and (not raw_text.lstrip().startswith('"<')):
|
||||||
|
# Avoid leaking unreadable compressed/binary payloads into UI.
|
||||||
|
if not _is_mostly_printable_text(raw_text):
|
||||||
|
raw_text = ""
|
||||||
|
sender_username = _decode_sqlite_text(row["sender_username"]).strip()
|
||||||
|
|
||||||
|
for username in users:
|
||||||
|
prev = best.get(username)
|
||||||
|
if prev is not None and sort_key <= prev[0]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
is_group = bool(username.endswith("@chatroom"))
|
||||||
|
try:
|
||||||
|
preview = _build_latest_message_preview(
|
||||||
|
username=username,
|
||||||
|
local_type=int(row["local_type"] or 0),
|
||||||
|
raw_text=raw_text,
|
||||||
|
is_group=is_group,
|
||||||
|
sender_username=sender_username,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
preview = ""
|
||||||
|
if preview and (not _is_mostly_printable_text(preview)):
|
||||||
|
try:
|
||||||
|
preview = _build_latest_message_preview(
|
||||||
|
username=username,
|
||||||
|
local_type=int(row["local_type"] or 0),
|
||||||
|
raw_text="",
|
||||||
|
is_group=is_group,
|
||||||
|
sender_username=sender_username,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
preview = ""
|
||||||
|
if not preview:
|
||||||
|
continue
|
||||||
|
|
||||||
|
best[username] = (
|
||||||
|
sort_key,
|
||||||
|
{
|
||||||
|
"username": username,
|
||||||
|
"sort_seq": int(sort_seq),
|
||||||
|
"local_id": int(local_id),
|
||||||
|
"create_time": int(create_time),
|
||||||
|
"local_type": int(row["local_type"] or 0),
|
||||||
|
"sender_username": sender_username,
|
||||||
|
"preview": preview,
|
||||||
|
"db_stem": str(db_path.stem),
|
||||||
|
"table_name": str(table_name),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback: ensure we always have a non-empty lastMessage for UI (even if message tables missing).
|
||||||
|
for r in sessions:
|
||||||
|
u = str(r["username"] or "").strip()
|
||||||
|
if not u:
|
||||||
|
continue
|
||||||
|
if u in best:
|
||||||
|
continue
|
||||||
|
draft_text = _decode_sqlite_text(r["draft"]).strip()
|
||||||
|
if draft_text:
|
||||||
|
draft_text = re.sub(r"\s+", " ", draft_text).strip()
|
||||||
|
preview = f"[草稿] {draft_text}" if draft_text else "[草稿]"
|
||||||
|
else:
|
||||||
|
summary_text = _decode_sqlite_text(r["summary"]).strip()
|
||||||
|
summary_text = re.sub(r"\s+", " ", summary_text).strip()
|
||||||
|
if summary_text:
|
||||||
|
preview = summary_text
|
||||||
|
else:
|
||||||
|
preview = _infer_last_message_brief(r["last_msg_type"], r["last_msg_sub_type"])
|
||||||
|
best[u] = (
|
||||||
|
(0, 0, 0),
|
||||||
|
{
|
||||||
|
"username": u,
|
||||||
|
"sort_seq": 0,
|
||||||
|
"local_id": 0,
|
||||||
|
"create_time": 0,
|
||||||
|
"local_type": 0,
|
||||||
|
"sender_username": "",
|
||||||
|
"preview": str(preview or ""),
|
||||||
|
"db_stem": "",
|
||||||
|
"table_name": "",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
final_path = get_session_preview_index_db_path(account_dir)
|
||||||
|
tmp_path = _index_db_tmp_path(account_dir)
|
||||||
|
try:
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn_out = sqlite3.connect(str(tmp_path))
|
||||||
|
try:
|
||||||
|
_init_index_db(conn_out)
|
||||||
|
try:
|
||||||
|
conn_out.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
conn_out.execute("BEGIN")
|
||||||
|
rows_to_insert: list[tuple[Any, ...]] = []
|
||||||
|
for _, rec in best.values():
|
||||||
|
rows_to_insert.append(
|
||||||
|
(
|
||||||
|
rec["username"],
|
||||||
|
int(rec["sort_seq"] or 0),
|
||||||
|
int(rec["local_id"] or 0),
|
||||||
|
int(rec["create_time"] or 0),
|
||||||
|
int(rec["local_type"] or 0),
|
||||||
|
str(rec["sender_username"] or ""),
|
||||||
|
str(rec["preview"] or ""),
|
||||||
|
str(rec["db_stem"] or ""),
|
||||||
|
str(rec["table_name"] or ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn_out.executemany(
|
||||||
|
"INSERT OR REPLACE INTO session_preview("
|
||||||
|
"username, sort_seq, local_id, create_time, local_type, sender_username, preview, db_stem, table_name"
|
||||||
|
") VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
rows_to_insert,
|
||||||
|
)
|
||||||
|
conn_out.commit()
|
||||||
|
|
||||||
|
built_at = int(time.time())
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("built_at", str(built_at)),
|
||||||
|
)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("session_count", str(len(best))),
|
||||||
|
)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("db_count", str(len(db_paths))),
|
||||||
|
)
|
||||||
|
src = _compute_source_fingerprint(account_dir)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("source_fingerprint", str(src.get("fingerprint") or "")),
|
||||||
|
)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("source_files", json.dumps(src.get("files") or [], ensure_ascii=False)),
|
||||||
|
)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("built_include_hidden", "1" if include_hidden else "0"),
|
||||||
|
)
|
||||||
|
conn_out.execute(
|
||||||
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
||||||
|
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
||||||
|
("built_include_official", "1" if include_official else "0"),
|
||||||
|
)
|
||||||
|
conn_out.commit()
|
||||||
|
finally:
|
||||||
|
conn_out.close()
|
||||||
|
|
||||||
|
os.replace(str(tmp_path), str(final_path))
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception(f"[session_preview] build failed: {e}")
|
||||||
|
try:
|
||||||
|
if tmp_path.exists():
|
||||||
|
tmp_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {
|
||||||
|
"status": "error",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"message": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
duration = max(0.0, time.time() - started)
|
||||||
|
logger.info(
|
||||||
|
f"[session_preview] build done account={account_dir.name} indexed={len(best)} "
|
||||||
|
f"durationSec={round(duration, 3)} path={final_path}"
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"status": "success",
|
||||||
|
"account": account_dir.name,
|
||||||
|
"indexed": len(best),
|
||||||
|
"path": str(final_path),
|
||||||
|
"durationSec": round(duration, 3),
|
||||||
|
}
|
||||||
@@ -441,6 +441,24 @@ def decrypt_wechat_databases(db_storage_path: str = None, key: str = None) -> di
|
|||||||
"failed_files": account_failed
|
"failed_files": account_failed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# 构建“会话最后一条消息”索引:把耗时挪到解密阶段,一劳永逸
|
||||||
|
if os.environ.get("WECHAT_TOOL_BUILD_SESSION_PREVIEW", "1") != "0":
|
||||||
|
try:
|
||||||
|
from .session_preview_index import build_session_preview_index
|
||||||
|
|
||||||
|
account_results[account_name]["session_preview_index"] = build_session_preview_index(
|
||||||
|
account_output_dir,
|
||||||
|
rebuild=True,
|
||||||
|
include_hidden=True,
|
||||||
|
include_official=True,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"构建会话预览索引失败: {account_name}: {e}")
|
||||||
|
account_results[account_name]["session_preview_index"] = {
|
||||||
|
"status": "error",
|
||||||
|
"message": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
logger.info(f"账号 {account_name} 解密完成: 成功 {account_success}/{len(databases)}")
|
logger.info(f"账号 {account_name} 解密完成: 成功 {account_success}/{len(databases)}")
|
||||||
|
|
||||||
# 返回结果
|
# 返回结果
|
||||||
|
|||||||
Reference in New Issue
Block a user