mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-02 05:50:50 +08:00
fix(chat): 群聊发送者解析支持 alias 并避免覆盖 sender_username
- _split_group_sender_prefix 支持传入已知 sender(username/alias),弱特征场景避免误拆前缀 - 群聊消息按需读取 contact.db 的 alias 辅助解析昵称:\n前缀(带缓存) - 仅在 sender 缺失时回填前缀,避免昵称覆盖 sender id
This commit is contained in:
@@ -833,15 +833,6 @@ def _iter_rows_for_conversation(
|
||||
raw_text = _decode_message_content(r["compress_content"], r["message_content"]).strip()
|
||||
|
||||
is_group = bool(conv_username.endswith("@chatroom"))
|
||||
sender_prefix = ""
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||
if is_group and sender_prefix:
|
||||
sender_username = sender_prefix
|
||||
if is_group and raw_text and (raw_text.startswith("<") or raw_text.startswith('"<')):
|
||||
xml_sender = _extract_sender_from_group_xml(raw_text)
|
||||
if xml_sender:
|
||||
sender_username = xml_sender
|
||||
|
||||
if is_sent:
|
||||
sender_username = account_wxid
|
||||
@@ -881,8 +872,21 @@ def _parse_message_for_export(
|
||||
is_group: bool,
|
||||
resource_conn: Optional[sqlite3.Connection],
|
||||
resource_chat_id: Optional[int],
|
||||
sender_alias: str = "",
|
||||
) -> dict[str, Any]:
|
||||
raw_text = row.raw_text or ""
|
||||
sender_username = str(row.sender_username or "").strip()
|
||||
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text, sender_username, sender_alias)
|
||||
if sender_prefix and (not sender_username):
|
||||
sender_username = sender_prefix
|
||||
|
||||
if is_group and raw_text and (raw_text.startswith("<") or raw_text.startswith('"<')):
|
||||
xml_sender = _extract_sender_from_group_xml(raw_text)
|
||||
if xml_sender:
|
||||
sender_username = xml_sender
|
||||
|
||||
local_type = int(row.local_type or 0)
|
||||
is_sent = bool(row.is_sent)
|
||||
|
||||
@@ -1152,7 +1156,7 @@ def _parse_message_for_export(
|
||||
"type": local_type,
|
||||
"renderType": render_type,
|
||||
"isSent": bool(is_sent),
|
||||
"senderUsername": row.sender_username,
|
||||
"senderUsername": sender_username,
|
||||
"conversationUsername": conv_username,
|
||||
"isGroup": bool(is_group),
|
||||
"content": content_text,
|
||||
@@ -1216,6 +1220,38 @@ def _write_conversation_json(
|
||||
arcname = f"{conv_dir}/messages.json"
|
||||
exported = 0
|
||||
|
||||
contact_conn: Optional[sqlite3.Connection] = None
|
||||
alias_cache: dict[str, str] = {}
|
||||
if conv_is_group:
|
||||
try:
|
||||
contact_db_path = account_dir / "contact.db"
|
||||
if contact_db_path.exists():
|
||||
contact_conn = sqlite3.connect(str(contact_db_path))
|
||||
except Exception:
|
||||
contact_conn = None
|
||||
|
||||
def lookup_alias(username: str) -> str:
|
||||
u = str(username or "").strip()
|
||||
if not u or contact_conn is None:
|
||||
return ""
|
||||
if u in alias_cache:
|
||||
return alias_cache[u]
|
||||
|
||||
alias = ""
|
||||
try:
|
||||
r = contact_conn.execute("SELECT alias FROM contact WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
if not alias:
|
||||
r = contact_conn.execute("SELECT alias FROM stranger WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
except Exception:
|
||||
alias = ""
|
||||
|
||||
alias_cache[u] = alias
|
||||
return alias
|
||||
|
||||
# NOTE: Do not keep an entry handle opened while also writing other entries (avatars/media).
|
||||
# zipfile forbids interleaving writes; stream to a temp file then add it to zip at the end.
|
||||
with tempfile.TemporaryDirectory(prefix="wechat_chat_export_") as tmp_dir:
|
||||
@@ -1263,12 +1299,28 @@ def _write_conversation_json(
|
||||
local_types=local_types,
|
||||
):
|
||||
scanned += 1
|
||||
|
||||
sender_alias = ""
|
||||
if conv_is_group and row.raw_text and (not row.raw_text.startswith("<")) and (not row.raw_text.startswith('"<')):
|
||||
sep = row.raw_text.find(":\n")
|
||||
if sep > 0:
|
||||
prefix = row.raw_text[:sep].strip()
|
||||
su = str(row.sender_username or "").strip()
|
||||
if prefix and su and prefix != su:
|
||||
strong_hint = prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix
|
||||
if not strong_hint:
|
||||
body_probe = row.raw_text[sep + 2 :].lstrip("\n").lstrip()
|
||||
body_is_xml = body_probe.startswith("<") or body_probe.startswith('"<')
|
||||
if not body_is_xml:
|
||||
sender_alias = lookup_alias(su)
|
||||
|
||||
msg = _parse_message_for_export(
|
||||
row=row,
|
||||
conv_username=conv_username,
|
||||
is_group=conv_is_group,
|
||||
resource_conn=resource_conn,
|
||||
resource_chat_id=resource_chat_id,
|
||||
sender_alias=sender_alias,
|
||||
)
|
||||
if want_types:
|
||||
rt_key = _normalize_render_type_key(msg.get("renderType"))
|
||||
@@ -1326,6 +1378,12 @@ def _write_conversation_json(
|
||||
tw.flush()
|
||||
|
||||
zf.write(str(tmp_path), arcname)
|
||||
if contact_conn is not None:
|
||||
try:
|
||||
contact_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return exported
|
||||
|
||||
|
||||
@@ -1360,6 +1418,38 @@ def _write_conversation_txt(
|
||||
arcname = f"{conv_dir}/messages.txt"
|
||||
exported = 0
|
||||
|
||||
contact_conn: Optional[sqlite3.Connection] = None
|
||||
alias_cache: dict[str, str] = {}
|
||||
if conv_is_group:
|
||||
try:
|
||||
contact_db_path = account_dir / "contact.db"
|
||||
if contact_db_path.exists():
|
||||
contact_conn = sqlite3.connect(str(contact_db_path))
|
||||
except Exception:
|
||||
contact_conn = None
|
||||
|
||||
def lookup_alias(username: str) -> str:
|
||||
u = str(username or "").strip()
|
||||
if not u or contact_conn is None:
|
||||
return ""
|
||||
if u in alias_cache:
|
||||
return alias_cache[u]
|
||||
|
||||
alias = ""
|
||||
try:
|
||||
r = contact_conn.execute("SELECT alias FROM contact WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
if not alias:
|
||||
r = contact_conn.execute("SELECT alias FROM stranger WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
except Exception:
|
||||
alias = ""
|
||||
|
||||
alias_cache[u] = alias
|
||||
return alias
|
||||
|
||||
# Same as JSON: write to temp file first to avoid zip interleaving writes.
|
||||
with tempfile.TemporaryDirectory(prefix="wechat_chat_export_") as tmp_dir:
|
||||
tmp_path = Path(tmp_dir) / "messages.txt"
|
||||
@@ -1391,12 +1481,27 @@ def _write_conversation_txt(
|
||||
local_types=local_types,
|
||||
):
|
||||
scanned += 1
|
||||
sender_alias = ""
|
||||
if conv_is_group and row.raw_text and (not row.raw_text.startswith("<")) and (not row.raw_text.startswith('"<')):
|
||||
sep = row.raw_text.find(":\n")
|
||||
if sep > 0:
|
||||
prefix = row.raw_text[:sep].strip()
|
||||
su = str(row.sender_username or "").strip()
|
||||
if prefix and su and prefix != su:
|
||||
strong_hint = prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix
|
||||
if not strong_hint:
|
||||
body_probe = row.raw_text[sep + 2 :].lstrip("\n").lstrip()
|
||||
body_is_xml = body_probe.startswith("<") or body_probe.startswith('"<')
|
||||
if not body_is_xml:
|
||||
sender_alias = lookup_alias(su)
|
||||
|
||||
msg = _parse_message_for_export(
|
||||
row=row,
|
||||
conv_username=conv_username,
|
||||
is_group=conv_is_group,
|
||||
resource_conn=resource_conn,
|
||||
resource_chat_id=resource_chat_id,
|
||||
sender_alias=sender_alias,
|
||||
)
|
||||
if want_types:
|
||||
rt_key = _normalize_render_type_key(msg.get("renderType"))
|
||||
@@ -1449,6 +1554,12 @@ def _write_conversation_txt(
|
||||
tw.flush()
|
||||
|
||||
zf.write(str(tmp_path), arcname)
|
||||
if contact_conn is not None:
|
||||
try:
|
||||
contact_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return exported
|
||||
|
||||
|
||||
|
||||
@@ -694,7 +694,11 @@ def _infer_transfer_status_text(
|
||||
return "转账"
|
||||
|
||||
|
||||
def _split_group_sender_prefix(text: str) -> tuple[str, str]:
|
||||
def _split_group_sender_prefix(
|
||||
text: str,
|
||||
known_sender_username: str = "",
|
||||
known_sender_alias: str = "",
|
||||
) -> tuple[str, str]:
|
||||
if not text:
|
||||
return "", text
|
||||
sep = text.find(":\n")
|
||||
@@ -706,7 +710,21 @@ def _split_group_sender_prefix(text: str) -> tuple[str, str]:
|
||||
return "", text
|
||||
if re.search(r"\s", prefix):
|
||||
return "", text
|
||||
if prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix:
|
||||
|
||||
strong_hint = prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix
|
||||
probe = body.lstrip()
|
||||
body_is_xml = probe.startswith("<") or probe.startswith('"<')
|
||||
|
||||
known_values = {str(known_sender_username or "").strip(), str(known_sender_alias or "").strip()}
|
||||
known_values.discard("")
|
||||
if known_values:
|
||||
if prefix in known_values:
|
||||
return prefix, body
|
||||
if strong_hint or body_is_xml:
|
||||
return prefix, body
|
||||
return "", text
|
||||
|
||||
if strong_hint or body_is_xml:
|
||||
return prefix, body
|
||||
return "", text
|
||||
|
||||
@@ -1013,7 +1031,7 @@ def _build_latest_message_preview(
|
||||
raw_text = (raw_text or "").strip()
|
||||
sender_prefix = ""
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text, sender_username)
|
||||
if is_group and (not sender_prefix) and sender_username:
|
||||
sender_prefix = str(sender_username).strip()
|
||||
|
||||
@@ -1400,7 +1418,7 @@ def _row_to_search_hit(
|
||||
|
||||
sender_prefix = ""
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text, sender_username)
|
||||
|
||||
if is_group and sender_prefix:
|
||||
sender_username = sender_prefix
|
||||
|
||||
@@ -111,6 +111,33 @@ def _normalize_chat_source(value: Optional[str]) -> str:
|
||||
raise HTTPException(status_code=400, detail="Invalid source, use 'decrypted' or 'realtime'.")
|
||||
|
||||
|
||||
def _lookup_contact_alias(
|
||||
conn: Optional[sqlite3.Connection],
|
||||
cache: dict[str, str],
|
||||
username: str,
|
||||
) -> str:
|
||||
u = str(username or "").strip()
|
||||
if not u or conn is None:
|
||||
return ""
|
||||
if u in cache:
|
||||
return cache[u]
|
||||
|
||||
alias = ""
|
||||
try:
|
||||
r = conn.execute("SELECT alias FROM contact WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
if not alias:
|
||||
r = conn.execute("SELECT alias FROM stranger WHERE username = ? LIMIT 1", (u,)).fetchone()
|
||||
if r is not None and r[0] is not None:
|
||||
alias = str(r[0] or "").strip()
|
||||
except Exception:
|
||||
alias = ""
|
||||
|
||||
cache[u] = alias
|
||||
return alias
|
||||
|
||||
|
||||
def _scan_db_storage_mtime_ns(db_storage_dir: Path) -> int:
|
||||
try:
|
||||
base = str(db_storage_dir)
|
||||
@@ -1386,6 +1413,16 @@ def _append_full_messages_from_rows(
|
||||
resource_conn: Optional[sqlite3.Connection],
|
||||
resource_chat_id: Optional[int],
|
||||
) -> None:
|
||||
contact_conn: Optional[sqlite3.Connection] = None
|
||||
alias_cache: dict[str, str] = {}
|
||||
if is_group:
|
||||
try:
|
||||
contact_db_path = account_dir / "contact.db"
|
||||
if contact_db_path.exists():
|
||||
contact_conn = sqlite3.connect(str(contact_db_path))
|
||||
except Exception:
|
||||
contact_conn = None
|
||||
|
||||
for r in rows:
|
||||
local_id = int(r["local_id"] or 0)
|
||||
create_time = int(r["create_time"] or 0)
|
||||
@@ -1448,10 +1485,21 @@ def _append_full_messages_from_rows(
|
||||
raw_text = raw_text.strip()
|
||||
|
||||
sender_prefix = ""
|
||||
if is_group and not raw_text.startswith("<") and not raw_text.startswith('"<'):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_alias = ""
|
||||
sep = raw_text.find(":\n")
|
||||
if sep > 0:
|
||||
prefix = raw_text[:sep].strip()
|
||||
if prefix and sender_username and prefix != sender_username:
|
||||
strong_hint = prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix
|
||||
if not strong_hint:
|
||||
body_probe = raw_text[sep + 2 :].lstrip("\n").lstrip()
|
||||
body_is_xml = body_probe.startswith("<") or body_probe.startswith('"<')
|
||||
if not body_is_xml:
|
||||
sender_alias = _lookup_contact_alias(contact_conn, alias_cache, sender_username)
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text, sender_username, sender_alias)
|
||||
|
||||
if is_group and sender_prefix:
|
||||
if is_group and sender_prefix and (not sender_username):
|
||||
sender_username = sender_prefix
|
||||
|
||||
if is_group and (raw_text.startswith("<") or raw_text.startswith('"<')):
|
||||
@@ -1812,6 +1860,12 @@ def _append_full_messages_from_rows(
|
||||
}
|
||||
)
|
||||
|
||||
if contact_conn is not None:
|
||||
try:
|
||||
contact_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _postprocess_full_messages(
|
||||
*,
|
||||
@@ -2301,6 +2355,16 @@ def _collect_chat_messages(
|
||||
pat_usernames: set[str] = set()
|
||||
has_more_any = False
|
||||
|
||||
contact_conn: Optional[sqlite3.Connection] = None
|
||||
alias_cache: dict[str, str] = {}
|
||||
if is_group:
|
||||
try:
|
||||
contact_db_path = account_dir / "contact.db"
|
||||
if contact_db_path.exists():
|
||||
contact_conn = sqlite3.connect(str(contact_db_path))
|
||||
except Exception:
|
||||
contact_conn = None
|
||||
|
||||
for db_path in db_paths:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
@@ -2384,10 +2448,21 @@ def _collect_chat_messages(
|
||||
raw_text = raw_text.strip()
|
||||
|
||||
sender_prefix = ""
|
||||
if is_group and not raw_text.startswith("<") and not raw_text.startswith('"<'):
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text)
|
||||
if is_group and raw_text and (not raw_text.startswith("<")) and (not raw_text.startswith('"<')):
|
||||
sender_alias = ""
|
||||
sep = raw_text.find(":\n")
|
||||
if sep > 0:
|
||||
prefix = raw_text[:sep].strip()
|
||||
if prefix and sender_username and prefix != sender_username:
|
||||
strong_hint = prefix.startswith("wxid_") or prefix.endswith("@chatroom") or "@" in prefix
|
||||
if not strong_hint:
|
||||
body_probe = raw_text[sep + 2 :].lstrip("\n").lstrip()
|
||||
body_is_xml = body_probe.startswith("<") or body_probe.startswith('"<')
|
||||
if not body_is_xml:
|
||||
sender_alias = _lookup_contact_alias(contact_conn, alias_cache, sender_username)
|
||||
sender_prefix, raw_text = _split_group_sender_prefix(raw_text, sender_username, sender_alias)
|
||||
|
||||
if is_group and sender_prefix:
|
||||
if is_group and sender_prefix and (not sender_username):
|
||||
sender_username = sender_prefix
|
||||
|
||||
if is_group and (raw_text.startswith("<") or raw_text.startswith('"<')):
|
||||
@@ -2744,6 +2819,12 @@ def _collect_chat_messages(
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if contact_conn is not None:
|
||||
try:
|
||||
contact_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return merged, has_more_any, sender_usernames, quote_usernames, pat_usernames
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user