mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-20 14:50:50 +08:00
feat(chat-export): 支持 HTML 导出(合并消息/远程缩略图可选下载)
- 导出格式新增 html:生成 index.html + 会话 messages.html,离线浏览 - 支持 chatHistory(合并消息)解析/渲染与弹窗查看 - 图片资源解析增强:MessageResourceInfo 优先 + md5/hdmd5 兜底 - HTML 导出可选下载远程缩略图(仅公网主机/图片类型/5MB 限制) - 修复拍一拍误判、公众号封面样式识别;转账过期状态与前端展示
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ from collections import Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import quote, urlparse
|
||||
from urllib.parse import parse_qs, quote, urlparse
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
@@ -634,6 +634,32 @@ def _is_mp_weixin_article_url(url: str) -> bool:
|
||||
return "mp.weixin.qq.com/" in lu
|
||||
|
||||
|
||||
def _is_mp_weixin_feed_article_url(url: str) -> bool:
|
||||
"""Detect WeChat's PC feed/recommendation mp.weixin.qq.com share URLs.
|
||||
|
||||
These links often carry an `exptype` like:
|
||||
masonry_feed_brief_content_elite_for_pcfeeds_u2i
|
||||
|
||||
WeChat desktop tends to render them in a cover-card style (image + bottom title),
|
||||
so we use this as a hint to choose the 'cover' linkStyle.
|
||||
"""
|
||||
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return False
|
||||
|
||||
try:
|
||||
parsed = urlparse(u)
|
||||
q = parse_qs(parsed.query or "")
|
||||
for v in (q.get("exptype") or []):
|
||||
if "masonry_feed" in str(v or "").lower():
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return "exptype=masonry_feed" in u.lower()
|
||||
|
||||
|
||||
def _classify_link_share(*, app_type: int, url: str, source_username: str, desc: str) -> tuple[str, str]:
|
||||
src = str(source_username or "").strip().lower()
|
||||
is_official_article = bool(
|
||||
@@ -647,7 +673,15 @@ def _classify_link_share(*, app_type: int, url: str, source_username: str, desc:
|
||||
hashtag_count = len(re.findall(r"#[^#\s]+", d))
|
||||
|
||||
# 公众号文章中「封面图 + 底栏标题」卡片特征:摘要以 #话题# 风格为主。
|
||||
link_style = "cover" if (is_official_article and (d.startswith("#") or hashtag_count >= 2)) else "default"
|
||||
cover_like = bool(
|
||||
is_official_article
|
||||
and (
|
||||
d.startswith("#")
|
||||
or hashtag_count >= 2
|
||||
or _is_mp_weixin_feed_article_url(url)
|
||||
)
|
||||
)
|
||||
link_style = "cover" if cover_like else "default"
|
||||
return link_type, link_style
|
||||
|
||||
|
||||
@@ -948,8 +982,12 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
||||
"recordItem": record_item or "",
|
||||
}
|
||||
|
||||
if app_type in (5, 68) and url:
|
||||
thumb_url = _normalize_xml_url(_extract_xml_tag_text(text, "thumburl"))
|
||||
if app_type in (4, 5, 68) and url:
|
||||
# Many appmsg link cards (notably Bilibili shares with <type>4</type>) include a <patMsg> metadata block.
|
||||
# DO NOT treat "<patmsg" presence as a pat message: it would misclassify normal link cards as "[拍一拍]".
|
||||
thumb_url = _normalize_xml_url(
|
||||
_extract_xml_tag_text(text, "thumburl") or _extract_xml_tag_text(text, "cdnthumburl")
|
||||
)
|
||||
link_type, link_style = _classify_link_share(
|
||||
app_type=app_type,
|
||||
url=url,
|
||||
@@ -1093,7 +1131,10 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
||||
"quoteVoiceLength": quote_voice_length,
|
||||
}
|
||||
|
||||
if app_type == 62 or "<patmsg" in lower or 'type="patmsg"' in lower or "type='patmsg'" in lower:
|
||||
# Some versions may mark pat messages via sysmsg/appmsg tag attribute: <sysmsg type="patmsg">...</sysmsg>.
|
||||
# Be strict here: lots of non-pat appmsg payloads still carry a nested <patMsg>...</patMsg> metadata block.
|
||||
patmsg_attr = bool(re.search(r"<(sysmsg|appmsg)\b[^>]*\btype=['\"]patmsg['\"]", lower))
|
||||
if app_type == 62 or patmsg_attr:
|
||||
return {"renderType": "system", "content": "[拍一拍]"}
|
||||
|
||||
if app_type == 2000 or (
|
||||
|
||||
@@ -2742,6 +2742,90 @@ def _postprocess_transfer_messages(merged: list[dict[str, Any]]) -> None:
|
||||
# - 将原始转账消息(1/8)回填为“已被接收”
|
||||
# - 若同一 transferId 同时存在原始消息与 paysubtype=3 消息,则将 paysubtype=3 的那条校正为“已收款”
|
||||
|
||||
def _is_transfer_expired_system_message(text: Any) -> bool:
|
||||
content = str(text or "").strip()
|
||||
if not content:
|
||||
return False
|
||||
if "转账" not in content or "过期" not in content:
|
||||
return False
|
||||
if "未接收" in content and ("24小时" in content or "二十四小时" in content):
|
||||
return True
|
||||
return "已过期" in content and ("收款方" in content or "转账" in content)
|
||||
|
||||
def _mark_pending_transfers_expired_by_system_messages() -> set[str]:
|
||||
expired_system_times: list[int] = []
|
||||
pending_candidates: list[tuple[int, int]] = [] # (index, createTime)
|
||||
|
||||
for idx, msg in enumerate(merged):
|
||||
rt = str(msg.get("renderType") or "").strip()
|
||||
if rt == "system":
|
||||
if _is_transfer_expired_system_message(msg.get("content")):
|
||||
try:
|
||||
ts = int(msg.get("createTime") or 0)
|
||||
except Exception:
|
||||
ts = 0
|
||||
if ts > 0:
|
||||
expired_system_times.append(ts)
|
||||
continue
|
||||
|
||||
if rt != "transfer":
|
||||
continue
|
||||
|
||||
pst = str(msg.get("paySubType") or "").strip()
|
||||
if pst not in ("1", "8"):
|
||||
continue
|
||||
|
||||
try:
|
||||
ts = int(msg.get("createTime") or 0)
|
||||
except Exception:
|
||||
ts = 0
|
||||
if ts <= 0:
|
||||
continue
|
||||
|
||||
pending_candidates.append((idx, ts))
|
||||
|
||||
if not expired_system_times or not pending_candidates:
|
||||
return set()
|
||||
|
||||
used_pending_indexes: set[int] = set()
|
||||
expired_transfer_ids: set[str] = set()
|
||||
|
||||
# 过期系统提示通常出现在转账发起约 24 小时后。
|
||||
# 为避免误匹配,要求时间差落在 [22h, 26h] 范围内,并选择最接近 24h 的待收款消息。
|
||||
for sys_ts in sorted(expired_system_times):
|
||||
best_index = -1
|
||||
best_distance = 10**9
|
||||
|
||||
for idx, transfer_ts in pending_candidates:
|
||||
if idx in used_pending_indexes:
|
||||
continue
|
||||
delta = sys_ts - transfer_ts
|
||||
if delta < 0:
|
||||
continue
|
||||
if delta < 22 * 3600 or delta > 26 * 3600:
|
||||
continue
|
||||
|
||||
distance = abs(delta - 24 * 3600)
|
||||
if distance < best_distance:
|
||||
best_distance = distance
|
||||
best_index = idx
|
||||
|
||||
if best_index < 0:
|
||||
continue
|
||||
|
||||
used_pending_indexes.add(best_index)
|
||||
transfer_msg = merged[best_index]
|
||||
transfer_msg["paySubType"] = "10"
|
||||
transfer_msg["transferStatus"] = "已过期"
|
||||
|
||||
tid = str(transfer_msg.get("transferId") or "").strip()
|
||||
if tid:
|
||||
expired_transfer_ids.add(tid)
|
||||
|
||||
return expired_transfer_ids
|
||||
|
||||
expired_transfer_ids = _mark_pending_transfers_expired_by_system_messages()
|
||||
|
||||
returned_transfer_ids: set[str] = set() # 退还状态的 transferId
|
||||
received_transfer_ids: set[str] = set() # 已收款状态的 transferId
|
||||
returned_amounts_with_time: list[tuple[str, int]] = [] # (金额, 时间戳) 用于退还回退匹配
|
||||
@@ -2828,6 +2912,8 @@ def _postprocess_transfer_messages(merged: list[dict[str, Any]]) -> None:
|
||||
tid = str(m.get("transferId") or "").strip()
|
||||
if not tid or tid not in pending_transfer_ids:
|
||||
continue
|
||||
if tid in expired_transfer_ids:
|
||||
continue
|
||||
mid = str(m.get("id") or "").strip()
|
||||
if mid and mid in backfilled_message_ids:
|
||||
continue
|
||||
|
||||
@@ -12,17 +12,31 @@ from ..path_fix import PathFixRoute
|
||||
|
||||
router = APIRouter(route_class=PathFixRoute)
|
||||
|
||||
ExportFormat = Literal["json", "txt"]
|
||||
ExportFormat = Literal["json", "txt", "html"]
|
||||
ExportScope = Literal["selected", "all", "groups", "singles"]
|
||||
MediaKind = Literal["image", "emoji", "video", "video_thumb", "voice", "file"]
|
||||
MessageType = Literal["text", "image", "emoji", "video", "voice", "file", "link", "transfer", "redPacket", "system", "quote", "voip"]
|
||||
MessageType = Literal[
|
||||
"text",
|
||||
"image",
|
||||
"emoji",
|
||||
"video",
|
||||
"voice",
|
||||
"chatHistory",
|
||||
"file",
|
||||
"link",
|
||||
"transfer",
|
||||
"redPacket",
|
||||
"system",
|
||||
"quote",
|
||||
"voip",
|
||||
]
|
||||
|
||||
|
||||
class ChatExportCreateRequest(BaseModel):
|
||||
account: Optional[str] = Field(None, description="账号目录名(可选,默认使用第一个)")
|
||||
scope: ExportScope = Field("selected", description="导出范围:selected=指定会话;all=全部;groups=仅群聊;singles=仅单聊")
|
||||
usernames: list[str] = Field(default_factory=list, description="会话 username 列表(scope=selected 时使用)")
|
||||
format: ExportFormat = Field("json", description="导出格式:json 或 txt(zip 内每个会话一个文件)")
|
||||
format: ExportFormat = Field("json", description="导出格式:json/txt/html(zip 内每个会话一个文件;html 可离线打开 index.html 查看)")
|
||||
start_time: Optional[int] = Field(None, description="起始时间(Unix 秒,含)")
|
||||
end_time: Optional[int] = Field(None, description="结束时间(Unix 秒,含)")
|
||||
include_hidden: bool = Field(False, description="是否包含隐藏会话(scope!=selected 时)")
|
||||
@@ -41,6 +55,10 @@ class ChatExportCreateRequest(BaseModel):
|
||||
False,
|
||||
description="预留字段:本项目不从微信进程提取媒体密钥,请使用 wx_key 获取并保存/批量解密",
|
||||
)
|
||||
download_remote_media: bool = Field(
|
||||
False,
|
||||
description="HTML 导出时允许联网下载链接/引用缩略图等远程媒体(提高离线完整性)",
|
||||
)
|
||||
privacy_mode: bool = Field(
|
||||
False,
|
||||
description="隐私模式导出:隐藏会话/用户名/内容,不打包头像与媒体",
|
||||
@@ -64,6 +82,7 @@ async def create_chat_export(req: ChatExportCreateRequest):
|
||||
message_types=req.message_types,
|
||||
output_dir=req.output_dir,
|
||||
allow_process_key_extract=req.allow_process_key_extract,
|
||||
download_remote_media=req.download_remote_media,
|
||||
privacy_mode=req.privacy_mode,
|
||||
file_name=req.file_name,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user