mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-03 06:20:51 +08:00
fix(chat): 合并转发记录的图片/表情解析与媒体兜底
- 前端:recordItem XML 清理非法控制字符并转义裸 &,补齐 fromnewmsgid/cdnurlstring/aeskey 等字段 - 前端:图片接口追加 server_id;表情接口透传 emoji_url/aes_key - 后端:/api/chat/media/image 支持 server_id,从 message_resource.db 反查 packed_info 资源 md5 并优先定位 - 后端:/api/chat/media/emoji 支持 emoji_url/aes_key,本地缺失时安全下载并尝试 AES-CBC 解密识别
This commit is contained in:
@@ -3865,7 +3865,12 @@ const parseChatHistoryRecord = (recordItemXml) => {
|
||||
const xml = String(recordItemXml || '').trim()
|
||||
if (!xml) return { info: null, items: [] }
|
||||
|
||||
const normalized = xml.replace(/ /g, ' ')
|
||||
const normalized = xml
|
||||
.replace(/ /g, ' ')
|
||||
// Strip control characters that are illegal in XML 1.0 (common in some recordItem payloads)
|
||||
.replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F]/g, '')
|
||||
// Escape stray ampersands (URLs sometimes contain raw '&' instead of '&')
|
||||
.replace(/&(?!amp;|lt;|gt;|quot;|apos;|#\d+;|#x[\da-fA-F]+;)/g, '&')
|
||||
let doc
|
||||
try {
|
||||
doc = new DOMParser().parseFromString(normalized, 'text/xml')
|
||||
@@ -3906,6 +3911,13 @@ const parseChatHistoryRecord = (recordItemXml) => {
|
||||
const fullmd5 = getText(node, 'fullmd5')
|
||||
const thumbfullmd5 = getText(node, 'thumbfullmd5')
|
||||
const md5 = getText(node, 'md5') || getText(node, 'emoticonmd5') || getText(node, 'emojiMd5')
|
||||
const fromnewmsgid = getText(node, 'fromnewmsgid')
|
||||
const srcMsgLocalid = getText(node, 'srcMsgLocalid')
|
||||
const srcMsgCreateTime = getText(node, 'srcMsgCreateTime')
|
||||
const cdnurlstring = normalizeChatHistoryUrl(getText(node, 'cdnurlstring'))
|
||||
const encrypturlstring = normalizeChatHistoryUrl(getText(node, 'encrypturlstring'))
|
||||
const externurl = normalizeChatHistoryUrl(getText(node, 'externurl'))
|
||||
const aeskey = getText(node, 'aeskey')
|
||||
|
||||
let content = datatitle || datadesc
|
||||
if (!content) {
|
||||
@@ -3948,6 +3960,13 @@ const parseChatHistoryRecord = (recordItemXml) => {
|
||||
fullmd5,
|
||||
thumbfullmd5,
|
||||
md5,
|
||||
fromnewmsgid,
|
||||
srcMsgLocalid,
|
||||
srcMsgCreateTime,
|
||||
cdnurlstring,
|
||||
encrypturlstring,
|
||||
externurl,
|
||||
aeskey,
|
||||
renderType,
|
||||
content
|
||||
}
|
||||
@@ -3998,15 +4017,23 @@ const normalizeChatHistoryRecordItem = (rec) => {
|
||||
if (!out.content || /^\[.+\]$/.test(String(out.content || '').trim())) out.content = '[视频]'
|
||||
} else if (out.renderType === 'emoji') {
|
||||
out.emojiMd5 = pickFirstMd5(out.md5, out.fullmd5, out.thumbfullmd5)
|
||||
const remoteEmojiUrl = String(out.cdnurlstring || out.externurl || out.encrypturlstring || '').trim()
|
||||
const remoteAesKey = String(out.aeskey || '').trim()
|
||||
out.emojiRemoteUrl = remoteEmojiUrl
|
||||
out.emojiUrl = out.emojiMd5
|
||||
? `${mediaBase}/api/chat/media/emoji?account=${account}&md5=${encodeURIComponent(out.emojiMd5)}&username=${username}`
|
||||
? `${mediaBase}/api/chat/media/emoji?account=${account}&md5=${encodeURIComponent(out.emojiMd5)}&username=${username}${remoteEmojiUrl ? `&emoji_url=${encodeURIComponent(remoteEmojiUrl)}` : ''}${remoteAesKey ? `&aes_key=${encodeURIComponent(remoteAesKey)}` : ''}`
|
||||
: ''
|
||||
if (!out.content || /^\[.+\]$/.test(String(out.content || '').trim())) out.content = '[表情]'
|
||||
} else if (out.renderType === 'image') {
|
||||
out.imageMd5 = pickFirstMd5(out.fullmd5, out.thumbfullmd5, out.md5)
|
||||
out.imageUrl = out.imageMd5
|
||||
? `${mediaBase}/api/chat/media/image?account=${account}&md5=${encodeURIComponent(out.imageMd5)}&username=${username}`
|
||||
: ''
|
||||
const srcServerId = String(out.fromnewmsgid || '').trim()
|
||||
const imgParts = [
|
||||
`account=${account}`,
|
||||
out.imageMd5 ? `md5=${encodeURIComponent(out.imageMd5)}` : '',
|
||||
srcServerId ? `server_id=${encodeURIComponent(srcServerId)}` : '',
|
||||
`username=${username}`
|
||||
].filter(Boolean)
|
||||
out.imageUrl = imgParts.length ? `${mediaBase}/api/chat/media/image?${imgParts.join('&')}` : ''
|
||||
if (!out.content || /^\[.+\]$/.test(String(out.content || '').trim())) out.content = '[图片]'
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import asyncio
|
||||
from functools import lru_cache
|
||||
import hashlib
|
||||
import html
|
||||
import ipaddress
|
||||
import mimetypes
|
||||
import os
|
||||
@@ -18,18 +19,20 @@ from pydantic import BaseModel, Field
|
||||
from ..logging_config import get_logger
|
||||
from ..media_helpers import (
|
||||
_convert_silk_to_wav,
|
||||
_decrypt_emoticon_aes_cbc,
|
||||
_detect_image_extension,
|
||||
_detect_image_media_type,
|
||||
_is_probably_valid_image,
|
||||
_iter_media_source_candidates,
|
||||
_order_media_candidates,
|
||||
_download_http_bytes,
|
||||
_ensure_decrypted_resource_for_md5,
|
||||
_fallback_search_media_by_file_id,
|
||||
_fallback_search_media_by_md5,
|
||||
_get_decrypted_resource_path,
|
||||
_get_resource_dir,
|
||||
_guess_media_type_by_path,
|
||||
_is_probably_valid_image,
|
||||
_iter_emoji_source_candidates,
|
||||
_iter_media_source_candidates,
|
||||
_order_media_candidates,
|
||||
_read_and_maybe_decrypt_media,
|
||||
_resolve_account_db_storage_dir,
|
||||
_resolve_account_dir,
|
||||
@@ -40,6 +43,7 @@ from ..media_helpers import (
|
||||
_try_find_decrypted_resource,
|
||||
_try_strip_media_prefix,
|
||||
)
|
||||
from ..chat_helpers import _extract_md5_from_packed_info
|
||||
from ..path_fix import PathFixRoute
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@@ -300,6 +304,51 @@ def _is_valid_md5(s: str) -> bool:
|
||||
return bool(re.fullmatch(r"[0-9a-f]{32}", v))
|
||||
|
||||
|
||||
@lru_cache(maxsize=4096)
|
||||
def _lookup_resource_md5_by_server_id(account_dir_str: str, server_id: int, want_local_type: int = 0) -> str:
|
||||
"""Resolve on-disk resource md5 from message_resource.db by message_svr_id.
|
||||
|
||||
WeChat 4.x often stores media on disk using an md5 derived from `packed_info` rather than
|
||||
the `fullmd5/thumbfullmd5` values found in message XML (including merged-forward records).
|
||||
"""
|
||||
account_dir_str = str(account_dir_str or "").strip()
|
||||
if not account_dir_str:
|
||||
return ""
|
||||
try:
|
||||
sid = int(server_id or 0)
|
||||
except Exception:
|
||||
sid = 0
|
||||
if not sid:
|
||||
return ""
|
||||
|
||||
account_dir = Path(account_dir_str)
|
||||
db_path = account_dir / "message_resource.db"
|
||||
if not db_path.exists():
|
||||
return ""
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT message_local_type, packed_info FROM MessageResourceInfo "
|
||||
"WHERE message_svr_id = ? ORDER BY message_create_time DESC LIMIT 1",
|
||||
(sid,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return ""
|
||||
if want_local_type and int(row[0] or 0) != int(want_local_type):
|
||||
return ""
|
||||
md5 = _extract_md5_from_packed_info(row[1])
|
||||
md5 = str(md5 or "").strip().lower()
|
||||
return md5 if _is_valid_md5(md5) else ""
|
||||
except Exception:
|
||||
return ""
|
||||
finally:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _is_safe_http_url(url: str) -> bool:
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
@@ -459,12 +508,13 @@ async def download_chat_emoji(req: EmojiDownloadRequest):
|
||||
async def get_chat_image(
|
||||
md5: Optional[str] = None,
|
||||
file_id: Optional[str] = None,
|
||||
server_id: Optional[int] = None,
|
||||
account: Optional[str] = None,
|
||||
username: Optional[str] = None,
|
||||
deep_scan: bool = False,
|
||||
):
|
||||
if (not md5) and (not file_id):
|
||||
raise HTTPException(status_code=400, detail="Missing md5/file_id.")
|
||||
if (not md5) and (not file_id) and (not server_id):
|
||||
raise HTTPException(status_code=400, detail="Missing md5/file_id/server_id.")
|
||||
|
||||
# Some WeChat versions put non-MD5 identifiers in the "md5" field; treat them as file_id.
|
||||
if md5 and (not file_id) and (not _is_valid_md5(str(md5))):
|
||||
@@ -472,6 +522,13 @@ async def get_chat_image(
|
||||
md5 = None
|
||||
account_dir = _resolve_account_dir(account)
|
||||
|
||||
# Prefer resource md5 derived from message_resource.db for chat history / app messages.
|
||||
# This matches how regular image messages are resolved elsewhere in the codebase.
|
||||
if server_id:
|
||||
resource_md5 = _lookup_resource_md5_by_server_id(str(account_dir), int(server_id), want_local_type=3)
|
||||
if resource_md5:
|
||||
md5 = resource_md5
|
||||
|
||||
# md5 模式:优先从解密资源目录读取(更快)
|
||||
if md5:
|
||||
decrypted_path = _try_find_decrypted_resource(account_dir, str(md5).lower())
|
||||
@@ -620,7 +677,13 @@ async def get_chat_image(
|
||||
|
||||
|
||||
@router.get("/api/chat/media/emoji", summary="获取表情消息资源")
|
||||
async def get_chat_emoji(md5: str, account: Optional[str] = None, username: Optional[str] = None):
|
||||
async def get_chat_emoji(
|
||||
md5: str,
|
||||
account: Optional[str] = None,
|
||||
username: Optional[str] = None,
|
||||
emoji_url: Optional[str] = None,
|
||||
aes_key: Optional[str] = None,
|
||||
):
|
||||
if not md5:
|
||||
raise HTTPException(status_code=400, detail="Missing md5.")
|
||||
account_dir = _resolve_account_dir(account)
|
||||
@@ -652,6 +715,44 @@ async def get_chat_emoji(md5: str, account: Optional[str] = None, username: Opti
|
||||
if data2 is not None and mt2:
|
||||
data, media_type = data2, mt2
|
||||
|
||||
if media_type == "application/octet-stream" and emoji_url:
|
||||
# Some merged-forward records include CDN URLs and AES keys inside recordItem, but the md5
|
||||
# is missing from emoticon.db; allow the client to provide a safe remote URL as fallback.
|
||||
url = html.unescape(str(emoji_url or "")).strip()
|
||||
if url:
|
||||
try:
|
||||
payload = _download_http_bytes(url)
|
||||
except Exception:
|
||||
payload = b""
|
||||
|
||||
candidates: list[bytes] = [payload] if payload else []
|
||||
dec = _decrypt_emoticon_aes_cbc(payload, str(aes_key or "").strip()) if payload and aes_key else None
|
||||
if dec is not None:
|
||||
candidates.insert(0, dec)
|
||||
|
||||
for blob in candidates:
|
||||
if not blob:
|
||||
continue
|
||||
try:
|
||||
data2, mt = _try_strip_media_prefix(blob)
|
||||
except Exception:
|
||||
data2, mt = blob, "application/octet-stream"
|
||||
|
||||
if mt == "application/octet-stream":
|
||||
mt = _detect_image_media_type(data2[:32])
|
||||
if mt == "application/octet-stream":
|
||||
try:
|
||||
if len(data2) >= 8 and data2[4:8] == b"ftyp":
|
||||
mt = "video/mp4"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if mt.startswith("image/") and (not _is_probably_valid_image(data2, mt)):
|
||||
continue
|
||||
if mt != "application/octet-stream":
|
||||
data, media_type = data2, mt
|
||||
break
|
||||
|
||||
if (not p) and media_type == "application/octet-stream":
|
||||
raise HTTPException(status_code=404, detail="Emoji not found.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user