mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-19 14:20:51 +08:00
fix(chat): 解析 XML 的 cdn 链接时进行反转义
- 新增 _normalize_xml_url 统一处理 & 等转义字符 - 图片/视频/表情消息的 cdn URL 解析统一规范化,避免误判为 file_id
This commit is contained in:
@@ -530,6 +530,17 @@ def _strip_cdata(s: str) -> str:
|
||||
return out.strip()
|
||||
|
||||
|
||||
def _normalize_xml_url(url: str) -> str:
|
||||
"""Normalize URLs extracted from XML attributes/tags (e.g. decode '&')."""
|
||||
u = str(url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
try:
|
||||
return html.unescape(u).strip()
|
||||
except Exception:
|
||||
return u.replace("&", "&").strip()
|
||||
|
||||
|
||||
def _extract_xml_tag_text(xml_text: str, tag: str) -> str:
|
||||
if not xml_text or not tag:
|
||||
return ""
|
||||
|
||||
@@ -33,6 +33,7 @@ from ..chat_helpers import (
|
||||
_load_contact_rows,
|
||||
_load_latest_message_previews,
|
||||
_lookup_resource_md5,
|
||||
_normalize_xml_url,
|
||||
_parse_app_message,
|
||||
_parse_pat_message,
|
||||
_pick_avatar_url,
|
||||
@@ -481,8 +482,10 @@ def _append_full_messages_from_rows(
|
||||
or _extract_xml_tag_text(raw_text, "cdnmidimgurl")
|
||||
or _extract_xml_tag_text(raw_text, "cdnbigimgurl")
|
||||
)
|
||||
_cdn_url_or_id = str(_cdn_url_or_id or "").strip()
|
||||
image_url = _cdn_url_or_id if _cdn_url_or_id.startswith(("http://", "https://")) else ""
|
||||
_cdn_url_or_id = _normalize_xml_url(_cdn_url_or_id)
|
||||
image_url = (
|
||||
_cdn_url_or_id if str(_cdn_url_or_id).lower().startswith(("http://", "https://")) else ""
|
||||
)
|
||||
if (not image_url) and _cdn_url_or_id:
|
||||
image_file_id = _cdn_url_or_id
|
||||
|
||||
@@ -512,6 +515,9 @@ def _append_full_messages_from_rows(
|
||||
raw_text, "cdnvideourl"
|
||||
)
|
||||
|
||||
video_thumb_url_or_id = _normalize_xml_url(video_thumb_url_or_id)
|
||||
video_url_or_id = _normalize_xml_url(video_url_or_id)
|
||||
|
||||
video_thumb_url = (
|
||||
video_thumb_url_or_id
|
||||
if str(video_thumb_url_or_id or "").strip().lower().startswith(("http://", "https://"))
|
||||
@@ -542,6 +548,7 @@ def _append_full_messages_from_rows(
|
||||
emoji_url = _extract_xml_attr(raw_text, "cdnurl")
|
||||
if not emoji_url:
|
||||
emoji_url = _extract_xml_tag_text(raw_text, "cdn_url")
|
||||
emoji_url = _normalize_xml_url(emoji_url)
|
||||
if (not emoji_md5) and resource_conn is not None:
|
||||
emoji_md5 = _lookup_resource_md5(
|
||||
resource_conn,
|
||||
|
||||
Reference in New Issue
Block a user