mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-19 14:20:51 +08:00
fix(chat): 解析 XML 的 cdn 链接时进行反转义
- 新增 _normalize_xml_url 统一处理 & 等转义字符 - 图片/视频/表情消息的 cdn URL 解析统一规范化,避免误判为 file_id
This commit is contained in:
@@ -530,6 +530,17 @@ def _strip_cdata(s: str) -> str:
|
|||||||
return out.strip()
|
return out.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_xml_url(url: str) -> str:
|
||||||
|
"""Normalize URLs extracted from XML attributes/tags (e.g. decode '&')."""
|
||||||
|
u = str(url or "").strip()
|
||||||
|
if not u:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
return html.unescape(u).strip()
|
||||||
|
except Exception:
|
||||||
|
return u.replace("&", "&").strip()
|
||||||
|
|
||||||
|
|
||||||
def _extract_xml_tag_text(xml_text: str, tag: str) -> str:
|
def _extract_xml_tag_text(xml_text: str, tag: str) -> str:
|
||||||
if not xml_text or not tag:
|
if not xml_text or not tag:
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from ..chat_helpers import (
|
|||||||
_load_contact_rows,
|
_load_contact_rows,
|
||||||
_load_latest_message_previews,
|
_load_latest_message_previews,
|
||||||
_lookup_resource_md5,
|
_lookup_resource_md5,
|
||||||
|
_normalize_xml_url,
|
||||||
_parse_app_message,
|
_parse_app_message,
|
||||||
_parse_pat_message,
|
_parse_pat_message,
|
||||||
_pick_avatar_url,
|
_pick_avatar_url,
|
||||||
@@ -481,8 +482,10 @@ def _append_full_messages_from_rows(
|
|||||||
or _extract_xml_tag_text(raw_text, "cdnmidimgurl")
|
or _extract_xml_tag_text(raw_text, "cdnmidimgurl")
|
||||||
or _extract_xml_tag_text(raw_text, "cdnbigimgurl")
|
or _extract_xml_tag_text(raw_text, "cdnbigimgurl")
|
||||||
)
|
)
|
||||||
_cdn_url_or_id = str(_cdn_url_or_id or "").strip()
|
_cdn_url_or_id = _normalize_xml_url(_cdn_url_or_id)
|
||||||
image_url = _cdn_url_or_id if _cdn_url_or_id.startswith(("http://", "https://")) else ""
|
image_url = (
|
||||||
|
_cdn_url_or_id if str(_cdn_url_or_id).lower().startswith(("http://", "https://")) else ""
|
||||||
|
)
|
||||||
if (not image_url) and _cdn_url_or_id:
|
if (not image_url) and _cdn_url_or_id:
|
||||||
image_file_id = _cdn_url_or_id
|
image_file_id = _cdn_url_or_id
|
||||||
|
|
||||||
@@ -512,6 +515,9 @@ def _append_full_messages_from_rows(
|
|||||||
raw_text, "cdnvideourl"
|
raw_text, "cdnvideourl"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
video_thumb_url_or_id = _normalize_xml_url(video_thumb_url_or_id)
|
||||||
|
video_url_or_id = _normalize_xml_url(video_url_or_id)
|
||||||
|
|
||||||
video_thumb_url = (
|
video_thumb_url = (
|
||||||
video_thumb_url_or_id
|
video_thumb_url_or_id
|
||||||
if str(video_thumb_url_or_id or "").strip().lower().startswith(("http://", "https://"))
|
if str(video_thumb_url_or_id or "").strip().lower().startswith(("http://", "https://"))
|
||||||
@@ -542,6 +548,7 @@ def _append_full_messages_from_rows(
|
|||||||
emoji_url = _extract_xml_attr(raw_text, "cdnurl")
|
emoji_url = _extract_xml_attr(raw_text, "cdnurl")
|
||||||
if not emoji_url:
|
if not emoji_url:
|
||||||
emoji_url = _extract_xml_tag_text(raw_text, "cdn_url")
|
emoji_url = _extract_xml_tag_text(raw_text, "cdn_url")
|
||||||
|
emoji_url = _normalize_xml_url(emoji_url)
|
||||||
if (not emoji_md5) and resource_conn is not None:
|
if (not emoji_md5) and resource_conn is not None:
|
||||||
emoji_md5 = _lookup_resource_md5(
|
emoji_md5 = _lookup_resource_md5(
|
||||||
resource_conn,
|
resource_conn,
|
||||||
|
|||||||
Reference in New Issue
Block a user