mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-19 14:20:51 +08:00
fix(chat): 链接卡片补全公众号来源并解决缩略图防盗链
- appmsg 解析补全 from/fromUsername,并规范化 url/thumbUrl - contact.db 兜底反查 fromUsername(仅有 sourcedisplayname 时) - 新增 /api/chat/media/proxy_image,仅允许 qpic/qlogo,带 mp.weixin.qq.com Referer(10MB 限制) - 前端 LinkCard 增加来源头像/host 兜底,qpic/qlogo 预览图走代理;头像加载失败回退 - 导出消息补充 from 字段
This commit is contained in:
@@ -894,6 +894,7 @@ def _parse_message_for_export(
|
||||
content_text = raw_text
|
||||
title = ""
|
||||
url = ""
|
||||
from_name = ""
|
||||
record_item = ""
|
||||
image_md5 = ""
|
||||
image_file_id = ""
|
||||
@@ -934,6 +935,7 @@ def _parse_message_for_export(
|
||||
content_text = str(parsed.get("content") or "")
|
||||
title = str(parsed.get("title") or "")
|
||||
url = str(parsed.get("url") or "")
|
||||
from_name = str(parsed.get("from") or "")
|
||||
record_item = str(parsed.get("recordItem") or "")
|
||||
quote_title = str(parsed.get("quoteTitle") or "")
|
||||
quote_content = str(parsed.get("quoteContent") or "")
|
||||
@@ -1162,6 +1164,7 @@ def _parse_message_for_export(
|
||||
"content": content_text,
|
||||
"title": title,
|
||||
"url": url,
|
||||
"from": from_name,
|
||||
"recordItem": record_item,
|
||||
"thumbUrl": thumb_url,
|
||||
"imageMd5": image_md5,
|
||||
|
||||
@@ -773,7 +773,21 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
||||
app_type = 0
|
||||
title = _extract_xml_tag_text(text, "title")
|
||||
des = _extract_xml_tag_text(text, "des")
|
||||
url = _extract_xml_tag_text(text, "url")
|
||||
url = _normalize_xml_url(_extract_xml_tag_text(text, "url"))
|
||||
|
||||
# Some appmsg payloads (notably mp.weixin.qq.com link shares) include a "source" block:
|
||||
# <sourceusername>gh_xxx</sourceusername>
|
||||
# <sourcedisplayname>公众号名</sourcedisplayname>
|
||||
# We'll surface that as `from` so the frontend can render the publisher line like WeChat.
|
||||
source_display_name = (
|
||||
_extract_xml_tag_text(text, "sourcedisplayname")
|
||||
or _extract_xml_tag_text(text, "sourceDisplayName")
|
||||
or _extract_xml_tag_text(text, "appname")
|
||||
)
|
||||
source_username = (
|
||||
_extract_xml_tag_text(text, "sourceusername")
|
||||
or _extract_xml_tag_text(text, "sourceUsername")
|
||||
)
|
||||
|
||||
lower = text.lower()
|
||||
|
||||
@@ -794,13 +808,15 @@ def _parse_app_message(text: str) -> dict[str, Any]:
|
||||
}
|
||||
|
||||
if app_type in (5, 68) and url:
|
||||
thumb_url = _extract_xml_tag_text(text, "thumburl")
|
||||
thumb_url = _normalize_xml_url(_extract_xml_tag_text(text, "thumburl"))
|
||||
return {
|
||||
"renderType": "link",
|
||||
"content": des or title or "[链接]",
|
||||
"title": title or des or "",
|
||||
"url": url,
|
||||
"thumbUrl": thumb_url or "",
|
||||
"from": str(source_display_name or "").strip(),
|
||||
"fromUsername": str(source_username or "").strip(),
|
||||
}
|
||||
|
||||
if app_type in (6, 74):
|
||||
@@ -1322,6 +1338,58 @@ def _load_contact_rows(contact_db_path: Path, usernames: list[str]) -> dict[str,
|
||||
conn.close()
|
||||
|
||||
|
||||
def _load_usernames_by_display_names(contact_db_path: Path, names: list[str]) -> dict[str, str]:
|
||||
"""Best-effort mapping from display name -> username using contact.db.
|
||||
|
||||
Some appmsg/link payloads only provide `sourcedisplayname` (surfaced as `from`) but not
|
||||
`sourceusername` (`fromUsername`). We use this mapping to recover `fromUsername` so the
|
||||
frontend can render the publisher avatar via `/api/chat/avatar`.
|
||||
"""
|
||||
|
||||
uniq = list(dict.fromkeys([str(n or "").strip() for n in names if str(n or "").strip()]))
|
||||
if not uniq:
|
||||
return {}
|
||||
|
||||
placeholders = ",".join(["?"] * len(uniq))
|
||||
hits: dict[str, set[str]] = {}
|
||||
|
||||
conn = sqlite3.connect(str(contact_db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
def query_table(table: str) -> None:
|
||||
for col in ("remark", "nick_name", "alias"):
|
||||
sql = f"""
|
||||
SELECT username, {col} AS display_name
|
||||
FROM {table}
|
||||
WHERE {col} IN ({placeholders})
|
||||
"""
|
||||
try:
|
||||
rows = conn.execute(sql, uniq).fetchall()
|
||||
except Exception:
|
||||
rows = []
|
||||
for r in rows:
|
||||
try:
|
||||
dn = str(r["display_name"] or "").strip()
|
||||
u = str(r["username"] or "").strip()
|
||||
except Exception:
|
||||
continue
|
||||
if not dn or not u:
|
||||
continue
|
||||
hits.setdefault(dn, set()).add(u)
|
||||
|
||||
query_table("contact")
|
||||
query_table("stranger")
|
||||
|
||||
# Only return unambiguous mappings (display name -> exactly 1 username).
|
||||
out: dict[str, str] = {}
|
||||
for dn, users in hits.items():
|
||||
if len(users) == 1:
|
||||
out[dn] = next(iter(users))
|
||||
return out
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def _make_search_tokens(q: str) -> list[str]:
|
||||
tokens = [t for t in re.split(r"\s+", str(q or "").strip()) if t]
|
||||
if len(tokens) > 8:
|
||||
|
||||
@@ -39,6 +39,7 @@ from ..chat_helpers import (
|
||||
_make_snippet,
|
||||
_match_tokens,
|
||||
_load_contact_rows,
|
||||
_load_usernames_by_display_names,
|
||||
_load_latest_message_previews,
|
||||
_lookup_resource_md5,
|
||||
_normalize_xml_url,
|
||||
@@ -1519,6 +1520,8 @@ def _append_full_messages_from_rows(
|
||||
content_text = raw_text
|
||||
title = ""
|
||||
url = ""
|
||||
from_name = ""
|
||||
from_username = ""
|
||||
record_item = ""
|
||||
image_md5 = ""
|
||||
emoji_md5 = ""
|
||||
@@ -1561,6 +1564,8 @@ def _append_full_messages_from_rows(
|
||||
content_text = str(parsed.get("content") or "")
|
||||
title = str(parsed.get("title") or "")
|
||||
url = str(parsed.get("url") or "")
|
||||
from_name = str(parsed.get("from") or "")
|
||||
from_username = str(parsed.get("fromUsername") or "")
|
||||
record_item = str(parsed.get("recordItem") or "")
|
||||
quote_title = str(parsed.get("quoteTitle") or "")
|
||||
quote_content = str(parsed.get("quoteContent") or "")
|
||||
@@ -1781,6 +1786,7 @@ def _append_full_messages_from_rows(
|
||||
amount = str(parsed.get("amount") or amount)
|
||||
cover_url = str(parsed.get("coverUrl") or cover_url)
|
||||
thumb_url = str(parsed.get("thumbUrl") or thumb_url)
|
||||
from_name = str(parsed.get("from") or from_name)
|
||||
file_size = str(parsed.get("size") or file_size)
|
||||
pay_sub_type = str(parsed.get("paySubType") or pay_sub_type)
|
||||
file_md5 = str(parsed.get("fileMd5") or file_md5)
|
||||
@@ -1828,6 +1834,8 @@ def _append_full_messages_from_rows(
|
||||
"content": content_text,
|
||||
"title": title,
|
||||
"url": url,
|
||||
"from": from_name,
|
||||
"fromUsername": from_username,
|
||||
"recordItem": record_item,
|
||||
"imageMd5": image_md5,
|
||||
"imageFileId": image_file_id,
|
||||
@@ -1949,13 +1957,42 @@ def _postprocess_full_messages(
|
||||
is_sent = m.get("isSent", False)
|
||||
m["transferStatus"] = "已收款" if is_sent else "已被接收"
|
||||
|
||||
# Some appmsg payloads provide only `from` (sourcedisplayname) but not `fromUsername` (sourceusername).
|
||||
# Recover `fromUsername` via contact.db so the frontend can render the publisher avatar.
|
||||
missing_from_names = [
|
||||
str(m.get("from") or "").strip()
|
||||
for m in merged
|
||||
if str(m.get("renderType") or "").strip() == "link"
|
||||
and str(m.get("from") or "").strip()
|
||||
and not str(m.get("fromUsername") or "").strip()
|
||||
]
|
||||
if missing_from_names:
|
||||
name_to_username = _load_usernames_by_display_names(contact_db_path, missing_from_names)
|
||||
if name_to_username:
|
||||
for m in merged:
|
||||
if str(m.get("fromUsername") or "").strip():
|
||||
continue
|
||||
if str(m.get("renderType") or "").strip() != "link":
|
||||
continue
|
||||
fn = str(m.get("from") or "").strip()
|
||||
if fn and fn in name_to_username:
|
||||
m["fromUsername"] = name_to_username[fn]
|
||||
|
||||
from_usernames = [str(m.get("fromUsername") or "").strip() for m in merged]
|
||||
uniq_senders = list(
|
||||
dict.fromkeys([u for u in (sender_usernames + list(pat_usernames) + quote_usernames) if u])
|
||||
dict.fromkeys([u for u in (sender_usernames + list(pat_usernames) + quote_usernames + from_usernames) if u])
|
||||
)
|
||||
sender_contact_rows = _load_contact_rows(contact_db_path, uniq_senders)
|
||||
local_sender_avatars = _query_head_image_usernames(head_image_db_path, uniq_senders)
|
||||
|
||||
for m in merged:
|
||||
# If appmsg doesn't provide sourcedisplayname, try mapping sourceusername to display name.
|
||||
if (not str(m.get("from") or "").strip()) and str(m.get("fromUsername") or "").strip():
|
||||
fu = str(m.get("fromUsername") or "").strip()
|
||||
frow = sender_contact_rows.get(fu)
|
||||
if frow is not None:
|
||||
m["from"] = _pick_display_name(frow, fu)
|
||||
|
||||
su = str(m.get("senderUsername") or "")
|
||||
if not su:
|
||||
continue
|
||||
@@ -2479,6 +2516,8 @@ def _collect_chat_messages(
|
||||
content_text = raw_text
|
||||
title = ""
|
||||
url = ""
|
||||
from_name = ""
|
||||
from_username = ""
|
||||
record_item = ""
|
||||
image_md5 = ""
|
||||
emoji_md5 = ""
|
||||
@@ -2523,6 +2562,8 @@ def _collect_chat_messages(
|
||||
content_text = str(parsed.get("content") or "")
|
||||
title = str(parsed.get("title") or "")
|
||||
url = str(parsed.get("url") or "")
|
||||
from_name = str(parsed.get("from") or "")
|
||||
from_username = str(parsed.get("fromUsername") or "")
|
||||
record_item = str(parsed.get("recordItem") or "")
|
||||
quote_title = str(parsed.get("quoteTitle") or "")
|
||||
quote_content = str(parsed.get("quoteContent") or "")
|
||||
@@ -2725,6 +2766,7 @@ def _collect_chat_messages(
|
||||
content_text = str(parsed.get("content") or content_text)
|
||||
title = str(parsed.get("title") or title)
|
||||
url = str(parsed.get("url") or url)
|
||||
from_name = str(parsed.get("from") or from_name)
|
||||
record_item = str(parsed.get("recordItem") or record_item)
|
||||
quote_title = str(parsed.get("quoteTitle") or quote_title)
|
||||
quote_content = str(parsed.get("quoteContent") or quote_content)
|
||||
@@ -2785,6 +2827,8 @@ def _collect_chat_messages(
|
||||
"content": content_text,
|
||||
"title": title,
|
||||
"url": url,
|
||||
"from": from_name,
|
||||
"fromUsername": from_username,
|
||||
"recordItem": record_item,
|
||||
"imageMd5": image_md5,
|
||||
"imageFileId": image_file_id,
|
||||
@@ -3124,6 +3168,8 @@ async def list_chat_messages(
|
||||
content_text = raw_text
|
||||
title = ""
|
||||
url = ""
|
||||
from_name = ""
|
||||
from_username = ""
|
||||
record_item = ""
|
||||
image_md5 = ""
|
||||
emoji_md5 = ""
|
||||
@@ -3168,6 +3214,8 @@ async def list_chat_messages(
|
||||
content_text = str(parsed.get("content") or "")
|
||||
title = str(parsed.get("title") or "")
|
||||
url = str(parsed.get("url") or "")
|
||||
from_name = str(parsed.get("from") or "")
|
||||
from_username = str(parsed.get("fromUsername") or "")
|
||||
record_item = str(parsed.get("recordItem") or "")
|
||||
quote_title = str(parsed.get("quoteTitle") or "")
|
||||
quote_content = str(parsed.get("quoteContent") or "")
|
||||
@@ -3366,6 +3414,7 @@ async def list_chat_messages(
|
||||
content_text = str(parsed.get("content") or content_text)
|
||||
title = str(parsed.get("title") or title)
|
||||
url = str(parsed.get("url") or url)
|
||||
from_name = str(parsed.get("from") or from_name)
|
||||
record_item = str(parsed.get("recordItem") or record_item)
|
||||
quote_title = str(parsed.get("quoteTitle") or quote_title)
|
||||
quote_content = str(parsed.get("quoteContent") or quote_content)
|
||||
@@ -3419,6 +3468,8 @@ async def list_chat_messages(
|
||||
"content": content_text,
|
||||
"title": title,
|
||||
"url": url,
|
||||
"from": from_name,
|
||||
"fromUsername": from_username,
|
||||
"recordItem": record_item,
|
||||
"imageMd5": image_md5,
|
||||
"imageFileId": image_file_id,
|
||||
@@ -3546,15 +3597,44 @@ async def list_chat_messages(
|
||||
is_sent = m.get("isSent", False)
|
||||
m["transferStatus"] = "已收款" if is_sent else "已被接收"
|
||||
|
||||
# Some appmsg payloads provide only `from` (sourcedisplayname) but not `fromUsername` (sourceusername).
|
||||
# Recover `fromUsername` via contact.db so the frontend can render the publisher avatar.
|
||||
missing_from_names = [
|
||||
str(m.get("from") or "").strip()
|
||||
for m in merged
|
||||
if str(m.get("renderType") or "").strip() == "link"
|
||||
and str(m.get("from") or "").strip()
|
||||
and not str(m.get("fromUsername") or "").strip()
|
||||
]
|
||||
if missing_from_names:
|
||||
name_to_username = _load_usernames_by_display_names(contact_db_path, missing_from_names)
|
||||
if name_to_username:
|
||||
for m in merged:
|
||||
if str(m.get("fromUsername") or "").strip():
|
||||
continue
|
||||
if str(m.get("renderType") or "").strip() != "link":
|
||||
continue
|
||||
fn = str(m.get("from") or "").strip()
|
||||
if fn and fn in name_to_username:
|
||||
m["fromUsername"] = name_to_username[fn]
|
||||
|
||||
from_usernames = [str(m.get("fromUsername") or "").strip() for m in merged]
|
||||
uniq_senders = list(
|
||||
dict.fromkeys(
|
||||
[u for u in (sender_usernames + list(pat_usernames) + quote_usernames) if u]
|
||||
[u for u in (sender_usernames + list(pat_usernames) + quote_usernames + from_usernames) if u]
|
||||
)
|
||||
)
|
||||
sender_contact_rows = _load_contact_rows(contact_db_path, uniq_senders)
|
||||
local_sender_avatars = _query_head_image_usernames(head_image_db_path, uniq_senders)
|
||||
|
||||
for m in merged:
|
||||
# If appmsg doesn't provide sourcedisplayname, try mapping sourceusername to display name.
|
||||
if (not str(m.get("from") or "").strip()) and str(m.get("fromUsername") or "").strip():
|
||||
fu = str(m.get("fromUsername") or "").strip()
|
||||
frow = sender_contact_rows.get(fu)
|
||||
if frow is not None:
|
||||
m["from"] = _pick_display_name(frow, fu)
|
||||
|
||||
su = str(m.get("senderUsername") or "")
|
||||
if not su:
|
||||
continue
|
||||
|
||||
@@ -408,6 +408,91 @@ def _detect_media_type_and_ext(data: bytes) -> tuple[bytes, str, str]:
|
||||
return payload, media_type, ext
|
||||
|
||||
|
||||
def _is_allowed_proxy_image_host(host: str) -> bool:
|
||||
"""Allowlist hosts for proxying images to avoid turning this into a general SSRF gadget."""
|
||||
h = str(host or "").strip().lower()
|
||||
if not h:
|
||||
return False
|
||||
# WeChat public account/article thumbnails and avatars commonly live on these CDNs.
|
||||
return h.endswith(".qpic.cn") or h.endswith(".qlogo.cn")
|
||||
|
||||
|
||||
@router.get("/api/chat/media/proxy_image", summary="代理获取远程图片(解决微信公众号图片防盗链)")
|
||||
async def proxy_image(url: str):
|
||||
u = html.unescape(str(url or "")).strip()
|
||||
if not u:
|
||||
raise HTTPException(status_code=400, detail="Missing url.")
|
||||
if not _is_safe_http_url(u):
|
||||
raise HTTPException(status_code=400, detail="Invalid url (only public http/https allowed).")
|
||||
|
||||
try:
|
||||
p = urlparse(u)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail="Invalid url.")
|
||||
|
||||
host = (p.hostname or "").strip().lower()
|
||||
if not _is_allowed_proxy_image_host(host):
|
||||
raise HTTPException(status_code=400, detail="Unsupported url host for proxy_image.")
|
||||
|
||||
def _download_bytes() -> tuple[bytes, str]:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
|
||||
"Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
||||
# qpic/qlogo often require a mp.weixin.qq.com referer (anti-hotlink)
|
||||
"Referer": "https://mp.weixin.qq.com/",
|
||||
"Origin": "https://mp.weixin.qq.com",
|
||||
}
|
||||
r = requests.get(u, headers=headers, timeout=20, stream=True)
|
||||
try:
|
||||
r.raise_for_status()
|
||||
content_type = str(r.headers.get("Content-Type") or "").strip()
|
||||
max_bytes = 10 * 1024 * 1024
|
||||
chunks: list[bytes] = []
|
||||
total = 0
|
||||
for ch in r.iter_content(chunk_size=64 * 1024):
|
||||
if not ch:
|
||||
continue
|
||||
chunks.append(ch)
|
||||
total += len(ch)
|
||||
if total > max_bytes:
|
||||
raise HTTPException(status_code=400, detail="Proxy image too large (>10MB).")
|
||||
return b"".join(chunks), content_type
|
||||
finally:
|
||||
try:
|
||||
r.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
data, ct = await asyncio.to_thread(_download_bytes)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.warning(f"proxy_image failed: url={u} err={e}")
|
||||
raise HTTPException(status_code=502, detail=f"Proxy image failed: {e}")
|
||||
|
||||
if not data:
|
||||
raise HTTPException(status_code=502, detail="Proxy returned empty body.")
|
||||
|
||||
payload, media_type, _ext = _detect_media_type_and_ext(data)
|
||||
|
||||
# Prefer upstream Content-Type when it looks like an image (sniffing may fail for some formats).
|
||||
if media_type == "application/octet-stream" and ct:
|
||||
try:
|
||||
mt = ct.split(";")[0].strip()
|
||||
if mt.startswith("image/"):
|
||||
media_type = mt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not str(media_type or "").startswith("image/"):
|
||||
raise HTTPException(status_code=502, detail="Proxy did not return an image.")
|
||||
|
||||
resp = Response(content=payload, media_type=media_type)
|
||||
resp.headers["Cache-Control"] = "public, max-age=86400"
|
||||
return resp
|
||||
|
||||
|
||||
@router.post("/api/chat/media/emoji/download", summary="下载表情消息资源到本地 resource")
|
||||
async def download_chat_emoji(req: EmojiDownloadRequest):
|
||||
md5 = str(req.md5 or "").strip().lower()
|
||||
|
||||
Reference in New Issue
Block a user