fix(chat): 链接卡片补全公众号来源并解决缩略图防盗链

- appmsg 解析补全 from/fromUsername，并规范化 url/thumbUrl - contact.db 兜底反查 fromUsername（仅有 sourcedisplayname 时） - 新增 /api/chat/media/proxy_image，仅允许 qpic/qlogo，带 mp.weixin.qq.com Referer（10MB 限制） - 前端 LinkCard 增加来源头像/host 兜底，qpic/qlogo 预览图走代理；头像加载失败回退 - 导出消息补充 from 字段
2026-02-19 14:20:51 +08:00 · 2026-01-24 10:51:35 +08:00
parent d3d1c8dc7d
commit c523036a10
5 changed files with 510 additions and 34 deletions
--- a/src/wechat_decrypt_tool/chat_export_service.py
+++ b/src/wechat_decrypt_tool/chat_export_service.py
@@ -894,6 +894,7 @@ def _parse_message_for_export(
    content_text = raw_text
    title = ""
    url = ""
+    from_name = ""
    record_item = ""
    image_md5 = ""
    image_file_id = ""
@@ -934,6 +935,7 @@ def _parse_message_for_export(
        content_text = str(parsed.get("content") or "")
        title = str(parsed.get("title") or "")
        url = str(parsed.get("url") or "")
+        from_name = str(parsed.get("from") or "")
        record_item = str(parsed.get("recordItem") or "")
        quote_title = str(parsed.get("quoteTitle") or "")
        quote_content = str(parsed.get("quoteContent") or "")
@@ -1162,6 +1164,7 @@ def _parse_message_for_export(
        "content": content_text,
        "title": title,
        "url": url,
+        "from": from_name,
        "recordItem": record_item,
        "thumbUrl": thumb_url,
        "imageMd5": image_md5,
--- a/src/wechat_decrypt_tool/chat_helpers.py
+++ b/src/wechat_decrypt_tool/chat_helpers.py
@@ -773,7 +773,21 @@ def _parse_app_message(text: str) -> dict[str, Any]:
        app_type = 0
    title = _extract_xml_tag_text(text, "title")
    des = _extract_xml_tag_text(text, "des")
-    url = _extract_xml_tag_text(text, "url")
+    url = _normalize_xml_url(_extract_xml_tag_text(text, "url"))
+
+    # Some appmsg payloads (notably mp.weixin.qq.com link shares) include a "source" block:
+    #   <sourceusername>gh_xxx</sourceusername>
+    #   <sourcedisplayname>公众号名</sourcedisplayname>
+    # We'll surface that as `from` so the frontend can render the publisher line like WeChat.
+    source_display_name = (
+        _extract_xml_tag_text(text, "sourcedisplayname")
+        or _extract_xml_tag_text(text, "sourceDisplayName")
+        or _extract_xml_tag_text(text, "appname")
+    )
+    source_username = (
+        _extract_xml_tag_text(text, "sourceusername")
+        or _extract_xml_tag_text(text, "sourceUsername")
+    )

    lower = text.lower()

@@ -794,13 +808,15 @@ def _parse_app_message(text: str) -> dict[str, Any]:
        }

    if app_type in (5, 68) and url:
-        thumb_url = _extract_xml_tag_text(text, "thumburl")
+        thumb_url = _normalize_xml_url(_extract_xml_tag_text(text, "thumburl"))
        return {
            "renderType": "link",
            "content": des or title or "[链接]",
            "title": title or des or "",
            "url": url,
            "thumbUrl": thumb_url or "",
+            "from": str(source_display_name or "").strip(),
+            "fromUsername": str(source_username or "").strip(),
        }

    if app_type in (6, 74):
@@ -1322,6 +1338,58 @@ def _load_contact_rows(contact_db_path: Path, usernames: list[str]) -> dict[str,
        conn.close()


+def _load_usernames_by_display_names(contact_db_path: Path, names: list[str]) -> dict[str, str]:
+    """Best-effort mapping from display name -> username using contact.db.
+
+    Some appmsg/link payloads only provide `sourcedisplayname` (surfaced as `from`) but not
+    `sourceusername` (`fromUsername`). We use this mapping to recover `fromUsername` so the
+    frontend can render the publisher avatar via `/api/chat/avatar`.
+    """
+
+    uniq = list(dict.fromkeys([str(n or "").strip() for n in names if str(n or "").strip()]))
+    if not uniq:
+        return {}
+
+    placeholders = ",".join(["?"] * len(uniq))
+    hits: dict[str, set[str]] = {}
+
+    conn = sqlite3.connect(str(contact_db_path))
+    conn.row_factory = sqlite3.Row
+    try:
+        def query_table(table: str) -> None:
+            for col in ("remark", "nick_name", "alias"):
+                sql = f"""
+                    SELECT username, {col} AS display_name
+                    FROM {table}
+                    WHERE {col} IN ({placeholders})
+                """
+                try:
+                    rows = conn.execute(sql, uniq).fetchall()
+                except Exception:
+                    rows = []
+                for r in rows:
+                    try:
+                        dn = str(r["display_name"] or "").strip()
+                        u = str(r["username"] or "").strip()
+                    except Exception:
+                        continue
+                    if not dn or not u:
+                        continue
+                    hits.setdefault(dn, set()).add(u)
+
+        query_table("contact")
+        query_table("stranger")
+
+        # Only return unambiguous mappings (display name -> exactly 1 username).
+        out: dict[str, str] = {}
+        for dn, users in hits.items():
+            if len(users) == 1:
+                out[dn] = next(iter(users))
+        return out
+    finally:
+        conn.close()
+
+
 def _make_search_tokens(q: str) -> list[str]:
    tokens = [t for t in re.split(r"\s+", str(q or "").strip()) if t]
    if len(tokens) > 8:
--- a/src/wechat_decrypt_tool/routers/chat.py
+++ b/src/wechat_decrypt_tool/routers/chat.py
@@ -39,6 +39,7 @@ from ..chat_helpers import (
    _make_snippet,
    _match_tokens,
    _load_contact_rows,
+    _load_usernames_by_display_names,
    _load_latest_message_previews,
    _lookup_resource_md5,
    _normalize_xml_url,
@@ -1519,6 +1520,8 @@ def _append_full_messages_from_rows(
        content_text = raw_text
        title = ""
        url = ""
+        from_name = ""
+        from_username = ""
        record_item = ""
        image_md5 = ""
        emoji_md5 = ""
@@ -1561,6 +1564,8 @@ def _append_full_messages_from_rows(
            content_text = str(parsed.get("content") or "")
            title = str(parsed.get("title") or "")
            url = str(parsed.get("url") or "")
+            from_name = str(parsed.get("from") or "")
+            from_username = str(parsed.get("fromUsername") or "")
            record_item = str(parsed.get("recordItem") or "")
            quote_title = str(parsed.get("quoteTitle") or "")
            quote_content = str(parsed.get("quoteContent") or "")
@@ -1781,6 +1786,7 @@ def _append_full_messages_from_rows(
                            amount = str(parsed.get("amount") or amount)
                            cover_url = str(parsed.get("coverUrl") or cover_url)
                            thumb_url = str(parsed.get("thumbUrl") or thumb_url)
+                            from_name = str(parsed.get("from") or from_name)
                            file_size = str(parsed.get("size") or file_size)
                            pay_sub_type = str(parsed.get("paySubType") or pay_sub_type)
                            file_md5 = str(parsed.get("fileMd5") or file_md5)
@@ -1828,6 +1834,8 @@ def _append_full_messages_from_rows(
                "content": content_text,
                "title": title,
                "url": url,
+                "from": from_name,
+                "fromUsername": from_username,
                "recordItem": record_item,
                "imageMd5": image_md5,
                "imageFileId": image_file_id,
@@ -1949,13 +1957,42 @@ def _postprocess_full_messages(
                    is_sent = m.get("isSent", False)
                    m["transferStatus"] = "已收款" if is_sent else "已被接收"

+    # Some appmsg payloads provide only `from` (sourcedisplayname) but not `fromUsername` (sourceusername).
+    # Recover `fromUsername` via contact.db so the frontend can render the publisher avatar.
+    missing_from_names = [
+        str(m.get("from") or "").strip()
+        for m in merged
+        if str(m.get("renderType") or "").strip() == "link"
+        and str(m.get("from") or "").strip()
+        and not str(m.get("fromUsername") or "").strip()
+    ]
+    if missing_from_names:
+        name_to_username = _load_usernames_by_display_names(contact_db_path, missing_from_names)
+        if name_to_username:
+            for m in merged:
+                if str(m.get("fromUsername") or "").strip():
+                    continue
+                if str(m.get("renderType") or "").strip() != "link":
+                    continue
+                fn = str(m.get("from") or "").strip()
+                if fn and fn in name_to_username:
+                    m["fromUsername"] = name_to_username[fn]
+
+    from_usernames = [str(m.get("fromUsername") or "").strip() for m in merged]
    uniq_senders = list(
-        dict.fromkeys([u for u in (sender_usernames + list(pat_usernames) + quote_usernames) if u])
+        dict.fromkeys([u for u in (sender_usernames + list(pat_usernames) + quote_usernames + from_usernames) if u])
    )
    sender_contact_rows = _load_contact_rows(contact_db_path, uniq_senders)
    local_sender_avatars = _query_head_image_usernames(head_image_db_path, uniq_senders)

    for m in merged:
+        # If appmsg doesn't provide sourcedisplayname, try mapping sourceusername to display name.
+        if (not str(m.get("from") or "").strip()) and str(m.get("fromUsername") or "").strip():
+            fu = str(m.get("fromUsername") or "").strip()
+            frow = sender_contact_rows.get(fu)
+            if frow is not None:
+                m["from"] = _pick_display_name(frow, fu)
+
        su = str(m.get("senderUsername") or "")
        if not su:
            continue
@@ -2479,6 +2516,8 @@ def _collect_chat_messages(
                content_text = raw_text
                title = ""
                url = ""
+                from_name = ""
+                from_username = ""
                record_item = ""
                image_md5 = ""
                emoji_md5 = ""
@@ -2523,6 +2562,8 @@ def _collect_chat_messages(
                    content_text = str(parsed.get("content") or "")
                    title = str(parsed.get("title") or "")
                    url = str(parsed.get("url") or "")
+                    from_name = str(parsed.get("from") or "")
+                    from_username = str(parsed.get("fromUsername") or "")
                    record_item = str(parsed.get("recordItem") or "")
                    quote_title = str(parsed.get("quoteTitle") or "")
                    quote_content = str(parsed.get("quoteContent") or "")
@@ -2725,6 +2766,7 @@ def _collect_chat_messages(
                                    content_text = str(parsed.get("content") or content_text)
                                    title = str(parsed.get("title") or title)
                                    url = str(parsed.get("url") or url)
+                                    from_name = str(parsed.get("from") or from_name)
                                    record_item = str(parsed.get("recordItem") or record_item)
                                    quote_title = str(parsed.get("quoteTitle") or quote_title)
                                    quote_content = str(parsed.get("quoteContent") or quote_content)
@@ -2785,6 +2827,8 @@ def _collect_chat_messages(
                        "content": content_text,
                        "title": title,
                        "url": url,
+                        "from": from_name,
+                        "fromUsername": from_username,
                        "recordItem": record_item,
                        "imageMd5": image_md5,
                        "imageFileId": image_file_id,
@@ -3124,6 +3168,8 @@ async def list_chat_messages(
                content_text = raw_text
                title = ""
                url = ""
+                from_name = ""
+                from_username = ""
                record_item = ""
                image_md5 = ""
                emoji_md5 = ""
@@ -3168,6 +3214,8 @@ async def list_chat_messages(
                    content_text = str(parsed.get("content") or "")
                    title = str(parsed.get("title") or "")
                    url = str(parsed.get("url") or "")
+                    from_name = str(parsed.get("from") or "")
+                    from_username = str(parsed.get("fromUsername") or "")
                    record_item = str(parsed.get("recordItem") or "")
                    quote_title = str(parsed.get("quoteTitle") or "")
                    quote_content = str(parsed.get("quoteContent") or "")
@@ -3366,6 +3414,7 @@ async def list_chat_messages(
                                    content_text = str(parsed.get("content") or content_text)
                                    title = str(parsed.get("title") or title)
                                    url = str(parsed.get("url") or url)
+                                    from_name = str(parsed.get("from") or from_name)
                                    record_item = str(parsed.get("recordItem") or record_item)
                                    quote_title = str(parsed.get("quoteTitle") or quote_title)
                                    quote_content = str(parsed.get("quoteContent") or quote_content)
@@ -3419,6 +3468,8 @@ async def list_chat_messages(
                        "content": content_text,
                        "title": title,
                        "url": url,
+                        "from": from_name,
+                        "fromUsername": from_username,
                        "recordItem": record_item,
                        "imageMd5": image_md5,
                        "imageFileId": image_file_id,
@@ -3546,15 +3597,44 @@ async def list_chat_messages(
                    is_sent = m.get("isSent", False)
                    m["transferStatus"] = "已收款" if is_sent else "已被接收"

+    # Some appmsg payloads provide only `from` (sourcedisplayname) but not `fromUsername` (sourceusername).
+    # Recover `fromUsername` via contact.db so the frontend can render the publisher avatar.
+    missing_from_names = [
+        str(m.get("from") or "").strip()
+        for m in merged
+        if str(m.get("renderType") or "").strip() == "link"
+        and str(m.get("from") or "").strip()
+        and not str(m.get("fromUsername") or "").strip()
+    ]
+    if missing_from_names:
+        name_to_username = _load_usernames_by_display_names(contact_db_path, missing_from_names)
+        if name_to_username:
+            for m in merged:
+                if str(m.get("fromUsername") or "").strip():
+                    continue
+                if str(m.get("renderType") or "").strip() != "link":
+                    continue
+                fn = str(m.get("from") or "").strip()
+                if fn and fn in name_to_username:
+                    m["fromUsername"] = name_to_username[fn]
+
+    from_usernames = [str(m.get("fromUsername") or "").strip() for m in merged]
    uniq_senders = list(
        dict.fromkeys(
-            [u for u in (sender_usernames + list(pat_usernames) + quote_usernames) if u]
+            [u for u in (sender_usernames + list(pat_usernames) + quote_usernames + from_usernames) if u]
        )
    )
    sender_contact_rows = _load_contact_rows(contact_db_path, uniq_senders)
    local_sender_avatars = _query_head_image_usernames(head_image_db_path, uniq_senders)

    for m in merged:
+        # If appmsg doesn't provide sourcedisplayname, try mapping sourceusername to display name.
+        if (not str(m.get("from") or "").strip()) and str(m.get("fromUsername") or "").strip():
+            fu = str(m.get("fromUsername") or "").strip()
+            frow = sender_contact_rows.get(fu)
+            if frow is not None:
+                m["from"] = _pick_display_name(frow, fu)
+
        su = str(m.get("senderUsername") or "")
        if not su:
            continue
--- a/src/wechat_decrypt_tool/routers/chat_media.py
+++ b/src/wechat_decrypt_tool/routers/chat_media.py
@@ -408,6 +408,91 @@ def _detect_media_type_and_ext(data: bytes) -> tuple[bytes, str, str]:
    return payload, media_type, ext


+def _is_allowed_proxy_image_host(host: str) -> bool:
+    """Allowlist hosts for proxying images to avoid turning this into a general SSRF gadget."""
+    h = str(host or "").strip().lower()
+    if not h:
+        return False
+    # WeChat public account/article thumbnails and avatars commonly live on these CDNs.
+    return h.endswith(".qpic.cn") or h.endswith(".qlogo.cn")
+
+
+@router.get("/api/chat/media/proxy_image", summary="代理获取远程图片（解决微信公众号图片防盗链）")
+async def proxy_image(url: str):
+    u = html.unescape(str(url or "")).strip()
+    if not u:
+        raise HTTPException(status_code=400, detail="Missing url.")
+    if not _is_safe_http_url(u):
+        raise HTTPException(status_code=400, detail="Invalid url (only public http/https allowed).")
+
+    try:
+        p = urlparse(u)
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid url.")
+
+    host = (p.hostname or "").strip().lower()
+    if not _is_allowed_proxy_image_host(host):
+        raise HTTPException(status_code=400, detail="Unsupported url host for proxy_image.")
+
+    def _download_bytes() -> tuple[bytes, str]:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
+            "Accept": "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
+            # qpic/qlogo often require a mp.weixin.qq.com referer (anti-hotlink)
+            "Referer": "https://mp.weixin.qq.com/",
+            "Origin": "https://mp.weixin.qq.com",
+        }
+        r = requests.get(u, headers=headers, timeout=20, stream=True)
+        try:
+            r.raise_for_status()
+            content_type = str(r.headers.get("Content-Type") or "").strip()
+            max_bytes = 10 * 1024 * 1024
+            chunks: list[bytes] = []
+            total = 0
+            for ch in r.iter_content(chunk_size=64 * 1024):
+                if not ch:
+                    continue
+                chunks.append(ch)
+                total += len(ch)
+                if total > max_bytes:
+                    raise HTTPException(status_code=400, detail="Proxy image too large (>10MB).")
+            return b"".join(chunks), content_type
+        finally:
+            try:
+                r.close()
+            except Exception:
+                pass
+
+    try:
+        data, ct = await asyncio.to_thread(_download_bytes)
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.warning(f"proxy_image failed: url={u} err={e}")
+        raise HTTPException(status_code=502, detail=f"Proxy image failed: {e}")
+
+    if not data:
+        raise HTTPException(status_code=502, detail="Proxy returned empty body.")
+
+    payload, media_type, _ext = _detect_media_type_and_ext(data)
+
+    # Prefer upstream Content-Type when it looks like an image (sniffing may fail for some formats).
+    if media_type == "application/octet-stream" and ct:
+        try:
+            mt = ct.split(";")[0].strip()
+            if mt.startswith("image/"):
+                media_type = mt
+        except Exception:
+            pass
+
+    if not str(media_type or "").startswith("image/"):
+        raise HTTPException(status_code=502, detail="Proxy did not return an image.")
+
+    resp = Response(content=payload, media_type=media_type)
+    resp.headers["Cache-Control"] = "public, max-age=86400"
+    return resp
+
+
@router.post("/api/chat/media/emoji/download", summary="下载表情消息资源到本地 resource")
 async def download_chat_emoji(req: EmojiDownloadRequest):
    md5 = str(req.md5 or "").strip().lower()