mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-02 22:10:50 +08:00
improvement(media): 提升图片/表情资源解密稳定性并自愈缓存
- 增加图片有效性校验并优化 JPEG/WebP 识别,降低 XOR 解密误判 - 图片资源按变体(b/h/c/t)与大小排序尝试多个候选,提高命中率 - 发现损坏缓存文件时自动删除并重建(单张获取与批量解密均生效) - 表情本地解密失败时支持从 emoticon.db 安全回退下载,并支持 AES-CBC 解密
This commit is contained in:
@@ -2,6 +2,7 @@ import ctypes
|
|||||||
import datetime
|
import datetime
|
||||||
import glob
|
import glob
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import ipaddress
|
||||||
import json
|
import json
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
@@ -11,6 +12,7 @@ import struct
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
|
||||||
@@ -74,15 +76,391 @@ def _detect_image_media_type(data: bytes) -> str:
|
|||||||
|
|
||||||
if data.startswith(b"\x89PNG\r\n\x1a\n"):
|
if data.startswith(b"\x89PNG\r\n\x1a\n"):
|
||||||
return "image/png"
|
return "image/png"
|
||||||
if data.startswith(b"\xff\xd8\xff"):
|
if data.startswith(b"\xff\xd8\xff") and len(data) >= 4:
|
||||||
|
marker = data[3]
|
||||||
|
# Most JPEG marker types are in 0xC0..0xFE (APP, SOF, DQT, DHT, SOS, COM, etc.).
|
||||||
|
# This avoids false positives where random bytes start with 0xFFD8FF.
|
||||||
|
if marker not in (0x00, 0xFF) and marker >= 0xC0:
|
||||||
return "image/jpeg"
|
return "image/jpeg"
|
||||||
if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
|
if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
|
||||||
return "image/gif"
|
return "image/gif"
|
||||||
if data.startswith(b"RIFF") and data[8:12] == b"WEBP":
|
if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
|
||||||
return "image/webp"
|
return "image/webp"
|
||||||
return "application/octet-stream"
|
return "application/octet-stream"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_probably_valid_image(data: bytes, media_type: str) -> bool:
|
||||||
|
"""Heuristic validation to reduce false positives when guessing XOR keys.
|
||||||
|
|
||||||
|
We keep it lightweight (no full parsing), only checking common trailers.
|
||||||
|
"""
|
||||||
|
if not data:
|
||||||
|
return False
|
||||||
|
|
||||||
|
mt = str(media_type or "").strip().lower()
|
||||||
|
if not mt.startswith("image/"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if mt == "image/jpeg":
|
||||||
|
if _detect_image_media_type(data[:32]) != "image/jpeg":
|
||||||
|
return False
|
||||||
|
trimmed = data.rstrip(b"\x00")
|
||||||
|
if len(trimmed) < 4 or not trimmed.startswith(b"\xff\xd8\xff"):
|
||||||
|
return False
|
||||||
|
if trimmed.endswith(b"\xff\xd9"):
|
||||||
|
return True
|
||||||
|
tail = trimmed[-4096:] if len(trimmed) > 4096 else trimmed
|
||||||
|
i = tail.rfind(b"\xff\xd9")
|
||||||
|
return i >= 0 and i >= len(tail) - 64 - 2
|
||||||
|
|
||||||
|
if mt == "image/png":
|
||||||
|
if not data.startswith(b"\x89PNG\r\n\x1a\n"):
|
||||||
|
return False
|
||||||
|
trailer = b"\x00\x00\x00\x00IEND\xaeB`\x82"
|
||||||
|
trimmed = data.rstrip(b"\x00")
|
||||||
|
if trimmed.endswith(trailer):
|
||||||
|
return True
|
||||||
|
tail = trimmed[-256:] if len(trimmed) > 256 else trimmed
|
||||||
|
i = tail.rfind(trailer)
|
||||||
|
return i >= 0 and i >= len(tail) - 64 - len(trailer)
|
||||||
|
|
||||||
|
if mt == "image/gif":
|
||||||
|
if not (data.startswith(b"GIF87a") or data.startswith(b"GIF89a")):
|
||||||
|
return False
|
||||||
|
trimmed = data.rstrip(b"\x00")
|
||||||
|
if trimmed.endswith(b"\x3B"):
|
||||||
|
return True
|
||||||
|
tail = trimmed[-256:] if len(trimmed) > 256 else trimmed
|
||||||
|
i = tail.rfind(b"\x3B")
|
||||||
|
return i >= 0 and i >= len(tail) - 16 - 1
|
||||||
|
|
||||||
|
if mt == "image/webp":
|
||||||
|
if len(data) < 12:
|
||||||
|
return False
|
||||||
|
return bool(data.startswith(b"RIFF") and data[8:12] == b"WEBP")
|
||||||
|
|
||||||
|
# Unknown image types: fall back to header-only check.
|
||||||
|
return _detect_image_media_type(data[:32]) != "application/octet-stream"
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_variant_basename(name: str) -> str:
|
||||||
|
"""Normalize a media filename stem by stripping common variant suffixes.
|
||||||
|
|
||||||
|
Mirrors echotrace's idea of normalizing `.t/.h/.b/.c` and `_t/_h/_b/_c`.
|
||||||
|
"""
|
||||||
|
v = str(name or "").strip()
|
||||||
|
if not v:
|
||||||
|
return ""
|
||||||
|
lower = v.lower()
|
||||||
|
for suf in ("_b", "_h", "_c", "_t", ".b", ".h", ".c", ".t"):
|
||||||
|
if lower.endswith(suf) and len(lower) > len(suf):
|
||||||
|
return lower[: -len(suf)]
|
||||||
|
return lower
|
||||||
|
|
||||||
|
|
||||||
|
def _variant_rank(name: str) -> int:
|
||||||
|
"""Ordering used when trying multiple candidate resources.
|
||||||
|
|
||||||
|
Prefer: big > high > original > cache > thumb.
|
||||||
|
"""
|
||||||
|
n = str(name or "").lower()
|
||||||
|
if n.endswith(("_b", ".b")):
|
||||||
|
return 0
|
||||||
|
if n.endswith(("_h", ".h")):
|
||||||
|
return 1
|
||||||
|
if n.endswith(("_c", ".c")):
|
||||||
|
return 3
|
||||||
|
if n.endswith(("_t", ".t")):
|
||||||
|
return 4
|
||||||
|
return 2
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_media_source_candidates(source: Path, *, limit: int = 30) -> list[Path]:
|
||||||
|
"""Yield sibling variant files around a resolved source path.
|
||||||
|
|
||||||
|
This is a lightweight approximation of echotrace's \"search many .dat variants then try them\".
|
||||||
|
"""
|
||||||
|
if not source:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not source.exists():
|
||||||
|
return []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if source.is_dir():
|
||||||
|
return []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
out: list[Path] = []
|
||||||
|
try:
|
||||||
|
out.append(source.resolve())
|
||||||
|
except Exception:
|
||||||
|
out.append(source)
|
||||||
|
|
||||||
|
parent = source.parent
|
||||||
|
stem = str(source.stem or "")
|
||||||
|
base = _normalize_variant_basename(stem)
|
||||||
|
if not base:
|
||||||
|
return out
|
||||||
|
|
||||||
|
preferred_names = [
|
||||||
|
f"{base}_b.dat",
|
||||||
|
f"{base}_h.dat",
|
||||||
|
f"{base}.dat",
|
||||||
|
f"{base}_c.dat",
|
||||||
|
f"{base}_t.dat",
|
||||||
|
f"{base}.b.dat",
|
||||||
|
f"{base}.h.dat",
|
||||||
|
f"{base}.c.dat",
|
||||||
|
f"{base}.t.dat",
|
||||||
|
f"{base}.gif",
|
||||||
|
f"{base}.webp",
|
||||||
|
f"{base}.png",
|
||||||
|
f"{base}.jpg",
|
||||||
|
f"{base}.jpeg",
|
||||||
|
]
|
||||||
|
|
||||||
|
for name in preferred_names:
|
||||||
|
p = parent / name
|
||||||
|
try:
|
||||||
|
if p.exists() and p.is_file():
|
||||||
|
out.append(p.resolve())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Add any other local .dat siblings with the same normalized base (limit to avoid explosion).
|
||||||
|
try:
|
||||||
|
for p in parent.glob(f"{base}*.dat"):
|
||||||
|
try:
|
||||||
|
if p.exists() and p.is_file():
|
||||||
|
out.append(p.resolve())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(out) >= int(limit):
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# De-dup while keeping order.
|
||||||
|
seen: set[str] = set()
|
||||||
|
uniq: list[Path] = []
|
||||||
|
for p in out:
|
||||||
|
try:
|
||||||
|
k = str(p.resolve())
|
||||||
|
except Exception:
|
||||||
|
k = str(p)
|
||||||
|
if k in seen:
|
||||||
|
continue
|
||||||
|
seen.add(k)
|
||||||
|
uniq.append(p)
|
||||||
|
return uniq
|
||||||
|
|
||||||
|
|
||||||
|
def _order_media_candidates(paths: list[Path]) -> list[Path]:
|
||||||
|
"""Sort candidate files similar to echotrace's variant preference + size heuristic."""
|
||||||
|
def _stat(p: Path) -> tuple[int, float]:
|
||||||
|
try:
|
||||||
|
st = p.stat()
|
||||||
|
return int(st.st_size), float(st.st_mtime)
|
||||||
|
except Exception:
|
||||||
|
return 0, 0.0
|
||||||
|
|
||||||
|
def key(p: Path) -> tuple[int, int, int, float, str]:
|
||||||
|
name = str(p.stem or "").lower()
|
||||||
|
rank = _variant_rank(name)
|
||||||
|
ext = str(p.suffix or "").lower()
|
||||||
|
# Prefer already-decoded formats (non-.dat) within the same variant rank.
|
||||||
|
ext_penalty = 1 if ext == ".dat" else 0
|
||||||
|
size, mtime = _stat(p)
|
||||||
|
return (rank, ext_penalty, -size, -mtime, str(p))
|
||||||
|
|
||||||
|
try:
|
||||||
|
return sorted(list(paths or []), key=key)
|
||||||
|
except Exception:
|
||||||
|
return list(paths or [])
|
||||||
|
|
||||||
|
|
||||||
|
def _is_safe_http_url(url: str) -> bool:
|
||||||
|
u = str(url or "").strip()
|
||||||
|
if not u:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
p = urlparse(u)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
if p.scheme not in ("http", "https"):
|
||||||
|
return False
|
||||||
|
host = (p.hostname or "").strip()
|
||||||
|
if not host:
|
||||||
|
return False
|
||||||
|
if host in {"localhost"}:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
ip = ipaddress.ip_address(host)
|
||||||
|
if ip.is_private or ip.is_loopback or ip.is_link_local:
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _download_http_bytes(url: str, *, timeout: int = 20, max_bytes: int = 30 * 1024 * 1024) -> bytes:
|
||||||
|
if not _is_safe_http_url(url):
|
||||||
|
raise HTTPException(status_code=400, detail="Unsafe URL.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"requests not available: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with requests.get(url, stream=True, timeout=timeout) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
try:
|
||||||
|
cl = int(r.headers.get("content-length") or 0)
|
||||||
|
if cl and cl > int(max_bytes):
|
||||||
|
raise HTTPException(status_code=413, detail="Remote file too large.")
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
chunks: list[bytes] = []
|
||||||
|
total = 0
|
||||||
|
for chunk in r.iter_content(chunk_size=256 * 1024):
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
chunks.append(chunk)
|
||||||
|
total += len(chunk)
|
||||||
|
if total > int(max_bytes):
|
||||||
|
raise HTTPException(status_code=413, detail="Remote file too large.")
|
||||||
|
return b"".join(chunks)
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=502, detail=f"Download failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def _decrypt_emoticon_aes_cbc(data: bytes, aes_key_hex: str) -> Optional[bytes]:
|
||||||
|
"""Decrypt WeChat emoticon payload from kNonStoreEmoticonTable.encrypt_url.
|
||||||
|
|
||||||
|
Observed scheme (WeChat 4.x):
|
||||||
|
- key = bytes.fromhex(aes_key_hex) (16 bytes)
|
||||||
|
- iv = key
|
||||||
|
- cipher = AES-128-CBC
|
||||||
|
- padding = PKCS7
|
||||||
|
"""
|
||||||
|
if not data:
|
||||||
|
return None
|
||||||
|
if len(data) % 16 != 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
khex = str(aes_key_hex or "").strip().lower()
|
||||||
|
if not re.fullmatch(r"[0-9a-f]{32}", khex):
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
key = bytes.fromhex(khex)
|
||||||
|
if len(key) != 16:
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from Crypto.Cipher import AES
|
||||||
|
from Crypto.Util import Padding
|
||||||
|
|
||||||
|
pt_padded = AES.new(key, AES.MODE_CBC, iv=key).decrypt(data)
|
||||||
|
pt = Padding.unpad(pt_padded, AES.block_size)
|
||||||
|
return pt
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=2048)
|
||||||
|
def _lookup_emoticon_info(account_dir_str: str, md5: str) -> dict[str, str]:
|
||||||
|
account_dir = Path(account_dir_str)
|
||||||
|
md5s = str(md5 or "").strip().lower()
|
||||||
|
if not md5s:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
db_path = account_dir / "emoticon.db"
|
||||||
|
if not db_path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT md5, aes_key, cdn_url, encrypt_url, extern_url, thumb_url, tp_url "
|
||||||
|
"FROM kNonStoreEmoticonTable WHERE lower(md5) = lower(?) LIMIT 1",
|
||||||
|
(md5s,),
|
||||||
|
).fetchone()
|
||||||
|
if not row:
|
||||||
|
return {}
|
||||||
|
return {k: str(row[k] or "") for k in row.keys()}
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _try_fetch_emoticon_from_remote(account_dir: Path, md5: str) -> tuple[Optional[bytes], Optional[str]]:
|
||||||
|
info = _lookup_emoticon_info(str(account_dir), str(md5 or "").lower())
|
||||||
|
if not info:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
aes_key_hex = str(info.get("aes_key") or "").strip()
|
||||||
|
urls: list[str] = []
|
||||||
|
# Prefer plain CDN URL first; fall back to encrypt_url (needs AES-CBC decrypt).
|
||||||
|
for k in ("cdn_url", "extern_url", "thumb_url", "tp_url", "encrypt_url"):
|
||||||
|
u = str(info.get(k) or "").strip()
|
||||||
|
if u and _is_safe_http_url(u):
|
||||||
|
urls.append(u)
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
try:
|
||||||
|
payload = _download_http_bytes(url)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
candidates: list[bytes] = [payload]
|
||||||
|
dec = _decrypt_emoticon_aes_cbc(payload, aes_key_hex)
|
||||||
|
if dec is not None:
|
||||||
|
candidates.insert(0, dec)
|
||||||
|
|
||||||
|
for data in candidates:
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data2, mt = _try_strip_media_prefix(data)
|
||||||
|
except Exception:
|
||||||
|
data2, mt = data, "application/octet-stream"
|
||||||
|
|
||||||
|
if mt == "application/octet-stream":
|
||||||
|
mt = _detect_image_media_type(data2[:32])
|
||||||
|
if mt == "application/octet-stream":
|
||||||
|
try:
|
||||||
|
if len(data2) >= 8 and data2[4:8] == b"ftyp":
|
||||||
|
mt = "video/mp4"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if mt.startswith("image/") and (not _is_probably_valid_image(data2, mt)):
|
||||||
|
continue
|
||||||
|
if mt != "application/octet-stream":
|
||||||
|
return data2, mt
|
||||||
|
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
class _WxAMConfig(ctypes.Structure):
|
class _WxAMConfig(ctypes.Structure):
|
||||||
_fields_ = [
|
_fields_ = [
|
||||||
("mode", ctypes.c_int),
|
("mode", ctypes.c_int),
|
||||||
@@ -191,7 +569,7 @@ def _try_strip_media_prefix(data: bytes) -> tuple[bytes, str]:
|
|||||||
if j >= 0 and j <= 128 * 1024:
|
if j >= 0 and j <= 128 * 1024:
|
||||||
sliced = data[j:]
|
sliced = data[j:]
|
||||||
mt2 = _detect_image_media_type(sliced[:32])
|
mt2 = _detect_image_media_type(sliced[:32])
|
||||||
if mt2 != "application/octet-stream":
|
if mt2 != "application/octet-stream" and _is_probably_valid_image(sliced, mt2):
|
||||||
return sliced, mt2
|
return sliced, mt2
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -363,7 +741,7 @@ def _resolve_media_path_from_hardlink(
|
|||||||
quoted = _quote_ident(table_name)
|
quoted = _quote_ident(table_name)
|
||||||
try:
|
try:
|
||||||
row = conn.execute(
|
row = conn.execute(
|
||||||
f"SELECT dir1, dir2, file_name, modify_time FROM {quoted} WHERE md5 = ? ORDER BY modify_time DESC LIMIT 1",
|
f"SELECT dir1, dir2, file_name, modify_time FROM {quoted} WHERE md5 = ? ORDER BY modify_time DESC, dir1 DESC, rowid DESC LIMIT 1",
|
||||||
(md5,),
|
(md5,),
|
||||||
).fetchone()
|
).fetchone()
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -917,9 +1295,10 @@ def _try_xor_decrypt_by_magic(data: bytes) -> tuple[Optional[bytes], Optional[st
|
|||||||
# (offset, magic, media_type)
|
# (offset, magic, media_type)
|
||||||
candidates: list[tuple[int, bytes, str]] = [
|
candidates: list[tuple[int, bytes, str]] = [
|
||||||
(0, b"\x89PNG\r\n\x1a\n", "image/png"),
|
(0, b"\x89PNG\r\n\x1a\n", "image/png"),
|
||||||
(0, b"\xff\xd8\xff", "image/jpeg"),
|
|
||||||
(0, b"GIF87a", "image/gif"),
|
(0, b"GIF87a", "image/gif"),
|
||||||
(0, b"GIF89a", "image/gif"),
|
(0, b"GIF89a", "image/gif"),
|
||||||
|
(0, b"RIFF", "application/octet-stream"),
|
||||||
|
(4, b"ftyp", "video/mp4"),
|
||||||
(0, b"wxgf", "application/octet-stream"),
|
(0, b"wxgf", "application/octet-stream"),
|
||||||
(1, b"wxgf", "application/octet-stream"),
|
(1, b"wxgf", "application/octet-stream"),
|
||||||
(2, b"wxgf", "application/octet-stream"),
|
(2, b"wxgf", "application/octet-stream"),
|
||||||
@@ -936,8 +1315,8 @@ def _try_xor_decrypt_by_magic(data: bytes) -> tuple[Optional[bytes], Optional[st
|
|||||||
(13, b"wxgf", "application/octet-stream"),
|
(13, b"wxgf", "application/octet-stream"),
|
||||||
(14, b"wxgf", "application/octet-stream"),
|
(14, b"wxgf", "application/octet-stream"),
|
||||||
(15, b"wxgf", "application/octet-stream"),
|
(15, b"wxgf", "application/octet-stream"),
|
||||||
(0, b"RIFF", "application/octet-stream"),
|
# JPEG magic is short (3 bytes), keep it last to reduce false positives.
|
||||||
(4, b"ftyp", "video/mp4"),
|
(0, b"\xff\xd8\xff", "image/jpeg"),
|
||||||
]
|
]
|
||||||
|
|
||||||
for offset, magic, mt in candidates:
|
for offset, magic, mt in candidates:
|
||||||
@@ -968,16 +1347,24 @@ def _try_xor_decrypt_by_magic(data: bytes) -> tuple[Optional[bytes], Optional[st
|
|||||||
|
|
||||||
if offset == 0 and magic == b"RIFF":
|
if offset == 0 and magic == b"RIFF":
|
||||||
if len(decoded) >= 12 and decoded[8:12] == b"WEBP":
|
if len(decoded) >= 12 and decoded[8:12] == b"WEBP":
|
||||||
|
if _is_probably_valid_image(decoded, "image/webp"):
|
||||||
return decoded, "image/webp"
|
return decoded, "image/webp"
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if mt == "application/octet-stream":
|
if mt == "video/mp4":
|
||||||
mt2 = _detect_image_media_type(decoded[:32])
|
try:
|
||||||
if mt2 != "application/octet-stream":
|
if len(decoded) >= 8 and decoded[4:8] == b"ftyp":
|
||||||
return decoded, mt2
|
return decoded, "video/mp4"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return decoded, mt
|
mt2 = _detect_image_media_type(decoded[:32])
|
||||||
|
if mt2 != mt:
|
||||||
|
continue
|
||||||
|
if not _is_probably_valid_image(decoded, mt2):
|
||||||
|
continue
|
||||||
|
return decoded, mt2
|
||||||
|
|
||||||
preview_len = 8192
|
preview_len = 8192
|
||||||
try:
|
try:
|
||||||
@@ -1005,6 +1392,8 @@ def _try_xor_decrypt_by_magic(data: bytes) -> tuple[Optional[bytes], Optional[st
|
|||||||
decoded = bytes(b ^ key for b in data)
|
decoded = bytes(b ^ key for b in data)
|
||||||
dec2, mt2 = _try_strip_media_prefix(decoded)
|
dec2, mt2 = _try_strip_media_prefix(decoded)
|
||||||
if mt2 != "application/octet-stream":
|
if mt2 != "application/octet-stream":
|
||||||
|
if mt2.startswith("image/") and (not _is_probably_valid_image(dec2, mt2)):
|
||||||
|
continue
|
||||||
return dec2, mt2
|
return dec2, mt2
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
@@ -1193,13 +1582,15 @@ def _detect_image_extension(data: bytes) -> str:
|
|||||||
"""根据图片数据检测文件扩展名"""
|
"""根据图片数据检测文件扩展名"""
|
||||||
if not data:
|
if not data:
|
||||||
return "dat"
|
return "dat"
|
||||||
if data.startswith(b"\x89PNG\r\n\x1a\n"):
|
head = data[:32] if len(data) > 32 else data
|
||||||
|
mt = _detect_image_media_type(head)
|
||||||
|
if mt == "image/png":
|
||||||
return "png"
|
return "png"
|
||||||
if data.startswith(b"\xff\xd8\xff"):
|
if mt == "image/jpeg":
|
||||||
return "jpg"
|
return "jpg"
|
||||||
if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
|
if mt == "image/gif":
|
||||||
return "gif"
|
return "gif"
|
||||||
if data.startswith(b"RIFF") and len(data) >= 12 and data[8:12] == b"WEBP":
|
if mt == "image/webp":
|
||||||
return "webp"
|
return "webp"
|
||||||
return "dat"
|
return "dat"
|
||||||
|
|
||||||
@@ -1262,18 +1653,19 @@ def _read_and_maybe_decrypt_media(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
data_pref = path.read_bytes()
|
data_pref = path.read_bytes()
|
||||||
|
# Only accept prefix stripping when it looks like a real image/video,
|
||||||
|
# otherwise encrypted/random bytes may trigger false positives.
|
||||||
stripped, mtp = _try_strip_media_prefix(data_pref)
|
stripped, mtp = _try_strip_media_prefix(data_pref)
|
||||||
if mtp != "application/octet-stream":
|
if mtp != "application/octet-stream":
|
||||||
|
if mtp.startswith("image/") and (not _is_probably_valid_image(stripped, mtp)):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
return stripped, mtp
|
return stripped, mtp
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
data = path.read_bytes()
|
data = path.read_bytes()
|
||||||
|
|
||||||
dec, mt2 = _try_xor_decrypt_by_magic(data)
|
|
||||||
if dec is not None and mt2:
|
|
||||||
return dec, mt2
|
|
||||||
|
|
||||||
# Try WeChat .dat v1/v2 decrypt.
|
# Try WeChat .dat v1/v2 decrypt.
|
||||||
version = _detect_wechat_dat_version(data)
|
version = _detect_wechat_dat_version(data)
|
||||||
if version in (0, 1, 2):
|
if version in (0, 1, 2):
|
||||||
@@ -1356,8 +1748,23 @@ def _read_and_maybe_decrypt_media(
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Fallback: try guessing XOR key by magic (only after key-based decrypt attempts).
|
||||||
|
# For V4 signature files, XOR guessing is not applicable and may be expensive.
|
||||||
|
if version in (0, -1):
|
||||||
|
dec, mt2 = _try_xor_decrypt_by_magic(data)
|
||||||
|
if dec is not None and mt2:
|
||||||
|
return dec, mt2
|
||||||
|
|
||||||
# Fallback: return as-is.
|
# Fallback: return as-is.
|
||||||
mt3 = _guess_media_type_by_path(path, fallback="application/octet-stream")
|
mt3 = _guess_media_type_by_path(path, fallback="application/octet-stream")
|
||||||
|
if mt3.startswith("image/") and (not _is_probably_valid_image(data, mt3)):
|
||||||
|
mt3 = "application/octet-stream"
|
||||||
|
if mt3 == "video/mp4":
|
||||||
|
try:
|
||||||
|
if not (len(data) >= 8 and data[4:8] == b"ftyp"):
|
||||||
|
mt3 = "application/octet-stream"
|
||||||
|
except Exception:
|
||||||
|
mt3 = "application/octet-stream"
|
||||||
return data, mt3
|
return data, mt3
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ from ..media_helpers import (
|
|||||||
_convert_silk_to_wav,
|
_convert_silk_to_wav,
|
||||||
_detect_image_extension,
|
_detect_image_extension,
|
||||||
_detect_image_media_type,
|
_detect_image_media_type,
|
||||||
|
_is_probably_valid_image,
|
||||||
|
_iter_media_source_candidates,
|
||||||
|
_order_media_candidates,
|
||||||
_ensure_decrypted_resource_for_md5,
|
_ensure_decrypted_resource_for_md5,
|
||||||
_fallback_search_media_by_file_id,
|
_fallback_search_media_by_file_id,
|
||||||
_fallback_search_media_by_md5,
|
_fallback_search_media_by_md5,
|
||||||
@@ -31,6 +34,7 @@ from ..media_helpers import (
|
|||||||
_resolve_account_wxid_dir,
|
_resolve_account_wxid_dir,
|
||||||
_resolve_media_path_for_kind,
|
_resolve_media_path_for_kind,
|
||||||
_resolve_media_path_from_hardlink,
|
_resolve_media_path_from_hardlink,
|
||||||
|
_try_fetch_emoticon_from_remote,
|
||||||
_try_find_decrypted_resource,
|
_try_find_decrypted_resource,
|
||||||
_try_strip_media_prefix,
|
_try_strip_media_prefix,
|
||||||
)
|
)
|
||||||
@@ -255,11 +259,14 @@ async def get_chat_image(
|
|||||||
if decrypted_path:
|
if decrypted_path:
|
||||||
data = decrypted_path.read_bytes()
|
data = decrypted_path.read_bytes()
|
||||||
media_type = _detect_image_media_type(data[:32])
|
media_type = _detect_image_media_type(data[:32])
|
||||||
if media_type == "application/octet-stream":
|
if media_type != "application/octet-stream" and _is_probably_valid_image(data, media_type):
|
||||||
guessed = mimetypes.guess_type(str(decrypted_path))[0]
|
|
||||||
if guessed:
|
|
||||||
media_type = guessed
|
|
||||||
return Response(content=data, media_type=media_type)
|
return Response(content=data, media_type=media_type)
|
||||||
|
# Corrupted cached file (e.g. wrong ext / partial data): remove and regenerate from source.
|
||||||
|
try:
|
||||||
|
if decrypted_path.suffix.lower() in {".jpg", ".jpeg", ".png", ".gif", ".webp"}:
|
||||||
|
decrypted_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# 回退:从微信数据目录实时定位并解密
|
# 回退:从微信数据目录实时定位并解密
|
||||||
wxid_dir = _resolve_account_wxid_dir(account_dir)
|
wxid_dir = _resolve_account_wxid_dir(account_dir)
|
||||||
@@ -283,6 +290,7 @@ async def get_chat_image(
|
|||||||
)
|
)
|
||||||
|
|
||||||
p: Optional[Path] = None
|
p: Optional[Path] = None
|
||||||
|
candidates: list[Path] = []
|
||||||
|
|
||||||
if md5:
|
if md5:
|
||||||
p = _resolve_media_path_from_hardlink(
|
p = _resolve_media_path_from_hardlink(
|
||||||
@@ -297,12 +305,25 @@ async def get_chat_image(
|
|||||||
hit = _fallback_search_media_by_md5(str(wxid_dir), str(md5), kind="image")
|
hit = _fallback_search_media_by_md5(str(wxid_dir), str(md5), kind="image")
|
||||||
if hit:
|
if hit:
|
||||||
p = Path(hit)
|
p = Path(hit)
|
||||||
|
# Also add scan-based candidates to improve the chance of finding a usable variant.
|
||||||
|
if wxid_dir:
|
||||||
|
try:
|
||||||
|
hit2 = _fallback_search_media_by_md5(str(wxid_dir), str(md5), kind="image")
|
||||||
|
if hit2:
|
||||||
|
candidates.extend(_iter_media_source_candidates(Path(hit2)))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
elif file_id:
|
elif file_id:
|
||||||
# 一些版本图片消息无 MD5,仅提供 cdnthumburl 等“文件标识”
|
# 一些版本图片消息无 MD5,仅提供 cdnthumburl 等“文件标识”
|
||||||
for r in [wxid_dir, db_storage_dir]:
|
for r in [wxid_dir, db_storage_dir]:
|
||||||
if not r:
|
if not r:
|
||||||
continue
|
continue
|
||||||
hit = _fallback_search_media_by_file_id(str(r), str(file_id), kind="image", username=str(username or ""))
|
hit = _fallback_search_media_by_file_id(
|
||||||
|
str(r),
|
||||||
|
str(file_id),
|
||||||
|
kind="image",
|
||||||
|
username=str(username or ""),
|
||||||
|
)
|
||||||
if hit:
|
if hit:
|
||||||
p = Path(hit)
|
p = Path(hit)
|
||||||
break
|
break
|
||||||
@@ -310,9 +331,29 @@ async def get_chat_image(
|
|||||||
if not p:
|
if not p:
|
||||||
raise HTTPException(status_code=404, detail="Image not found.")
|
raise HTTPException(status_code=404, detail="Image not found.")
|
||||||
|
|
||||||
logger.info(f"chat_image: md5={md5} file_id={file_id} resolved_source={p}")
|
candidates.extend(_iter_media_source_candidates(p))
|
||||||
|
candidates = _order_media_candidates(candidates)
|
||||||
|
|
||||||
data, media_type = _read_and_maybe_decrypt_media(p, account_dir=account_dir, weixin_root=wxid_dir)
|
logger.info(f"chat_image: md5={md5} file_id={file_id} candidates={len(candidates)} first={p}")
|
||||||
|
|
||||||
|
data = b""
|
||||||
|
media_type = "application/octet-stream"
|
||||||
|
chosen: Optional[Path] = None
|
||||||
|
for src_path in candidates:
|
||||||
|
try:
|
||||||
|
data, media_type = _read_and_maybe_decrypt_media(src_path, account_dir=account_dir, weixin_root=wxid_dir)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if media_type.startswith("image/") and (not _is_probably_valid_image(data, media_type)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if media_type != "application/octet-stream":
|
||||||
|
chosen = src_path
|
||||||
|
break
|
||||||
|
|
||||||
|
if not chosen:
|
||||||
|
raise HTTPException(status_code=422, detail="Image found but failed to decode/decrypt.")
|
||||||
|
|
||||||
# 仅在 md5 有效时缓存到 resource 目录;file_id 可能非常长,避免写入超长文件名
|
# 仅在 md5 有效时缓存到 resource 目录;file_id 可能非常长,避免写入超长文件名
|
||||||
if md5 and media_type.startswith("image/"):
|
if md5 and media_type.startswith("image/"):
|
||||||
@@ -326,7 +367,9 @@ async def get_chat_image(
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
logger.info(f"chat_image: md5={md5} file_id={file_id} media_type={media_type} bytes={len(data)}")
|
logger.info(
|
||||||
|
f"chat_image: md5={md5} file_id={file_id} chosen={chosen} media_type={media_type} bytes={len(data)}"
|
||||||
|
)
|
||||||
return Response(content=data, media_type=media_type)
|
return Response(content=data, media_type=media_type)
|
||||||
|
|
||||||
|
|
||||||
@@ -341,18 +384,31 @@ async def get_chat_emoji(md5: str, account: Optional[str] = None, username: Opti
|
|||||||
if decrypted_path:
|
if decrypted_path:
|
||||||
data = decrypted_path.read_bytes()
|
data = decrypted_path.read_bytes()
|
||||||
media_type = _detect_image_media_type(data[:32])
|
media_type = _detect_image_media_type(data[:32])
|
||||||
if media_type == "application/octet-stream":
|
if media_type != "application/octet-stream" and _is_probably_valid_image(data, media_type):
|
||||||
guessed = mimetypes.guess_type(str(decrypted_path))[0]
|
|
||||||
if guessed:
|
|
||||||
media_type = guessed
|
|
||||||
return Response(content=data, media_type=media_type)
|
return Response(content=data, media_type=media_type)
|
||||||
|
try:
|
||||||
|
if decrypted_path.suffix.lower() in {".jpg", ".jpeg", ".png", ".gif", ".webp"}:
|
||||||
|
decrypted_path.unlink()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
wxid_dir = _resolve_account_wxid_dir(account_dir)
|
wxid_dir = _resolve_account_wxid_dir(account_dir)
|
||||||
p = _resolve_media_path_for_kind(account_dir, kind="emoji", md5=str(md5), username=username)
|
p = _resolve_media_path_for_kind(account_dir, kind="emoji", md5=str(md5), username=username)
|
||||||
if not p:
|
|
||||||
|
data = b""
|
||||||
|
media_type = "application/octet-stream"
|
||||||
|
if p:
|
||||||
|
data, media_type = _read_and_maybe_decrypt_media(p, account_dir=account_dir, weixin_root=wxid_dir)
|
||||||
|
|
||||||
|
if media_type == "application/octet-stream":
|
||||||
|
# Some emojis are stored encrypted (see emoticon.db); try remote fetch as fallback.
|
||||||
|
data2, mt2 = _try_fetch_emoticon_from_remote(account_dir, str(md5).lower())
|
||||||
|
if data2 is not None and mt2:
|
||||||
|
data, media_type = data2, mt2
|
||||||
|
|
||||||
|
if (not p) and media_type == "application/octet-stream":
|
||||||
raise HTTPException(status_code=404, detail="Emoji not found.")
|
raise HTTPException(status_code=404, detail="Emoji not found.")
|
||||||
|
|
||||||
data, media_type = _read_and_maybe_decrypt_media(p, account_dir=account_dir, weixin_root=wxid_dir)
|
|
||||||
if media_type.startswith("image/"):
|
if media_type.startswith("image/"):
|
||||||
try:
|
try:
|
||||||
out_md5 = str(md5).lower()
|
out_md5 = str(md5).lower()
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from ..media_helpers import (
|
|||||||
_collect_all_dat_files,
|
_collect_all_dat_files,
|
||||||
_decrypt_and_save_resource,
|
_decrypt_and_save_resource,
|
||||||
_detect_image_media_type,
|
_detect_image_media_type,
|
||||||
|
_is_probably_valid_image,
|
||||||
_get_resource_dir,
|
_get_resource_dir,
|
||||||
_load_media_keys,
|
_load_media_keys,
|
||||||
_resolve_account_dir,
|
_resolve_account_dir,
|
||||||
@@ -315,6 +316,10 @@ async def decrypt_all_media_stream(
|
|||||||
# 检查是否已解密
|
# 检查是否已解密
|
||||||
existing = _try_find_decrypted_resource(account_dir, md5)
|
existing = _try_find_decrypted_resource(account_dir, md5)
|
||||||
if existing:
|
if existing:
|
||||||
|
try:
|
||||||
|
cached = existing.read_bytes()
|
||||||
|
cached_mt = _detect_image_media_type(cached[:32])
|
||||||
|
if cached_mt != "application/octet-stream" and _is_probably_valid_image(cached, cached_mt):
|
||||||
skip_count += 1
|
skip_count += 1
|
||||||
# 每100个跳过的文件发送一次进度,减少消息量
|
# 每100个跳过的文件发送一次进度,减少消息量
|
||||||
if skip_count % 100 == 0 or current == total_files:
|
if skip_count % 100 == 0 or current == total_files:
|
||||||
@@ -323,6 +328,14 @@ async def decrypt_all_media_stream(
|
|||||||
)
|
)
|
||||||
await asyncio.sleep(0)
|
await asyncio.sleep(0)
|
||||||
continue
|
continue
|
||||||
|
# Cache exists but looks corrupted: remove and regenerate.
|
||||||
|
existing.unlink(missing_ok=True)
|
||||||
|
except Exception:
|
||||||
|
# If we can't read/validate it, try regenerating.
|
||||||
|
try:
|
||||||
|
existing.unlink(missing_ok=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# 解密并保存
|
# 解密并保存
|
||||||
success, msg = _decrypt_and_save_resource(
|
success, msg = _decrypt_and_save_resource(
|
||||||
|
|||||||
Reference in New Issue
Block a user