mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-02 05:50:50 +08:00
improvement(chat): packed_info md5 提取优先匹配 .dat 文件名
- packed_info 同时包含多个 32hex 时,优先从形如 xxx_t.dat/.t.dat 的文件名中提取 md5;无匹配再回退到原 32hex 众数策略 - 对齐 echotrace 的选择思路,降低取错 md5 概率
This commit is contained in:
@@ -426,6 +426,7 @@ def _decode_message_content(compress_value: Any, message_value: Any) -> str:
|
||||
|
||||
|
||||
_MD5_HEX_RE = re.compile(rb"(?i)[0-9a-f]{32}")
|
||||
_DAT_MD5_RE = re.compile(rb"(?i)([0-9a-f]{32})(?:[._][thbc])?\.dat")
|
||||
|
||||
|
||||
def _extract_md5_from_blob(blob: Any) -> str:
|
||||
@@ -443,6 +444,21 @@ def _extract_md5_from_blob(blob: Any) -> str:
|
||||
|
||||
if not data:
|
||||
return ""
|
||||
|
||||
# Prefer md5 that appears as an actual `.dat` filename (incl. _t.dat/.t.dat variants).
|
||||
# This matches echotrace's idea: packed_info often contains multiple 32-hex tokens, but only
|
||||
# the one referenced by a file path is the correct on-disk basename.
|
||||
try:
|
||||
m2 = _DAT_MD5_RE.findall(data)
|
||||
except Exception:
|
||||
m2 = []
|
||||
if m2:
|
||||
best2 = Counter([x.lower() for x in m2]).most_common(1)[0][0]
|
||||
try:
|
||||
return best2.decode("ascii", errors="ignore")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
m = _MD5_HEX_RE.findall(data)
|
||||
if not m:
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user