mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-02-02 22:10:50 +08:00
improvement(chat): packed_info md5 提取优先匹配 .dat 文件名
- packed_info 同时包含多个 32hex 时,优先从形如 xxx_t.dat/.t.dat 的文件名中提取 md5;无匹配再回退到原 32hex 众数策略 - 对齐 echotrace 的选择思路,降低取错 md5 概率
This commit is contained in:
@@ -426,6 +426,7 @@ def _decode_message_content(compress_value: Any, message_value: Any) -> str:
|
|||||||
|
|
||||||
|
|
||||||
_MD5_HEX_RE = re.compile(rb"(?i)[0-9a-f]{32}")
|
_MD5_HEX_RE = re.compile(rb"(?i)[0-9a-f]{32}")
|
||||||
|
_DAT_MD5_RE = re.compile(rb"(?i)([0-9a-f]{32})(?:[._][thbc])?\.dat")
|
||||||
|
|
||||||
|
|
||||||
def _extract_md5_from_blob(blob: Any) -> str:
|
def _extract_md5_from_blob(blob: Any) -> str:
|
||||||
@@ -443,6 +444,21 @@ def _extract_md5_from_blob(blob: Any) -> str:
|
|||||||
|
|
||||||
if not data:
|
if not data:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
# Prefer md5 that appears as an actual `.dat` filename (incl. _t.dat/.t.dat variants).
|
||||||
|
# This matches echotrace's idea: packed_info often contains multiple 32-hex tokens, but only
|
||||||
|
# the one referenced by a file path is the correct on-disk basename.
|
||||||
|
try:
|
||||||
|
m2 = _DAT_MD5_RE.findall(data)
|
||||||
|
except Exception:
|
||||||
|
m2 = []
|
||||||
|
if m2:
|
||||||
|
best2 = Counter([x.lower() for x in m2]).most_common(1)[0][0]
|
||||||
|
try:
|
||||||
|
return best2.decode("ascii", errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
m = _MD5_HEX_RE.findall(data)
|
m = _MD5_HEX_RE.findall(data)
|
||||||
if not m:
|
if not m:
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
Reference in New Issue
Block a user