improvement(chat): packed_info md5 提取优先匹配 .dat 文件名

- packed_info 同时包含多个 32hex 时,优先从形如 xxx_t.dat/.t.dat 的文件名中提取 md5;无匹配再回退到原 32hex 众数策略

- 对齐 echotrace 的选择思路,降低取错 md5 概率
This commit is contained in:
2977094657
2025-12-31 11:35:51 +08:00
parent 0349d89def
commit 67358deeef

View File

@@ -426,6 +426,7 @@ def _decode_message_content(compress_value: Any, message_value: Any) -> str:
_MD5_HEX_RE = re.compile(rb"(?i)[0-9a-f]{32}")
_DAT_MD5_RE = re.compile(rb"(?i)([0-9a-f]{32})(?:[._][thbc])?\.dat")
def _extract_md5_from_blob(blob: Any) -> str:
@@ -443,6 +444,21 @@ def _extract_md5_from_blob(blob: Any) -> str:
if not data:
return ""
# Prefer md5 that appears as an actual `.dat` filename (incl. _t.dat/.t.dat variants).
# This matches echotrace's idea: packed_info often contains multiple 32-hex tokens, but only
# the one referenced by a file path is the correct on-disk basename.
try:
m2 = _DAT_MD5_RE.findall(data)
except Exception:
m2 = []
if m2:
best2 = Counter([x.lower() for x in m2]).most_common(1)[0][0]
try:
return best2.decode("ascii", errors="ignore")
except Exception:
return ""
m = _MD5_HEX_RE.findall(data)
if not m:
return ""