feat(wrapped): 年度关键词卡升级为年度常用语词云

- 新增常用语短句过滤与统计,按重复短句(非分词)生成词云数据。

- 引入常用语扫描元数据(scannedCandidates/matchedCandidates/uniquePhrases 等)。

- 示例语句抽样改为唯一优先,不足补齐重复,提升短句命中率。

- wrapped cache version 升级到 24,并补充常用语相关单元测试。
This commit is contained in:
2977094657
2026-02-22 22:18:43 +08:00
Unverified
parent e537c524f3
commit 236d0ae703
3 changed files with 384 additions and 32 deletions
+75
View File
@@ -8,6 +8,63 @@ sys.path.insert(0, str(ROOT / "src"))
class TestWrappedKeywordsWordCloud(unittest.TestCase):
def test_weflow_common_phrase_filter(self):
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import _weflow_common_phrase_or_empty
self.assertEqual(_weflow_common_phrase_or_empty(" 在吗 "), "在吗")
self.assertEqual(_weflow_common_phrase_or_empty("ok"), "ok")
self.assertEqual(_weflow_common_phrase_or_empty("a"), "") # too short
self.assertEqual(_weflow_common_phrase_or_empty("x" * 21), "") # too long
self.assertEqual(_weflow_common_phrase_or_empty("看看 http://x.com"), "") # contains http
self.assertEqual(_weflow_common_phrase_or_empty("<msg>xml</msg>"), "") # contains "<"
self.assertEqual(_weflow_common_phrase_or_empty("[捂脸]"), "") # bracketed payload
self.assertEqual(_weflow_common_phrase_or_empty("<?xml version='1.0'?>"), "") # xml payload
def test_build_common_phrases_payload_structure(self):
from collections import Counter
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import build_common_phrases_payload
phrase_counts = Counter({"好的": 5, "在吗": 2, "movie": 2, "单次": 1})
example_texts = [
"好的收到",
"好的好的,明白了",
"你好的呀",
"在吗宝贝",
"movie night is fun",
"MOVIE time now",
]
payload = build_common_phrases_payload(
phrase_counts=phrase_counts,
seed=123456,
top_n=32,
bubble_limit=50,
example_texts=example_texts,
examples_per_word=3,
)
self.assertIn("keywords", payload)
self.assertIn("bubbleMessages", payload)
self.assertIn("examples", payload)
self.assertIn("topKeyword", payload)
self.assertEqual(payload["topKeyword"]["word"], "好的")
self.assertEqual(int(payload["topKeyword"]["count"]), 5)
self.assertTrue(all(int(x.get("count") or 0) >= 2 for x in payload["keywords"]))
self.assertTrue(all(isinstance(x.get("word"), str) and x.get("word") for x in payload["keywords"]))
# Examples should contain real message samples with an upper bound.
for ex in payload["examples"]:
msgs = ex.get("messages") or []
self.assertGreaterEqual(len(msgs), 1)
self.assertLessEqual(len(msgs), 3)
word = str(ex.get("word") or "")
if any("\u4e00" <= ch <= "\u9fff" for ch in word):
self.assertTrue(any(word in str(m) for m in msgs))
else:
self.assertTrue(any(word.lower() in str(m).lower() for m in msgs))
def test_extract_keywords_jieba_basic(self):
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import extract_keywords_jieba
@@ -96,6 +153,24 @@ class TestWrappedKeywordsWordCloud(unittest.TestCase):
m_movie = next(x for x in out if x["word"] == "movie")
self.assertTrue(all("movie" in m.lower() for m in m_movie["messages"]))
def test_pick_examples_short_phrase_can_fill_three(self):
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import pick_examples
keywords = [{"word": "在吗", "count": 9, "weight": 1.0}]
pool = [
"在吗",
"在吗",
"在吗",
"在吗?",
"ok",
]
out = pick_examples(keywords, pool, per_word=3)
self.assertEqual(len(out), 1)
msgs = out[0]["messages"]
self.assertEqual(len(msgs), 3)
self.assertTrue(all("在吗" in m for m in msgs))
def test_build_keywords_payload_structure(self):
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import build_keywords_payload