mirror of
https://github.com/LifeArchiveProject/WeChatDataAnalysis.git
synced 2026-06-18 15:54:08 +08:00
feat(wrapped): 年度关键词卡升级为年度常用语词云
- 新增常用语短句过滤与统计,按重复短句(非分词)生成词云数据。 - 引入常用语扫描元数据(scannedCandidates/matchedCandidates/uniquePhrases 等)。 - 示例语句抽样改为唯一优先,不足补齐重复,提升短句命中率。 - wrapped cache version 升级到 24,并补充常用语相关单元测试。
This commit is contained in:
@@ -8,6 +8,63 @@ sys.path.insert(0, str(ROOT / "src"))
|
||||
|
||||
|
||||
class TestWrappedKeywordsWordCloud(unittest.TestCase):
|
||||
def test_weflow_common_phrase_filter(self):
|
||||
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import _weflow_common_phrase_or_empty
|
||||
|
||||
self.assertEqual(_weflow_common_phrase_or_empty(" 在吗 "), "在吗")
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("ok"), "ok")
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("a"), "") # too short
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("x" * 21), "") # too long
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("看看 http://x.com"), "") # contains http
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("<msg>xml</msg>"), "") # contains "<"
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("[捂脸]"), "") # bracketed payload
|
||||
self.assertEqual(_weflow_common_phrase_or_empty("<?xml version='1.0'?>"), "") # xml payload
|
||||
|
||||
def test_build_common_phrases_payload_structure(self):
|
||||
from collections import Counter
|
||||
|
||||
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import build_common_phrases_payload
|
||||
|
||||
phrase_counts = Counter({"好的": 5, "在吗": 2, "movie": 2, "单次": 1})
|
||||
example_texts = [
|
||||
"好的收到",
|
||||
"好的好的,明白了",
|
||||
"你好的呀",
|
||||
"在吗宝贝",
|
||||
"movie night is fun",
|
||||
"MOVIE time now",
|
||||
]
|
||||
payload = build_common_phrases_payload(
|
||||
phrase_counts=phrase_counts,
|
||||
seed=123456,
|
||||
top_n=32,
|
||||
bubble_limit=50,
|
||||
example_texts=example_texts,
|
||||
examples_per_word=3,
|
||||
)
|
||||
|
||||
self.assertIn("keywords", payload)
|
||||
self.assertIn("bubbleMessages", payload)
|
||||
self.assertIn("examples", payload)
|
||||
self.assertIn("topKeyword", payload)
|
||||
|
||||
self.assertEqual(payload["topKeyword"]["word"], "好的")
|
||||
self.assertEqual(int(payload["topKeyword"]["count"]), 5)
|
||||
|
||||
self.assertTrue(all(int(x.get("count") or 0) >= 2 for x in payload["keywords"]))
|
||||
self.assertTrue(all(isinstance(x.get("word"), str) and x.get("word") for x in payload["keywords"]))
|
||||
|
||||
# Examples should contain real message samples with an upper bound.
|
||||
for ex in payload["examples"]:
|
||||
msgs = ex.get("messages") or []
|
||||
self.assertGreaterEqual(len(msgs), 1)
|
||||
self.assertLessEqual(len(msgs), 3)
|
||||
word = str(ex.get("word") or "")
|
||||
if any("\u4e00" <= ch <= "\u9fff" for ch in word):
|
||||
self.assertTrue(any(word in str(m) for m in msgs))
|
||||
else:
|
||||
self.assertTrue(any(word.lower() in str(m).lower() for m in msgs))
|
||||
|
||||
def test_extract_keywords_jieba_basic(self):
|
||||
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import extract_keywords_jieba
|
||||
|
||||
@@ -96,6 +153,24 @@ class TestWrappedKeywordsWordCloud(unittest.TestCase):
|
||||
m_movie = next(x for x in out if x["word"] == "movie")
|
||||
self.assertTrue(all("movie" in m.lower() for m in m_movie["messages"]))
|
||||
|
||||
def test_pick_examples_short_phrase_can_fill_three(self):
|
||||
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import pick_examples
|
||||
|
||||
keywords = [{"word": "在吗", "count": 9, "weight": 1.0}]
|
||||
pool = [
|
||||
"在吗",
|
||||
"在吗",
|
||||
"在吗",
|
||||
"在吗?",
|
||||
"ok",
|
||||
]
|
||||
|
||||
out = pick_examples(keywords, pool, per_word=3)
|
||||
self.assertEqual(len(out), 1)
|
||||
msgs = out[0]["messages"]
|
||||
self.assertEqual(len(msgs), 3)
|
||||
self.assertTrue(all("在吗" in m for m in msgs))
|
||||
|
||||
def test_build_keywords_payload_structure(self):
|
||||
from wechat_decrypt_tool.wrapped.cards.card_05_keywords_wordcloud import build_keywords_payload
|
||||
|
||||
|
||||
Reference in New Issue
Block a user