From f982ba466e26f61ebdbf4cea824d8a2536af0686 Mon Sep 17 00:00:00 2001 From: Drxan Date: Thu, 20 Dec 2018 20:35:44 +0800 Subject: [PATCH 1/2] set the _keyword=set() for some keyword has multiple different clean_names For example: keyword_processor = KeywordProcessor() keyword_dict = {"news_channel": ["CNN","CCTV","BBC"],"neural_network": ["CNN", "RNN"]} keyword_processor.add_keywords_from_dict(keyword_dict) keyword_processor.extract_keywords('I like CNN') we hope get result as follows: ("news_channel", "neural_network") --- flashtext/keyword.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flashtext/keyword.py b/flashtext/keyword.py index f358c77..6f64441 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -149,7 +149,8 @@ def __setitem__(self, keyword, clean_name=None): if self._keyword not in current_dict: status = True self._terms_in_trie += 1 - current_dict[self._keyword] = clean_name + current_dict[self._keyword] = set() + current_dict[self._keyword].add(clean_name) return status def __delitem__(self, keyword): From 5b4d8cd6632e405945ad4e9d71dc8f2dcf8b4b81 Mon Sep 17 00:00:00 2001 From: Drxan Date: Thu, 20 Dec 2018 21:12:48 +0800 Subject: [PATCH 2/2] Some key_words have multiple different clean_names For example: keyword_processor = KeywordProcessor() keyword_dict = {"news_channel": ["CNN","CCTV","BBC"],"neural_network": ["CNN", "RNN"]} keyword_processor.add_keywords_from_dict(keyword_dict) keyword_processor.extract_keywords('I like CNN') we hope get result as follows: "news_channel_|_neural_network" we can use str.split() to get real clean name as follows: "news_channel_|_neural_network".split('_|_') ==> ["news_channel", "neural_network"] --- flashtext/keyword.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/flashtext/keyword.py b/flashtext/keyword.py index 6f64441..3fcb4e4 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -149,8 +149,12 @@ def __setitem__(self, keyword, clean_name=None): if self._keyword not in current_dict: status = True self._terms_in_trie += 1 - current_dict[self._keyword] = set() - current_dict[self._keyword].add(clean_name) + current_dict[self._keyword] = clean_name + else: + status = True + clean_names = set(current_dict[self._keyword].split('_|_')) + clean_names.add(clean_name) + current_dict[self._keyword] = '_|_'.join(clean_names) return status def __delitem__(self, keyword):