知识图谱:【数据清洗工具flashtext(五)】——flashtext使用示例
作者:互联网
文章目录
关键字提取
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
>>> keywords_found
>>> # ['New York', 'Bay Area']
## 区分大小写
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor(case_sensitive=True)
keyword_processor.add_keyword('Big Apple', 'New York')
keyword_processor.add_keyword('Bay Area')
keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
>>> keywords_found
>>> # ['Bay Area']
同时添加多个关键词
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
# {'clean_name': ['list of unclean names']}
keyword_processor.add_keywords_from_dict(keyword_dict)
# Or add keywords from a list:
keyword_processor.add_keywords_from_list(["java", "python"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management', 'java']
删除关键字
from flashtext import KeywordProcessor
keyword_processor = KeywordProcessor()
keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
keyword_processor.add_keywords_from_dict(keyword_dict)
print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
# output ['product management', 'java']
keyword_processor.remove_keyword('java_2e')
# you can also remove keywords from a list/ dictionary
keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
keyword_processor.remove_keywords_from_list(["java programing"])
keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
# output ['product management']
函数封装示例
from flashtext import KeywordProcessor
def build_actree(wordlist):
'''
AC自动机进行关键词匹配
构造AC trie
'''
actree = KeywordProcessor()
for index, word in enumerate(wordlist):
actree.add_keyword(word) # 向trie树中添加单词
#self.actree = actree
return actree
def ac_detect(actree,text,span_info = True):
'''
AC自动机进行关键词匹配
文本匹配
'''
region_wds = []
for w1 in actree.extract_keywords(text,span_info = span_info):
if len(w1) > 0:
region_wds.append(w1[0])
return region_wds
wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)
>>> CPU times: user 41 µs, sys: 0 ns, total: 41 µs
>>> Wall time: 47.2 µs
>>> ['减肥', '健康', '减肥', '健康']
pyahocorasick版
import ahocorasick
def build_actree(wordlist):
'''
AC自动机进行关键词匹配
构造AC trie
'''
actree = ahocorasick.Automaton() # 初始化trie树
for index, word in enumerate(wordlist):
actree.add_word(word, (index, word)) # 向trie树中添加单词
actree.make_automaton() # 将trie树转化为Aho-Corasick自动机
#self.actree = actree
return actree
def ac_detect(actree,text):
'''
AC自动机进行关键词匹配
文本匹配
'''
region_wds = []
for w1 in actree.iter(text):
if len(w1) > 0:
region_wds.append(w1[1][1])
return region_wds
wordlist = ['健康','减肥']
text = '今天你减肥了吗,今天你健康了吗,减肥 = 健康!'
actree = build_actree(wordlist)
ac_detect(actree,text)
>>> CPU times: user 10 µs, sys: 3 µs, total: 13 µs
>>> Wall time: 17.4 µs
>>> ['减肥', '健康', '减肥', '健康']
标签:product,java,keyword,示例,图谱,actree,flashtext,keywords,processor 来源: https://blog.csdn.net/u013010473/article/details/122038230