TFIDF
作者:互联网
TF-IDF是什么:
文本向量化工具
TF-IDF基本概念:
TF:词频除以文章总词数
IDF:ln(语料库的总个数/出现该词的语料库个数+1)
TF-IDF = TF * IDF
1 def forward_segment(text, dic): 2 word_list = [] 3 i = 0 4 while i < len(text): 5 logest_word = text[i] 6 for j in range(i+1, len(text)+1): 7 word = text[i:j] 8 if word in dic: 9 logest_word = word 10 word_list.append(logest_word) 11 i += len(logest_word) 12 return word_list正向最长匹配
1 def backward_segment(text, dic): 2 word_list = [] 3 i = len(text) - 1 4 while i >= 0: 5 longest_word = text[i] 6 for j in range(0, i): 7 word = text[j:i+1] 8 if word in dic: 9 longest_word = word 10 break 11 word_list.insert(0, longest_word) 12 i -= len(longest_word) 13 return word_list逆向最长匹配
def bidirectional_segment(text, dic): forward_result = forward_segment(text, dic) backward_result = backward_segment(text, dic) #默认逆向分词 final_result = backward_result if len(forward_result) < len(backward_result): final_result = forward_result elif count_single(forward_result) < count_single(backward_result): final_result = forward_result return final_result双向最长匹配
文本向量化步骤
1.遍历每个文档,使用jieba等分词工具进行分词,形成文档库,通常将数词(m)、非语言词(x)、中文停用词等排除在语料库之外。
2.剔除重复字段形成字典库
3.计算TF 、IDF
4.输出结果TF/IDF
import jieba import jieba.posseg as pseg import numpy as np def process(documents): file_list = [] vocab = set() for content in documents: psresult = pseg.cut(content) document = [x.word for x in psresult if x.flag not in ['x', 'm']] file_list.append(document) vocab.update(document) return file_list, vocab class TFIDFVector: def __init__(self,documents): self.documents = documents self.file_list = [] self.vocab = set() self.prosses() #文档预处理,生成文档列表和语料库 def prosses(self): self.file_list, self.vocab = process(self.documents) #文本向量化 def vector(self): self.vocab_list = list(self.vocab) #计算TF tf_list = [np.array([sent.count(word) for word in self.vocab_list])/len(sent) for sent in self.file_list] #计算IDF idf_list = [np.log(len(self.file_list)/sum([1 for sent in self.file_list if word in sent])) for word in self.vocab_list] tf_arrays = np.array(tf_list) idf_arrays = np.array(idf_list) #计算TF-IDF=TF*IDF # result = [sent*np.array(idf_list) for sent in tf_list] result = tf_arrays * idf_arrays return result if __name__ == '__main__': #dirName = r'd:\deepblue\course\NO.11\datas\datas'# r'D:\AI课程\05.NLP基础\代码\datas' #documents = load_data(dirName) documents = ['行政机关强行解除行政协议造成损失,如何索取赔偿?', '借钱给朋友到期不还得什么时候可以起诉?怎么起诉?', '我在微信上被骗了,请问被骗多少钱才可以立案?', '公民对于选举委对选民的资格申诉的处理决定不服,能不能去法院起诉吗?', '有人走私两万元,怎么处置他?', '法律上餐具、饮具集中消毒服务单位的责任是不是对消毒餐具、饮具进行检验?', '走私是如何量刑的?', '如果微信上被骗怎么办?'] vector = TFIDFVector(documents) print(vector.vector())文本向量化python代码示例
标签:word,text,self,list,TFIDF,result,IDF 来源: https://www.cnblogs.com/joancaster/p/16171548.html