拓扑贝叶斯用于垃圾邮件分类(优化算法)
作者:互联网
#一共900封邮件 #垃圾邮件500封,正常邮件有400封 #找到词在垃圾和正常中出现的概率值 #比如:公司这个词,在垃圾邮件中有200封出现了公司这个词,相应的概率就是200/500=0.4 # 在正常邮件中有10封出现了公司这个词,所以概率为10/400=0.025 import math import re import jieba import numpy num_spam = 0 num_ham = 0 EmailsIndex= "index" jieba.setLogLevel(jieba.logging.INFO) #正则表达式过滤所有非中文词语 def filterEmail(email): email = re.sub("[a-zA-Z0-9_]+|\W","",email) return email #读取邮件内容,读取文件内容,过滤出了中文词语,生成了一个中文词语的列表 def readEmail(filename): with open(filename, "r",encoding='GB2312', errors='ignore') as fp: content=fp.read() content = filterEmail(content) #把一封邮件提取所有中文内容 words = list(jieba.cut(content)) return words #加载所有邮件,生了一个邮件列表,每个列表列表里是一个邮件的信息:1.spam垃圾邮件,2.这封邮件对应的所有词的列表 # def loadAllEmails(IndexFile): # Emails=[] # k = 10 # with open(IndexFile, "r") as fp: # lines=fp.readlines() # for line in lines: # spam,filename = line.split() # Emails.append((spam,readEmail(filename))) # if k<0: # break # k = k-1 # return Emails def loadAllEmails(IndexFile): Emails=[] k = 10 with open(IndexFile, "r") as fp: lines=fp.readlines() for line in lines: spam,filename = line.split() Emails.append((spam,readEmail(filename))) return Emails #计算词语的概率表 # 用什么数据结构类型存储这个概率表 # python常用的数据结构类型:元组,列表,集合和字典,双端队列deque # 公司 正常邮件概率0.025 垃圾邮件中的概率 0.4 # 哈哈 #spam 垃圾邮件 #ham 正常邮件 def calWordsFreqTable(Emails): global num_spam global num_ham table = {} for email in Emails: flag,words = email words = set(words) if flag=='spam': num_spam = num_spam + 1 for word in words: if word not in table: table[word] = [1,2] else: table[word][0] =table[word][0] + 1 else: num_ham = num_ham + 1 for word in words: if word not in table: table[word]=[2,1] else: table[word][1] = table[word][1] + 1 for word in table: table[word][0] = numpy.log(table[word][0]/num_spam) table[word][1] = numpy.log(table[word][1]/num_ham) return table #保存词语概率表 # 公司 0.025 0.4 # 哈哈 0.4 0.01 # 优惠 0.01 0.5 def saveWordsTable(table): #转换为键值对应函数 with open(r"E:\PyCharm_文档文件\table_last.txt",'w+') as fp: for item in sorted(table.items(),key=lambda x:x[1],reverse=True): # label fp.write(str(item[0])) fp.write(" ") # spam fp.write(str(item[1][0])) fp.write(" ") # ham fp.write(str(item[1][1])) fp.write('\n') '''计算一下一封新邮件,只有一个词语,算是垃圾信息的概率是多少 假设E表示单词,H1表示来自垃圾邮件,H2表示来自正常邮件 计算公式为p(H1/E)=p(H1)*p(E/H1)/p(E) p(E) = p(E/H1)*p(H1)+p(E/H2)*p(H2) 假设为正常邮件P(H1|E) = P(E|H1)*P(H1)/p(E) 同理得垃圾邮件P(H2|E) = P(E|H2)*P(H2)/p(E) 对于多个单词,同理可得 ''' def checkOneEmail(label,table): # p_H1 = num_spam/(num_ham+num_spam) # p_H2 = num_ham/(num_ham+num_spam) # p_E_H1 = table.get(f)[0] # p_E_H2 = table.get(f)[1] # p_E = p_E_H1 * p_H1 + p_E_H2 * p_H2 # p_spam = p_E_H1 * p_H1 / p_E # p_ham = p_E_H2 * p_H2 / p_E p1vect = numpy.e p2vect = numpy.e for word in label: p1vect = numpy.log(p1vect) + numpy.log(table[word][0]) p2vect = numpy.log(p2vect) + numpy.log( table[word][1]) pspam = num_spam/(num_ham+num_spam) pham = num_ham//(num_ham+num_spam) p1 = numpy.log(pspam*p1vect)-numpy.log(pspam*p1vect+pham*p2vect) p2 = numpy.log(pham*p2vect/(pspam*p1vect+pham*p2vect)) if p1 > p2: print('spam') print(p1) else: print('ham') print(p2) if __name__ == "__main__": #读取所有的邮件内容和标志 Emails = loadAllEmails(EmailsIndex) # # print(Emails) table = calWordsFreqTable(Emails) #输出第一封邮件的标志和内容 #print(Emails[0]) #预测下一份邮件 label = ['你好','的','爸爸'] checkOneEmail(label,table)
标签:__,spam,拓扑,贝叶斯,垃圾邮件,print,filename,Emails,邮件 来源: https://blog.csdn.net/vimpirespider/article/details/120145245