编程语言
首页 > 编程语言> > 拓扑贝叶斯用于垃圾邮件分类(优化算法)

拓扑贝叶斯用于垃圾邮件分类(优化算法)

作者:互联网

#一共900封邮件
#垃圾邮件500封,正常邮件有400封
#找到词在垃圾和正常中出现的概率值
#比如:公司这个词,在垃圾邮件中有200封出现了公司这个词,相应的概率就是200/500=0.4
#                在正常邮件中有10封出现了公司这个词,所以概率为10/400=0.025
import math
import re
import jieba
import numpy
num_spam = 0
num_ham = 0
EmailsIndex= "index"
jieba.setLogLevel(jieba.logging.INFO)

#正则表达式过滤所有非中文词语
def filterEmail(email):
    email = re.sub("[a-zA-Z0-9_]+|\W","",email)
    return email

#读取邮件内容,读取文件内容,过滤出了中文词语,生成了一个中文词语的列表
def readEmail(filename):
    with  open(filename, "r",encoding='GB2312', errors='ignore') as fp:
        content=fp.read()
        content = filterEmail(content) #把一封邮件提取所有中文内容
        words = list(jieba.cut(content))
    return words

#加载所有邮件,生了一个邮件列表,每个列表列表里是一个邮件的信息:1.spam垃圾邮件,2.这封邮件对应的所有词的列表
# def loadAllEmails(IndexFile):
#     Emails=[]
#     k = 10
#     with  open(IndexFile, "r") as fp:
#         lines=fp.readlines()
#         for line in lines:
#             spam,filename = line.split()
#             Emails.append((spam,readEmail(filename)))
#             if k<0:
#                 break
#             k = k-1
#     return Emails
def loadAllEmails(IndexFile):
    Emails=[]
    k = 10
    with  open(IndexFile, "r") as fp:
        lines=fp.readlines()
        for line in lines:
            spam,filename = line.split()
            Emails.append((spam,readEmail(filename)))
    return Emails
#计算词语的概率表
# 用什么数据结构类型存储这个概率表
# python常用的数据结构类型:元组,列表,集合和字典,双端队列deque
# 公司  正常邮件概率0.025  垃圾邮件中的概率  0.4
# 哈哈
#spam 垃圾邮件
#ham 正常邮件
def calWordsFreqTable(Emails):
    global num_spam
    global num_ham
    table = {}
    for email in Emails:
        flag,words = email
        words = set(words)
        if flag=='spam':
            num_spam = num_spam + 1
            for word in words:
                if word not in table:
                    table[word] = [1,2]
                else:
                    table[word][0] =table[word][0] + 1
        else:
            num_ham = num_ham + 1
            for word in words:
                if word not in table:
                    table[word]=[2,1]
                else:
                    table[word][1] = table[word][1] + 1
    for word in table:
        table[word][0] = numpy.log(table[word][0]/num_spam)
        table[word][1] = numpy.log(table[word][1]/num_ham)
    return table
#保存词语概率表
#  公司 0.025 0.4
#  哈哈 0.4 0.01
#  优惠 0.01 0.5
def saveWordsTable(table):
    #转换为键值对应函数
    with open(r"E:\PyCharm_文档文件\table_last.txt",'w+') as fp:
        for item in sorted(table.items(),key=lambda x:x[1],reverse=True):
            # label
            fp.write(str(item[0]))
            fp.write(" ")
            # spam
            fp.write(str(item[1][0]))
            fp.write(" ")
            # ham
            fp.write(str(item[1][1]))
            fp.write('\n')
'''计算一下一封新邮件,只有一个词语,算是垃圾信息的概率是多少
假设E表示单词,H1表示来自垃圾邮件,H2表示来自正常邮件
计算公式为p(H1/E)=p(H1)*p(E/H1)/p(E)
p(E) = p(E/H1)*p(H1)+p(E/H2)*p(H2)
假设为正常邮件P(H1|E) = P(E|H1)*P(H1)/p(E)
同理得垃圾邮件P(H2|E) = P(E|H2)*P(H2)/p(E)
对于多个单词,同理可得
'''
def checkOneEmail(label,table):
        # p_H1 = num_spam/(num_ham+num_spam)
        # p_H2 = num_ham/(num_ham+num_spam)
        # p_E_H1 = table.get(f)[0]
        # p_E_H2 = table.get(f)[1]
        # p_E = p_E_H1 * p_H1 + p_E_H2 * p_H2
        # p_spam  = p_E_H1 * p_H1 / p_E
        # p_ham  = p_E_H2 * p_H2 / p_E
        p1vect = numpy.e
        p2vect = numpy.e
        for word in label:
            p1vect = numpy.log(p1vect) + numpy.log(table[word][0])
            p2vect = numpy.log(p2vect) + numpy.log( table[word][1])
        pspam = num_spam/(num_ham+num_spam)
        pham = num_ham//(num_ham+num_spam)
        p1 = numpy.log(pspam*p1vect)-numpy.log(pspam*p1vect+pham*p2vect)
        p2 = numpy.log(pham*p2vect/(pspam*p1vect+pham*p2vect))
        if p1 > p2:
            print('spam')
            print(p1)
        else:
            print('ham')
            print(p2)
if __name__ == "__main__":
    #读取所有的邮件内容和标志
    Emails = loadAllEmails(EmailsIndex)
    # # print(Emails)
    table = calWordsFreqTable(Emails)
    #输出第一封邮件的标志和内容
    #print(Emails[0])
    #预测下一份邮件
    label = ['你好','的','爸爸']
    checkOneEmail(label,table)

标签:__,spam,拓扑,贝叶斯,垃圾邮件,print,filename,Emails,邮件
来源: https://blog.csdn.net/vimpirespider/article/details/120145245