(3)机器学习实战笔记:朴素贝叶斯
作者:互联网
优点:数据比较少的时候仍然有效,可以处理多类别问题
缺点:对于输入数据的准备方式比较敏感
适用的数据类型:标称型数据
将一组单词转换为一组数字
使用数字计算概率
著名应用:使用朴素贝叶斯过滤垃圾邮件
分类思路:
(1)收集数据:提供文本文件
(2)准备数据:将文本文件解析成词条向量
(3)分析数据:检查词条确保解析的正确性
(4)训练算法:使用我们之前建立的trainNB0()函数
(5)测试算法:使用classifyNB(),构建一个新的测试函数来计算文档的错误率
(6)使用算法:构建一个完整的程度对一组文档进行分类,将错分的文档输出到屏幕上
切分文本:使用String.split()方法切分
为了更加精确地估计分类器错误率,需要进行多次迭代后求出平均错误率
——————————————————————————————-
简单实例:通过朴素贝叶斯分类实现垃圾邮件分类
通过对一邮件文本数据集进行处理(转化为向量)
经过朴素贝叶斯分类器进行分类可以判定是否为垃圾邮件
代码实现了简单的朴素贝叶斯分类器、文本向量转换器
详细备注见解释,下载数据集点这里
import numpy as np from functools import reduce #准备数据:从文本中构建词向量 def loadDataSet(): # 切分的词条 postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] # 类别标签向量,1代表侮辱性词汇,0代表不是 classVec = [0, 1, 0, 1, 0, 1] # 返回实验样本切分的词条、类别标签向量 return postingList, classVec def createVocabList(dataSet): #无重复提取单祠 vocabSet = set([]) for document in dataSet: vocabSet = vocabSet |set(document)#创建两个集合的并集 return list(vocabSet) #检查单词在第几篇文档(文档为inputSet向量)中出现 def setOfWords2Vec(vocabList,inputSet): #创建一个元素都为0的向量 returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 # else: print("the word: %s is not in my Vocabulary!" % word) return returnVec listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) # print(myVocabList) # print(setOfWords2Vec(myVocabList,listOPosts[0])) #训练算法:词向量计算概率 #输入:文档矩阵trainMatrix、每类文档类别标签所构成向量trainCategory def trainNB0(trainMatrix,trainCategory): numTrainDocs = len(trainMatrix) numWords = len(trainMatrix[0])#无重复单词表有几个单词 #概率初始化 pAbusive=sum(trainCategory)/float(numTrainDocs) #文档里脏话文档的概率 p0Num = np.ones(numWords)#正向词汇列 p1Num = np.ones(numWords)#脏话词汇列,都初始化为0 #降低由概率值为0导致最后乘积为0的影响 p0Denom=2.0 p1Denom=2.0 for i in range(numTrainDocs): #计算文档属于侮辱性文档(class=1)的概率P(1) #对于二分类问题可以通过1-P(1)得到P(0) #一旦某个词语在某文档中出现,该词对应的个数就加1 #在所有的文档中,文档的总词数也相应+1, if trainCategory[i]==1: #对于脏话类词汇统计 p1Num += trainMatrix[i] #统计脏话词汇数量,对应位置数量+1(单位就是1) p1Denom += sum(trainMatrix[i]) #总脏话词汇+出现次数 else: p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect=p1Num/p1Denom p0Vect=p0Num/p0Denom return p0Vect,p1Vect,pAbusive trainMat=[] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #统计经过处理的无重复词汇表中在对应第(postinDoc)文档中是否出现,是标记1, #返回len为文档长度的向量组 # print(trainMat) p0V,p1V,pAb=trainNB0(trainMat,listClasses) # print("0") # print(p0V) # print("1") # print(p1V) # print("A") # print(pAb) # print(myVocabList) #朴素贝叶斯分类函数/输入向量(要分类的向量,使用函数trainNB0计算得到三个概率 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1 = sum(vec2Classify*p1Vec)+np.log(pClass1) p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1) if p1>p0: return 1 else: return 0 #简单的分类测试 def testingNB(): listOposts,listClasses = loadDataSet() myVocabList = createVocabList(listOposts) trainMat=[] for postinDoc in listOposts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses)) testEntry=['love','my','dalmation'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)) testEntry=['stupid','garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)) testingNB() #词袋模型:遇到每一个单词时,会增加词向量中对应值,而不是将对应数值设为0 def bagOfWords2VecMN(vocabList,inputSet): returnVec = [0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 return returnVec #应用:进行垃圾邮件的过滤 #切分文本 #test! # mySent = 'This book is the best book on python or M.L. I have laid eyes upon.' # # import re # regEx = re.compile('\\W*') # listOfTokens = regEx.split(mySent) #测试:使用朴素贝叶斯进行交叉验证 def textParse(bigString): import re listOfTokens =re.split(r'\W*',bigString) return[tok.lower() for tok in listOfTokens if len(tok)>2] #返回长度大于2的词,而且全部小写化 #该函数对贝叶斯垃圾邮件分类进行自动化处理,导入spam与ham下的文本文件,并为他们解析词列表。(*1) #分离器所需要的概率计算只利用训练集中的文档来完成 #python变量trainingSet是一个整数列表,数值范围是0到49;(*2) def spamTest(): docList=[] classList=[] fullText=[] main_email=[] for i in range(1,26): #(*1) wordList = textParse(open('email/spam/%d.txt'%i).read()) main_e=open('email/spam/%d.txt'%i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt'%i).read()) docList.append(wordList) main_e= open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) #构建词汇表 vocabList = createVocabList(docList) # print("构建的vocabList:") # print(vocabList) # print("=========================================================") #进行测试集的划分 (*2) trainingSet = list(range(50)) testSet=[] for i in range(10): #随机选择10个文件 randIndex = int(np.random.uniform(0,len(trainingSet))) #随机构建测试集,获取随机数作为index testSet.append(trainingSet[randIndex])#把index对应的邮件index添加到测试集中 del(trainingSet[randIndex])#并且把该index从待挑选名单中删除 trainMat=[] trainClasses =[] for docIndex in trainingSet: #对于每一个训练集里的训练单位进行词向量的构建 trainMat.append(setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) #安上对应标签! #针对训练集进行训练 # # print(trainMat) # print(trainClasses) p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses)) # print(p0V) errorCount = 0 for docIndex in testSet: #提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList,docList[docIndex]) # print(wordVector) if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]: errorCount +=1 print(main_email[docIndex]) print(classifyNB(np.array(wordVector), p0V, p1V, pSpam)) print(classList[docIndex]) print('the error rate is :',float(errorCount)/len(testSet)) # spamTest() #寻找最优参数 def findthebest_Data_test(): docList = [] classList = [] fullText = [] main_email = [] for i in range(1, 26): # (*1) wordList = textParse(open('email/spam/%d.txt' % i).read()) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) # 构建词汇表 vocabList = createVocabList(docList) # print("构建的vocabList:") # print(vocabList) # print("=========================================================") # 进行测试集的划分 (*2) trainingSet = list(range(50)) testSet = [] for i in range(10): # 随机选择10个文件 randIndex = int(np.random.uniform(0, len(trainingSet))) # 随机构建测试集,获取随机数作为index testSet.append(trainingSet[randIndex]) # 把index对应的邮件index添加到测试集中 del (trainingSet[randIndex]) # 并且把该index从待挑选名单中删除 trainMat = [] trainClasses = [] for docIndex in trainingSet: # 对于每一个训练集里的训练单位进行词向量的构建 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) # 安上对应标签! # 针对训练集进行训练 # # print(trainMat) # print(trainClasses) p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses)) # print(p0V) errorCount = 0 for docIndex in testSet: # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList, docList[docIndex]) # print(wordVector) if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 # print(main_email[docIndex]) # print(classifyNB(np.array(wordVector), p0V, p1V, pSpam)) # print(classList[docIndex]) # print('the error rate is :', float(errorCount) / len(testSet)) error_rate=float(errorCount) / len(testSet) return p0V, p1V, pSpam,error_rate def find_the_data(): p0Num = np.ones(10) p1Num = np.ones(10) PA = 0.0 err=1 for i in range(50): a,b,c,d=findthebest_Data_test() if d<err: err = d p0Num=a p1Num=b PA=c return p0Num,p1Num,PA def final_test(): p0,p1,pA =find_the_data() docList = [] classList = [] fullText = [] main_email = [] for i in range(1, 26): # (*1) wordList = textParse(open('email/spam/%d.txt' % i).read()) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) main_e = open('email/spam/%d.txt' % i).read() main_email.append(main_e) fullText.extend(wordList) classList.append(0) vocabList = createVocabList(docList) errorCount = 0 for i in range(len(docList)): # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector # print("train") # print(docList[docIndex]) wordVector = setOfWords2Vec(vocabList, docList[i]) # print(wordVector) if classifyNB(np.array(wordVector), p0, p1, pA) != classList[i]: errorCount += 1 # print(main_email[i]) # print(classifyNB(np.array(wordVector), p0, p1, pA)) # print(classList[i]) print('the error rate is :', float(errorCount) / len(docList)) final_test()
标签:实战,vocabList,笔记,贝叶斯,np,docIndex,print,main,append 来源: https://www.cnblogs.com/AKsnoopy/p/14085074.html