其他分享
首页 > 其他分享> > 机器学习 周志华 课后习题 7.3

机器学习 周志华 课后习题 7.3

作者:互联网

7.3 试编程拉普拉斯修正的朴素贝叶斯分类器,并以西瓜数据集3.0为训练集,对“测1”样本进行判别。

代码 python代码实现。

import pandas as pd
import math
import numpy as np
class LaplacianNB():
    """
    Laplacian naive bayes for binary classification problem.
    """
    def __init__(self):
        """
        """
    def train(self, X, y):
        """
        Training laplacian naive bayes classifier with traning set (X, y).
        Input:
            X: list of instances. Each instance is represented by
            y: list of labels. 0 represents bad, 1 represents good.
        """
        N = len(y)
        self.classes = self.count_list(y)
        self.class_num = len(self.classes)
        self.classes_p = {}
        self.classes_p1 = {}
        #print self.classes
        #进行拉普拉斯修正,求得先验概率
        for c, n in self.classes.items():
            self.classes_p[c] = float(n+1) / (N+self.class_num)
        #未经修正下的先验值
        # for c, n in self.classes.items():
        #     self.classes_p1[c] = float(n ) / (N )
        # print(self.classes_p1)

        #离散属性
        self.discrete_attris_with_good_p = []
        self.discrete_attris_with_bad_p = []
        for i in range(6):
            attr_with_good = []
            attr_with_bad = []
            for j in range(N):
                if y[j] == 1:
                     attr_with_good.append(X[j][i])
                else:
                    attr_with_bad.append(X[j][i])
            unique_with_good = self.count_list(attr_with_good)
            unique_with_bad = self.count_list(attr_with_bad)
            self.discrete_attris_with_good_p.append(self.discrete_p(unique_with_good, self.classes[1]))
            self.discrete_attris_with_bad_p.append(self.discrete_p(unique_with_bad, self.classes[0]))

        self.good_mus = []
        self.good_vars = []
        self.bad_mus = []
        self.bad_vars = []
        """
              连续属性考虑概率密度
        """

        for i in range(2):
            attr_with_good = []
            attr_with_bad = []
            for j in range(N):
                if y[j] == 1:
                    attr_with_good.append(X[j][i+6])
                else:
                    attr_with_bad.append(X[j][i+6])
            good_mu, good_var = self.mu_var_of_list(attr_with_good)
            bad_mu, bad_var = self.mu_var_of_list(attr_with_bad)
            self.good_mus.append(good_mu)
            self.good_vars.append(good_var)
            self.bad_mus.append(bad_mu)
            self.bad_vars.append(bad_var)

    def predict(self, x):
        """
        """
        p_good = self.classes_p[1]
        p_bad = self.classes_p[0]
        for i in range(6):
            p_good  *= self.discrete_attris_with_good_p[i][x[i]]
            p_bad *= self.discrete_attris_with_bad_p[i][x[i]]
        for i in range(2):
            p_good *= self.continuous_p(x[i+6], self.good_mus[i], self.good_vars[i])
            p_bad *= self.continuous_p(x[i+6], self.bad_mus[i], self.bad_vars[i])
        if p_good >= p_bad:
            return p_good, p_bad, "是"
        else:
            return p_good, p_bad, "否"

    def count_list(self, l):
        """
        统计元素个数
        """
        unique_dict = {}
        for e in l:
            if e in unique_dict:
                unique_dict[e] += 1
            else:
                unique_dict[e] = 1
        return unique_dict

  #为每个属性估计条件概率
    def discrete_p(self, d, N_class):
        """
        Compute discrete attribution probability based on {0:, 1:, 2: }.
        """
        new_d = {}
        #print d
        for a, n in d.items():
            new_d[a] = float(n+1) / (N_class + len(d))
        return new_d
    #求概率密度函数
    def continuous_p(self, x, mu, var):
        p = 1.0 / (math.sqrt(2*math.pi) * math.sqrt(var)) * math.exp(- (x-mu)**2 /(2*var))
        return p
  #求均值方差
    def mu_var_of_list(self, l):
        mu = sum(l) / float(len(l))
        var = 0
        for i in range(len(l)):
            var += (l[i]-mu)**2
        var = var / float(len(l))
        return mu, var

if __name__=="__main__":
    lnb = LaplacianNB()
    data = pd.read_csv("data/西瓜数据集3.0.csv")
    data=np.array(data)
    X=data[:,1:-1]
    y=[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]
    #print X, y
    lnb.train(X, y)
    #print lnb.discrete_attris_with_good_p
    label = lnb.predict(["青绿", "蜷缩", "浊响", "清晰", "凹陷", "硬滑", 0.697, 0.460])
    print ("predict ressult: ", label)

标签:周志华,good,attr,self,bad,classes,课后,var,习题
来源: https://blog.csdn.net/JJPOMELO/article/details/121467151