其他分享
首页 > 其他分享> > 《机器学习实战》第十三章 利用PCA来简化数据

《机器学习实战》第十三章 利用PCA来简化数据

作者:互联网

from numpy import *
from numpy.linalg import linalg
from numpy.ma import mean, argsort, shape
import pandas as pd
import numpy as np

def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]#
    datArr = [list(map(float, line)) for line in stringArr]
    return mat(datArr)#将datArr转换为矩阵

def pca(dataMat, topNfeat=9999999):
    meanVals = mean(list(dataMat), axis=0)#计算平均值
    meanRemoved = dataMat - meanVals#减去原始数据的平均值
    covMat = cov(meanRemoved, rowvar=0)#计算协方差矩阵
    eigVals,eigVects = linalg.eig(mat(covMat))#获得特征值和特征向量
    eigValInd = argsort(eigVals)#对特征值进行从小到大的排序
    eigValInd = eigValInd[:-(topNfeat+1):-1]
    redEigVects = eigVects[:,eigValInd]
    lowDDataMat = meanRemoved * redEigVects#将数据转换到新空间
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

dataMat = loadDataSet('testSet.txt')
lowDMat, reconMAT = pca(dataMat, 1)
print(shape(lowDMat))

报错:ValueError: operands could not be broadcast together with shapes (1000,2) (2,1)

操作数不能与形如(1000,2)(2,1)一起广播

与书里面的对照看了下,可能是函数导入错误,重新导了下函数,顺利通过。


import numpy as np

def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]#
    datArr = [list(map(float, line)) for line in stringArr]
    return np.mat(datArr)#将datArr转换为矩阵

def pca(dataMat, topNfeat=9999999):
    meanVals = np.mean(list(dataMat), axis=0)#计算平均值
    meanRemoved = dataMat - meanVals#减去原始数据的平均值
    covMat = np.cov(meanRemoved, rowvar=0)#计算协方差矩阵
    eigVals,eigVects = np.linalg.eig(np.mat(covMat))#获得特征值和特征向量
    eigValInd = np.argsort(eigVals)#对特征值进行从小到大的排序
    eigValInd = eigValInd[:-(topNfeat+1):-1]
    redEigVects = eigVects[:,eigValInd]
    lowDDataMat = meanRemoved * redEigVects#将数据转换到新空间
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

dataMat = loadDataSet('testSet.txt')
lowDMat, reconMAT = pca(dataMat, 1)
print(np.shape(lowDMat))

输出:(1000, 1)

标签:实战,eigValInd,meanVals,第十三章,import,np,dataMat,line,PCA
来源: https://blog.csdn.net/qq_40016005/article/details/115458343