编程语言
首页 > 编程语言> > K-均值聚类(Python3)

K-均值聚类(Python3)

作者:互联网

K-均值聚类(Python3)

1. K均值算法

1.1 K-均值聚类支持函数Python3实现

from numpy import *
import matplotlib
import matplotlib.pyplot as plt

#导入数据集
def loadDataSet(filename):
    dataMat = []
    fr = open(filename)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return dataMat
#计算两个向量的欧式距离
def distEclud(vecA, vecB):
    return sqrt(sum(power(vecA - vecB, 2)))

#生成随机质心
def randCent(dataSet, k):
    n = shape(dataSet)[1]
    centroids = mat(zeros((k, n)))
    for j in range(n):
        minJ = min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j]) - minJ)
        centroids[:, j] = minJ + rangeJ * random.rand(k,1)
    return centroids

#K-均值聚类算法
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m, 2)))#簇分配结果矩阵,存储簇索引值与误差
    centroids = randCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:#迭代标志
        clusterChanged = False
        for i in range(m):
            minDist = inf
            minIndex = -1
            for j in range(k):#寻找最近质心
                distJI = distMeas(centroids[j,:],dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i,0] != minIndex:#如果簇分配结果发生变化,更新迭代标志
                clusterChanged = True
            clusterAssment[i,:] = minIndex,minDist**2#重新分配簇结果
        print (centroids)
        for cent in range(k):
            ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]#获取给定簇的所有点
            centroids[cent,:] = mean(ptsInClust, axis=0)#沿矩阵的列方向计算它们的均值
    return centroids, clusterAssment
def plotCentroids(datMat, centroids, clusterAssment, k):
    fig = plt.figure()
    ax  = fig.add_subplot(111)
    for i in range(k):
        ax.scatter(datMat[[nonzero(clusterAssment[:,0] == float(i))[0]],0].flatten().A[0], datMat[[nonzero(clusterAssment[:,0] == float(i))[0]],1].flatten().A[0], marker="^", s=90)
    ax.scatter(centroids[:,0].flatten().A[0], centroids[:,1].flatten().A[0], marker="+", s=300, c="black")
datMat = mat(loadDataSet('testSet.txt'))
myCentroids, clustAssing = kMeans(datMat,4)
[[ 2.05223983 -3.0746459 ]
 [ 2.07512432  3.50918187]
 [-2.18388394 -1.47117211]
 [ 0.27725078  5.1426455 ]]
[[ 2.65077367 -2.79019029]
 [ 2.66534547  2.99911595]
 [-3.4859745  -2.31300105]
 [-2.10585717  3.15782844]]
[[ 2.65077367 -2.79019029]
 [ 2.6265299   3.10868015]
 [-3.53973889 -2.89384326]
 [-2.46154315  2.78737555]]
plotCentroids(datMat, myCentroids, clustAssing, 4)

在这里插入图片描述

1.2 使用后处理来提高聚类性能

datMat = mat(loadDataSet('testSet.txt'))
myCentroids, clustAssing = kMeans(datMat,4)
plotCentroids(datMat, myCentroids, clustAssing, 4)
[[ 1.96818879  3.10418929]
 [-2.66894365  2.35308276]
 [ 3.56299862  0.14036498]
 [-4.42485328 -4.1647096 ]]
[[ 2.52792822  3.30405044]
 [-2.46154315  2.78737555]
 [ 2.8675685  -2.36043623]
 [-3.38237045 -2.9473363 ]]
[[ 2.6265299   3.10868015]
 [-2.46154315  2.78737555]
 [ 2.80293085 -2.7315146 ]
 [-3.38237045 -2.9473363 ]]

在这里插入图片描述

1.3 二分K-均值算法

#二分K-均值聚类算法
def biKmeans(dataSet, k, distMeas=distEclud):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))#存储簇分配结果及平方误差
    centroid0 = mean(dataSet, axis=0).tolist()[0]#计算整个数据集的质心
    centList = [centroid0[0]]#使用一个列表保留所有质心
    for j in range(m):
        clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2#计算每个点到质心的误差值
    while(len(centList) < k):#该循环不停对簇进行划分,直到得到想要的簇的数目
        lowestSSE = inf #将SSE值设为无穷大
        for i in range(len(centList)):  #遍历簇列表中的所有簇
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A == i)[0],:]#将该簇中的所有点看成一个小的数据集。
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)#生成两个质心同时给出每个簇的误差值
            sseSplit = sum(splitClustAss[:,1]) 
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A != i)[0],1])
            print("sseSplit, and NotSplit: ", sseSplit, sseNotSplit)
            if(sseSplit + sseNotSplit) < lowestSSE:  #如果该划分的SSE值最小,则本次划分被保存 
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0], 0] = len(centList)#更新簇的分配结果
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0], 0] = bestCentToSplit
        print("the bestCentToSplit is: ",bestCentToSplit)
        print("the len of bestClustAss is: ",len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0], :] = bestClustAss
    return mat(centList), clusterAssment
datMat3 = mat(loadDataSet('testSet2.txt'))
centList, myNewAssments = biKmeans(datMat3, 3)
centList
[[ 3.37991172  3.38570226]
 [-3.10486239 -2.37443875]]
[[ 1.86138027  3.22269712]
 [-1.70174271 -0.30206818]]
[[ 2.35797261  3.21160974]
 [-1.72153338 -0.00938424]]
[[ 2.76275171  3.12704005]
 [-1.73028592  0.20133246]]
[[ 2.93386365  3.12782785]
 [-1.70351595  0.27408125]]
sseSplit, and NotSplit:  541.2976292649145 0.0
the bestCentToSplit is:  0
the len of bestClustAss is:  60
[[3.38922822 0.8721925 ]
 [3.91147439 0.73983691]]
[[2.75314728 3.06695644]
 [4.560311   3.6756705 ]]
[[2.48449707 2.95091147]
 [4.2819634  3.658577  ]]
sseSplit, and NotSplit:  25.535514707587865 501.7683305828214
[[-2.71396475  1.10408322]
 [-1.82642235 -3.1436269 ]]
[[-2.94737575  3.3263781 ]
 [-0.45965615 -2.7782156 ]]
sseSplit, and NotSplit:  67.2202000797829 39.52929868209309
the bestCentToSplit is:  1
the len of bestClustAss is:  40





matrix([[ 2.93386365,  3.12782785],
        [-2.94737575,  3.3263781 ],
        [-0.45965615, -2.7782156 ]])
plotCentroids(datMat3, centList, myNewAssments, 3)

在这里插入图片描述

本章小结

标签:均值,dataSet,算法,clusterAssment,质心,Python3,聚类
来源: https://blog.csdn.net/sungod2/article/details/100054536