Python实现kMeans(k均值聚类)
作者:互联网
参考文献 https://www.cnblogs.com/wsine/p/5180769.html
Python实现kMeans(k均值聚类)
运行环境
Pyhton3
numpy(科学计算包)
matplotlib(画图所需,不画图可不必)
计算过程
st=>start: 开始
e=>end: 结束
op1=>operation: 读入数据
op2=>operation: 随机初始化聚类中心
cond=>condition: 是否聚类是否变化
op3=>operation: 寻找最近的点加入聚类
op4=>operation: 更新聚类中心
op5=>operation: 输出结果
st->op1->op2->op3->op4->cond
cond(yes)->op3
cond(no)->op5->e
输入样例
/* 788points.txt */
15.55,28.65
14.9,27.55
14.45,28.35
14.15,28.8
13.75,28.05
13.35,28.45
13,29.15
13.45,27.5
13.6,26.5
12.8,27.35
12.4,27.85
12.3,28.4
12.2,28.65
13.4,25.1
12.95,25.95
788points.txt完整文件:下载
代码实现
# -*- coding: utf-8 -*-
__author__ = 'Wsine'
from numpy import *
import matplotlib.pyplot as plt
import operator
import time
INF = 9999999.0
def loadDataSet(fileName, splitChar='\t'):
"""
输入:文件名
输出:数据集
描述:从文件读入数据集
"""
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
# def createDataSet():
# """
# 输出:数据集
# 描述:生成数据集
# """
# dataSet = [[0.0, 2.0],
# [0.0, 0.0],
# [1.5, 0.0],
# [5.0, 0.0],
# [5.0, 2.0]]
# return dataSet
def distEclud(vecA, vecB):
"""
输入:向量A, 向量B
输出:两个向量的欧式距离
"""
return sqrt(sum(power(vecA - vecB, 2)))
def randCent(dataSet, k):
"""
输入:数据集, 聚类个数
输出:k个随机质心的矩阵
"""
n = shape(dataSet)[1]
centroids = mat(zeros((k, n)))
for j in range(n):
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
centroids[:, j] = minJ + rangeJ * random.rand(k, 1)
return centroids
def kMeans(dataSet, k, distMeans=distEclud, createCent=randCent):
"""
输入:数据集, 聚类个数, 距离计算函数, 生成随机质心函数
输出:质心矩阵, 簇分配和距离矩阵
"""
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m): # 寻找最近的质心
minDist = INF #设定的距离阈值,这么大没什么用
minIndex = -1 #假设的质心点的位置
for j in range(k): #对dataSet的点挨个与0 - k质心点做对比,找出最近的点
distJI = distMeans(centroids[j, :], dataSet[i, :]) #计算dataSet的点与假设质心点之间的欧几里得距离
if distJI < minDist: #dataSet的每个点都要与centroids质心点进行比较,找出距离质心最近的点(质心坐标为0 - k)
minDist = distJI
minIndex = j
if clusterAssment[i, 0] != minIndex: #clusterAssment[i, 0]所有点都是0,都不会相等
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2 #clusterAssment[i,:]具有i行,第一列的值表示dataSet属于0 - k的哪一个,cA第一列的取值范围是0 - k
for cent in range(k): # 更新质心的位置,只是更新了质心点,没有在质心点的基础上进行迭代?
ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] #寻找出clusterAssment[:, 0]第一列有i个值,i个值的样子大概是【1,2,3,1,1,2】,寻找出值为1的下标分别是0,3,4
centroids[cent, :] = mean(ptsInClust, axis=0) #axis = 0:压缩行,对各列求均值,返回 1* n 矩阵
return centroids, clusterAssment
def plotFeature(dataSet, centroids, clusterAssment): # clusterAssment的行列数与dataSet一样,必须保证行列要与dataSet对齐
m = shape(centroids)[0] #返回矩阵的行数,【1】表示返回矩阵的列数
fig = plt.figure() #画二维图
scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<'] #定义形状
scatterColors = ['blue', 'green', 'yellow', 'purple', 'orange', 'black', 'brown'] #定义颜色
ax = fig.add_subplot(111) #图框的个数
for i in range(m): #m的取值0 - k
ptsInCurCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :] #比方说,第一次调出第一列为0的所有值的下标,第二次调出所有为1的下标
markerStyle = scatterMarkers[i % len(scatterMarkers)]
colorSytle = scatterColors[i % len(scatterColors)]
ax.scatter(ptsInCurCluster[:, 0].flatten().A[0], ptsInCurCluster[:, 1].flatten().A[0], marker=markerStyle, c=colorSytle, s=90) #https://www.jianshu.com/p/53e49c02c469
ax.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0], marker='+', c='red', s=300) #定义质心点的形状、颜色
def main():
#dataSet = loadDataSet('testSet2.txt')
dataSet = loadDataSet('788points.txt', splitChar=',')
#dataSet = createDataSet()
dataSet = mat(dataSet)
resultCentroids, clustAssing = kMeans(dataSet, 6)
print('*******************')
print(resultCentroids)
print('*******************')
plotFeature(dataSet, resultCentroids, clustAssing)
if __name__ == '__main__':
start = time.clock()
main()
end = time.clock()
print('finish all in %s' % str(end - start))
plt.show()
标签:Python,聚类,kMeans,centroids,dataSet,clusterAssment,质心,def 来源: https://blog.csdn.net/RNG_uzi_/article/details/88286147