Apriori算法 源码
作者:互联网
具体原理先鸽了
下面是代码
view code
#coding:utf-8
# generate data
def genData():
return [['牛奶','啤酒','尿布'],
['牛奶','面包','黄油'],
['牛奶','尿布','饼干'],
['面包','黄油','饼干'],
['啤酒','尿布','饼干'],
['牛奶','尿布','面包','黄油'],
['尿布','面包','黄油'],
['啤酒','尿布'],
['牛奶','尿布','面包','黄油'],
['啤酒','饼干'] ]
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# 传入参数:数据集
# 返回值:候选项集C1
def genC1(datalist)->[frozenset]:
goodsset = set()
for items in datalist:
for goods in items:
goodsset.add(goods)
C = list()
for goods in goodsset:
C.append(frozenset([goods]))
return C
# 传入参数:频繁项集list
# 返回:下一个候选项集->list(frozenset)
def mergeToNext(preL):
Ck = list()
k = len(preL[0])
for i in range(len(preL)):
for j in range(i+1,len(preL)):
A = sorted([x for x in preL[i]])[:k-1]
B = sorted([x for x in preL[j]])[:k-1]
if A == B:
Ck.append(preL[i] | preL[j])
return Ck
# 传入参数:数据集,候选项集,最小支持度
# 返回值:频繁项集->list(frozenset),频繁项集支持度->dict
def genfreq(dataset, preC, minsupport):
objfreq = dict()
L = list()
for item in preC:
__appcnt = 0
for data in dataset:
if (item&data) == item:
__appcnt += 1
if __appcnt / len(dataset) >= minsupport:
L.append(item)
objfreq[item] = __appcnt / len(dataset)
return L, objfreq
# 传入参数:频繁项,规则后集,支持度集合,规则集合,最小置信度
# 无返回值
def GetRules(freqset, R, suppotdata, rulelist, minconf):
if len(R)==0 or len(R[0])==len(freqset):
return
legalconseq = list()
for ret in R:
# P(A|B) = P(AB) / P(B)
conseq = freqset - ret
conf = supportdata[freqset] / supportdata[conseq]
if conf >= minconf:
rulelist.append([conseq,ret,conf])
legalconseq.append(conseq)
nextconseqlist = mergeToNext(legalconseq)
nextR = list()
for conseq in nextconseqlist:
nextR.append(freqset-conseq)
if len(nextR)==0 or len(nextR[0])==0:
return
GetRules(freqset,nextR,supportdata,rulelist,minconf)
# 传入参数:各长度频繁项集,频繁项集支持度,最小置信度
# 返回值:规则列表以及置信度
def genRules(Llist, supportdata, minconf = .5):
rulelist = list()
for i in range(1,len(Llist)):
L = Llist[i]
if len(L) == 0:
break
for freqset in L:
R = [frozenset([x]) for x in freqset]
GetRules(freqset,R,supportdata,rulelist,minconf)
return rulelist
# 传入参数:数据集,最小支持度
# 返回值:各长度频繁项集->list(list(frozenset)),频繁项集支持度->dist
def apriori(datalist, minsupport = .5):
# C1 -> L1 ---merge---> C2
dataset = list(map(frozenset,[x for x in datalist]))
supportdata = dict()
Llist = list()
C = genC1(dataset)
while len(C) != 0:
L, tmpfreq = genfreq(dataset,C,minsupport)
Llist.append(L)
supportdata.update(tmpfreq)
C = mergeToNext(Llist[-1])
return Llist, supportdata
if __name__ == "__main__":
# datalist = genData()
datalist = loadDataSet()
Llist, supportdata = apriori(datalist)
rulelist = genRules(Llist,supportdata)
# for L in Llist:
# for p in L:
# print(p,supportdata[p])
for rule in rulelist:
print(rule[0],'->',rule[1],'conf = ',rule[2])
标签:return,Apriori,list,supportdata,项集,Llist,算法,源码,len 来源: https://www.cnblogs.com/kikokiko/p/13996120.html