编程语言
首页 > 编程语言> > Apriori算法 源码

Apriori算法 源码

作者:互联网

Apriori算法 源码

具体原理先鸽了

下面是代码

view code


#coding:utf-8


# generate data
def genData():
    return [['牛奶','啤酒','尿布'],
    ['牛奶','面包','黄油'],
    ['牛奶','尿布','饼干'],
    ['面包','黄油','饼干'],
    ['啤酒','尿布','饼干'],
    ['牛奶','尿布','面包','黄油'],
    ['尿布','面包','黄油'],
    ['啤酒','尿布'],
    ['牛奶','尿布','面包','黄油'],
    ['啤酒','饼干'] ]

def loadDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]


# 传入参数:数据集
# 返回值:候选项集C1
def genC1(datalist)->[frozenset]:
    goodsset = set()
    for items in datalist:
        for goods in items:
            goodsset.add(goods)
    C = list()
    for goods in goodsset:
        C.append(frozenset([goods]))
    return C



# 传入参数:频繁项集list
# 返回:下一个候选项集->list(frozenset)
def mergeToNext(preL):
    Ck = list()
    k = len(preL[0])
    for i in range(len(preL)):
        for j in  range(i+1,len(preL)):
            A = sorted([x for x in preL[i]])[:k-1]
            B = sorted([x for x in preL[j]])[:k-1]
            if A == B:
                Ck.append(preL[i] | preL[j])
    return Ck

# 传入参数:数据集,候选项集,最小支持度
# 返回值:频繁项集->list(frozenset),频繁项集支持度->dict
def genfreq(dataset, preC, minsupport):
    objfreq = dict()
    L = list()
    for item in preC:
        __appcnt = 0
        for data in dataset:
            if (item&data) == item:
                __appcnt += 1
        if __appcnt / len(dataset) >= minsupport:
            L.append(item)
            objfreq[item] = __appcnt / len(dataset)
    return L, objfreq



# 传入参数:频繁项,规则后集,支持度集合,规则集合,最小置信度
# 无返回值
def GetRules(freqset, R, suppotdata, rulelist, minconf):
    if len(R)==0 or len(R[0])==len(freqset):
        return
    legalconseq = list()
    for ret in R:
        # P(A|B) = P(AB) / P(B)
        conseq = freqset - ret
        conf = supportdata[freqset] / supportdata[conseq]
        if conf >= minconf:
            rulelist.append([conseq,ret,conf])
            legalconseq.append(conseq)
    nextconseqlist = mergeToNext(legalconseq)
    nextR = list()
    for conseq in nextconseqlist:
        nextR.append(freqset-conseq)
    if len(nextR)==0 or len(nextR[0])==0:
        return
    GetRules(freqset,nextR,supportdata,rulelist,minconf)

# 传入参数:各长度频繁项集,频繁项集支持度,最小置信度
# 返回值:规则列表以及置信度
def genRules(Llist, supportdata, minconf = .5):
    rulelist = list()
    for i in range(1,len(Llist)):
        L = Llist[i]
        if len(L) == 0:
            break
        for freqset in L:
            R = [frozenset([x]) for x in freqset]
            GetRules(freqset,R,supportdata,rulelist,minconf)
    return rulelist

# 传入参数:数据集,最小支持度
# 返回值:各长度频繁项集->list(list(frozenset)),频繁项集支持度->dist
def apriori(datalist, minsupport = .5):
    # C1 -> L1 ---merge---> C2
    dataset = list(map(frozenset,[x for x in datalist]))
    supportdata = dict()
    Llist = list()
    C = genC1(dataset)
    while len(C) != 0:
        L, tmpfreq = genfreq(dataset,C,minsupport)
        Llist.append(L)
        supportdata.update(tmpfreq)
        C = mergeToNext(Llist[-1])
    return Llist, supportdata


if __name__ == "__main__":
    # datalist = genData()
    datalist = loadDataSet()
    Llist, supportdata = apriori(datalist)
    rulelist = genRules(Llist,supportdata)
    # for L in Llist:
    #     for p in L:
    #         print(p,supportdata[p])
    for rule in rulelist:
        print(rule[0],'->',rule[1],'conf = ',rule[2])

标签:return,Apriori,list,supportdata,项集,Llist,算法,源码,len
来源: https://www.cnblogs.com/kikokiko/p/13996120.html