编程语言
首页 > 编程语言> > Apriori算法 python实现

Apriori算法 python实现

作者:互联网

 

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re

def read_data():    #读取数据
    return [['bread', 'cream', 'milk', 'tea'], ['bread', 'cream', 'milk'], ['cake', 'milk'], ['milk', 'tea'], ['bread', 'cake', 'milk'], ['bread', 'tea'], ['beer', 'milk', 'tea'], ['bread', 'tea'], ['bread', 'cream', 'milk', 'tea']]

def find_all_frequent_1_itemsets(data_set,minsup,support_count): #发现所有频繁1项集,support_count为支持度计数字典,在函数内为此字典填充频繁一项集的值
    frequent_1_itemsets=set()
    for transaction in data_set:
        for item in transaction:
            if frozenset([item]) not in support_count:
                support_count[frozenset([item])]=1
            else:
                support_count[frozenset([item])]+=1

    for item in support_count:
        if (float(support_count[item])/len(data_set))>=minsup:
            frequent_1_itemsets.add(item)

    return frequent_1_itemsets   

def apriori_gen(frequent_k_sub_1_itemset):  #由频繁k-1项集生成候选k项集(频繁k-1项集)
    candidate_k_itemsets=set()
    for l1 in frequent_k_sub_1_itemset:
        for l2 in frequent_k_sub_1_itemset:
            if isinstance(l1,frozenset): #set无序,所以转换成list按字典序排序,l1_临时存下,不改变l1本身
                l1_=list(l1)
                l2_=list(l2)
            else:
                l1_=list([l1])
                l2_=list([l2])
            l1_.sort()
            l2_.sort()
            if l1_<>l2_:
                if l1_[0:-1]==l2_[0:-1]:  #若前k-2项相同,则合并
                    candidate_k_itemsets.add(frozenset(set(l1_+l2_)))   #frozenset后才可将set加入set
    return candidate_k_itemsets

def subset(candidate_k_itemsets,transaction):   #属于t的所有候选
    Ct=set()
    for candidate_k in candidate_k_itemsets:
        if set(candidate_k).issubset(set(transaction)):
            Ct.add(candidate_k)
    return Ct

def extract_the_frequent_K_itemsets(data_set,candidate_k_itemsets,minsup,support_count):  #由候选k项集提取频繁k项集(数据集,候选k项集,最小支持度,支持度数据)
    frequent_K_itemsets=set()
    for k_itemsets in candidate_k_itemsets:
        if (float(support_count[k_itemsets])/len(data_set))>=minsup:
            frequent_K_itemsets.add(k_itemsets)

    return frequent_K_itemsets



def gen_frequent_itemsets(data_set,minsup):   #ap算法主体,生成频繁项集,返回结果和频繁项支持度计数
    k=1
    support_count={}    #定义存支持度计数的字典,key为一条k项,value为支持度计数
    Fk=find_all_frequent_1_itemsets(data_set,minsup,support_count)    #发现所有频繁1项集,传入support_count,在函数内赋值

    result=[]   #结果,储存所有频繁项集
    result.append(Fk)

    while Fk:
        k=k+1
        Fk_sub_1=Fk
        Ck=apriori_gen(Fk_sub_1)    #产生候选项集

        for i in Ck:
            support_count[frozenset(i)]=0   #初始化字典

        for t in data_set:  
            Ct=subset(Ck,t) #识别属于t的所有候选集
            for c in Ct:
                support_count[frozenset(c)]+=1  #支持度计数

        Fk=extract_the_frequent_K_itemsets(data_set,Ck,minsup,support_count)    #提取频繁k项集

        if Fk:
            result.append(Fk)


    #筛选,只保留字典中的频繁项,且把频繁度计数转换为频繁度
    tmp={}  
    for i in support_count: 
        if support_count[i]>=minsup:
            tmp[i]=float(support_count[i])/len(data_set)
    support_count=tmp

    return result,support_count


def gen_related_rule(freq_set,support_data,min_conf):  #生成关联规则(频繁项集,支持度字典,最小置信度)
    
    related_rule_list=[] #定义关联规则列表
    sub_set_list = []   #定义子集列表


    
    for frequent_K_itemsets in freq_set: #遍历频繁项集
        for freq_k_item in frequent_K_itemsets:  #遍历频繁k项集,遍历出每条频繁记录
            sub_set_list.append(freq_k_item)
            for sub_set in sub_set_list:
                if freq_k_item==sub_set:
                    continue
                    
                if sub_set.issubset(freq_k_item):
                    conf = support_data[freq_k_item] / support_data[freq_k_item - sub_set]  #计算置信度
                    related_rule = (freq_k_item - sub_set, sub_set, conf)
                    if conf >= min_conf and related_rule not in related_rule_list:
                        related_rule_list.append(related_rule)
            
    

    return related_rule_list



data_set=read_data()
minsup=0.2
minconf=0.7

freq_set,support=gen_frequent_itemsets(data_set,minsup)
related_rule=gen_related_rule(freq_set,support,minconf)

k=1
for frequent_K_itemsets in freq_set:
    print str("frequent "+str(k)).ljust(70),"support"
    print "="*90
    k+=1
    for frequent_K_item in frequent_K_itemsets:
        print str(frequent_K_item).ljust(70),support[frequent_K_item]
    print "="*90
    print

print
print "related_rule".ljust(70),"conf"
print "="*90
for i in related_rule:
    print (str(i[0])+"=>"+str(i[1]) ).ljust(70), str(i[2])
print "="*90

 

标签:count,set,python,support,Apriori,item,算法,frequent,itemsets
来源: https://www.cnblogs.com/leixiao-/p/10840310.html