Apriori算法 python实现
作者:互联网
#!/usr/bin/python # -*- coding: UTF-8 -*- import re def read_data(): #读取数据 return [['bread', 'cream', 'milk', 'tea'], ['bread', 'cream', 'milk'], ['cake', 'milk'], ['milk', 'tea'], ['bread', 'cake', 'milk'], ['bread', 'tea'], ['beer', 'milk', 'tea'], ['bread', 'tea'], ['bread', 'cream', 'milk', 'tea']] def find_all_frequent_1_itemsets(data_set,minsup,support_count): #发现所有频繁1项集,support_count为支持度计数字典,在函数内为此字典填充频繁一项集的值 frequent_1_itemsets=set() for transaction in data_set: for item in transaction: if frozenset([item]) not in support_count: support_count[frozenset([item])]=1 else: support_count[frozenset([item])]+=1 for item in support_count: if (float(support_count[item])/len(data_set))>=minsup: frequent_1_itemsets.add(item) return frequent_1_itemsets def apriori_gen(frequent_k_sub_1_itemset): #由频繁k-1项集生成候选k项集(频繁k-1项集) candidate_k_itemsets=set() for l1 in frequent_k_sub_1_itemset: for l2 in frequent_k_sub_1_itemset: if isinstance(l1,frozenset): #set无序,所以转换成list按字典序排序,l1_临时存下,不改变l1本身 l1_=list(l1) l2_=list(l2) else: l1_=list([l1]) l2_=list([l2]) l1_.sort() l2_.sort() if l1_<>l2_: if l1_[0:-1]==l2_[0:-1]: #若前k-2项相同,则合并 candidate_k_itemsets.add(frozenset(set(l1_+l2_))) #frozenset后才可将set加入set return candidate_k_itemsets def subset(candidate_k_itemsets,transaction): #属于t的所有候选 Ct=set() for candidate_k in candidate_k_itemsets: if set(candidate_k).issubset(set(transaction)): Ct.add(candidate_k) return Ct def extract_the_frequent_K_itemsets(data_set,candidate_k_itemsets,minsup,support_count): #由候选k项集提取频繁k项集(数据集,候选k项集,最小支持度,支持度数据) frequent_K_itemsets=set() for k_itemsets in candidate_k_itemsets: if (float(support_count[k_itemsets])/len(data_set))>=minsup: frequent_K_itemsets.add(k_itemsets) return frequent_K_itemsets def gen_frequent_itemsets(data_set,minsup): #ap算法主体,生成频繁项集,返回结果和频繁项支持度计数 k=1 support_count={} #定义存支持度计数的字典,key为一条k项,value为支持度计数 Fk=find_all_frequent_1_itemsets(data_set,minsup,support_count) #发现所有频繁1项集,传入support_count,在函数内赋值 result=[] #结果,储存所有频繁项集 result.append(Fk) while Fk: k=k+1 Fk_sub_1=Fk Ck=apriori_gen(Fk_sub_1) #产生候选项集 for i in Ck: support_count[frozenset(i)]=0 #初始化字典 for t in data_set: Ct=subset(Ck,t) #识别属于t的所有候选集 for c in Ct: support_count[frozenset(c)]+=1 #支持度计数 Fk=extract_the_frequent_K_itemsets(data_set,Ck,minsup,support_count) #提取频繁k项集 if Fk: result.append(Fk) #筛选,只保留字典中的频繁项,且把频繁度计数转换为频繁度 tmp={} for i in support_count: if support_count[i]>=minsup: tmp[i]=float(support_count[i])/len(data_set) support_count=tmp return result,support_count def gen_related_rule(freq_set,support_data,min_conf): #生成关联规则(频繁项集,支持度字典,最小置信度) related_rule_list=[] #定义关联规则列表 sub_set_list = [] #定义子集列表 for frequent_K_itemsets in freq_set: #遍历频繁项集 for freq_k_item in frequent_K_itemsets: #遍历频繁k项集,遍历出每条频繁记录 sub_set_list.append(freq_k_item) for sub_set in sub_set_list: if freq_k_item==sub_set: continue if sub_set.issubset(freq_k_item): conf = support_data[freq_k_item] / support_data[freq_k_item - sub_set] #计算置信度 related_rule = (freq_k_item - sub_set, sub_set, conf) if conf >= min_conf and related_rule not in related_rule_list: related_rule_list.append(related_rule) return related_rule_list data_set=read_data() minsup=0.2 minconf=0.7 freq_set,support=gen_frequent_itemsets(data_set,minsup) related_rule=gen_related_rule(freq_set,support,minconf) k=1 for frequent_K_itemsets in freq_set: print str("frequent "+str(k)).ljust(70),"support" print "="*90 k+=1 for frequent_K_item in frequent_K_itemsets: print str(frequent_K_item).ljust(70),support[frequent_K_item] print "="*90 print print print "related_rule".ljust(70),"conf" print "="*90 for i in related_rule: print (str(i[0])+"=>"+str(i[1]) ).ljust(70), str(i[2]) print "="*90
标签:count,set,python,support,Apriori,item,算法,frequent,itemsets 来源: https://www.cnblogs.com/leixiao-/p/10840310.html