关于TP,TN,FP,FN相关验证
作者:互联网
先贴基础
以我前面博客所述例子进行举例:
得到数据:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.rcParams[u'font.sans-serif'] = ['simhei']
basepath = 'D:/data/'
topK = 50 #只取前K个推荐
获取商品信息
# 自定义一个商品信息类
class ItemsInfo:
itemclass = ''
iteminfo = ''
infoPath = 'D:/data/doc_info.txt'
infofile = open(infoPath,'r',encoding='utf-8')
infoDict = dict()
print('正在加载商品数据...')
for line in tqdm(infofile.readlines()):
itemID = line.split('\t')[0]
if itemID not in infoDict.keys():
tmp = ItemsInfo()
if len(line.split('\t')) >= 1:
tmp.itemclass = line.split('\t')[1]
if len(line.split('\t')) >= 2:
tmp.iteminfo = line.split('\t')[2]
infoDict[itemID] = tmp
filepath = basepath+'step4/part-r-00000'
recommandFile = open(filepath,'r')
user_recommendDict = {}
for line in tqdm(recommandFile.readlines()):
userID = line.split('\t')[0]
itemList = line.split('\t')[1].split(',')
userdf = pd.DataFrame(itemList)[0].str.split('_',expand=True)
userdf[1] = userdf[1].astype(np.float)
userdf = userdf.sort_values(by=[1],ascending=False).iloc[:topK,:]
recommendList = userdf[0].tolist()
user_recommendDict[userID] = recommendList
filepath = basepath+'train_data_output.转置评分矩阵/part-r-00000_'
itemFile = open(filepath,'r')
user_clickDict = {}
for line in tqdm(itemFile.readlines()):
userID = line.split('\t')[0]
itemList = line.split('\t')[1].split(',')
userdf = pd.DataFrame(itemList)[0].str.split('_',expand=True)[0]
clickItemList = userdf.tolist()
user_clickDict[userID] = clickItemList
进行计算
PPlist = []
RRlist = []
FFlist = []
for topk in range(1,topK):
plist = []
rlist = []
flist = []
for userID in user_recommendDict:
recommendList = user_recommendDict[userID][:topk]
clickItemList = user_clickDict[userID]
interSet = list(set(clickItemList) & set(recommendList))
p = len(interSet)/len(recommendList)
r = len(interSet)/len(clickItemList)
plist.append(p)
rlist.append(r)
if p+r == 0:
flist.append(0)
continue
flist.append(2*p*r/(p+r))
PPlist.append(np.mean(plist))
RRlist.append(np.mean(rlist))
FFlist.append(np.mean(flist))
plt.plot(range(1,topK),PPlist,range(1,topK),RRlist,range(1,topK),FFlist)
plt.legend(['精确率','召回率','F1值'])
plt.show()
PPlist = []
RRlist = []
FFlist = []
for topk in range(1,topK):
plist = []
rlist = []
flist = []
for userID in user_recommendDict:
# 首先获得推荐列表和真实点击列表
recommendList = user_recommendDict[userID][:topk]
clickItemList = user_clickDict[userID]
# 然后得到内容大类别集合
recommendInfoList = []
for recommandItem in recommendList:
recommendInfoList.append(infoDict[recommandItem].iteminfo.split('/')[0])
clickInfoList = []
for clickItem in clickItemList:
clickInfoList.append(infoDict[clickItem].iteminfo.split('/')[0])
interSet = list(set(clickInfoList) & set(recommendInfoList))
p = len(interSet)/len(recommendInfoList)
r = len(interSet)/len(clickInfoList)
plist.append(p)
rlist.append(r)
if p+r == 0:
flist.append(0)
continue
flist.append(2*p*r/(p+r))
PPlist.append(np.mean(plist))
RRlist.append(np.mean(rlist))
FFlist.append(np.mean(flist))
plt.plot(range(1,topK),PPlist,range(1,topK),RRlist,range(1,topK),FFlist)
plt.legend(['精确率','召回率','F1值'])
plt.show()
统计占比及数量
# 首先读取商品统计表
itemdf = pd.read_csv(basepath+'itemStat2/part-r-00000',sep='\t',header=None)
# 将播放量和观看时长分开
newcol = itemdf[1].str.split('_',expand=True)
# 将大类别和小类别分开
newcol2 = itemdf[3].str.split('/',expand=True)
itemdf.head()
print(newcol.info())
newcol[0] = newcol[0].astype(np.int)
newcol[1] = newcol[1].astype(np.float)
print(newcol.info())
itemdf['点击量'] = newcol[0]
itemdf['观看时长'] = newcol[1]
itemdf['大类别'] = newcol2[0]
itemdf['小类别'] = newcol2[1]
itemdf.drop(columns=[1,3],inplace=True)
itemdf.head(5)
itemdf.sort_values(by=['点击量'],ascending=False,inplace=True)
itemdf.head(10)
iteminfodf = itemdf.groupby(by=['大类别']).sum()[['点击量','观看时长']]
iteminfodf.sort_values(by=['点击量','观看时长'],ascending=False)
itemclassdf = itemdf.groupby(by=[2]).sum()
itemclassdf
......
标签:FP,userID,TP,TN,itemdf,split,len,line,append 来源: https://blog.csdn.net/lyt_520/article/details/122377258