其他分享
首页 > 其他分享> > 关于TP,TN,FP,FN相关验证

关于TP,TN,FP,FN相关验证

作者:互联网

先贴基础

在这里插入图片描述

 以我前面博客所述例子进行举例:

得到数据:

import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.rcParams[u'font.sans-serif'] = ['simhei']
basepath = 'D:/data/'
topK = 50 #只取前K个推荐

 获取商品信息
# 自定义一个商品信息类
class ItemsInfo:
    itemclass = ''
    iteminfo = ''
infoPath = 'D:/data/doc_info.txt'
infofile = open(infoPath,'r',encoding='utf-8')
infoDict = dict()
print('正在加载商品数据...')
for line in tqdm(infofile.readlines()):
    itemID = line.split('\t')[0]
    if itemID not in infoDict.keys():
        tmp = ItemsInfo()
        if len(line.split('\t')) >= 1:
            tmp.itemclass = line.split('\t')[1]
        if len(line.split('\t')) >= 2:
            tmp.iteminfo = line.split('\t')[2]
        infoDict[itemID] = tmp


filepath = basepath+'step4/part-r-00000'
recommandFile = open(filepath,'r')
user_recommendDict = {}
for line in tqdm(recommandFile.readlines()):
    userID = line.split('\t')[0]
    itemList = line.split('\t')[1].split(',')
    userdf = pd.DataFrame(itemList)[0].str.split('_',expand=True)
    userdf[1] = userdf[1].astype(np.float)
    userdf = userdf.sort_values(by=[1],ascending=False).iloc[:topK,:]
    recommendList = userdf[0].tolist()
    user_recommendDict[userID] = recommendList

filepath = basepath+'train_data_output.转置评分矩阵/part-r-00000_'
itemFile = open(filepath,'r')
user_clickDict = {}
for line in tqdm(itemFile.readlines()):
    userID = line.split('\t')[0]
    itemList = line.split('\t')[1].split(',')
    userdf = pd.DataFrame(itemList)[0].str.split('_',expand=True)[0]
    clickItemList = userdf.tolist()
    user_clickDict[userID] = clickItemList

进行计算

PPlist = []
RRlist = []
FFlist = []
for topk in range(1,topK):
    plist = []
    rlist = []
    flist = []
    for userID in user_recommendDict:
        recommendList = user_recommendDict[userID][:topk]
        clickItemList = user_clickDict[userID]
        interSet = list(set(clickItemList) & set(recommendList))
        p = len(interSet)/len(recommendList)
        r = len(interSet)/len(clickItemList)
        plist.append(p)
        rlist.append(r)
        if p+r == 0:
            flist.append(0)
            continue
        flist.append(2*p*r/(p+r))
    PPlist.append(np.mean(plist))
    RRlist.append(np.mean(rlist))
    FFlist.append(np.mean(flist))
plt.plot(range(1,topK),PPlist,range(1,topK),RRlist,range(1,topK),FFlist)
plt.legend(['精确率','召回率','F1值'])
plt.show()

PPlist = []
RRlist = []
FFlist = []
for topk in range(1,topK):
    plist = []
    rlist = []
    flist = []
    for userID in user_recommendDict:
        # 首先获得推荐列表和真实点击列表
        recommendList = user_recommendDict[userID][:topk]
        clickItemList = user_clickDict[userID]
        # 然后得到内容大类别集合
        recommendInfoList = []
        for recommandItem in recommendList:
            recommendInfoList.append(infoDict[recommandItem].iteminfo.split('/')[0])
        clickInfoList = []
        for clickItem in clickItemList:
            clickInfoList.append(infoDict[clickItem].iteminfo.split('/')[0])

        interSet = list(set(clickInfoList) & set(recommendInfoList))
        p = len(interSet)/len(recommendInfoList)
        r = len(interSet)/len(clickInfoList)
        plist.append(p)
        rlist.append(r)
        if p+r == 0:
            flist.append(0)
            continue
        flist.append(2*p*r/(p+r))
    PPlist.append(np.mean(plist))
    RRlist.append(np.mean(rlist))
    FFlist.append(np.mean(flist))
plt.plot(range(1,topK),PPlist,range(1,topK),RRlist,range(1,topK),FFlist)
plt.legend(['精确率','召回率','F1值'])
plt.show()

统计占比及数量

# 首先读取商品统计表
itemdf = pd.read_csv(basepath+'itemStat2/part-r-00000',sep='\t',header=None)
# 将播放量和观看时长分开
newcol = itemdf[1].str.split('_',expand=True)
# 将大类别和小类别分开
newcol2 = itemdf[3].str.split('/',expand=True)
itemdf.head()

print(newcol.info())
newcol[0] = newcol[0].astype(np.int)
newcol[1] = newcol[1].astype(np.float)
print(newcol.info())

itemdf['点击量'] = newcol[0]
itemdf['观看时长'] = newcol[1]
itemdf['大类别'] = newcol2[0]
itemdf['小类别'] = newcol2[1]
itemdf.drop(columns=[1,3],inplace=True)
itemdf.head(5)

itemdf.sort_values(by=['点击量'],ascending=False,inplace=True)
itemdf.head(10)

iteminfodf = itemdf.groupby(by=['大类别']).sum()[['点击量','观看时长']]

iteminfodf.sort_values(by=['点击量','观看时长'],ascending=False)

itemclassdf = itemdf.groupby(by=[2]).sum()
itemclassdf

......

标签:FP,userID,TP,TN,itemdf,split,len,line,append
来源: https://blog.csdn.net/lyt_520/article/details/122377258