pandas数据抽样
作者:互联网
# -*- coding: utf-8 -*- import pandas as pd import os import re import xlsxwriter import xlrd from random import randint, sample ####线上作业明细 input1 = r"D:\4.AOI类型\AOI作业监控\exportAoiInfoLost_20210805142956.csv" ####输出 out = r"D:\4.AOI类型\AOI作业监控\作业准确率\线上标记异常分析(0804审核)_20210804.xlsx" data1 = pd.read_csv(input1,dtype=str,error_bad_lines=False) data1.fillna('', inplace=True) # 全空的替代 data1.replace('nan', '', inplace=True) # 空字符的替代 data1.replace('NaN', '', inplace=True) # 空字符的替代 data1['flag_rw']=((data1['任务状态']=="未完成")|(data1['补码审核状态']=="驳回")) data1=data1[(data1['地区']!='852Y')] data1=data1[data1['flag_rw']==False] ###线上标记异常抽查 # data1=data1[(data1['任务状态']=='完成-标记异常未审核')|(data1['任务状态']=='完成-标记异常审核通过')] # ####线上外包质量 # data1=data1[(data1['补码审核状态']=='审核通过')|(data1['补码审核状态']=='补码修改通过')] # ######线上标记异常 # data1=data1[(data1['任务状态']=='完成-标记异常未审核')|(data1['任务状态']=='完成-标记异常审核通过')] #####外包名单 input5 = r"D:\4.AOI类型\线上类型核实平台作业人员明细.csv" data5 = pd.read_csv(input5, dtype=str, error_bad_lines=False) data1=pd.merge(data1,data5,how='left',left_on='补码工号',right_on='工号') data1.fillna('', inplace=True) # 全空的替代 data1.replace('nan', '', inplace=True) # 空字符的替代 data1.replace('NaN', '', inplace=True) # 空字符的替代 ####是否过滤外包审核结果条件 #data1=data1[data1['姓名']==""] print(len(data1)) #####选择补码作业日期 data1['日期']=data1['任务包提交时间'].str[:10] data1=data1[(data1['日期']=='2021-08-03')|(data1['日期']=='2021-08-04')|(data1['日期']=='2021-08-02')] print('地区作业量,',len(data1)) def aoi_lx(b,a): if a!="" and b!="": return a elif a=="" and b!="": return b else: return "" data1['dl_before']=data1['AOI大类(修改前)'] data1['dl_after']=data1['AOI大类(修改后)'] data1['xl_before']=data1['AOI小类(修改前)'] data1['xl_after']=data1['AOI小类(修改后)'] data1['name_after']= data1['AOI名称(修改后)'] data1['name_before']= data1['AOI名称(修改前)'] data1['大类']=data1.apply(lambda x:aoi_lx(x.dl_before,x.dl_after),axis=1) data1['fa_type']=data1.apply(lambda x:aoi_lx(x.xl_before,x.xl_after),axis=1) data1['aoi_name']=data1['name_before'] data1['city_code']=data1['城市'] data1['aoi_id']=data1['AOIID'] data1.fillna('', inplace=True) # 全空的替代 data1.replace('nan', '', inplace=True) # 空字符的替代 data1.replace('NaN', '', inplace=True) # 空字符的替代 n0=len(data1) ####仅抽查住宅小区 # data1=data1[data1['大类']=='住宅小区'] # n0=len(data1) #####核实总量,计算比例:核实量/任务总量 n_hs=5000 bl=n_hs*1.0/n0 cityfile = r"D:\4.AOI类型\city_file.csv" city_file = pd.read_csv(cityfile, dtype=str) city_dict = dict(zip(city_file.city_code, city_file.city_name)) area_dict = dict(zip(city_file.city_code, city_file.region)) ad_dict= dict(zip(city_file.city_code, city_file.adcode)) data1['大区'] = data1['city_code'].map(area_dict) data1['城市'] = data1['city_code'].map(city_dict) sp=data1[['大区']].copy() sp.drop_duplicates(subset=['大区'], keep='first',inplace=True) sp['比例']=bl cc_dict = dict(zip(sp.大区, sp.比例)) ####按数量抽样,可调整 # sp['数量']=100 # cc_dict = dict(zip(sp.大区, sp.数量)) lxf = r"D:\4.AOI类型\AOI类型配置.csv" lx = pd.read_csv(lxf, dtype=str) dl_dict = dict(zip(lx.类型编码, lx.大类)) xl_dict = dict(zip(lx.类型编码, lx.子类)) #data1['大类'] = data1['fa_type'].map(dl_dict) data1['小类'] = data1['fa_type'].map(xl_dict) ###按比例 def typicalsampling_bl(group, typicalFracDict): name = group.name frac = typicalFracDict[name] return group.sample(frac=frac) ####按数量 def typicalsamling_num(group, typicalNDict): name = group.name n = typicalNDict[name] return group.sample(n=n) ###按数量 # result = data1.groupby('大区', group_keys=False).apply(typicalsamling_num, cc_dict) #####按比例 # result = data1.groupby('大区', group_keys=False).apply(typicalsampling_bl, cc_dict ) result = data1 result.to_excel(out,index=0,columns=['大区','aoi_id','AOI名称(修改前)','city_code','大类','小类','AOI大类(修改前)', 'AOI小类(修改前)','AOI名称(修改后)','AOI大类(修改后)','AOI小类(修改后)','AOI名称(审核修改前)','AOI大类(审核修改前)', 'AOI小类(审核修改前)','AOI大类对比','任务状态','标记异常原因','仓管工号','标记异常原因','仓管工号','任务包领取时间', '任务包提交时间','补码审核状态','补码工号','补码审核时间','姓名']) # result.to_excel(out,index=0)
标签:city,抽样,name,数据,inplace,AOI,dict,data1,pandas 来源: https://www.cnblogs.com/bahe/p/15210745.html