数据库
首页 > 数据库> > python连接Mysql共享汽车行业案例分析

python连接Mysql共享汽车行业案例分析

作者:互联网

学习记录——共享汽车分析

前言

  1. 本文仅记录个人学习过程写的代码供自己复盘使用,如果对你有帮助和启发那就更好了,
  2. 新人作品,欢迎讨论和斧正,大神轻喷。
  3. 纯代码实现,无结论。
  4. 一些相似的维度举一反三就行。
  5. 需要数据集练习的可以留言

目标

  1. python, mysql, matplotlib 代码练习
  2. 常用数据指标的实现

数据导入

python 连接数据库 Mysql

#获取数据方法
import pymysql 
import pandas as pd 
from matplotlib import pyplot as plt
from matplotlib import font_manager

def get_mysql_data(DB, sql):
    conn = pymysql.connect(host=DB['host'], port=DB['port'], user=DB['user'], password=DB['password'], database=DB['dbname'])
 
    # 创建游标
    cursor = conn.cursor()
    # 执行sql语句
    cursor.execute(sql)
    # 调出数据
    data = cursor.fetchall()
    # cols为字段信息 
    cols = cursor.description
    # 将数据转换为DataFrame
    col = []
    for i in cols:
        col.append(i[0])
    # data转成list形式 
    data = list(map(list, data))
    data = pd.DataFrame(data,columns=col)
    # 关闭游标以及连接
    cursor.close()
    conn.close()
    return data

日期时间处理——获取年月日时维度

def time_data(data):
        
    data['下单年'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0]) 
    data['下单年月'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0]+"/"+x.split('-')[1])
    data['下单月'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[1])
    data['下单日'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[2]) 
    data['下单时'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0]) 
    data['付款年'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0]) 
    data['付款年月'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0]+"/"+x.split('-')[1])
    data['付款月'] =data["付款日期"].astype("str").apply(lambda x: x.split('-')[1])
    data['付款日'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[2]) 
    data['付款时'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0]) 
    return data



DB = { 'host':"127.0.0.1",'port':3306, 'user':'root','password':'password','dbname':'datebase'}

留存分析

# 留存分析
SQL='''select *,
concat(ROUND(100*次日留存用户数/日活跃用户数),"%") 次日留存率,
concat(ROUND(100*三日留存用户数/日活跃用户数),"%") 三日留存率,
concat(ROUND(100*七日留存用户数/日活跃用户数),"%") 七日留存率
from(
select date(a.`付款时间`) 日期 ,DATE_FORMAT(a.`付款时间`,"%H") 时间,
COUNT(DISTINCT a.`用户ID`) 日活跃用户数,
COUNT(DISTINCT b.`用户ID`) 次日留存用户数,
COUNT(DISTINCT c.`用户ID`) 三日留存用户数,
COUNT(DISTINCT d.`用户ID`) 七日留存用户数
from paper_data a 
left join paper_data b 
on  a.`用户ID`=b.`用户ID`
and date(b.`付款时间`)=date(a.`付款时间`)+1
left join paper_data c
on  a.`用户ID`=c.`用户ID`
and date(c.`付款时间`)=date(a.`付款时间`)+3
left join paper_data d
on  a.`用户ID`=d.`用户ID`
and date(d.`付款时间`)=date(a.`付款时间`)+7
where a.`付款时间` between "2020-01-01" and "2020-01-31"
GROUP BY date(a.`付款时间`)
)f;'''
liuchun = get_mysql_data(DB, SQL)
liuchun
x=liuchun['日期']
y1=liuchun['次日留存率']
y2=liuchun['三日留存率']
y3=liuchun['七日留存率']
plt.figure(figsize=(25,10),dpi=80)
my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
rects1=plt.plot(x,y1,color="red",alpha=0.5,linestyle="-",linewidth=3,label="次日留存率")
rects2=plt.plot(x,y2,color="g",alpha=0.5,linestyle="-",linewidth=3,label="三日留存率")
rects3=plt.plot(x,y3,color="b",alpha=0.5,linestyle="-",linewidth=3,label="七日留存率")
rects4=plt.legend(prop=my_font,loc="best")
for i in range(len(x)):
        plt.text(x[i],y1[i],y1[i],fontsize=15,ha="center")

for i in range(len(x)):
        plt.text(x[i],y2[i],y2[i],fontsize=15,ha="center")
for i in range(len(x)):
        plt.text(x[i],y3[i],y3[i],fontsize=15,ha="center")
        
plt.show()


留存分析

订单分析

订单分析(可以把订单换成其他的,比如用户数,销售量等数据)

# 1.根据下单的时间来统计订单量
# (1)年维度
SQL='''select date_format(付款时间,"%Y")下单年 ,count(订单ID) as 订单量 from paper_data group by date_format(付款时间,"%Y")'''
order_num_year=get_mysql_data(DB, SQL)

my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
x=order_num_year["下单年"]
y=order_num_year["订单量"]
plt.figure(figsize=(20,8),dpi=80)
rects = plt.bar(x,y,width=0.3,color="b",label="2019")
# 标注
for rect in rects:
    height=rect.get_height()
    plt.text(rect.get_x()+rect.get_width()/2,height+0.3,str(height),ha="center")
plt.title("年度订单量",FontProperties=my_font)
plt.xlabel("年份",FontProperties=my_font)
plt.ylabel("订单量",FontProperties=my_font)
plt.legend(prop=my_font,loc="best")
plt.show()

在这里插入图片描述

# 订单分析
# 1.根据下单的时间来统计订单量
# (2)年月维度
SQL='''select date_format(付款时间,"%Y/%m")下单年月 ,count(订单ID) as 订单量 from paper_data group by date_format(付款时间,"%Y/%m")'''
order_num_month=get_mysql_data(DB, SQL)

my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
x=order_num_month["下单年月"]
y=order_num_month["订单量"]
plt.figure(figsize=(20,8),dpi=80)
rects = plt.bar(x,y,width=0.3,color=["r","g","b"])
# 标注
for rect in rects:
    height=rect.get_height()
    plt.text(rect.get_x()+rect.get_width()/2,height+0.3,str(height),ha="center")
plt.show()

在这里插入图片描述

# (3)年月、城市维度(折线图)
SQL='''select date_format(下单时间,"%Y/%m")下单年月 ,城市,count(订单ID) as 订单量 
from paper_data
group by date_format(下单时间,"%Y/%m"),城市
order by date_format(下单时间,"%Y/%m");
'''
order_num_month1=get_mysql_data(DB, SQL)



#把每个城市的数据取出来
order_month_city=order_num_month1.groupby(["城市"])
order_shanghai=order_month_city.get_group("上海").reset_index(drop="true")
order_beijing=order_month_city.get_group("北京").reset_index(drop="true")
order_hangzhou=order_month_city.get_group("杭州").reset_index(drop="true")
order_xian=order_month_city.get_group("西安").reset_index(drop="true")


# matplotlib画图部分
my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
plt.figure(figsize=(25,10),dpi=80)

plt.plot(order_shanghai["下单年月"],order_shanghai["订单量"],marker='o',color="r",label="上海订单量")
plt.plot(order_beijing["下单年月"],order_beijing["订单量"],marker='v',color="g",label="北京订单量")
plt.plot(order_hangzhou["下单年月"],order_hangzhou["订单量"],marker='^',color="b",label="杭州订单量")
plt.plot(order_xian["下单年月"],order_xian["订单量"],marker='.',color="y",label="西安订单量")
plt.xlabel("时间",FontProperties=my_font)
plt.ylabel("订单量",FontProperties=my_font)
plt.title("各城市每月订单量",FontProperties=my_font)
plt.legend(prop=my_font,loc="best")



#  df.idxmax()默认值是0,求列最大值的行索引,1就是反过来
def city_max(df):
    return df.idxmax()



# 最大值标记
plt.text(order_shanghai["下单年月"].iloc[city_max(order_shanghai["订单量"])],order_shanghai.max()["订单量"]+5,"上海最大订单:{}".format(order_shanghai.max()["订单量"]),fontsize=12,ha="center",color="r",FontProperties=my_font)
plt.text(order_beijing["下单年月"].iloc[city_max(order_beijing["订单量"])],order_beijing.max()["订单量"]+5,"北京最大订单:{}".format(order_beijing.max()["订单量"]),fontsize=12,ha="center",color="g",FontProperties=my_font)
plt.text(order_hangzhou["下单年月"].iloc[city_max(order_hangzhou["订单量"])],order_hangzhou.max()["订单量"]+5,"杭州最大订单:{}".format(order_hangzhou.max()["订单量"]),fontsize=12,ha="center",color="b",FontProperties=my_font)
plt.text(order_xian["下单年月"].iloc[city_max(order_xian["订单量"])],order_xian.max()["订单量"]+5,"西安最大订单:{}".format(order_xian.max()["订单量"]),fontsize=12,ha="center",color="y",FontProperties=my_font)
plt.show()


# -----------------------------

# 每年、每月的一样的做法,改下日期字段就好


在这里插入图片描述

# (3)年城市维度(柱状图)
SQL='''select date_format(付款时间,"%Y")下单年,城市,count(订单ID) as 订单量 
from paper_data
group by date_format(付款时间,"%Y"),城市
order by date_format(付款时间,"%Y");
'''
order_num_year=get_mysql_data(DB, SQL)
# order_num_month.describe()

# 按城市拆分数据
order_year_city=order_num_year.groupby(["城市"])
order_shanghai=order_year_city.get_group("上海").reset_index(drop="true")
order_beijing=order_year_city.get_group("北京").reset_index(drop="true")
order_hangzhou=order_year_city.get_group("杭州").reset_index(drop="true")
order_xian=order_year_city.get_group("西安").reset_index(drop="true")

# -----matplotlib画图部分------
my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
plt.figure(figsize=(25,10),dpi=80)
# 设置柱子位置,通过偏移量a来实现,
# 由于时间是datetime类型或者object(str)类型的不能直接加偏移量所以这里用长度长度实现柱子的位置,然后把刻度值重新设置 
a=0.2 
x_01=list(range(len(order_shanghai["下单年"])))
x_02=[i+a for i in x_01]
x_03=[i-2*a for i in x_01]
x_04=[i-a for i in x_01]


# 画图
rects1=plt.bar(x_01,order_shanghai["订单量"],a,color="r",align='edge',label="上海订单量")
rects2=plt.bar(x_02,order_beijing["订单量"],a,color="g",align='edge',label="北京订单量")
rects3=plt.bar(x_03,order_hangzhou["订单量"],a,color="b",align='edge',label="杭州订单量")
rects4=plt.bar(x_04,order_xian["订单量"],a,color="y",align='edge',label="西安订单量")

# x轴刻度
plt.xticks(x_01,labels=order_shanghai["下单年"])

# 轴名称
plt.xlabel("年份",FontProperties=my_font)
plt.ylabel("订单量",FontProperties=my_font)
# 标题
plt.title("各城市每月订单量",FontProperties=my_font)
# 图例
plt.legend(prop=my_font,loc="best")
# 标记
def mark_bar(rects):
    for rect in rects:
        height=rect.get_height()
        plt.text(rect.get_x()+rect.get_width()/2,height+0.3,str(height),fontsize=15,ha="center")
mark_bar(rects1)
mark_bar(rects2)
mark_bar(rects3)
mark_bar(rects4)
plt.show()

在这里插入图片描述

# 3、小时订单量分布情况
SQL= '''SELECT date_format(下单时间,"%H")  下单时, count(`订单ID`) 订单量
from paper_data
where  date_format(下单时间,"%Y-%m")="2020-01"
GROUP BY date_format(下单时间,"%H") 
ORDER BY date_format(下单时间,"%H") 
;'''

h_dingdan=get_mysql_data(DB, SQL)


my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
plt.figure(figsize=(25,10),dpi=80)
x=h_dingdan["下单时"]
y=h_dingdan["订单量"]
plt.plot(x,y,color="r",label="每小时订单量")
# 设置标记
for a,b  in zip(x,y):
    #a x坐标, b坐标, (a,b) 一个点 
    # ha 水平方向  va 垂直方向  fontsize 大小 
    plt.text(a,b,b ,ha='center',va='bottom', fontsize=12)
#设置刻度
xtick_labels = ['{}:00'.format(i) for i in x]
plt.xticks(x,xtick_labels)
#设置坐标名称
plt.xlabel("时间",FontProperties=my_font)
plt.ylabel("订单量",FontProperties=my_font)
#设置表名
plt.title("1月每小时订单量",FontProperties=my_font)
#设置图例
plt.legend(prop=my_font,loc="best")
plt.show()

RFM模型

#RMF分层
SQL='''SELECT 用户ID,COUNT(订单ID) AS F,round(sum(实收金额),2) AS M,date_format(max(付款时间),"%Y/%m/%d") as R 
            FROM paper_data GROUP BY 用户ID ORDER BY 用户ID
            ;'''
User_layer=get_mysql_data(DB, SQL)
# RMF评分公式

# User_layer["R1"]=((pd.to_datetime("2020/03-01")-pd.to_datetime(User_layer["R"])) 后面的处理方式是为了把timedelta类型转换成Int类型
User_layer["R1"]=((pd.to_datetime("2020/03-01")-pd.to_datetime(User_layer["R"]))/pd.Timedelta(1,"D")).fillna(0).astype(int)
User_layer["RR"]=(pd.to_datetime(User_layer["R"])-pd.to_datetime(min(User_layer["R"])))/((pd.to_datetime(max(User_layer["R"]))-pd.to_datetime(min(User_layer["R"]))))
User_layer["FF"]=(User_layer["F"]-min(User_layer["F"]))/(max(User_layer["F"])-min(User_layer["F"]))
User_layer["MM"]=(User_layer["M"]-min(User_layer["M"]))/(max(User_layer["M"])-min(User_layer["M"]))
User_layer["SCORE"]=100*(0.25*User_layer["FF"]+0.6*User_layer["MM"]+0.15*User_layer["RR"])
User_layer
用户IDFMRR1RRFFMMSCORE
01323517.02021/01/25-3300.9805190.6739130.66395371.392801
110292600.02021/01/23-3280.9779220.6086960.49034559.306896
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847
410009171.02019/04/093270.1272730.0000000.0115492.602010
..............................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748

5929 rows × 9 columns

# 根据权重SCORE四分位分组的依据
User_describe=User_layer.describe()
User_describe
FMR1RRFFMMSCORE
count5929.0000005929.0000005929.0000005929.0000005929.0000005929.0000005929.000000
mean10.9790861148.957160-117.5140830.7045640.2169370.21563028.929667
std13.6568801440.872815228.5950260.2968770.2968890.27278926.743116
min1.00000010.000000-345.0000000.0000000.0000000.0000000.185039
25%1.000000112.000000-318.0000000.4896100.0000000.0193118.927758
50%2.000000198.000000-208.0000000.8220780.0217390.03559314.496745
75%26.0000002679.00000048.0000000.9649350.5434780.50530158.371452
max47.0000005292.000000425.0000001.0000001.0000001.00000097.960474
# df遍历后的结果最好用list/dict存放起来。df本身一个一个添加数据遍历效率不高。而且难取数,最好用List/dict整体取赋值
list_level=[]
for index ,rows in User_layer.iterrows():
#     print("索引是:",index,"数据是:",rows)
    if  rows["SCORE"]<=User_describe.loc["25%"]["SCORE"]:
        rows["level"]="四等"
        
    elif  rows["SCORE"]<=User_describe.loc["50%"]["SCORE"]:
        rows["level"]="三等"
        
    elif  rows["SCORE"]<=User_describe.loc["75%"]["SCORE"]:
        rows["level"]="二等"
        
    else:
        rows["level"]="一等"  
    list_level.append(rows["level"])
    
User_layer['level']=list_level
User_layer



用户IDFMRR1RRFFMMSCORElevel
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等
410009171.02019/04/093270.1272730.0000000.0115492.602010四等
.................................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779三等
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566一等
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388一等
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713三等
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748三等

5929 rows × 10 columns

用户标签


# 用户打标签
SQL='''select 用户ID,count(*) as order_num,count(date_format(付款时间,"%M")) 月数,
                  sum(case when 取车类型='自取' then 1 else 0 end) as 自取次数,
                  sum(case when 优惠金额>1 then 1 else 0 end) as 优惠券使用次数,
                  sum(case when 非网点还车费>0 then 1 else 0 end) as  还车次数
                  from paper_data 
                  group by 用户ID
                  ;
'''

bq=get_mysql_data(DB, SQL)

data_total=pd.merge(User_layer,bq,on="用户ID")

data_total
# print(data_total["FF"].dtypes,data_total["自取次数"].dtypes)
for index,row in data_total.iterrows():
# 取车
    if row["FF"]==0:
        row["FF"]= row["FF"]+0.0001
# Int/浮点数会报错,注意转换类型
        if float(row["自取次数"])/row["FF"]>0.5:
            data_total["取车偏好"]="时租居多"
        else:
            data_total["取车偏好"]="日租居多"
    #       网点还车  
        if float(row["还车次数"])/row["FF"]>0.5:
            data_total["还车偏好"]="非网点还车"
        else:
            data_total["还车偏好"]="网点还车"
    #     月均订单
        if row["FF"]/row["月数"]>6:
            data_total["月均单量"]="月均单量大于6"
        else:
            data_total["月均单量"]="月均单量小于6"

data_total

   

用户IDFMRR1RRFFMMSCORElevelorder_num月数自取次数优惠券使用次数还车次数取车偏好还车偏好月均单量
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等3232112313时租居多网点还车月均单量小于6
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等2929162412时租居多网点还车月均单量小于6
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等313112217时租居多网点还车月均单量小于6
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等2424131511时租居多网点还车月均单量小于6
410009171.02019/04/093270.1272730.0000000.0115492.602010四等11101时租居多网点还车月均单量小于6
.........................................................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779三等11010时租居多网点还车月均单量小于6
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566一等3434182413时租居多网点还车月均单量小于6
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388一等3333121910时租居多网点还车月均单量小于6
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713三等11011时租居多网点还车月均单量小于6
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748三等11100时租居多网点还车月均单量小于6

5929 rows × 18 columns

用户生命周期

def user_life(data,a,b,c):
    list_1=[]
    for index,row in data.iterrows():
#         print(row[a])
#         print(index)
        if row[a]==1:
            row[c]="引入期"
        elif row[a]>1 and row[a]<3 and row[b]<=200:
            row[c]="成长期"
        elif row[a]>=3 and row[b]<=300:
            row[c]="休眠期"
        elif row[b]>300:
            row[c]="流失期"
        else:
            row[c]="其他"
#         print(row[c])
        list_1.append(row)
    data_1=pd.DataFrame(list_1)
#     print(data_1)
    return data_1


Userlife=pd.DataFrame()
Userlife=user_life(data_total,"F","R1","生命周期")
Userlife.head(10)


用户IDFMRR1RRFFMMSCORElevelorder_num月数自取次数优惠券使用次数还车次数取车偏好还车偏好月均单量生命周期
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等3232112313时租居多网点还车月均单量小于6休眠期
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等2929162412时租居多网点还车月均单量小于6休眠期
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等313112217时租居多网点还车月均单量小于6休眠期
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等2424131511时租居多网点还车月均单量小于6休眠期
410009171.02019/04/093270.1272730.0000000.0115492.602010四等11101时租居多网点还车月均单量小于6引入期
51001283074.02021/01/24-3290.9792210.5869570.58008364.167223一等2828121812时租居多网点还车月均单量小于6休眠期
610014149.02020/08/25-1770.7818180.0000000.00738412.170287三等11110时租居多网点还车月均单量小于6引入期
710016167.02020/07/19-1400.7337660.0000000.01079111.653976三等11010时租居多网点还车月均单量小于6引入期
81002293316.02021/01/03-3080.9519480.6086960.62589967.050569一等2929121915时租居多网点还车月均单量小于6休眠期
910023181.02020/09/09-1920.8012990.0000000.01344212.825993三等11110时租居多网点还车月均单量小于6引入期
"休眠期" in Userlife["生命周期"].values
True
zq=Userlife.groupby("生命周期")
yr=zq.get_group("引入期")["生命周期"].count()
cz=zq.get_group("成长期")["生命周期"].count()
xm=zq.get_group("休眠期")["生命周期"].count()
ls=zq.get_group("流失期")["生命周期"].count()

x=["引入期","成长期","休眠期","流失期"]
y=[yr,cz,xm,ls]
for i in range(len(x)):
    print(x[i],y[i])

my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 18)
plt.figure(figsize=(25,10),dpi=80)
plt.rcParams['font.sans-serif']=['STSONG'] #中文标签设置
# plt.rcParams['axes.unicode_minus']=False  #用来正常显示坐标负号
rects=plt.bar(x,y,0.3,color=["r","g","b","y"])
plt.xticks(x,x,fontproperties=my_font)

for rect in rects:
    height=rect.get_height()
    plt.text(rect.get_x()+rect.get_width()/2,height+1,str(height),fontsize=18,ha="center")


plt.show()
引入期 2528
成长期 895
休眠期 2414
流失期 32

在这里插入图片描述

用户评分统计

# 评分 
#0分代表未参与评分,可以统计用户的参与度

SQL='''select date_format(付款时间,"%Y/%m") 付款年月,评分,count(distinct 用户ID) as user_num,count(*) as order_num 
                   from paper_data group by date_format(付款时间,"%Y/%m"),评分;'''


pinfen_num=get_mysql_data(DB,SQL)
pinfen_num["评分"]=pinfen_num["评分"].astype(int)
pinfen_num["参与度"]=pinfen_num["user_num"]/pinfen_num["order_num"]
pinfen_num.dtypes

付款年月          object
评分             int32
user_num       int64
order_num      int64
参与度          float64
dtype: object
# 用户评分参与度=有评分的用户/下单用户量
import numpy as np
# 按评分分组
pf_group=pinfen_num.groupby(["评分"])

# id_name
# 取出分组的值
# 方法一
list_pf=[]
for key,value in pf_group:
    list_pf.append(value.reset_index(drop="true"))
# 方法二格式化成列表或者字典,详见后面一个代码块

x=list_pf[0]["付款年月"]
y=[]
for i in range(len(list_pf)):
    y.append(list_pf[i]["参与度"])
# print("x:",x.values)
# print("y:",y[1])


my_font = font_manager.FontProperties(fname="C:/Users/wty_pc/Anaconda3/Lib/site-packages/matplotlib/mpl-data/fonts/ttf/STSONG.TTF",size = 15)
fig=plt.figure(figsize=(20,40),dpi=80)

colors=["r","g","b"]
for i in range(1,12):
    ax=fig.add_subplot(11,1,i)
    ax.plot(x,y[i-1],color=colors[np.random.randint(3)],label="{}分".format(i-1))
    x_label="评分为{}的趋势".format(i-1)
    ax.set_xlabel(x_label,fontproperties=my_font)
    ax.set_ylabel("参与度",fontproperties=my_font)
    ax.legend()
 #调整子图间距    
plt.tight_layout()
plt.show()

在这里插入图片描述

groupby取数的技巧

# groupby 格式化成列表 为转换成字典做铺垫

a=list(pinfen_num.groupby(["评分"]))
# 列表的每个元素是个元组,每个元组的值为分组键和值(datafarme)
a[0][1].reset_index(drop = True)
# print(type(a[1]))
付款年月评分user_numorder_num参与度
02019/0102302410.954357
12019/0202062130.967136
22019/0302392580.926357
32019/0402092160.967593
42019/0502132230.955157
52019/0602172300.943478
62019/0702292390.958159
72019/0802162220.972973
82019/0902142290.934498
92019/1002322440.950820
102019/1102252350.957447
112019/1202392470.967611
122020/0102712870.944251
132020/0201872000.935000
142020/0302212300.960870
152020/0402162300.939130
162020/0502192270.964758
172020/0602072180.949541
182020/0702242320.965517
192020/0802392500.956000
202020/0902242330.961373
212020/1002402500.960000
222020/1102202270.969163
232020/1202182300.947826
242021/0102382510.948207
252021/02056570.982456
# groupby 格式化成字典
a=dict(list(pinfen_num.groupby(["评分"])))
# type(a[0])
a[0].reset_index(drop=True)
付款年月评分user_numorder_num参与度
02019/0102302410.954357
12019/0202062130.967136
22019/0302392580.926357
32019/0402092160.967593
42019/0502132230.955157
52019/0602172300.943478
62019/0702292390.958159
72019/0802162220.972973
82019/0902142290.934498
92019/1002322440.950820
102019/1102252350.957447
112019/1202392470.967611
122020/0102712870.944251
132020/0201872000.935000
142020/0302212300.960870
152020/0402162300.939130
162020/0502192270.964758
172020/0602072180.949541
182020/0702242320.965517
192020/0802392500.956000
202020/0902242330.961373
212020/1002402500.960000
222020/1102202270.969163
232020/1202182300.947826
242021/0102382510.948207
252021/02056570.982456
pinfen_num.groupby(["评分"]).size()
评分
0     26
1     26
2     26
3     26
4     26
5     26
6     26
7     26
8     26
9     26
10    26
dtype: int64
pinfen_num.groupby(["评分"]).count()
付款年月user_numorder_num参与度
评分
026262626
126262626
226262626
326262626
426262626
526262626
626262626
726262626
826262626
926262626
1026262626
# 每个评分最小参与度

pinfen_num["参与度"].groupby(pinfen_num["评分"]).min()
pinfen_num.groupby(pinfen_num["评分"]).min()["参与度"]
pinfen_num.groupby(pinfen_num["评分"])["参与度"].min()
评分
0     0.926357
1     0.923695
2     0.917749
3     0.925620
4     0.912000
5     0.934694
6     0.916318
7     0.929961
8     0.929577
9     0.940092
10    0.928571
Name: 参与度, dtype: float64
# 按评分选取多列
# pinfen_num.groupby(pinfen_num["评分"])[["参与度","order_num"]].min()
pinfen_num[["参与度","order_num"]].groupby(pinfen_num["评分"]).min()
参与度order_num
评分
00.92635757
10.92369547
20.91774967
30.92562061
40.91200062
50.93469468
60.91631852
70.92996162
80.92957765
90.94009248
100.92857166
# 多维分组用unstack可以解成二维表(不堆叠)
a=pinfen_num["参与度"].groupby([pinfen_num["付款年月"],pinfen_num["评分"]]).min().unstack()
a
# a.shape
评分012345678910
付款年月
2019/010.9543570.9500000.9710740.9668050.9678710.9763780.9375000.9615380.9357430.9506170.957806
2019/020.9671360.9389670.9724770.9520000.9463410.9439250.9313730.9444440.9345790.9545450.944954
2019/030.9263570.9451480.9280000.9587160.9397590.9409280.9668050.9306120.9563490.9653680.937799
2019/040.9675930.9668250.9704430.9469030.9255810.9786320.9907410.9495800.9776790.9699570.944700
2019/050.9551570.9539750.9469030.9471540.9745760.9521740.9576270.9491530.9626170.9442060.970588
2019/060.9434780.9236950.9485980.9369370.9488190.9581590.9570310.9652170.9533900.9513270.945378
2019/070.9581590.9504130.9688890.9422220.9500000.9541670.9578060.9358490.9483390.9579830.977876
2019/080.9729730.9375000.9451480.9568970.9736840.9680370.9215690.9707110.9617020.9489360.958506
2019/090.9344980.9778760.9488370.9592760.9728510.9739130.9707320.9508930.9547510.9465650.970588
2019/100.9508200.9699570.9824560.9519650.9274190.9448820.9541280.9605260.9464290.9502070.967611
2019/110.9574470.9495800.9827590.9608700.9423080.9655170.9655170.9601590.9346940.9644670.965217
2019/120.9676110.9406390.9489360.9372380.9670780.9405940.9586780.9299610.9466670.9617020.930736
2020/010.9442510.9456070.9618640.9778760.9434780.9769590.9419090.9655170.9510200.9482070.928571
2020/020.9350000.9678900.9631340.9631150.9733330.9444440.9729730.9347830.9694320.9484980.967136
2020/030.9608700.9859150.9414230.9434780.9414410.9775780.9739130.9636360.9468600.9648440.955157
2020/040.9391300.9272030.9382240.9314520.9521740.9459460.9617020.9591840.9455250.9400920.942387
2020/050.9647580.9354840.9177490.9446640.9780700.9592760.9333330.9638550.9547170.9635630.974684
2020/060.9495410.9590910.9527900.9256200.9515420.9638550.9530520.9641260.9581590.9626170.941909
2020/070.9655170.9662450.9492190.9517540.9120000.9457010.9245280.9556450.9674420.9427480.942652
2020/080.9560000.9397590.9260870.9906100.9310340.9442060.9163180.9547330.9763780.9613730.966038
2020/090.9613730.9600000.9589040.9563320.9449150.9809520.9353450.9372380.9600000.9553570.945607
2020/100.9600000.9353450.9432310.9523810.9267400.9417040.9649120.9360000.9314520.9547510.956140
2020/110.9691630.9688720.9523810.9466670.9189190.9601590.9422220.9647580.9295770.9551570.947581
2020/120.9478260.9515420.9579830.9466670.9675930.9517540.9481130.9581750.9517540.9487180.959016
2021/010.9482070.9363640.9612070.9317270.9501920.9346940.9670780.9566930.9610890.9574470.944444
2021/020.9824560.9787230.9701490.9836071.0000000.9705881.0000001.0000000.9692311.0000000.984848
#多维分组堆叠成多索引 
b=pinfen_num["参与度"].groupby([pinfen_num["付款年月"],pinfen_num["评分"]]).min()
b
# b.shape
付款年月     评分
2019/01  0     0.954357
         1     0.950000
         2     0.971074
         3     0.966805
         4     0.967871
                 ...   
2021/02  6     1.000000
         7     1.000000
         8     0.969231
         9     1.000000
         10    0.984848
Name: 参与度, Length: 286, dtype: float64
pinfen_num["参与度"].groupby(pinfen_num["评分"]).describe().unstack()
       评分
count  0     26.000000
       1     26.000000
       2     26.000000
       3     26.000000
       4     26.000000
               ...    
max    6      1.000000
       7      1.000000
       8      0.977679
       9      1.000000
       10     0.984848
Length: 88, dtype: float64
pinfen_num["参与度"].groupby(pinfen_num["评分"]).describe()
countmeanstdmin25%50%75%max
评分
026.00.9553720.0131880.9263570.9479210.9567230.9653270.982456
126.00.9524080.0166140.9236950.9391650.9502070.9666800.985915
226.00.9541870.0164780.9177490.9455860.9525850.9674500.982759
326.00.9524200.0155170.9256200.9437750.9518600.9591360.990610
426.00.9510660.0211230.9120000.9401800.9494090.9678021.000000
526.00.9575050.0143770.9346940.9445540.9561630.9699500.980952
626.00.9540350.0204230.9163180.9386020.9573290.9664831.000000
726.00.9547300.0152480.9299610.9456210.9574340.9638011.000000
826.00.9532910.0132830.9295770.9464880.9540530.9615490.977679
926.00.9565100.0118250.9400920.9487730.9549540.9623881.000000
1026.00.9549200.0148030.9285710.9445080.9556490.9668620.984848

标签:汽车行业,num,python,订单,plt,Mysql,font,data,order
来源: https://blog.csdn.net/weixin_51749229/article/details/114380177