【数据分析&数据挖掘】非数值型数据的哑变量转化、连续型数据离散化——等宽分组&等频分组
作者:互联网
1 import pandas as pd 2 import numpy as np 3 4 # 加载数据 5 detail = pd.read_excel("../day05/meal_order_detail.xlsx") 6 # print("detail :", detail) 7 print("detail 的列索引:", detail.columns) 8 9 # 将dishes_name 转化为数值型数据 10 # 哑变量矩阵转化 11 res = pd.get_dummies( 12 data=detail.loc[:,"dishes_name"], 13 prefix_sep="_", 14 prefix="菜品" 15 ) 16 print("转化之后的结果res:\n",res) 17 # res.to_csv("./hh.csv") 18 19 20 # 身高 150 - 190 每位同学 都是一个具体的身高---连续的小数 21 # 将连续型数据转化为类别数据 ----离散化 22 # 分组 23 print("菜品单价的最大值与最小值:", detail.loc[:, "amounts"].max(), detail.loc[:, "amounts"].min()) 24 # 将detail 里面的amounts 数据进行离散化 25 # detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=5) 26 27 # 自定义分组 28 # # 等宽分组 29 # # 1、指定分组个数 30 group_num = 5 31 # # # 2、计算最大值与最小值的极差 32 ptp = detail.loc[:, "amounts"].max() - detail.loc[:, "amounts"].min() 33 # # # 3、确定步长 34 step = int(np.ceil(ptp / group_num)) 35 # # # 4、确定分组的区间的节点 36 bins = np.arange(detail.loc[:, "amounts"].min(), detail.loc[:, "amounts"].max() + step, step) 37 print(bins) 38 # # 5、指定自定义分组 39 # # include_lowest ---指定包含最小值 40 detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=bins, include_lowest=True) 41 42 # 等频分组 43 # 1、计算分位数 44 # bins = detail.loc[:, "amounts"].quantile(q=np.arange(0, 1 + 1 / 5, 1 / 5)) 45 # print(bins) 46 # # include_lowest ---指定包含最小值 47 # detail.loc[:, "amounts"] = pd.cut(detail.loc[:, "amounts"], bins=bins, include_lowest=True) 48 # 49 # print(detail.loc[:, "amounts"]) 50 # # 51 # # 统计每一个组内的个数 52 # res_counts = pd.value_counts(detail.loc[:, "amounts"]) 53 # print("res_counts:\n", res_counts)
标签:loc,等频,print,detail,amounts,分组,数据挖掘,bins 来源: https://www.cnblogs.com/Tree0108/p/12116149.html