Kaggle酒推荐,winemag-data-130k-v2.csv
作者:互联网
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import kurtosis,skew
from scipy import stats
函数定义
def resumetable(df):
print(f"Dataset Shape: {df.shape}")
summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
summary = summary.reset_index()
summary['Name'] = summary['index']
summary = summary[['Name','dtypes']]
summary['Missing'] = df.isnull().sum().values
summary['Uniques'] = df.nunique().values
summary['First Value'] = df.loc[0].values
summary['Second Value'] = df.loc[1].values
summary['Third Value'] = df.loc[2].values
for name in summary['Name'].value_counts().index:
summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2)
return summary
def CalcOutliers(df_num):
'''
Leonardo Ferreira 20/10/2018
Set a numerical value and it will calculate the upper, lower and total number of outliers
It will print a lot of statistics of the numerical feature that you set on input
'''
# calculating mean and std of the array
data_mean, data_std = np.mean(df_num), np.std(df_num)
# seting the cut line to both higher and lower values
# You can change this value
cut = data_std * 3
#Calculating the higher and lower cut values
lower, upper = data_mean - cut, data_mean + cut
# creating an array of lower, higher and total outlier values
outliers_lower = [x for x in df_num if x < lower]
outliers_higher = [x for x in df_num if x > upper]
outliers_total = [x for x in df_num if x < lower or x > upper]
# array without outlier values
outliers_removed = [x for x in df_num if x > lower and x < upper]
print('Identified lowest outliers: %d' % len(outliers_lower)) # printing total number of values in lower cut of outliers
print('Identified upper outliers: %d' % len(outliers_higher)) # printing total number of values in higher cut of outliers
print('Identified outliers: %d' % len(outliers_total)) # printing total number of values outliers of both sides
print('Non-outlier observations: %d' % len(outliers_removed)) # printing total number of non outlier values
print("Total percentual of Outliers: ", round((len(outliers_total) / len(outliers_removed) )*100, 4)) # Percentual of outliers in points
return
标签:lower,csv,df,Kaggle,summary,v2,values,total,outliers 来源: https://blog.csdn.net/sinat_37574187/article/details/120219162