Keras的泰坦尼克号的生存率的数据分析
作者:互联网
1 # coding: utf-8 2 3 # # 1. Import Library 4 5 # In[1]: 6 7 8 import numpy 9 import pandas as pd 10 from sklearn import preprocessing 11 numpy.random.seed(10) 12 13 14 # # 数据准备 15 16 # In[2]: 17 18 19 all_df = pd.read_excel("data/titanic3.xls") 20 21 22 # In[3]: 23 24 25 cols=['survived','name','pclass' ,'sex', 'age', 'sibsp', 26 'parch', 'fare', 'embarked'] 27 all_df=all_df[cols] 28 29 30 # In[4]: 31 32 33 msk = numpy.random.rand(len(all_df)) < 0.8 34 train_df = all_df[msk] 35 test_df = all_df[~msk] 36 37 38 # In[5]: 39 40 41 print('total:',len(all_df), 42 'train:',len(train_df), 43 'test:',len(test_df)) 44 45 46 # In[6]: 47 48 49 def PreprocessData(raw_df): 50 df=raw_df.drop(['name'], axis=1) 51 age_mean = df['age'].mean() 52 df['age'] = df['age'].fillna(age_mean) 53 fare_mean = df['fare'].mean() 54 df['fare'] = df['fare'].fillna(fare_mean) 55 df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int) 56 x_One Hot_df = pd.get_dummies(data=df,columns=["embarked" ]) 57 58 ndarray = x_One Hot_df.values 59 Features = ndarray[:,1:] 60 Label = ndarray[:,0] 61 62 minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1)) 63 scaledFeatures=minmax_scale.fit_transform(Features) 64 65 return scaledFeatures,Label 66 67 68 # In[7]: 69 70 71 train_Features,train_Label=PreprocessData(train_df) 72 test_Features,test_Label=PreprocessData(test_df) 73 74 75 # # 3. Create Model 76 77 # In[8]: 78 79 80 from keras.models import Sequential 81 from keras.layers import Dense,Dropout 82 83 84 # In[9]: 85 86 87 model = Sequential() 88 89 90 # In[10]: 91 92 93 model.add(Dense(units=40, input_dim=9, 94 kernel_initializer='uniform', 95 activation='relu')) 96 97 98 # In[11]: 99 100 101 model.add(Dense(units=30, 102 kernel_initializer='uniform', 103 activation='relu')) 104 105 106 # In[12]: 107 108 109 model.add(Dense(units=1, 110 kernel_initializer='uniform', 111 activation='sigmoid')) 112 113 114 # # 4. Train model 115 116 # In[13]: 117 118 119 model.compile(loss='binary_crossentropy', 120 optimizer='adam', metrics=['accuracy']) 121 122 123 # In[14]: 124 125 126 train_history =model.fit(x=train_Features, 127 y=train_Label, 128 validation_split=0.1, 129 epochs=30, 130 batch_size=30,verbose=2) 131 132 133 # # 6. Print History 134 135 # In[15]: 136 137 138 import matplotlib.pyplot as plt 139 def show_train_history(train_history,train,validation): 140 plt.plot(train_history.history[train]) 141 plt.plot(train_history.history[validation]) 142 plt.title('Train History') 143 plt.ylabel(train) 144 plt.xlabel('Epoch') 145 plt.legend(['train', 'validation'], loc='upper left') 146 plt.show() 147 148 149 # In[16]: 150 151 152 show_train_history(train_history,'acc','val_acc') 153 154 155 # In[17]: 156 157 158 show_train_history(train_history,'loss','val_loss') 159 160 161 # # 评估模型的准确率 162 163 # In[18]: 164 165 166 scores = model.evaluate(x=test_Features, 167 y=test_Label) 168 169 170 # In[19]: 171 172 173 scores[1] 174 175 176 # # 预测数据 177 178 # # 加入Jack & Rose数据 179 180 # In[20]: 181 182 183 Jack = pd.Series([0 ,'Jack',3, 'male' , 23, 1, 0, 5.0000,'S']) 184 Rose = pd.Series([1 ,'Rose',1, 'female', 20, 1, 0, 100.0000,'S']) 185 186 187 # In[21]: 188 189 190 JR_df = pd.DataFrame([list(Jack),list(Rose)], 191 columns=['survived', 'name','pclass', 'sex', 192 'age', 'sibsp','parch', 'fare','embarked']) 193 194 195 # In[22]: 196 197 198 all_df=pd.concat([all_df,JR_df]) 199 200 201 # In[23]: 202 203 204 all_df[-2:] 205 206 207 # # 进行预测 208 209 # In[24]: 210 211 212 all_Features,Label=PreprocessData(all_df) 213 214 215 # In[25]: 216 217 218 all_probability=model.predict(all_Features) 219 220 221 # In[26]: 222 223 224 all_probability[:10] 225 226 227 # In[27]: 228 229 230 pd=all_df 231 pd.insert(len(all_df.columns), 232 'probability',all_probability) 233 234 235 # # 预测Jack & Rose数据的生存几率 236 237 # In[28]: 238 239 240 pd[-2:] 241 242 243 # # 查看生存几率高,却没有存活 244 245 # In[29]: 246 247 248 pd[(pd['survived']==0) & (pd['probability']>0.9) ] 249 250 251 # In[30]: 252 253 254 pd[:5]View Code
excel资源如下:
链接:https://pan.baidu.com/s/1PvonynplLKC6ZepSlL9DqQ
提取码:w7z3
这个是书上的历程,然后我没多少时间看了,采用多层感知器的方案的,主要是数据的预处理过程挺重要然后可经典。
也是对excel表格的处理吧。
我想还有没有下次,能好好梳理一下。
标签:泰坦尼克号,Features,Keras,df,生存率,train,pd,model,history 来源: https://www.cnblogs.com/bai2018/p/10462530.html