首页 > 其他分享> > Keras的泰坦尼克号的生存率的数据分析

Keras的泰坦尼克号的生存率的数据分析

2019-03-02 19:40:45 作者：互联网

  1 # coding: utf-8
  2 
  3 # # 1. Import Library
  4 
  5 # In[1]:
  6 
  7 
  8 import numpy
  9 import pandas as pd
 10 from sklearn import preprocessing
 11 numpy.random.seed(10)
 12 
 13 
 14 # # 数据准备
 15 
 16 # In[2]:
 17 
 18 
 19 all_df = pd.read_excel("data/titanic3.xls")
 20 
 21 
 22 # In[3]:
 23 
 24 
 25 cols=['survived','name','pclass' ,'sex', 'age', 'sibsp',
 26       'parch', 'fare', 'embarked']
 27 all_df=all_df[cols]
 28 
 29 
 30 # In[4]:
 31 
 32 
 33 msk = numpy.random.rand(len(all_df)) < 0.8
 34 train_df = all_df[msk]
 35 test_df = all_df[~msk]
 36 
 37 
 38 # In[5]:
 39 
 40 
 41 print('total:',len(all_df),
 42       'train:',len(train_df),
 43       'test:',len(test_df))
 44 
 45 
 46 # In[6]:
 47 
 48 
 49 def PreprocessData(raw_df):
 50     df=raw_df.drop(['name'], axis=1)
 51     age_mean = df['age'].mean()
 52     df['age'] = df['age'].fillna(age_mean)
 53     fare_mean = df['fare'].mean()
 54     df['fare'] = df['fare'].fillna(fare_mean)
 55     df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
 56     x_One Hot_df = pd.get_dummies(data=df,columns=["embarked" ])
 57 
 58     ndarray = x_One Hot_df.values
 59     Features = ndarray[:,1:]
 60     Label = ndarray[:,0]
 61 
 62     minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
 63     scaledFeatures=minmax_scale.fit_transform(Features)    
 64     
 65     return scaledFeatures,Label
 66 
 67 
 68 # In[7]:
 69 
 70 
 71 train_Features,train_Label=PreprocessData(train_df)
 72 test_Features,test_Label=PreprocessData(test_df)
 73 
 74 
 75 # # 3. Create Model 
 76 
 77 # In[8]:
 78 
 79 
 80 from keras.models import Sequential
 81 from keras.layers import Dense,Dropout
 82 
 83 
 84 # In[9]:
 85 
 86 
 87 model = Sequential()
 88 
 89 
 90 # In[10]:
 91 
 92 
 93 model.add(Dense(units=40, input_dim=9, 
 94                 kernel_initializer='uniform', 
 95                 activation='relu'))
 96 
 97 
 98 # In[11]:
 99 
100 
101 model.add(Dense(units=30, 
102                 kernel_initializer='uniform', 
103                 activation='relu'))
104 
105 
106 # In[12]:
107 
108 
109 model.add(Dense(units=1, 
110                 kernel_initializer='uniform',
111                 activation='sigmoid'))
112 
113 
114 # # 4. Train model
115 
116 # In[13]:
117 
118 
119 model.compile(loss='binary_crossentropy', 
120               optimizer='adam', metrics=['accuracy'])
121 
122 
123 # In[14]:
124 
125 
126 train_history =model.fit(x=train_Features, 
127                          y=train_Label, 
128                          validation_split=0.1, 
129                          epochs=30, 
130                          batch_size=30,verbose=2)
131 
132 
133 # # 6. Print History
134 
135 # In[15]:
136 
137 
138 import matplotlib.pyplot as plt
139 def show_train_history(train_history,train,validation):
140     plt.plot(train_history.history[train])
141     plt.plot(train_history.history[validation])
142     plt.title('Train History')
143     plt.ylabel(train)
144     plt.xlabel('Epoch')
145     plt.legend(['train', 'validation'], loc='upper left')
146     plt.show()
147 
148 
149 # In[16]:
150 
151 
152 show_train_history(train_history,'acc','val_acc')
153 
154 
155 # In[17]:
156 
157 
158 show_train_history(train_history,'loss','val_loss')
159 
160 
161 # # 评估模型的准确率
162 
163 # In[18]:
164 
165 
166 scores = model.evaluate(x=test_Features, 
167                         y=test_Label)
168 
169 
170 # In[19]:
171 
172 
173 scores[1]
174 
175 
176 # # 预测数据
177 
178 # # 加入Jack & Rose数据
179 
180 # In[20]:
181 
182 
183 Jack = pd.Series([0 ,'Jack',3, 'male'  , 23, 1, 0,  5.0000,'S'])
184 Rose = pd.Series([1 ,'Rose',1, 'female', 20, 1, 0, 100.0000,'S'])
185 
186 
187 # In[21]:
188 
189 
190 JR_df = pd.DataFrame([list(Jack),list(Rose)],  
191                   columns=['survived', 'name','pclass', 'sex', 
192                    'age', 'sibsp','parch', 'fare','embarked'])
193 
194 
195 # In[22]:
196 
197 
198 all_df=pd.concat([all_df,JR_df])
199 
200 
201 # In[23]:
202 
203 
204 all_df[-2:]
205 
206 
207 # # 进行预测
208 
209 # In[24]:
210 
211 
212 all_Features,Label=PreprocessData(all_df)
213 
214 
215 # In[25]:
216 
217 
218 all_probability=model.predict(all_Features)
219 
220 
221 # In[26]:
222 
223 
224 all_probability[:10]
225 
226 
227 # In[27]:
228 
229 
230 pd=all_df
231 pd.insert(len(all_df.columns),
232           'probability',all_probability)
233 
234 
235 # # 预测Jack & Rose数据的生存几率
236 
237 # In[28]:
238 
239 
240 pd[-2:]
241 
242 
243 # # 查看生存几率高，却没有存活
244 
245 # In[29]:
246 
247 
248 pd[(pd['survived']==0) &  (pd['probability']>0.9) ]
249 
250 
251 # In[30]:
252 
253 
254 pd[:5]

View Code

excel资源如下：

链接：https://pan.baidu.com/s/1PvonynplLKC6ZepSlL9DqQ
提取码：w7z3

这个是书上的历程，然后我没多少时间看了，采用多层感知器的方案的，主要是数据的预处理过程挺重要然后可经典。

也是对excel表格的处理吧。

我想还有没有下次，能好好梳理一下。

标签：泰坦尼克号,Features,Keras,df,生存率,train,pd,model,history
来源： https://www.cnblogs.com/bai2018/p/10462530.html