MNL(使用自己的数据集)
作者:互联网
1. 导入包
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
2. 导入自己的数据
data_wide = pd.read_csv("./data/mode_wide.csv", index_col=0) #index_col=0 第一行为列名
data_wide
choice | cost.car | cost.carpool | cost.bus | cost.rail | time.car | time.carpool | time.bus | time.rail | |
---|---|---|---|---|---|---|---|---|---|
1 | car | 1.507010 | 2.335612 | 1.800512 | 2.358920 | 18.503200 | 26.338233 | 20.867794 | 30.033469 |
2 | rail | 6.056998 | 2.896919 | 2.237128 | 1.855450 | 31.311107 | 34.256956 | 67.181889 | 60.293126 |
3 | car | 5.794677 | 2.137454 | 2.576385 | 2.747479 | 22.547429 | 23.255171 | 63.309057 | 49.171643 |
4 | car | 1.869144 | 2.572427 | 1.903518 | 2.268276 | 26.090282 | 29.896023 | 19.752704 | 13.472675 |
5 | car | 2.498952 | 1.722010 | 2.686000 | 2.973866 | 4.699140 | 12.414084 | 43.092039 | 39.743252 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
449 | rail | 6.990901 | 0.515137 | 2.066044 | 2.171174 | 48.022792 | 44.501577 | 27.271918 | 18.966319 |
450 | car | 4.591647 | 2.891148 | 1.900379 | 1.794407 | 29.444192 | 33.727087 | 66.117345 | 39.842459 |
451 | car | 3.236237 | 1.206815 | 1.754674 | 2.023671 | 16.349017 | 18.975074 | 23.387729 | 43.298276 |
452 | bus | 6.932740 | 1.171861 | 2.461495 | 2.612489 | 65.420641 | 60.481668 | 52.404315 | 48.370662 |
453 | carpool | 6.531509 | 1.408171 | 2.214791 | 1.856338 | 59.566073 | 55.141406 | 67.815635 | 73.447286 |
453 rows × 9 columns
2. 处理数据
y= 1(选car);
y = 2 (carpool);
y = 3 (rail);
y = 4 (bus);
def choice_to_y(choice):
if choice == 'car':
return 1
elif choice == 'carpool':
return 2
elif choice == 'rail':
return 3
else:
return 4
data_wide['y'] = data_wide['choice'].map(choice_to_y)
data_wide
choice | cost.car | cost.carpool | cost.bus | cost.rail | time.car | time.carpool | time.bus | time.rail | y | |
---|---|---|---|---|---|---|---|---|---|---|
1 | car | 1.507010 | 2.335612 | 1.800512 | 2.358920 | 18.503200 | 26.338233 | 20.867794 | 30.033469 | 1 |
2 | rail | 6.056998 | 2.896919 | 2.237128 | 1.855450 | 31.311107 | 34.256956 | 67.181889 | 60.293126 | 3 |
3 | car | 5.794677 | 2.137454 | 2.576385 | 2.747479 | 22.547429 | 23.255171 | 63.309057 | 49.171643 | 1 |
4 | car | 1.869144 | 2.572427 | 1.903518 | 2.268276 | 26.090282 | 29.896023 | 19.752704 | 13.472675 | 1 |
5 | car | 2.498952 | 1.722010 | 2.686000 | 2.973866 | 4.699140 | 12.414084 | 43.092039 | 39.743252 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
449 | rail | 6.990901 | 0.515137 | 2.066044 | 2.171174 | 48.022792 | 44.501577 | 27.271918 | 18.966319 | 3 |
450 | car | 4.591647 | 2.891148 | 1.900379 | 1.794407 | 29.444192 | 33.727087 | 66.117345 | 39.842459 | 1 |
451 | car | 3.236237 | 1.206815 | 1.754674 | 2.023671 | 16.349017 | 18.975074 | 23.387729 | 43.298276 | 1 |
452 | bus | 6.932740 | 1.171861 | 2.461495 | 2.612489 | 65.420641 | 60.481668 | 52.404315 | 48.370662 | 4 |
453 | carpool | 6.531509 | 1.408171 | 2.214791 | 1.856338 | 59.566073 | 55.141406 | 67.815635 | 73.447286 | 2 |
453 rows × 10 columns
3. 确定自变量X和因变量y
data_wide.columns
Index(['choice', 'cost.car', 'cost.carpool', 'cost.bus', 'cost.rail',
'time.car', 'time.carpool', 'time.bus', 'time.rail', 'y'],
dtype='object')
X = data_wide[['cost.car', 'cost.carpool', 'cost.bus', 'cost.rail','time.car', 'time.carpool', 'time.bus', 'time.rail']]
y = data_wide['y']
4. 配置Logit模型并评估
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# define the model evaluation procedure (定义模型评估程序) n_splits 就是K-flods中的K值;n_repeats是交叉验证的次数
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores (评估模型并收集分数)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Mean Accuracy: 0.665 (0.061)
5. 拟合
model.fit(X, y)
D:\ANACONDA\lib\site-packages\sklearn\linear_model\_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
LogisticRegression(multi_class='multinomial')
6. 设置一个新的数据,预测结果
#生成一组新数据
new_data = np.random.rand(8)
new_data
array([0.11880174, 0.16505872, 0.14297278, 0.50355392, 0.87629855,
0.91189688, 0.57073101, 0.19178997])
#预测
#预测新数据的分布概率
yhat = model.predict_proba([new_data])
#输出预测结果
print('Predicted Probabilities: %s' % yhat[0])
Predicted Probabilities: [0.3749058 0.20228137 0.20380141 0.21901142]
D:\ANACONDA\lib\site-packages\sklearn\base.py:451: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
"X does not have valid feature names, but"
已经可以了解如何使用自己的数据进行多元logit回归的一个思路;
上面的警告是出现了无效的特征名(列名不是正确的格式)
标签:wide,data,choice,cost,MNL,使用,import,model,数据 来源: https://blog.csdn.net/sheyueyu/article/details/123610338