逻辑回归0.5阈值修改问题
作者:互联网
逻辑回归阈值修改
#使用sklearn乳腺癌数据集验证
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as LR
import numpy as np
np.set_printoptions(suppress=True)
data = load_breast_cancer()
lr = LR().fit(data.data,data.target)
#训练预测
print(lr.predict(data.data))
print(lr.predict_proba(data.data))
打印结果如下:
- predict(x):直接输出0-1二分类结果
- predict_proba(x): 分别输出0-1的概率可能性
#预测源码
def predict(self, X):
"""
Predict class labels for samples in X.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
C : array, shape [n_samples]
Predicted class label per sample.
"""
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
# 分类概率源码
def predict_proba(self, X):
"""
Probability estimates.
The returned estimates for all classes are ordered by the
label of classes.
For a multi_class problem, if multi_class is set to be "multinomial"
the softmax function is used to find the predicted probability of
each class.
Else use a one-vs-rest approach, i.e calculate the probability
of each class assuming it to be positive using the logistic function.
and normalize these values across all the classes.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Vector to be scored, where `n_samples` is the number of samples and
`n_features` is the number of features.
Returns
-------
T : array-like of shape (n_samples, n_classes)
Returns the probability of the sample for each class in the model,
where classes are ordered as they are in ``self.classes_``.
"""
check_is_fitted(self)
ovr = (self.multi_class in ["ovr", "warn"] or
(self.multi_class == 'auto' and (self.classes_.size <= 2 or
self.solver == 'liblinear')))
if ovr:
return super()._predict_proba_lr(X)
else:
decision = self.decision_function(X)
if decision.ndim == 1:
# Workaround for multi_class="multinomial" and binary outcomes
# which requires softmax prediction with only a 1D decision.
decision_2d = np.c_[-decision, decision]
else:
decision_2d = decision
return softmax(decision_2d, copy=False)
# 输出线性结果
def decision_function(self, X):
"""
Predict confidence scores for samples.
The confidence score for a sample is proportional to the signed
distance of that sample to the hyperplane.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Samples.
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for self.classes_[1] where >0 means this
class would be predicted.
"""
check_is_fitted(self)
X = check_array(X, accept_sparse='csr')
n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
raise ValueError("X has %d features per sample; expecting %d"
% (X.shape[1], n_features))
scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
"""
从源码可以看出:predict()和predict_proba()
都使用了"decision = self.decision_function(X)" 然后进一步处理,
而decision看源码刚好是 "scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) +self.intercept_ (截距)""不经过sigmlod的结果
print(lr.decision_function(data.data))))
# 这里直接打印decision结果,打印结果如下:
"""
结论:
从结果可以看是线性拟合函数decision_function的结果,我们可以使用人为加入sigmlod
print(1/(1+np.exp(-lr.decision_function(data.data))))
求取转化后[0,1]直接的连续值,这是就可以自定义threshold阈值,对结果进行切分或直接当作评分结果直接使用。
备注:
'''LR.predict()直接默认的阈值为0.5,满足线性拟合函数结果为负值就归为0,为正值就归于1,从源码就可以看出,也没有经过sigmlod函数'''
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
'''LR.predict()从源码可以看出使用的是softmax()函数得出得结果
如 线性拟合函数值decision为-33,然后就使用 (np.exp(-decision)/(np.exp(decision)+np.exp(-decison))) 结果为 1 代表就是是0类得概率为1,从上面得第一个样本结果可以看出
'''
decision_2d = np.c_[-decision, decision]
return softmax(decision_2d, copy=False)
标签:逻辑,阈值,predict,self,0.5,shape,scores,data,decision 来源: https://www.cnblogs.com/childheart/p/15266201.html