首页 > 其他分享> > 逻辑回归0.5阈值修改问题

逻辑回归0.5阈值修改问题

2021-09-14 10:02:00 作者：互联网

逻辑回归阈值修改

#使用sklearn乳腺癌数据集验证
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as LR
import numpy as np
np.set_printoptions(suppress=True)

data = load_breast_cancer()

lr = LR().fit(data.data,data.target)

#训练预测
print(lr.predict(data.data))
print(lr.predict_proba(data.data))

打印结果如下：

predict(x)：直接输出0-1二分类结果
predict_proba(x): 分别输出0-1的概率可能性

   #预测源码
   def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Samples.

        Returns
        -------
        C : array, shape [n_samples]
            Predicted class label per sample.
        """
        scores = self.decision_function(X)
        if len(scores.shape) == 1:
            indices = (scores > 0).astype(int)
        else:
            indices = scores.argmax(axis=1)
        return self.classes_[indices]
 # 分类概率源码       
 def predict_proba(self, X):
        """
        Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        For a multi_class problem, if multi_class is set to be "multinomial"
        the softmax function is used to find the predicted probability of
        each class.
        Else use a one-vs-rest approach, i.e calculate the probability
        of each class assuming it to be positive using the logistic function.
        and normalize these values across all the classes.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Vector to be scored, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        Returns
        -------
        T : array-like of shape (n_samples, n_classes)
            Returns the probability of the sample for each class in the model,
            where classes are ordered as they are in ``self.classes_``.
        """
        check_is_fitted(self)

        ovr = (self.multi_class in ["ovr", "warn"] or
               (self.multi_class == 'auto' and (self.classes_.size <= 2 or
                                                self.solver == 'liblinear')))
        if ovr:
            return super()._predict_proba_lr(X)
        else:
            decision = self.decision_function(X)
            if decision.ndim == 1:
                # Workaround for multi_class="multinomial" and binary outcomes
                # which requires softmax prediction with only a 1D decision.
                decision_2d = np.c_[-decision, decision]
            else:
                decision_2d = decision
            return softmax(decision_2d, copy=False)
 # 输出线性结果
     def decision_function(self, X):
        """
        Predict confidence scores for samples.

        The confidence score for a sample is proportional to the signed
        distance of that sample to the hyperplane.

        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            Samples.

        Returns
        -------
        array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
            Confidence scores per (sample, class) combination. In the binary
            case, confidence score for self.classes_[1] where >0 means this
            class would be predicted.
        """
        check_is_fitted(self)

        X = check_array(X, accept_sparse='csr')

        n_features = self.coef_.shape[1]
        if X.shape[1] != n_features:
            raise ValueError("X has %d features per sample; expecting %d"
                             % (X.shape[1], n_features))

        scores = safe_sparse_dot(X, self.coef_.T,
                                 dense_output=True) + self.intercept_
        return scores.ravel() if scores.shape[1] == 1 else scores

"""
从源码可以看出：predict()和predict_proba()

都使用了"decision = self.decision_function(X)" 然后进一步处理，
而decision看源码刚好是 "scores = safe_sparse_dot(X, self.coef_.T,
 dense_output=True) +self.intercept_ (截距)""不经过sigmlod的结果
 

print(lr.decision_function(data.data))))
# 这里直接打印decision结果，打印结果如下：
"""

结论：

从结果可以看是线性拟合函数decision_function的结果，我们可以使用人为加入sigmlod

print(1/(1+np.exp(-lr.decision_function(data.data))))

求取转化后[0,1]直接的连续值，这是就可以自定义threshold阈值，对结果进行切分或直接当作评分结果直接使用。

备注：

'''LR.predict()直接默认的阈值为0.5，满足线性拟合函数结果为负值就归为0，为正值就归于1,从源码就可以看出，也没有经过sigmlod函数'''
scores = self.decision_function(X)
if len(scores.shape) == 1:
            indices = (scores > 0).astype(int)
'''LR.predict()从源码可以看出使用的是softmax()函数得出得结果
如 线性拟合函数值decision为-33，然后就使用 (np.exp(-decision)/(np.exp(decision)+np.exp(-decison))) 结果为 1 代表就是是0类得概率为1，从上面得第一个样本结果可以看出
'''
decision_2d = np.c_[-decision, decision]
return softmax(decision_2d, copy=False)

标签：逻辑,阈值,predict,self,0.5,shape,scores,data,decision
来源： https://www.cnblogs.com/childheart/p/15266201.html