"""Metrics to assess performance on classification task given classe prediction Functions named as ``*_score`` return a scalar value to maximize: the higher the better Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: the lower the better """ # Authors: Alexandre Gramfort # Mathieu Blondel # Olivier Grisel # Arnaud Joly # Jochen Wersdorfer # Lars Buitinck # Joel Nothman # Noel Dawe # Jatin Shah # Saurabh Jha # License: BSD 3 clause from __future__ import division import warnings import numpy as np from scipy.sparse import coo_matrix from scipy.sparse import csr_matrix from scipy.spatial.distance import hamming as sp_hamming from ..preprocessing import LabelBinarizer, label_binarize from ..preprocessing import LabelEncoder from ..utils import check_array from ..utils import check_consistent_length from ..utils import column_or_1d from ..utils.multiclass import unique_labels from ..utils.multiclass import type_of_target from ..utils.validation import _num_samples from ..utils.sparsefuncs import count_nonzero from ..utils.fixes import bincount from .base import UndefinedMetricWarning def _check_targets(y_true, y_pred): """Check that y_true and y_pred belong to the same classification task This converts multiclass or binary types to a common shape, and raises a ValueError for a mix of multilabel and multiclass targets, a mix of multilabel formats, for the presence of continuous-valued or multioutput targets, or for targets of different lengths. Column vectors are squeezed to 1d, while multilabel formats are returned as CSR sparse label indicators. Parameters ---------- y_true : array-like y_pred : array-like Returns ------- type_true : one of {'multilabel-indicator', 'multiclass', 'binary'} The type of the true target data, as output by ``utils.multiclass.type_of_target`` y_true : array or indicator matrix y_pred : array or indicator matrix """ check_consistent_length(y_true, y_pred) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = set([type_true, type_pred]) if y_type == set(["binary", "multiclass"]): y_type = set(["multiclass"]) if len(y_type) > 1: raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if (y_type not in ["binary", "multiclass", "multilabel-indicator"]): raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) if y_type.startswith('multilabel'): y_true = csr_matrix(y_true) y_pred = csr_matrix(y_pred) y_type = 'multilabel-indicator' return y_type, y_true, y_pred def _weighted_sum(sample_score, sample_weight, normalize=False): if normalize: return np.average(sample_score, weights=sample_weight) elif sample_weight is not None: return np.dot(sample_score, sample_weight) else: return sample_score.sum() def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- score : float If ``normalize == True``, return the correctly classified samples (float), else it returns the number of correctly classified samples (int). The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. See also -------- jaccard_similarity_score, hamming_loss, zero_one_loss Notes ----- In binary and multiclass classification, this function is equal to the ``jaccard_similarity_score`` function. Examples -------- >>> import numpy as np >>> from sklearn.metrics import accuracy_score >>> y_pred = [0, 2, 1, 3] >>> y_true = [0, 1, 2, 3] >>> accuracy_score(y_true, y_pred) 0.5 >>> accuracy_score(y_true, y_pred, normalize=False) 2 In the multilabel case with binary label indicators: >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 """ # Compute accuracy for each possible representation y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type.startswith('multilabel'): differing_labels = count_nonzero(y_true - y_pred, axis=1) score = differing_labels == 0 else: score = y_true == y_pred return _weighted_sum(score, sample_weight, normalize) def confusion_matrix(y_true, y_pred, labels=None): """Compute confusion matrix to evaluate the accuracy of a classification By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}` is equal to the number of observations known to be in group :math:`i` but predicted to be in group :math:`j`. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] Ground truth (correct) target values. y_pred : array, shape = [n_samples] Estimated targets as returned by a classifier. labels : array, shape = [n_classes], optional List of labels to index the matrix. This may be used to reorder or select a subset of labels. If none is given, those that appear at least once in ``y_true`` or ``y_pred`` are used in sorted order. Returns ------- C : array, shape = [n_classes, n_classes] Confusion matrix References ---------- .. [1] `Wikipedia entry for the Confusion matrix `_ Examples -------- >>> from sklearn.metrics import confusion_matrix >>> y_true = [2, 0, 2, 2, 0, 1] >>> y_pred = [0, 0, 2, 2, 0, 2] >>> confusion_matrix(y_true, y_pred) array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] >>> confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"]) array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) """ y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type not in ("binary", "multiclass"): raise ValueError("%s is not supported" % y_type) if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) n_labels = labels.size label_to_ind = dict((y, x) for x, y in enumerate(labels)) # convert yt, yp into index y_pred = np.array([label_to_ind.get(x, n_labels + 1) for x in y_pred]) y_true = np.array([label_to_ind.get(x, n_labels + 1) for x in y_true]) # intersect y_pred, y_true with labels, eliminate items not in labels ind = np.logical_and(y_pred < n_labels, y_true < n_labels) y_pred = y_pred[ind] y_true = y_true[ind] CM = coo_matrix((np.ones(y_true.shape[0], dtype=np.int), (y_true, y_pred)), shape=(n_labels, n_labels) ).toarray() return CM def cohen_kappa_score(y1, y2, labels=None): """Cohen's kappa: a statistic that measures inter-annotator agreement. This function computes Cohen's kappa [1], a score that expresses the level of agreement between two annotators on a classification problem. It is defined as .. math:: \kappa = (p_o - p_e) / (1 - p_e) where :math:`p_o` is the empirical probability of agreement on the label assigned to any sample (the observed agreement ratio), and :math:`p_e` is the expected agreement when both annotators assign labels randomly. :math:`p_e` is estimated using a per-annotator empirical prior over the class labels [2]. Parameters ---------- y1 : array, shape = [n_samples] Labels assigned by the first annotator. y2 : array, shape = [n_samples] Labels assigned by the second annotator. The kappa statistic is symmetric, so swapping ``y1`` and ``y2`` doesn't change the value. labels : array, shape = [n_classes], optional List of labels to index the matrix. This may be used to select a subset of labels. If None, all labels that appear at least once in ``y1`` or ``y2`` are used. Returns ------- kappa : float The kappa statistic, which is a number between -1 and 1. The maximum value means complete agreement; zero or lower means chance agreement. References ---------- .. [1] J. Cohen (1960). "A coefficient of agreement for nominal scales". Educational and Psychological Measurement 20(1):37-46. doi:10.1177/001316446002000104. .. [2] R. Artstein and M. Poesio (2008). "Inter-coder agreement for computational linguistics". Computational Linguistic 34(4):555-596. """ confusion = confusion_matrix(y1, y2, labels=labels) P = confusion / float(confusion.sum()) p_observed = np.trace(P) p_expected = np.dot(P.sum(axis=0), P.sum(axis=1)) return (p_observed - p_expected) / (1 - p_expected) def jaccard_similarity_score(y_true, y_pred, normalize=True, sample_weight=None): """Jaccard similarity coefficient score The Jaccard index [1], or Jaccard similarity coefficient, defined as the size of the intersection divided by the size of the union of two label sets, is used to compare set of predicted labels for a sample to the corresponding set of labels in ``y_true``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the sum of the Jaccard similarity coefficient over the sample set. Otherwise, return the average of Jaccard similarity coefficient. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- score : float If ``normalize == True``, return the average Jaccard similarity coefficient, else it returns the sum of the Jaccard similarity coefficient over the sample set. The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. See also -------- accuracy_score, hamming_loss, zero_one_loss Notes ----- In binary and multiclass classification, this function is equivalent to the ``accuracy_score``. It differs in the multilabel classification problem. References ---------- .. [1] `Wikipedia entry for the Jaccard index `_ Examples -------- >>> import numpy as np >>> from sklearn.metrics import jaccard_similarity_score >>> y_pred = [0, 2, 1, 3] >>> y_true = [0, 1, 2, 3] >>> jaccard_similarity_score(y_true, y_pred) 0.5 >>> jaccard_similarity_score(y_true, y_pred, normalize=False) 2 In the multilabel case with binary label indicators: >>> jaccard_similarity_score(np.array([[0, 1], [1, 1]]),\ np.ones((2, 2))) 0.75 """ # Compute accuracy for each possible representation y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type.startswith('multilabel'): with np.errstate(divide='ignore', invalid='ignore'): # oddly, we may get an "invalid" rather than a "divide" error here pred_or_true = count_nonzero(y_true + y_pred, axis=1) pred_and_true = count_nonzero(y_true.multiply(y_pred), axis=1) score = pred_and_true / pred_or_true # If there is no label, it results in a Nan instead, we set # the jaccard to 1: lim_{x->0} x/x = 1 # Note with py2.6 and np 1.3: we can't check safely for nan. score[pred_or_true == 0.0] = 1.0 else: score = y_true == y_pred return _weighted_sum(score, sample_weight, normalize) def matthews_corrcoef(y_true, y_pred): """Compute the Matthews correlation coefficient (MCC) for binary classes The Matthews correlation coefficient is used in machine learning as a measure of the quality of binary (two-class) classifications. It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. The MCC is in essence a correlation coefficient value between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 an average random prediction and -1 an inverse prediction. The statistic is also known as the phi coefficient. [source: Wikipedia] Only in the binary case does this relate to information about true and false positives and negatives. See references below. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] Ground truth (correct) target values. y_pred : array, shape = [n_samples] Estimated targets as returned by a classifier. Returns ------- mcc : float The Matthews correlation coefficient (+1 represents a perfect prediction, 0 an average random prediction and -1 and inverse prediction). References ---------- .. [1] `Baldi, Brunak, Chauvin, Andersen and Nielsen, (2000). Assessing the accuracy of prediction algorithms for classification: an overview `_ .. [2] `Wikipedia entry for the Matthews Correlation Coefficient `_ Examples -------- >>> from sklearn.metrics import matthews_corrcoef >>> y_true = [+1, +1, +1, -1] >>> y_pred = [+1, -1, +1, +1] >>> matthews_corrcoef(y_true, y_pred) # doctest: +ELLIPSIS -0.33... """ y_type, y_true, y_pred = _check_targets(y_true, y_pred) if y_type != "binary": raise ValueError("%s is not supported" % y_type) lb = LabelEncoder() lb.fit(np.hstack([y_true, y_pred])) y_true = lb.transform(y_true) y_pred = lb.transform(y_pred) with np.errstate(invalid='ignore'): mcc = np.corrcoef(y_true, y_pred)[0, 1] if np.isnan(mcc): return 0. else: return mcc def zero_one_loss(y_true, y_pred, normalize=True, sample_weight=None): """Zero-one classification loss. If normalize is ``True``, return the fraction of misclassifications (float), else it returns the number of misclassifications (int). The best performance is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the number of misclassifications. Otherwise, return the fraction of misclassifications. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- loss : float or int, If ``normalize == True``, return the fraction of misclassifications (float), else it returns the number of misclassifications (int). Notes ----- In multilabel classification, the zero_one_loss function corresponds to the subset zero-one loss: for each sample, the entire set of labels must be correctly predicted, otherwise the loss for that sample is equal to one. See also -------- accuracy_score, hamming_loss, jaccard_similarity_score Examples -------- >>> from sklearn.metrics import zero_one_loss >>> y_pred = [1, 2, 3, 4] >>> y_true = [2, 2, 3, 4] >>> zero_one_loss(y_true, y_pred) 0.25 >>> zero_one_loss(y_true, y_pred, normalize=False) 1 In the multilabel case with binary label indicators: >>> zero_one_loss(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 """ score = accuracy_score(y_true, y_pred, normalize=normalize, sample_weight=sample_weight) if normalize: return 1 - score else: if sample_weight is not None: n_samples = np.sum(sample_weight) else: n_samples = _num_samples(y_true) return n_samples - score def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None): """Compute the F1 score, also known as balanced F-score or F-measure The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:: F1 = 2 * (precision * recall) / (precision + recall) In the multi-class and multi-label case, this is the weighted average of the F1 score of each class. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. .. versionchanged:: 0.17 parameter *labels* improved for multiclass problem. pos_label : str or int, 1 by default The class to report if ``average='binary'``. Until version 0.18 it is necessary to set ``pos_label=None`` if seeking to use another averaging method over binary targets. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). Note that if ``pos_label`` is given in binary classification with `average != 'binary'`, only that positive class is reported. This behavior is deprecated and will change in version 0.18. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- f1_score : float or array of float, shape = [n_unique_labels] F1 score of the positive class in binary classification or weighted average of the F1 scores of each class for the multiclass task. References ---------- .. [1] `Wikipedia entry for the F1-score `_ Examples -------- >>> from sklearn.metrics import f1_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> f1_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS 0.26... >>> f1_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.33... >>> f1_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.26... >>> f1_score(y_true, y_pred, average=None) array([ 0.8, 0. , 0. ]) """ return fbeta_score(y_true, y_pred, 1, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight) def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1, average='binary', sample_weight=None): """Compute the F-beta score The F-beta score is the weighted harmonic mean of precision and recall, reaching its optimal value at 1 and its worst value at 0. The `beta` parameter determines the weight of precision in the combined score. ``beta < 1`` lends more weight to precision, while ``beta > 1`` favors recall (``beta -> 0`` considers only precision, ``beta -> inf`` only recall). Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. beta: float Weight of precision in harmonic mean. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. .. versionchanged:: 0.17 parameter *labels* improved for multiclass problem. pos_label : str or int, 1 by default The class to report if ``average='binary'``. Until version 0.18 it is necessary to set ``pos_label=None`` if seeking to use another averaging method over binary targets. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). Note that if ``pos_label`` is given in binary classification with `average != 'binary'`, only that positive class is reported. This behavior is deprecated and will change in version 0.18. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- fbeta_score : float (if average is not None) or array of float, shape =\ [n_unique_labels] F-beta score of the positive class in binary classification or weighted average of the F-beta score of each class for the multiclass task. References ---------- .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern Information Retrieval. Addison Wesley, pp. 327-328. .. [2] `Wikipedia entry for the F1-score `_ Examples -------- >>> from sklearn.metrics import fbeta_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) ... # doctest: +ELLIPSIS 0.23... >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) ... # doctest: +ELLIPSIS 0.33... >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) ... # doctest: +ELLIPSIS 0.23... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) ... # doctest: +ELLIPSIS array([ 0.71..., 0. , 0. ]) """ _, _, f, _ = precision_recall_fscore_support(y_true, y_pred, beta=beta, labels=labels, pos_label=pos_label, average=average, warn_for=('f-score',), sample_weight=sample_weight) return f def _prf_divide(numerator, denominator, metric, modifier, average, warn_for): """Performs division and handles divide-by-zero. On zero-division, sets the corresponding result elements to zero and raises a warning. The metric, modifier and average arguments are used only for determining an appropriate warning. """ result = numerator / denominator mask = denominator == 0.0 if not np.any(mask): return result # remove infs result[mask] = 0.0 # build appropriate warning # E.g. "Precision and F-score are ill-defined and being set to 0.0 in # labels with no predicted samples" axis0 = 'sample' axis1 = 'label' if average == 'samples': axis0, axis1 = axis1, axis0 if metric in warn_for and 'f-score' in warn_for: msg_start = '{0} and F-score are'.format(metric.title()) elif metric in warn_for: msg_start = '{0} is'.format(metric.title()) elif 'f-score' in warn_for: msg_start = 'F-score is' else: return result msg = ('{0} ill-defined and being set to 0.0 {{0}} ' 'no {1} {2}s.'.format(msg_start, modifier, axis0)) if len(mask) == 1: msg = msg.format('due to') else: msg = msg.format('in {0}s with'.format(axis1)) warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) return result def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None, pos_label=1, average=None, warn_for=('precision', 'recall', 'f-score'), sample_weight=None): """Compute precision, recall, F-measure and support for each class The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0. The F-beta score weights recall more than precision by a factor of ``beta``. ``beta == 1.0`` means recall and precision are equally important. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average precision, recall and F-measure if ``average`` is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. beta : float, 1.0 by default The strength of recall versus precision in the F-score. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'``. Until version 0.18 it is necessary to set ``pos_label=None`` if seeking to use another averaging method over binary targets. average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). Note that if ``pos_label`` is given in binary classification with `average != 'binary'`, only that positive class is reported. This behavior is deprecated and will change in version 0.18. warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- precision: float (if average is not None) or array of float, shape =\ [n_unique_labels] recall: float (if average is not None) or array of float, , shape =\ [n_unique_labels] fbeta_score: float (if average is not None) or array of float, shape =\ [n_unique_labels] support: int (if average is not None) or array of int, shape =\ [n_unique_labels] The number of occurrences of each label in ``y_true``. References ---------- .. [1] `Wikipedia entry for the Precision and recall `_ .. [2] `Wikipedia entry for the F1-score `_ .. [3] `Discriminative Methods for Multi-labeled Classification Advances in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu Godbole, Sunita Sarawagi ` Examples -------- >>> from sklearn.metrics import precision_recall_fscore_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> precision_recall_fscore_support(y_true, y_pred, average='macro') ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='micro') ... # doctest: +ELLIPSIS (0.33..., 0.33..., 0.33..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS (0.22..., 0.33..., 0.26..., None) It is possible to compute per-label precisions, recalls, F1-scores and supports instead of averaging: >>> precision_recall_fscore_support(y_true, y_pred, average=None, ... labels=['pig', 'dog', 'cat']) ... # doctest: +ELLIPSIS,+NORMALIZE_WHITESPACE (array([ 0. , 0. , 0.66...]), array([ 0., 0., 1.]), array([ 0. , 0. , 0.8]), array([2, 2, 2])) """ average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average not in average_options and average != 'binary': raise ValueError('average has to be one of ' + str(average_options)) if beta <= 0: raise ValueError("beta should be >0 in the F-beta score") y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) if average == 'binary' and (y_type != 'binary' or pos_label is None): warnings.warn('The default `weighted` averaging is deprecated, ' 'and from version 0.18, use of precision, recall or ' 'F-score with multiclass or multilabel data or ' 'pos_label=None will result in an exception. ' 'Please set an explicit value for `average`, one of ' '%s. In cross validation use, for instance, ' 'scoring="f1_weighted" instead of scoring="f1".' % str(average_options), DeprecationWarning, stacklevel=2) average = 'weighted' if y_type == 'binary' and pos_label is not None and average is not None: if average != 'binary': warnings.warn('From version 0.18, binary input will not be ' 'handled specially when using averaged ' 'precision/recall/F-score. ' 'Please use average=\'binary\' to report only the ' 'positive class performance.', DeprecationWarning) if labels is None or len(labels) <= 2: if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels return (0., 0., 0., 0) else: raise ValueError("pos_label=%r is not a valid label: %r" % (pos_label, present_labels)) labels = [pos_label] if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack([labels, np.setdiff1d(present_labels, labels, assume_unique=True)]) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith('multilabel'): sum_axis = 1 if average == 'samples' else 0 # All labels are index integers for multilabel. # Select labels: if not np.all(labels == present_labels): if np.max(labels) > np.max(present_labels): raise ValueError('All labels must be in [0, n labels). ' 'Got %d > %d' % (np.max(labels), np.max(present_labels))) if np.min(labels) < 0: raise ValueError('All labels must be in [0, n labels). ' 'Got %d < 0' % np.min(labels)) y_true = y_true[:, labels[:n_labels]] y_pred = y_pred[:, labels[:n_labels]] # calculate weighted counts true_and_pred = y_true.multiply(y_pred) tp_sum = count_nonzero(true_and_pred, axis=sum_axis, sample_weight=sample_weight) pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight) true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight) elif average == 'samples': raise ValueError("Sample-based precision, recall, fscore is " "not meaningful outside multilabel " "classification. See the accuracy_score instead.") else: le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = bincount(y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): true_sum = bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] if average == 'micro': tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) # Finally, we have all our sufficient statistics. Divide! # beta2 = beta ** 2 with np.errstate(divide='ignore', invalid='ignore'): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. precision = _prf_divide(tp_sum, pred_sum, 'precision', 'predicted', average, warn_for) recall = _prf_divide(tp_sum, true_sum, 'recall', 'true', average, warn_for) # Don't need to warn for F: either P or R warned, or tp == 0 where pos # and true are nonzero, in which case, F is well-defined and zero f_score = ((1 + beta2) * precision * recall / (beta2 * precision + recall)) f_score[tp_sum == 0] = 0.0 # Average the results if average == 'weighted': weights = true_sum if weights.sum() == 0: return 0, 0, 0, None elif average == 'samples': weights = sample_weight else: weights = None if average is not None: assert average != 'binary' or len(precision) == 1 precision = np.average(precision, weights=weights) recall = np.average(recall, weights=weights) f_score = np.average(f_score, weights=weights) true_sum = None # return no support return precision, recall, f_score, true_sum def precision_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None): """Compute the precision The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. .. versionchanged:: 0.17 parameter *labels* improved for multiclass problem. pos_label : str or int, 1 by default The class to report if ``average='binary'``. Until version 0.18 it is necessary to set ``pos_label=None`` if seeking to use another averaging method over binary targets. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). Note that if ``pos_label`` is given in binary classification with `average != 'binary'`, only that positive class is reported. This behavior is deprecated and will change in version 0.18. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- precision : float (if average is not None) or array of float, shape =\ [n_unique_labels] Precision of the positive class in binary classification or weighted average of the precision of each class for the multiclass task. Examples -------- >>> from sklearn.metrics import precision_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> precision_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS 0.22... >>> precision_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.33... >>> precision_score(y_true, y_pred, average='weighted') ... # doctest: +ELLIPSIS 0.22... >>> precision_score(y_true, y_pred, average=None) # doctest: +ELLIPSIS array([ 0.66..., 0. , 0. ]) """ p, _, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('precision',), sample_weight=sample_weight) return p def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary', sample_weight=None): """Compute the recall The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. .. versionchanged:: 0.17 parameter *labels* improved for multiclass problem. pos_label : str or int, 1 by default The class to report if ``average='binary'``. Until version 0.18 it is necessary to set ``pos_label=None`` if seeking to use another averaging method over binary targets. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). Note that if ``pos_label`` is given in binary classification with `average != 'binary'`, only that positive class is reported. This behavior is deprecated and will change in version 0.18. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- recall : float (if average is not None) or array of float, shape =\ [n_unique_labels] Recall of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. Examples -------- >>> from sklearn.metrics import recall_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> recall_score(y_true, y_pred, average='macro') # doctest: +ELLIPSIS 0.33... >>> recall_score(y_true, y_pred, average='micro') # doctest: +ELLIPSIS 0.33... >>> recall_score(y_true, y_pred, average='weighted') # doctest: +ELLIPSIS 0.33... >>> recall_score(y_true, y_pred, average=None) array([ 1., 0., 0.]) """ _, r, _, _ = precision_recall_fscore_support(y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('recall',), sample_weight=sample_weight) return r def classification_report(y_true, y_pred, labels=None, target_names=None, sample_weight=None, digits=2): """Build a text report showing the main classification metrics Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : array, shape = [n_labels] Optional list of label indices to include in the report. target_names : list of strings Optional display names matching the labels (same order). sample_weight : array-like of shape = [n_samples], optional Sample weights. digits : int Number of digits for formatting output floating point values Returns ------- report : string Text summary of the precision, recall, F1 score for each class. Examples -------- >>> from sklearn.metrics import classification_report >>> y_true = [0, 1, 2, 2, 2] >>> y_pred = [0, 0, 2, 2, 1] >>> target_names = ['class 0', 'class 1', 'class 2'] >>> print(classification_report(y_true, y_pred, target_names=target_names)) precision recall f1-score support class 0 0.50 1.00 0.67 1 class 1 0.00 0.00 0.00 1 class 2 1.00 0.67 0.80 3 avg / total 0.70 0.60 0.61 5 """ if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels) last_line_heading = 'avg / total' if target_names is None: width = len(last_line_heading) target_names = ['%s' % l for l in labels] else: width = max(len(cn) for cn in target_names) width = max(width, len(last_line_heading), digits) headers = ["precision", "recall", "f1-score", "support"] fmt = '%% %ds' % width # first column: class name fmt += ' ' fmt += ' '.join(['% 9s' for _ in headers]) fmt += '\n' headers = [""] + headers report = fmt % tuple(headers) report += '\n' p, r, f1, s = precision_recall_fscore_support(y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight) for i, label in enumerate(labels): values = [target_names[i]] for v in (p[i], r[i], f1[i]): values += ["{0:0.{1}f}".format(v, digits)] values += ["{0}".format(s[i])] report += fmt % tuple(values) report += '\n' # compute averages values = [last_line_heading] for v in (np.average(p, weights=s), np.average(r, weights=s), np.average(f1, weights=s)): values += ["{0:0.{1}f}".format(v, digits)] values += ['{0}'.format(np.sum(s))] report += fmt % tuple(values) return report def hamming_loss(y_true, y_pred, classes=None): """Compute the average Hamming loss. The Hamming loss is the fraction of labels that are incorrectly predicted. Read more in the :ref:`User Guide `. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) labels. y_pred : 1d array-like, or label indicator array / sparse matrix Predicted labels, as returned by a classifier. classes : array, shape = [n_labels], optional Integer array of labels. Returns ------- loss : float or int, Return the average Hamming loss between element of ``y_true`` and ``y_pred``. See Also -------- accuracy_score, jaccard_similarity_score, zero_one_loss Notes ----- In multiclass classification, the Hamming loss correspond to the Hamming distance between ``y_true`` and ``y_pred`` which is equivalent to the subset ``zero_one_loss`` function. In multilabel classification, the Hamming loss is different from the subset zero-one loss. The zero-one loss considers the entire set of labels for a given sample incorrect if it does entirely match the true set of labels. Hamming loss is more forgiving in that it penalizes the individual labels. The Hamming loss is upperbounded by the subset zero-one loss. When normalized over samples, the Hamming loss is always between 0 and 1. References ---------- .. [1] Grigorios Tsoumakas, Ioannis Katakis. Multi-Label Classification: An Overview. International Journal of Data Warehousing & Mining, 3(3), 1-13, July-September 2007. .. [2] `Wikipedia entry on the Hamming distance `_ Examples -------- >>> from sklearn.metrics import hamming_loss >>> y_pred = [1, 2, 3, 4] >>> y_true = [2, 2, 3, 4] >>> hamming_loss(y_true, y_pred) 0.25 In the multilabel case with binary label indicators: >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) 0.75 """ y_type, y_true, y_pred = _check_targets(y_true, y_pred) if classes is None: classes = unique_labels(y_true, y_pred) else: classes = np.asarray(classes) if y_type.startswith('multilabel'): n_differences = count_nonzero(y_true - y_pred) return (n_differences / (y_true.shape[0] * len(classes))) elif y_type in ["binary", "multiclass"]: return sp_hamming(y_true, y_pred) else: raise ValueError("{0} is not supported".format(y_type)) def log_loss(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None): """Log loss, aka logistic loss or cross-entropy loss. This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of the true labels given a probabilistic classifier's predictions. For a single sample with true label yt in {0,1} and estimated probability yp that yt = 1, the log loss is -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp)) Read more in the :ref:`User Guide `. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) labels for n_samples samples. y_pred : array-like of float, shape = (n_samples, n_classes) Predicted probabilities, as returned by a classifier's predict_proba method. eps : float Log loss is undefined for p=0 or p=1, so probabilities are clipped to max(eps, min(1 - eps, p)). normalize : bool, optional (default=True) If true, return the mean loss per sample. Otherwise, return the sum of the per-sample losses. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- loss : float Examples -------- >>> log_loss(["spam", "ham", "ham", "spam"], # doctest: +ELLIPSIS ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) 0.21616... References ---------- C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, p. 209. Notes ----- The logarithm used is the natural logarithm (base-e). """ lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = np.append(1 - T, T, axis=1) # Clipping Y = np.clip(y_pred, eps, 1 - eps) # This happens in cases when elements in y_pred have type "str". if not isinstance(Y, np.ndarray): raise ValueError("y_pred should be an array of floats.") # If y_pred is of single dimension, assume y_true to be binary # and then check. if Y.ndim == 1: Y = Y[:, np.newaxis] if Y.shape[1] == 1: Y = np.append(1 - Y, Y, axis=1) # Check if dimensions are consistent. check_consistent_length(T, Y) T = check_array(T) Y = check_array(Y) if T.shape[1] != Y.shape[1]: raise ValueError("y_true and y_pred have different number of classes " "%d, %d" % (T.shape[1], Y.shape[1])) # Renormalize Y /= Y.sum(axis=1)[:, np.newaxis] loss = -(T * np.log(Y)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize) def hinge_loss(y_true, pred_decision, labels=None, sample_weight=None): """Average hinge loss (non-regularized) In binary class case, assuming labels in y_true are encoded with +1 and -1, when a prediction mistake is made, ``margin = y_true * pred_decision`` is always negative (since the signs disagree), implying ``1 - margin`` is always greater than 1. The cumulated hinge loss is therefore an upper bound of the number of mistakes made by the classifier. In multiclass case, the function expects that either all the labels are included in y_true or an optional labels argument is provided which contains all the labels. The multilabel margin is calculated according to Crammer-Singer's method. As in the binary case, the cumulated hinge loss is an upper bound of the number of mistakes made by the classifier. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] True target, consisting of integers of two values. The positive label must be greater than the negative label. pred_decision : array, shape = [n_samples] or [n_samples, n_classes] Predicted decisions, as output by decision_function (floats). labels : array, optional, default None Contains all the labels for the problem. Used in multiclass hinge loss. sample_weight : array-like of shape = [n_samples], optional Sample weights. Returns ------- loss : float References ---------- .. [1] `Wikipedia entry on the Hinge loss `_ .. [2] Koby Crammer, Yoram Singer. On the Algorithmic Implementation of Multiclass Kernel-based Vector Machines. Journal of Machine Learning Research 2, (2001), 265-292 .. [3] `L1 AND L2 Regularization for Multiclass Hinge Loss Models by Robert C. Moore, John DeNero. `_ Examples -------- >>> from sklearn import svm >>> from sklearn.metrics import hinge_loss >>> X = [[0], [1]] >>> y = [-1, 1] >>> est = svm.LinearSVC(random_state=0) >>> est.fit(X, y) LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, verbose=0) >>> pred_decision = est.decision_function([[-2], [3], [0.5]]) >>> pred_decision # doctest: +ELLIPSIS array([-2.18..., 2.36..., 0.09...]) >>> hinge_loss([-1, 1, 1], pred_decision) # doctest: +ELLIPSIS 0.30... In the multiclass case: >>> X = np.array([[0], [1], [2], [3]]) >>> Y = np.array([0, 1, 2, 3]) >>> labels = np.array([0, 1, 2, 3]) >>> est = svm.LinearSVC() >>> est.fit(X, Y) LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) >>> pred_decision = est.decision_function([[-1], [2], [3]]) >>> y_true = [0, 2, 3] >>> hinge_loss(y_true, pred_decision, labels) #doctest: +ELLIPSIS 0.56... """ check_consistent_length(y_true, pred_decision, sample_weight) pred_decision = check_array(pred_decision, ensure_2d=False) y_true = column_or_1d(y_true) y_true_unique = np.unique(y_true) if y_true_unique.size > 2: if (labels is None and pred_decision.ndim > 1 and (np.size(y_true_unique) != pred_decision.shape[1])): raise ValueError("Please include all labels in y_true " "or pass labels as third argument") if labels is None: labels = y_true_unique le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) mask = np.ones_like(pred_decision, dtype=bool) mask[np.arange(y_true.shape[0]), y_true] = False margin = pred_decision[~mask] margin -= np.max(pred_decision[mask].reshape(y_true.shape[0], -1), axis=1) else: # Handles binary class case # this code assumes that positive and negative labels # are encoded as +1 and -1 respectively pred_decision = column_or_1d(pred_decision) pred_decision = np.ravel(pred_decision) lbin = LabelBinarizer(neg_label=-1) y_true = lbin.fit_transform(y_true)[:, 0] try: margin = y_true * pred_decision except TypeError: raise TypeError("pred_decision should be an array of floats.") losses = 1 - margin # The hinge_loss doesn't penalize good enough predictions. losses[losses <= 0] = 0 return np.average(losses, weights=sample_weight) def _check_binary_probabilistic_predictions(y_true, y_prob): """Check that y_true is binary and y_prob contains valid probabilities""" check_consistent_length(y_true, y_prob) labels = np.unique(y_true) if len(labels) != 2: raise ValueError("Only binary classification is supported. " "Provided labels %s." % labels) if y_prob.max() > 1: raise ValueError("y_prob contains values greater than 1.") if y_prob.min() < 0: raise ValueError("y_prob contains values less than 0.") return label_binarize(y_true, labels)[:, 0] def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None): """Compute the Brier score. The smaller the Brier score, the better, hence the naming with "loss". Across all items in a set N predictions, the Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome. Therefore, the lower the Brier score is for a set of predictions, the better the predictions are calibrated. Note that the Brier score always takes on a value between zero and one, since this is the largest possible difference between a predicted probability (which must be between zero and one) and the actual outcome (which can take on values of only 0 and 1). The Brier score is appropriate for binary and categorical outcomes that can be structured as true or false, but is inappropriate for ordinal variables which can take on three or more values (this is because the Brier score assumes that all possible outcomes are equivalently "distant" from one another). Which label is considered to be the positive label is controlled via the parameter pos_label, which defaults to 1. Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape (n_samples,) True targets. y_prob : array, shape (n_samples,) Probabilities of the positive class. sample_weight : array-like of shape = [n_samples], optional Sample weights. pos_label : int (default: None) Label of the positive class. If None, the maximum label is used as positive class Returns ------- score : float Brier score Examples -------- >>> import numpy as np >>> from sklearn.metrics import brier_score_loss >>> y_true = np.array([0, 1, 1, 0]) >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"]) >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3]) >>> brier_score_loss(y_true, y_prob) # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true, 1-y_prob, pos_label=0) # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true_categorical, y_prob, \ pos_label="ham") # doctest: +ELLIPSIS 0.037... >>> brier_score_loss(y_true, np.array(y_prob) > 0.5) 0.0 References ---------- http://en.wikipedia.org/wiki/Brier_score """ y_true = column_or_1d(y_true) y_prob = column_or_1d(y_prob) if pos_label is None: pos_label = y_true.max() y_true = np.array(y_true == pos_label, int) y_true = _check_binary_probabilistic_predictions(y_true, y_prob) return np.average((y_true - y_prob) ** 2, weights=sample_weight)