Source code for rankereval.metrics

import numpy as np
from .data import BinaryLabels, Labels, Rankings


[docs]class Metric(object):
[docs] def score(self, y_true, y_pred): r""" Individual scores for each ranking. Parameters ---------- y_true : :class:`~rankereval.data.Labels` Ground truth labels. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. ValueError if `n_bootstrap_samples`, `confidence` or `nan_handling` contain invalid values. """ if not isinstance(y_true, Labels): raise TypeError("y_true must be of type Labels") if not isinstance(y_pred, Rankings): raise TypeError("y_pred must be of type Rankings") y_pred_labels = y_true.get_labels_for(y_pred, self._k) return self._score(y_true, y_pred_labels)
@classmethod def _bootstrap_ci(cls, scores, n_bootstrap_samples, confidence): if not isinstance(n_bootstrap_samples, int) or n_bootstrap_samples <= 1: raise ValueError("n_bootstrap_samples must be int > 1") elif not isinstance(confidence, float) or confidence <= 0.0 or confidence >= 1.0: raise ValueError("Confidence must be float and 0 < confidence < 1") if len(scores): resamples = np.random.choice( scores, (len(scores), n_bootstrap_samples), replace=True) bootstrap_means = resamples.mean(axis=0) # Compute "percentile bootstrap" alpha_2 = (1 - confidence) / 2.0 lower_ci = np.quantile(bootstrap_means, alpha_2) upper_ci = np.quantile(bootstrap_means, 1.0 - alpha_2) return (lower_ci, upper_ci) else: return (float('nan'), float('nan'))
[docs] def mean(self, y_true, y_pred, nan_handling='propagate', conf_interval=False, n_bootstrap_samples=1000, confidence=0.95): r""" Mean score over all ranking after handling NaN values. Parameters ---------- y_true : :class:`~rankereval.data.Labels` Ground truth labels, see also above. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. nan_handling : {'propagate', 'drop', 'zerofill'}, optional `'propagate'` (default): Return NaN if any value is NaN `'drop'` : Ignore NaN values `'zerofill'` : Replace NaN values with zero conv_interval : bool, optional If True, then return bootstrapped confidence intervals of mean, otherwise interval is None. Defaults to False. n_bootstrap_samples : int, optional Number of bootstrap samples to draw. confidence : float, optional Indicates width of confidence interval. Default is 0.95 (95%). Returns ------- mean: float `mean` if `conv_interval` is `false` otherwise Dictionary with ``mean["score"]`` and ``mean["conf_interval"]`` for the confidence interval tuple `(lower CI, upper CI)`. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. ValueError if `n_bootstrap_samples`, `confidence` or `nan_handling` contain invalid values. """ scores = self.score(y_true, y_pred) if nan_handling == 'drop': scores = scores[~np.isnan(scores)] elif nan_handling == 'zerofill': scores = np.nan_to_num(scores) elif nan_handling == "propagate": if np.isnan(scores).sum(): scores = [] else: raise ValueError( 'nan_handling must be "propagate", "drop" or "zerofill"') if conf_interval: ci = self._bootstrap_ci(scores, n_bootstrap_samples, confidence) else: ci = None if len(scores): mean = scores.mean() else: mean = float('nan') if conf_interval: return {"score": mean, "conf_interval": ci} else: return mean
def name(self): if self._k is None: k = "" else: k = f"@{self._k}" return self.__class__.__name__ + k
class BinaryMetric(Metric): def __init__(self, k): if not isinstance(k, int) or k <= 0: raise ValueError("Cutoff k needs to be integer > 0") self._k = k def score(self, y_true, y_pred): if not isinstance(y_true, BinaryLabels): raise TypeError(f"y_true must be of type BinaryLabels but is of instance {type(y_true)}") return super().score(y_true, y_pred)
[docs]class Precision(BinaryMetric): """ Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. truncated : bool if `true`, `k` gets clipped at length of input ranking. Raises ------ ValueError if `k` is not integer > 0. """ def __init__(self, k, truncated=False): super().__init__(k) self._truncated = truncated def _precision(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) n_relevant = np.sum(y_pred_labels[:, :self._k] == 1, axis=-1).filled(0) if self._truncated: items = np.broadcast_to(~y_pred_labels.mask, y_pred_labels.shape) n_items_in_y_pred = items.sum(axis=1).flatten() # not defined if there are no relevant labels scores = np.NaN * np.zeros_like(n_relevant, dtype=float) valid = (n_items_in_y_pred > 0) & (n_pos > 0) scores[valid] = n_relevant[valid].astype(float) / np.minimum(n_items_in_y_pred[valid], self._k) else: scores = n_relevant.astype(float) / self._k # not defined if there are no relevant labels scores[n_pos == 0] = np.NaN return scores def _score(self, y_true, y_pred_labels): return self._precision(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred): r""" Computes Precision@k [MN]_ of each ranking *y* in `y_pred` as .. math:: \mathrm{Precision}@k(y) = \frac{\sum_{i=1}^{k'} \mathrm{rel}(y_i)}{k'}, where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. If `truncated` then :math:`k' = \min(k,|y|)`, i.e., `k'` can never exceed the length of the input ranking; otherwise :math:`k' = k` (default). Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{Precision}@k(y) = 0`. 2. There are no relevant items in `y_true`: :math:`\mathrm{Precision}@k(y) =` NaN. This marks invalid instances explicitly and is consistent with Recall. Examples -------- >>> from rankereval import BinaryLabels, Rankings, Precision >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1,2,3]]) >>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]]) >>> Precision(3).score(y_true, y_pred) array([0. , 0.66666667]) """ return super().score(y_true, y_pred)
[docs]class Recall(BinaryMetric): """ Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. truncated : bool if `true`, number of relevant results gets clipped at `k`. Raises ------ ValueError if `k` is not integer > 0. """ def __init__(self, k, truncated=False): super().__init__(k) self._truncated = truncated def _recall(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) n_relevant = np.sum(y_pred_labels[:, :self._k] == 1, axis=-1).filled(0) scores = np.NaN * np.zeros_like(n_relevant, dtype=float) if self._truncated: denominator = np.clip(n_pos[n_pos > 0], None, self._k) else: denominator = n_pos[n_pos > 0] scores[n_pos > 0] = n_relevant[n_pos > 0].astype(float) / denominator return scores def _score(self, y_true, y_pred_labels): return self._recall(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred): r""" Computes Recall@k [MN]_ as the fraction of relevant results in `y_true` that were in the top *k* results of `y_pred`. More formally, the recall of each ranking *y* in `y_pred` with labels `y_true` is defined as .. math:: \mathrm{Recall}@k(y) = \frac{\sum_{i=1}^{k'} \mathrm{rel}(y_i)}{n_{rel}}, where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y* and :math:`n_{rel} = \left\| y_{true} \right\| _1` if `truncated` is false (default) and :math:`n_{rel} = \max(\left\| y_{true} \right\| _1, k)` otherwise. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{Recall}@k(y) = 0`. 2. There are no relevant items in `y_true`: :math:`\mathrm{Recall}@k(y) =` NaN. This marks invalid instances explicitly. Examples -------- >>> from rankereval import BinaryLabels, Rankings, Recall >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]]) >>> Recall(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0., 1.]) """ return super().score(y_true, y_pred)
[docs]class F1(Precision, Recall):
[docs] def score(self, y_true, y_pred): r""" Computes F1 [MN]_ as harmonic mean of Precision@k and Recall@k. More formally, the F1 score of each ranking *y* in `y_pred` is defined as .. math:: \mathrm{F1}@k(y) = \frac{2*\big(\mathrm{Precision}@k(y) * \mathrm{Recall}@k(y)\big)}{\mathrm{Precision}@k(y) + \mathrm{Recall}@k(y)}. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. In this case, :math:`\mathrm{Recall}@k(y) = \mathrm{Precision}@k(y) = 0` . 2. If :math:`\mathrm{Recall}@k(y) = \mathrm{Precision}@k(y) = 0`, we define :math:`\mathrm{F1}@k(y) = 0`. 3. There are no relevant items in `y_true`: :math:`\mathrm{F1}@k(y) =` NaN. Examples -------- >>> from rankereval import BinaryLabels, Rankings, F1 >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]]) >>> F1(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0. , 0.5]) """ return super().score(y_true, y_pred)
def _score(self, y_true, y_pred_labels): recall = self._recall(y_true, y_pred_labels) precision = self._precision(y_true, y_pred_labels) product = 2 * recall * precision sm = recall + precision # return 0 for geometric mean if both are zero scores = np.zeros_like(product, dtype=float) valid = np.nan_to_num(product) > 0 invalid = np.isnan(product) scores[valid] = product[valid] / sm[valid] scores[invalid] = np.NaN return scores
[docs]class HitRate(Recall): """ Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. Raises ------ ValueError if `k` is not integer > 0. """
[docs] def score(self, y_true, y_pred): r""" Computes HitRate@k [MN]_ as the whether a relevant item occurs in *k* results of `y_pred`. Differs from Recall@k in that `y_true` has to contain exactly one element per row. More formally, the HitRate of each ranking *y* in `y_pred` with labels `y_true` is defined as .. math:: \mathrm{HitRate}@k(y) &= \sum_{i=1}^{k'} \mathrm{rel}(y_i), k' &= \min(k,|y|); where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary and exactly one relevant item per row. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{HitRate}@k(y) = 0`. 2. There is not exactly one relevant item in `y_true`: :math:`\mathrm{HitRate}@k(y) =` NaN. Examples -------- >>> from rankereval import BinaryLabels, Rankings, HitRate >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]]) >>> HitRate(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0., 1.]) """ return super().score(y_true, y_pred)
def _score(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) scores = self._recall(y_true, y_pred_labels) scores[n_pos != 1] = np.NaN # Not defined for no or multiple positives return scores
[docs]class ReciprocalRank(BinaryMetric): """ Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. Raises ------ ValueError if `k` is not integer > 0. """ def _score(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) labels = y_pred_labels[:, :self._k].filled(0) ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1) # It is 1/rank if document appears in top k, 0 otherwise scores = np.max(labels / ranks, axis=-1, initial=0.0) scores[n_pos == 0] = np.NaN # Not defined for no multiple positives return scores
[docs] def score(self, y_true, y_pred): r""" Computes ReciprocalRank@k [NC]_ as the rank where the first relevant item occurs in the top *k* results of `y_pred`. More formally, the ReciprocalRank of each ranking *y* in `y_pred` is defined as .. math:: \mathrm{ReciprocalRank}@k(y) &= \max_{i=1,\ldots,k'} \frac{\mathrm{rel}(y_i)}{i}, k' &= \min(k,|y|); where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty or first relevant item appears beyond rank *k*. Per definition above, :math:`\mathrm{ReciprocalRank}@k(y) = 0`. 2. There is no relevant item in `y_true`: :math:`\mathrm{ReciprocalRank}@k(y) =` NaN. Examples -------- >>> from rankereval import BinaryLabels, Rankings, ReciprocalRank >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,0,1], [1,2]]) >>> ReciprocalRank(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0.5, 1. ]) """ return super().score(y_true, y_pred)
[docs]class MeanRanks(BinaryMetric): """ Used for evaluating permutations of `y_true`. Does not accept *k* as it scores permutations. """ def __init__(self): self._k = None def _score(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) labels = y_pred_labels.filled(0) ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1) scores = np.sum(ranks * labels, axis=-1) scores[n_pos > 0] = scores[n_pos > 0] / n_pos[n_pos > 0] scores[n_pos == 0] = np.NaN return scores
[docs] def score(self, y_true, y_pred): r""" Computes MeanRanks as the mean of ranks of relevant items `y_pred`. More formally, it is defined for each ranking *y* in `y_pred` as .. math:: \mathrm{MeanRanks}(y) = \frac{\sum_{i=1}^{|y|} i\cdot\mathrm{rel}(y_i)}{\sum_{i=1}^{|y|} \mathrm{rel}(y_i)}, where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. There is no relevant item in `y_true`: :math:`\mathrm{MeanRanks}(y) =` NaN. Examples -------- >>> from rankereval import BinaryLabels, Rankings, MeanRanks >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,0,5], [1,2]]) >>> MeanRanks().score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([2.5, 1. ]) """ return super().score(y_true, y_pred)
[docs]class FirstRelevantRank(BinaryMetric): """ Used for evaluating permutations of `y_true`. Does not accept *k* as it scores permutations. """ def __init__(self): self._k = None def _score(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) labels = y_pred_labels.filled(0) ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1) ranks = ranks * labels ranks[ranks == 0] = np.inf scores = np.min(ranks, axis=-1) scores[n_pos == 0] = np.NaN return scores
[docs] def score(self, y_true, y_pred): r""" Computes FirstRelevantRank as the mean of ranks of relevant items `y_pred`. More formally, it is defined for each ranking *y* in `y_pred` as .. math:: \mathrm{FirstRelevantRank}(y) = \min \{i \mid \mathrm{rel}(y_i) = 1\}, where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. There is no relevant item in `y_true`: :math:`\mathrm{FirstRelevantRank}(y) =` NaN. Examples -------- >>> from rankereval import BinaryLabels, Rankings, FirstRelevantRank >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]]) >>> y_pred = Rankings.from_ranked_indices([[3,0,5], [1,2]]) >>> FirstRelevantRank().score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([2., 1. ]) """ return super().score(y_true, y_pred)
[docs]class AP(BinaryMetric): """ Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. Raises ------ ValueError if `k` is not integer > 0. """ def _score(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) labels = y_pred_labels[:, :self._k].filled(0) ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1) precision = np.cumsum(labels, axis=-1) / ranks scores = np.zeros_like(n_pos, dtype=float) scores[n_pos > 0] = np.sum( precision * labels, axis=-1)[n_pos > 0] / np.clip(n_pos[n_pos > 0], None, self._k) scores[n_pos == 0] = np.NaN return scores
[docs] def score(self, y_true, y_pred): r""" Computes AveragePrecision@k [MN]_, an approximation to the the area under the precision-recall curve, of each ranking *y* in `y_pred` as .. math:: \mathrm{AveragePrecision}@k(y) &= \frac{1}{Z} \sum_{i=1}^{k'} \mathrm{rel}(y_i)\cdot \mathrm{Precision}@i(y), k' &= \min(k,|y|), Z &= \min \big(k, \left\| y_{true} \right\| _1\big); where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. .. note:: Sometimes the denominator *Z* is defined with respect to only the retrieved or recommended set of items `y_pred`. This is not desirable as AP could be artificially inflated, e.g., by returning only one relevant item at the top and then filling up the ranking with and irrelevant items. Parameters ---------- y_true : :class:`~rankereval.data.BinaryLabels` Ground truth labels, must be binary. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{AveragePrecision}@k(y) = 0`. 2. There are no relevant items in `y_true`: :math:`\mathrm{AveragePrecision}@k(y) =` NaN to make it consistent with other metrics. 3. There are no relevant items in `y_pred` up to *k*: Per definition above, :math:`\mathrm{AveragePrecision}@k(y) = 0`. Examples -------- >>> from rankereval import BinaryLabels, Rankings, AP >>> # use separate labels for each ranking >>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1,2,3]]) >>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]]) >>> AP(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0. , 0.66666667]) """ return super().score(y_true, y_pred)
[docs]class DCG(Metric): r""" Parameters ---------- k : int specifies number of top results `k` of each ranking to be evaluated. relevance_scaling : str, ['binary' (default), 'power'] Determines are relevance labels are transformed: `'identity'`: (default) :math:`f(\mathrm{rel}(y_i)) = \mathrm{rel}(y_i)` `'power'`: :math:`f(\mathrm{rel}(y_i)) = 2^{\mathrm{rel}(y_i)} - 1` log_base : str, ['e' (default), '2'] Determines what log base is used in denominator. The smaller this value, the heavier emphasis on top-ranked documents. `'e'` (default): Natural logarithm :math:`\ln` `'2'`: :math:`\log_2` Notes ----- The original definition of (n)DCG [KJ]_ uses 'identity' for `relevance_scaling`, but leaves the choice of `log_base` open. Raises ------ ValueError if `k` is not integer > 0 or `relevance_scaling` or `log_base` are invalid. """ SCALERS = {'identity': lambda x: x, 'power': lambda x: np.power(x, 2) - 1} LOGS = {'2': lambda x: np.log2(x), 'e': lambda x: np.log(x)} def __init__(self, k=None, relevance_scaling='identity', log_base='e'): self._k = k if relevance_scaling not in self.SCALERS: raise ValueError( "Relevance scaling must be 'identity' or 'power'.") if log_base not in self.LOGS: raise ValueError("Log base needs to be 'e' or '2'.") self._rel_scale = self.SCALERS[relevance_scaling] self._log_fct = self.LOGS[log_base] def _dcg(self, y_true, y_pred_labels): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) labels = y_pred_labels[:, :self._k].filled(0) ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1) scores = np.sum( self._rel_scale(labels) / self._log_fct( ranks + 1), axis=- 1) scores[n_pos == 0] = np.NaN return scores def _score(self, y_true, y_pred_labels): return self._dcg(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred): r""" Computes Discounted Cumulative Gain at k (DCG@k) [KJ]_ as a weighted sum of relevance labels of top *k* results of `y_pred`. More formally, the recall of each ranking *y* in `y_pred` with labels `y_true` is defined as .. math:: \mathrm{DCG}@k(y) &= \sum_{i=1}^{k'} \frac{f(\mathrm{rel}(y_i))}{\log_b(i + 1)}, k' &= \min(k,|y|); where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i* in the ranking *y*. *f* is the `relevance_scaling` function and *b* the `log_base` parameters defined earlier. Parameters ---------- y_true : :class:`~rankereval.data.Labels` Ground truth labels, binary or numeric. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{DCG}@k(y) = 0`. 2. There are no items with relevance > 0 in `y_true`: :math:`\mathrm{DCG}@k(y) =` NaN to make it consistent with other metrics. Examples -------- >>> from rankereval import NumericLabels, Rankings, DCG >>> # use separate labels for each ranking >>> y_true = NumericLabels.from_matrix([[1, 2, 3], [4, 5]]) >>> y_pred = Rankings.from_ranked_indices([[0,2,1], [1,0]]) >>> DCG(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([ 5.61610776, 10.85443211]) """ return super().score(y_true, y_pred)
[docs]class NDCG(DCG): """ For a description of the arguments, see :class:`DCG`. """ def _score(self, y_true, y_pred_labels): dcg = self._dcg(y_true, y_pred_labels) ideal_labels = y_true.get_labels_for(y_true.as_rankings(), self._k) idcg = self._dcg(y_true, ideal_labels) return dcg / idcg
[docs] def score(self, y_true, y_pred): r""" Computes the *normalized* Discounted Cumulative Gain at k (nDCG@k) [KJ]_ as a weighted sum of relevance labels of top *k* results of `y_pred`, normalized to the range [0, 1]. More formally, the nDCG of each ranking *y* in `y_pred` with labels `y_true` is defined as .. math:: \mathrm{nDCG}@k(y) = \begin{cases} \frac{\mathrm{DCG}@k(y)} {\mathrm{IDCG}@k(y)} &\mbox{if } \mathrm{IDCG}@k(y) > 0 \\ 0 & \mbox{otherwise } \end{cases}, where :math:`\mathrm{IDCG}@k(y)` is the maximum DCG@k value that can be achieved on *all* relevance labels (i.e., DCG@k of the sorted relevance labels). .. note:: Sometimes IDCG is defined with respect to only the retrieved or recommended set of items *y*. This is not desirable as nDCG could be artificially inflated, e.g., by returning only one relevant item at the top and then filling up the ranking with and irrelevant items. Parameters ---------- y_true : :class:`~rankereval.data.Labels` Ground truth labels, binary or numeric. y_pred : :class:`~rankereval.data.Rankings` Rankings to be evaluated. If `y_true` only contains one row, the labels in this row will be used for every ranking in `y_pred`. Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`. Returns ------- computed_metric: ndarray, shape (n_rankings, ) Computed metric for each ranking. Raises ------ TypeError if `y_true` or `y_pred` are of incorrect type. Notes ----- Edge cases: 1. `y` is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{DCG}@k(y) = 0`. 2. There are no items with relevance > 0 in `y_true`: :math:`\mathrm{DCG}@k(y) =` NaN to make it consistent with other metrics. 3. There are no items with relevance > 0 in `y` up to *k*: :math:`\mathrm{nDCG}@k(y) = 0`. Examples -------- >>> from rankereval import NumericLabels, Rankings, NDCG >>> # use separate labels for each ranking >>> y_true = NumericLabels.from_matrix([[1, 2, 3], [4, 5]]) >>> y_pred = Rankings.from_ranked_indices([[0,2,1], [1,0]]) >>> NDCG(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE array([0.81749351, 1. ]) """ return super().score(y_true, y_pred)