import numpy as np
from .data import BinaryLabels, Labels, Rankings
[docs]class Metric(object):
[docs] def score(self, y_true, y_pred):
r"""
Individual scores for each ranking.
Parameters
----------
y_true : :class:`~rankereval.data.Labels`
Ground truth labels.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
ValueError
if `n_bootstrap_samples`, `confidence` or `nan_handling` contain invalid values.
"""
if not isinstance(y_true, Labels):
raise TypeError("y_true must be of type Labels")
if not isinstance(y_pred, Rankings):
raise TypeError("y_pred must be of type Rankings")
y_pred_labels = y_true.get_labels_for(y_pred, self._k)
return self._score(y_true, y_pred_labels)
@classmethod
def _bootstrap_ci(cls, scores, n_bootstrap_samples, confidence):
if not isinstance(n_bootstrap_samples,
int) or n_bootstrap_samples <= 1:
raise ValueError("n_bootstrap_samples must be int > 1")
elif not isinstance(confidence, float) or confidence <= 0.0 or confidence >= 1.0:
raise ValueError("Confidence must be float and 0 < confidence < 1")
if len(scores):
resamples = np.random.choice(
scores, (len(scores), n_bootstrap_samples), replace=True)
bootstrap_means = resamples.mean(axis=0)
# Compute "percentile bootstrap"
alpha_2 = (1 - confidence) / 2.0
lower_ci = np.quantile(bootstrap_means, alpha_2)
upper_ci = np.quantile(bootstrap_means, 1.0 - alpha_2)
return (lower_ci, upper_ci)
else:
return (float('nan'), float('nan'))
[docs] def mean(self, y_true, y_pred, nan_handling='propagate',
conf_interval=False, n_bootstrap_samples=1000, confidence=0.95):
r"""
Mean score over all ranking after handling NaN values.
Parameters
----------
y_true : :class:`~rankereval.data.Labels`
Ground truth labels, see also above.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated.
nan_handling : {'propagate', 'drop', 'zerofill'}, optional
`'propagate'` (default):
Return NaN if any value is NaN
`'drop'` :
Ignore NaN values
`'zerofill'` :
Replace NaN values with zero
conv_interval : bool, optional
If True, then return bootstrapped confidence intervals of mean,
otherwise interval is None.
Defaults to False.
n_bootstrap_samples : int, optional
Number of bootstrap samples to draw.
confidence : float, optional
Indicates width of confidence interval. Default is 0.95 (95%).
Returns
-------
mean: float `mean` if `conv_interval` is `false` otherwise
Dictionary with ``mean["score"]`` and ``mean["conf_interval"]``
for the confidence interval tuple `(lower CI, upper CI)`.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
ValueError
if `n_bootstrap_samples`, `confidence` or `nan_handling` contain invalid
values.
"""
scores = self.score(y_true, y_pred)
if nan_handling == 'drop':
scores = scores[~np.isnan(scores)]
elif nan_handling == 'zerofill':
scores = np.nan_to_num(scores)
elif nan_handling == "propagate":
if np.isnan(scores).sum():
scores = []
else:
raise ValueError(
'nan_handling must be "propagate", "drop" or "zerofill"')
if conf_interval:
ci = self._bootstrap_ci(scores, n_bootstrap_samples, confidence)
else:
ci = None
if len(scores):
mean = scores.mean()
else:
mean = float('nan')
if conf_interval:
return {"score": mean, "conf_interval": ci}
else:
return mean
def name(self):
if self._k is None:
k = ""
else:
k = f"@{self._k}"
return self.__class__.__name__ + k
class BinaryMetric(Metric):
def __init__(self, k):
if not isinstance(k, int) or k <= 0:
raise ValueError("Cutoff k needs to be integer > 0")
self._k = k
def score(self, y_true, y_pred):
if not isinstance(y_true, BinaryLabels):
raise TypeError(f"y_true must be of type BinaryLabels but is of instance {type(y_true)}")
return super().score(y_true, y_pred)
[docs]class Precision(BinaryMetric):
"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
truncated : bool
if `true`, `k` gets clipped at length of input ranking.
Raises
------
ValueError
if `k` is not integer > 0.
"""
def __init__(self, k, truncated=False):
super().__init__(k)
self._truncated = truncated
def _precision(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
n_relevant = np.sum(y_pred_labels[:, :self._k] == 1, axis=-1).filled(0)
if self._truncated:
items = np.broadcast_to(~y_pred_labels.mask, y_pred_labels.shape)
n_items_in_y_pred = items.sum(axis=1).flatten()
# not defined if there are no relevant labels
scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
valid = (n_items_in_y_pred > 0) & (n_pos > 0)
scores[valid] = n_relevant[valid].astype(float) / np.minimum(n_items_in_y_pred[valid], self._k)
else:
scores = n_relevant.astype(float) / self._k
# not defined if there are no relevant labels
scores[n_pos == 0] = np.NaN
return scores
def _score(self, y_true, y_pred_labels):
return self._precision(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred):
r"""
Computes Precision@k [MN]_ of each ranking *y* in `y_pred` as
.. math::
\mathrm{Precision}@k(y) = \frac{\sum_{i=1}^{k'} \mathrm{rel}(y_i)}{k'},
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y*. If `truncated` then :math:`k' = \min(k,|y|)`, i.e., `k'`
can never exceed the length of the input ranking; otherwise :math:`k' = k` (default).
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above,
:math:`\mathrm{Precision}@k(y) = 0`.
2. There are no relevant items in `y_true`: :math:`\mathrm{Precision}@k(y) =` NaN.
This marks invalid instances explicitly and is consistent with Recall.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, Precision
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1,2,3]])
>>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]])
>>> Precision(3).score(y_true, y_pred)
array([0. , 0.66666667])
"""
return super().score(y_true, y_pred)
[docs]class Recall(BinaryMetric):
"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
truncated : bool
if `true`, number of relevant results gets clipped at `k`.
Raises
------
ValueError
if `k` is not integer > 0.
"""
def __init__(self, k, truncated=False):
super().__init__(k)
self._truncated = truncated
def _recall(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
n_relevant = np.sum(y_pred_labels[:, :self._k] == 1, axis=-1).filled(0)
scores = np.NaN * np.zeros_like(n_relevant, dtype=float)
if self._truncated:
denominator = np.clip(n_pos[n_pos > 0], None, self._k)
else:
denominator = n_pos[n_pos > 0]
scores[n_pos > 0] = n_relevant[n_pos > 0].astype(float) / denominator
return scores
def _score(self, y_true, y_pred_labels):
return self._recall(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred):
r"""
Computes Recall@k [MN]_ as the fraction of relevant results in `y_true` that were
in the top *k* results of `y_pred`.
More formally, the recall of each ranking *y* in `y_pred` with
labels `y_true` is defined as
.. math::
\mathrm{Recall}@k(y) =
\frac{\sum_{i=1}^{k'} \mathrm{rel}(y_i)}{n_{rel}},
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y* and :math:`n_{rel} = \left\| y_{true} \right\| _1` if `truncated`
is false (default) and :math:`n_{rel} = \max(\left\| y_{true} \right\| _1, k)` otherwise.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. Per definition above,
:math:`\mathrm{Recall}@k(y) = 0`.
2. There are no relevant items in `y_true`: :math:`\mathrm{Recall}@k(y) =` NaN. This
marks invalid instances explicitly.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, Recall
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]])
>>> Recall(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0., 1.])
"""
return super().score(y_true, y_pred)
[docs]class F1(Precision, Recall):
[docs] def score(self, y_true, y_pred):
r"""
Computes F1 [MN]_ as harmonic mean of Precision@k and Recall@k.
More formally, the F1 score of each ranking *y* in `y_pred` is defined as
.. math::
\mathrm{F1}@k(y) = \frac{2*\big(\mathrm{Precision}@k(y) *
\mathrm{Recall}@k(y)\big)}{\mathrm{Precision}@k(y) + \mathrm{Recall}@k(y)}.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`. In this case,
:math:`\mathrm{Recall}@k(y) = \mathrm{Precision}@k(y) = 0` .
2. If :math:`\mathrm{Recall}@k(y) = \mathrm{Precision}@k(y) = 0`,
we define :math:`\mathrm{F1}@k(y) = 0`.
3. There are no relevant items in `y_true`: :math:`\mathrm{F1}@k(y) =` NaN.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, F1
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]])
>>> F1(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0. , 0.5])
"""
return super().score(y_true, y_pred)
def _score(self, y_true, y_pred_labels):
recall = self._recall(y_true, y_pred_labels)
precision = self._precision(y_true, y_pred_labels)
product = 2 * recall * precision
sm = recall + precision
# return 0 for geometric mean if both are zero
scores = np.zeros_like(product, dtype=float)
valid = np.nan_to_num(product) > 0
invalid = np.isnan(product)
scores[valid] = product[valid] / sm[valid]
scores[invalid] = np.NaN
return scores
[docs]class HitRate(Recall):
"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
Raises
------
ValueError
if `k` is not integer > 0.
"""
[docs] def score(self, y_true, y_pred):
r"""
Computes HitRate@k [MN]_ as the whether a relevant item occurs
in *k* results of `y_pred`.
Differs from Recall@k in that `y_true` has to contain exactly
one element per row.
More formally, the HitRate of each ranking *y* in `y_pred` with labels
`y_true` is defined as
.. math::
\mathrm{HitRate}@k(y) &= \sum_{i=1}^{k'} \mathrm{rel}(y_i),
k' &= \min(k,|y|);
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y*.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary and exactly one relevant item per row.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`.
Per definition above, :math:`\mathrm{HitRate}@k(y) = 0`.
2. There is not exactly one relevant item in
`y_true`: :math:`\mathrm{HitRate}@k(y) =` NaN.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, HitRate
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]])
>>> HitRate(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0., 1.])
"""
return super().score(y_true, y_pred)
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
scores = self._recall(y_true, y_pred_labels)
scores[n_pos != 1] = np.NaN # Not defined for no or multiple positives
return scores
[docs]class ReciprocalRank(BinaryMetric):
"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
Raises
------
ValueError
if `k` is not integer > 0.
"""
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
labels = y_pred_labels[:, :self._k].filled(0)
ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1)
# It is 1/rank if document appears in top k, 0 otherwise
scores = np.max(labels / ranks, axis=-1, initial=0.0)
scores[n_pos == 0] = np.NaN # Not defined for no multiple positives
return scores
[docs] def score(self, y_true, y_pred):
r"""
Computes ReciprocalRank@k [NC]_ as the rank where the first relevant item
occurs in the top *k* results of `y_pred`.
More formally, the ReciprocalRank of each ranking *y* in `y_pred` is defined as
.. math::
\mathrm{ReciprocalRank}@k(y) &= \max_{i=1,\ldots,k'} \frac{\mathrm{rel}(y_i)}{i},
k' &= \min(k,|y|);
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y*.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty or first relevant item appears beyond rank *k*.
Per definition above, :math:`\mathrm{ReciprocalRank}@k(y) = 0`.
2. There is no relevant item in `y_true`: :math:`\mathrm{ReciprocalRank}@k(y) =` NaN.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, ReciprocalRank
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,0,1], [1,2]])
>>> ReciprocalRank(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0.5, 1. ])
"""
return super().score(y_true, y_pred)
[docs]class MeanRanks(BinaryMetric):
"""
Used for evaluating permutations of `y_true`. Does not accept *k* as it
scores permutations.
"""
def __init__(self):
self._k = None
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
labels = y_pred_labels.filled(0)
ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1)
scores = np.sum(ranks * labels, axis=-1)
scores[n_pos > 0] = scores[n_pos > 0] / n_pos[n_pos > 0]
scores[n_pos == 0] = np.NaN
return scores
[docs] def score(self, y_true, y_pred):
r"""
Computes MeanRanks as the mean of ranks of relevant items `y_pred`.
More formally, it is defined for each ranking *y* in `y_pred` as
.. math::
\mathrm{MeanRanks}(y) =
\frac{\sum_{i=1}^{|y|} i\cdot\mathrm{rel}(y_i)}{\sum_{i=1}^{|y|} \mathrm{rel}(y_i)},
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y*.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. There is no relevant item in `y_true`: :math:`\mathrm{MeanRanks}(y) =` NaN.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, MeanRanks
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,0,5], [1,2]])
>>> MeanRanks().score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([2.5, 1. ])
"""
return super().score(y_true, y_pred)
[docs]class FirstRelevantRank(BinaryMetric):
"""
Used for evaluating permutations of `y_true`. Does not accept *k* as it
scores permutations.
"""
def __init__(self):
self._k = None
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
labels = y_pred_labels.filled(0)
ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1)
ranks = ranks * labels
ranks[ranks == 0] = np.inf
scores = np.min(ranks, axis=-1)
scores[n_pos == 0] = np.NaN
return scores
[docs] def score(self, y_true, y_pred):
r"""
Computes FirstRelevantRank as the mean of ranks of relevant items `y_pred`.
More formally, it is defined for each ranking *y* in `y_pred` as
.. math::
\mathrm{FirstRelevantRank}(y) = \min \{i \mid \mathrm{rel}(y_i) = 1\},
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at rank *i*
in the ranking *y*.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. There is no relevant item in `y_true`: :math:`\mathrm{FirstRelevantRank}(y) =` NaN.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, FirstRelevantRank
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1]])
>>> y_pred = Rankings.from_ranked_indices([[3,0,5], [1,2]])
>>> FirstRelevantRank().score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([2., 1. ])
"""
return super().score(y_true, y_pred)
[docs]class AP(BinaryMetric):
"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
Raises
------
ValueError
if `k` is not integer > 0.
"""
def _score(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
labels = y_pred_labels[:, :self._k].filled(0)
ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1)
precision = np.cumsum(labels, axis=-1) / ranks
scores = np.zeros_like(n_pos, dtype=float)
scores[n_pos > 0] = np.sum(
precision * labels, axis=-1)[n_pos > 0] / np.clip(n_pos[n_pos > 0], None, self._k)
scores[n_pos == 0] = np.NaN
return scores
[docs] def score(self, y_true, y_pred):
r"""
Computes AveragePrecision@k [MN]_, an approximation to the the area
under the precision-recall curve, of each ranking *y* in `y_pred` as
.. math::
\mathrm{AveragePrecision}@k(y) &= \frac{1}{Z} \sum_{i=1}^{k'}
\mathrm{rel}(y_i)\cdot \mathrm{Precision}@i(y),
k' &= \min(k,|y|),
Z &= \min \big(k, \left\| y_{true} \right\| _1\big);
where :math:`\mathrm{rel}(y_i)` is the relevance label of the item at
rank *i* in the ranking *y*.
.. note::
Sometimes the denominator *Z* is defined with respect to only the
retrieved or recommended set of items `y_pred`. This is not desirable
as AP could be artificially inflated, e.g., by returning only one
relevant item at the top
and then filling up the ranking with and irrelevant items.
Parameters
----------
y_true : :class:`~rankereval.data.BinaryLabels`
Ground truth labels, must be binary.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`.
Per definition above, :math:`\mathrm{AveragePrecision}@k(y) = 0`.
2. There are no relevant items in `y_true`:
:math:`\mathrm{AveragePrecision}@k(y) =` NaN to make it consistent with other metrics.
3. There are no relevant items in `y_pred` up to *k*:
Per definition above, :math:`\mathrm{AveragePrecision}@k(y) = 0`.
Examples
--------
>>> from rankereval import BinaryLabels, Rankings, AP
>>> # use separate labels for each ranking
>>> y_true = BinaryLabels.from_positive_indices([[0, 5],[1,2,3]])
>>> y_pred = Rankings.from_ranked_indices([[3,2,1], [1,2]])
>>> AP(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0. , 0.66666667])
"""
return super().score(y_true, y_pred)
[docs]class DCG(Metric):
r"""
Parameters
----------
k : int
specifies number of top results `k` of each ranking to be evaluated.
relevance_scaling : str, ['binary' (default), 'power']
Determines are relevance labels are transformed:
`'identity'`: (default)
:math:`f(\mathrm{rel}(y_i)) = \mathrm{rel}(y_i)`
`'power'`:
:math:`f(\mathrm{rel}(y_i)) = 2^{\mathrm{rel}(y_i)} - 1`
log_base : str, ['e' (default), '2']
Determines what log base is used in denominator.
The smaller this value, the heavier emphasis on top-ranked documents.
`'e'` (default):
Natural logarithm :math:`\ln`
`'2'`:
:math:`\log_2`
Notes
-----
The original definition of (n)DCG [KJ]_ uses 'identity' for `relevance_scaling`,
but leaves the choice of `log_base` open.
Raises
------
ValueError
if `k` is not integer > 0 or `relevance_scaling` or `log_base` are invalid.
"""
SCALERS = {'identity': lambda x: x,
'power': lambda x: np.power(x, 2) - 1}
LOGS = {'2': lambda x: np.log2(x), 'e': lambda x: np.log(x)}
def __init__(self, k=None, relevance_scaling='identity', log_base='e'):
self._k = k
if relevance_scaling not in self.SCALERS:
raise ValueError(
"Relevance scaling must be 'identity' or 'power'.")
if log_base not in self.LOGS:
raise ValueError("Log base needs to be 'e' or '2'.")
self._rel_scale = self.SCALERS[relevance_scaling]
self._log_fct = self.LOGS[log_base]
def _dcg(self, y_true, y_pred_labels):
n_pos = y_true.get_n_positives(y_pred_labels.shape[0])
labels = y_pred_labels[:, :self._k].filled(0)
ranks = np.arange(1, labels.shape[1] + 1, dtype=float).reshape(1, -1)
scores = np.sum(
self._rel_scale(labels) /
self._log_fct(
ranks +
1),
axis=-
1)
scores[n_pos == 0] = np.NaN
return scores
def _score(self, y_true, y_pred_labels):
return self._dcg(y_true, y_pred_labels)
[docs] def score(self, y_true, y_pred):
r"""
Computes Discounted Cumulative Gain at k (DCG@k) [KJ]_ as a
weighted sum of relevance labels of top *k* results of `y_pred`.
More formally, the recall of each ranking *y* in `y_pred`
with labels `y_true` is defined as
.. math::
\mathrm{DCG}@k(y) &= \sum_{i=1}^{k'} \frac{f(\mathrm{rel}(y_i))}{\log_b(i + 1)},
k' &= \min(k,|y|);
where :math:`\mathrm{rel}(y_i)` is the relevance label of
the item at rank *i* in the ranking *y*.
*f* is the `relevance_scaling` function and *b* the `log_base`
parameters defined earlier.
Parameters
----------
y_true : :class:`~rankereval.data.Labels`
Ground truth labels, binary or numeric.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. Ranking to be evaluated is empty, i.e., :math:`|y|=0`.
Per definition above, :math:`\mathrm{DCG}@k(y) = 0`.
2. There are no items with relevance > 0 in `y_true`:
:math:`\mathrm{DCG}@k(y) =` NaN to make it consistent with other metrics.
Examples
--------
>>> from rankereval import NumericLabels, Rankings, DCG
>>> # use separate labels for each ranking
>>> y_true = NumericLabels.from_matrix([[1, 2, 3], [4, 5]])
>>> y_pred = Rankings.from_ranked_indices([[0,2,1], [1,0]])
>>> DCG(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([ 5.61610776, 10.85443211])
"""
return super().score(y_true, y_pred)
[docs]class NDCG(DCG):
"""
For a description of the arguments, see :class:`DCG`.
"""
def _score(self, y_true, y_pred_labels):
dcg = self._dcg(y_true, y_pred_labels)
ideal_labels = y_true.get_labels_for(y_true.as_rankings(), self._k)
idcg = self._dcg(y_true, ideal_labels)
return dcg / idcg
[docs] def score(self, y_true, y_pred):
r"""
Computes the *normalized* Discounted Cumulative Gain at k (nDCG@k) [KJ]_
as a weighted sum of relevance labels of top *k* results of `y_pred`,
normalized to the range [0, 1].
More formally, the nDCG of each ranking *y* in `y_pred` with
labels `y_true` is defined as
.. math::
\mathrm{nDCG}@k(y) = \begin{cases} \frac{\mathrm{DCG}@k(y)}
{\mathrm{IDCG}@k(y)} &\mbox{if } \mathrm{IDCG}@k(y) > 0 \\
0 & \mbox{otherwise } \end{cases},
where :math:`\mathrm{IDCG}@k(y)` is the maximum DCG@k value that can
be achieved on *all* relevance labels (i.e., DCG@k of the sorted relevance labels).
.. note::
Sometimes IDCG is defined with respect to only
the retrieved or recommended set of items *y*. This is not desirable
as nDCG could be artificially inflated, e.g., by
returning only one relevant item at the top
and then filling up the ranking with and irrelevant items.
Parameters
----------
y_true : :class:`~rankereval.data.Labels`
Ground truth labels, binary or numeric.
y_pred : :class:`~rankereval.data.Rankings`
Rankings to be evaluated. If `y_true` only contains one row,
the labels in this row will be used for every ranking in `y_pred`.
Otherwise, each row *i* in `y_pred` uses label row *i* in `y_true`.
Returns
-------
computed_metric: ndarray, shape (n_rankings, )
Computed metric for each ranking.
Raises
------
TypeError
if `y_true` or `y_pred` are of incorrect type.
Notes
-----
Edge cases:
1. `y` is empty, i.e., :math:`|y|=0`. Per definition above, :math:`\mathrm{DCG}@k(y) = 0`.
2. There are no items with relevance > 0 in `y_true`: :math:`\mathrm{DCG}@k(y) =` NaN
to make it consistent with other metrics.
3. There are no items with relevance > 0 in `y` up to *k*: :math:`\mathrm{nDCG}@k(y) = 0`.
Examples
--------
>>> from rankereval import NumericLabels, Rankings, NDCG
>>> # use separate labels for each ranking
>>> y_true = NumericLabels.from_matrix([[1, 2, 3], [4, 5]])
>>> y_pred = Rankings.from_ranked_indices([[0,2,1], [1,0]])
>>> NDCG(3).score(y_true, y_pred) # doctest: +NORMALIZE_WHITESPACE
array([0.81749351, 1. ])
"""
return super().score(y_true, y_pred)