Skip to content

Stats

Apply the Friedman test with Iman-Davenport correction [1].

Parameters:

Name Type Description Default
data EvaluationData

EvaluateData instance containing the calculated ranks.

required

Returns:

Type Description
tuple[float, float]

An tuple containing (chi-squared stats, p-value)

References: .. [1] Demšar, Janez. "Statistical comparisons of classifiers over multiple data sets." Journal of Machine learning research 7.Jan (2006): 1-30.

Source code in labicompare/stats/friedman.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def friedman_test(data: EvaluationData) -> tuple[float, float]:
  """
  Apply the Friedman test with Iman-Davenport correction [1].

  Args:
    data: EvaluateData instance containing the calculated ranks.

  Returns:
    An tuple containing (chi-squared stats, p-value)

  References:
  .. [1] Demšar, Janez. "Statistical comparisons of classifiers over
      multiple data sets." Journal of Machine learning research
      7.Jan (2006): 1-30.
  """
  if len(data.model_names) < 3:
    raise ValueError(
      "Friedman test requires at least 3 models for comparison."
      "For 2 models, you should use Wilcoxon test."
    )

  model_arrays = [data._df[model].values for model in data.model_names]
  res = st.friedmanchisquare(*model_arrays)

  return float(res.statistic), float(res.pvalue)

Runs the Paired T-Test (Parametric) between two models.

Source code in labicompare/stats/pairwise.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def paired_ttest(
  data: EvaluationData, 
  model_a: str, 
  model_b: str, 
  alpha: float = 0.05,
  check_normality: bool = True
) -> PairwiseResult:
  """
  Runs the Paired T-Test (Parametric) between two models.
  """
  if model_a not in data.model_names or model_b not in data.model_names:
    raise ValueError("One or both models not found in data.")

  perf_a = data._df[model_a].values
  perf_b = data._df[model_b].values

  diffs = perf_a - perf_b

  if check_normality and len(diffs) >= 3:
    shapiro_stat, shapiro_p = st.shapiro(diffs)

    if shapiro_p < alpha:
      warnings.warn(
        f"\n[labicompare] WARNING:\n"
        f"Differences between '{model_a}' and '{model_b}' NOT follow a normal "
        f"distribution (Shapiro-Wilk p-value = {shapiro_p:.4f} < {alpha}).\n"
        f"The result of this paired T-Test  has high risk of false positive. "
        f"We strongly suggest using the Wilcoxon Signed-Rank instead.",
        UserWarning,
        stacklevel=2
      )

  res = st.ttest_rel(perf_a, perf_b)
  p_value = float(res.pvalue)

  mean_diff = float(np.mean(diffs))
  is_significant = p_value <= alpha
  winner = _determine_winner(mean_diff, model_a, model_b, data.higher_is_better)

  return PairwiseResult(
    model_a=model_a,
    model_b=model_b,
    p_value=p_value,
    is_significant=is_significant,
    winner=winner if is_significant else None,
    mean_diff=mean_diff
  )

Runs the sign-rank test (non-parametric). Based in only who wins or loose each round, ignoring the scale of each differences.

Source code in labicompare/stats/pairwise.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def sign_test(
  data: EvaluationData, 
  model_a: str, 
  model_b: str, 
  alpha: float = 0.05
) -> PairwiseResult:
  """
  Runs the sign-rank test (non-parametric).
  Based in only who wins or loose each round, ignoring the scale of each differences.
  """
  if model_a not in data.model_names or model_b not in data.model_names:
    raise ValueError("One or both models not found in results.")

  perf_a = data._df[model_a].values
  perf_b = data._df[model_b].values

  diffs = perf_a - perf_b
  non_zero_diffs = diffs[diffs != 0]
  n_trials = len(non_zero_diffs)

  mean_diff = float(np.mean(diffs))

  if n_trials == 0:
    p_value = 1.0
  else:
    positive_signs = np.sum(non_zero_diffs > 0)

    res = st.binomtest(k=positive_signs, n=n_trials, p=0.5, alternative='two-sided')
    p_value = float(res.pvalue)

  is_significant = p_value <= alpha
  winner = _determine_winner(mean_diff, model_a, model_b, data.higher_is_better)

  return PairwiseResult(
    model_a=model_a,
    model_b=model_b,
    p_value=p_value,
    is_significant=is_significant,
    winner=winner if is_significant else None,
    mean_diff=mean_diff
  )

Executes the Wilcoxon signed-rank test (non-parametric) between two models. This is the ideal alternative for paired T-Test when the data do not follow a normal distribution. Consider the direction (who wins) and the scale of ranking differences.

Source code in labicompare/stats/pairwise.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def wilcoxon_signed_rank(
  data: EvaluationData, 
  model_a: str, 
  model_b: str, 
  alpha: float = 0.05
) -> PairwiseResult:
  """
  Executes the Wilcoxon signed-rank test (non-parametric) between two models.
  This is the ideal alternative for paired T-Test when the data do not follow
  a normal distribution. Consider the direction (who wins) and the scale of ranking
  differences.
  """
  if model_a not in data.model_names or model_b not in data.model_names:
    raise ValueError("One or both models not found in results.")

  perf_a = data._df[model_a].values
  perf_b = data._df[model_b].values

  try:
      res = st.wilcoxon(perf_a, perf_b, zero_method='pratt')
      p_value = float(res.pvalue)
  except ValueError as e:
      if "zero_method" in str(e) or "zero" in str(e):
          p_value = 1.0
      else:
          raise e

  mean_diff = float(np.mean(perf_a - perf_b))
  is_significant = p_value <= alpha
  winner = _determine_winner(mean_diff, model_a, model_b, data.higher_is_better)

  return PairwiseResult(
    model_a=model_a,
    model_b=model_b,
    p_value=p_value,
    is_significant=is_significant,
    winner=winner if is_significant else None,
    mean_diff=mean_diff
  )