Plots

Source code in labicompare/plots/ranking.py

def plot_cd_diagram(
  data: EvaluationData,
  summary: ComparisonSummary,
  title: str = "Critical Difference Diagram (Wilcoxon-Holm)",
  figsize: tuple[float, float] = (12, 5),
  highlight_models: list[str] | None = None,
  highlight_color: str = "#d97706"
) -> plt.Figure:

  ranks = data._df.rank(axis=1, ascending=not data.higher_is_better).mean()
  ranks = ranks.sort_values()
  model_names = ranks.index.tolist()
  avg_ranks = ranks.values
  n_models = len(model_names)

  fig, ax = plt.subplots(figsize=figsize)

  ax.set_xlim(0, n_models + 1)
  ax.set_ylim(-0.2, 1.2) 
  ax.set_yticks([])
  ax.set_xticks([])

  for spine in ['top', 'right', 'left', 'bottom']:
      ax.spines[spine].set_visible(False)

  # Main axis
  y_axis = 1.0
  ax.plot([1, n_models], [y_axis, y_axis], color='#000000', linewidth=2.5, zorder=1)

  for i in range(1, n_models + 1):
    ax.plot([i, i], [y_axis, y_axis + 0.03], color='#000000', linewidth=2.6)
    ax.text(i, y_axis + 0.05, str(i), ha='center', va='bottom', 
            fontsize=16, fontweight='bold', color='#1f2937')

  is_sig_matrix = np.zeros((n_models, n_models), dtype=bool)
  for res in summary.pairwise_results:
    if res.model_a in model_names and res.model_b in model_names:
      idx1 = model_names.index(res.model_a)
      idx2 = model_names.index(res.model_b)
      is_sig_matrix[idx1, idx2] = res.is_significant
      is_sig_matrix[idx2, idx1] = res.is_significant

  cliques = []
  for i in range(n_models):
    for j in range(i + 1, n_models):
      is_clique = True
      for a in range(i, j + 1):
        for b in range(a + 1, j + 1):
          if is_sig_matrix[a, b]:
            is_clique = False
            break
        if not is_clique:
          break
      if is_clique:
        cliques.append((i, j))

  maximal_cliques = []
  for c1 in cliques:
    is_maximal = True
    for c2 in cliques:
      if c1 != c2 and c1[0] >= c2[0] and c1[1] <= c2[1]:
        is_maximal = False
        break
    if is_maximal and c1 not in maximal_cliques:
      maximal_cliques.append(c1)

  y_ns_base = 0.92 
  ns_line_step = 0.06

  for idx, clique in enumerate(maximal_cliques):
      start_rank = avg_ranks[clique[0]]
      end_rank = avg_ranks[clique[1]]
      y_ns = y_ns_base - (idx * ns_line_step)

      ax.plot([start_rank, end_rank], [y_ns, y_ns], 
              color='#000000', linewidth=5, zorder=10)

  # 6. Calcular espaço para os nomes e desenhar conexões em "L"
  num_bars = len(maximal_cliques)
  lowest_ns_y = (
    y_ns_base - (max(0, num_bars - 1) * ns_line_step) if num_bars > 0 else y_axis
  )
  y_name_max = min(0.65, lowest_ns_y - 0.1) 

  split_idx = (n_models + 1) // 2
  y_space_left = np.linspace(y_name_max, 0.0, max(1, split_idx))
  y_space_right = np.linspace(y_name_max, 0.0, max(1, n_models - split_idx))

  color_line = '#000000'

  for i, (name, rank) in enumerate(zip(model_names, avg_ranks)):  
    is_highlighted = highlight_models is not None and name in highlight_models

    point_color = highlight_color if is_highlighted else '#1f77b4'
    text_color = highlight_color if is_highlighted else '#111827'

    ax.scatter(rank, y_axis, color=point_color,
               s=80, zorder=15, edgecolor='white', linewidth=1.5)

    if i < split_idx:
        x_name = 0.2
        ha = 'right'
        y_name = y_space_left[i]
    else:
        x_name = n_models + 0.8
        ha = 'left'
        y_name = y_space_right[i - split_idx]

    ax.plot([rank, rank], [y_axis, y_name], color=color_line, linewidth=1.5, zorder=2)
    ax.plot([rank, x_name], [y_name, y_name], color=color_line, linewidth=1.5, zorder=2)

    offset = 0.01
    ax.text(x_name, y_name + offset, name, ha=ha, va='bottom', 
            fontweight='bold', fontsize=11, color=text_color)
    ax.text(x_name, y_name - offset, f"({rank:.2f})", ha=ha, va='top', 
            fontsize=10, color='#000000')

  ax.set_title(title, pad=30, fontsize=14, fontweight='bold', color='#111827')
  plt.tight_layout()
  return fig

Generates a horizontal boxplot combined with a scatter plot (jitter) focused on difference between two models. The central line (zero) divides visually who wins in the majority of the samples.

Source code in labicompare/plots/differences.py

def plot_difference_distribution(
  data: EvaluationData,
  model_a: str,
  model_b: str,
  figsize: tuple[float, float] = (10, 4)
) -> plt.Figure:
  """
  Generates a horizontal boxplot combined with a scatter plot (jitter)
  focused on difference between two models. The central line (zero) divides
  visually who wins in the majority of the samples.
  """
  if model_a not in data.model_names or model_b not in data.model_names:
    raise ValueError(f"The Models '{model_a}' or '{model_b}' not found in results.")

  diffs = data._df[model_a] - data._df[model_b]
  mean_diff = diffs.mean()
  median_diff = diffs.median()

  fig, ax = plt.subplots(figsize=figsize)

  max_abs_diff = max(abs(diffs.min()), abs(diffs.max()))
  limit = max_abs_diff * 1.15 

  color_win_a = "#10b981" # Emerald Green
  color_win_b = "#ef4444" # Coral Red
  color_tie = "#9ca3af"   # Cool Gray

  if data.higher_is_better:
    point_colors = np.where(diffs > 0,
                            color_win_a,
                            np.where(diffs < 0, color_win_b, color_tie))
  else:
    point_colors = np.where(diffs < 0,
                            color_win_a,
                            np.where(diffs > 0,color_win_b, color_tie))

  ax.axvline(0, color='#6b7280', linestyle='-', linewidth=1.5, zorder=1)

  boxprops = dict(facecolor="#f3f4f6", color="#9ca3af", linewidth=1.5, alpha=0.5)
  medianprops = dict(color="#374151", linewidth=2.5)
  whiskerprops = dict(color="#9ca3af", linewidth=1.5, linestyle="--")
  capprops = dict(color="#9ca3af", linewidth=1.5)

  _ = ax.boxplot(
    diffs, 
    vert=False, 
    patch_artist=True, 
    widths=0.3,
    boxprops=boxprops,
    medianprops=medianprops,
    whiskerprops=whiskerprops,
    capprops=capprops,
    showfliers=False,
    zorder=2
  )

  np.random.seed(42)
  y_jitter = np.random.normal(1, 0.06, size=len(diffs))

  ax.scatter(
    diffs, 
    y_jitter, 
    c=point_colors, 
    linewidth=0.8,
    s=30,
    alpha=0.4,
    zorder=3
  )

  ax.plot(mean_diff, 1, marker='D', color='black', markersize=4, zorder=4, label='Mean')

  label_right = (f"Favors {model_a} →" if data.higher_is_better
                 else f"Favors {model_b} →")

  ax.text(limit*0.05, 1.4, label_right,
          color=color_win_a if data.higher_is_better else color_win_b, 
          fontsize=10, fontweight='bold', va='center', ha='left')

  label_left = f"← Favors {model_b}" if data.higher_is_better else f"← Favors {model_a}"
  ax.text(-limit*0.05, 1.4, label_left,
          color=color_win_b if data.higher_is_better else color_win_a, 
          fontsize=10, fontweight='bold', va='center', ha='right')

  stats_text = f"Mean: {mean_diff:+.4f}\nMedian: {median_diff:+.4f}"
  ax.text(0.98, 0.05, stats_text, transform=ax.transAxes, fontsize=10,
          ha='right', va='bottom',
          bbox=dict(boxstyle='round,pad=0.5',
                    facecolor='white', alpha=0.9,
                    edgecolor='#d1d5db'))

  ax.set_xlim(-limit, limit)
  ax.set_ylim(0.5, 1.5)
  ax.set_yticks([]) 

  ax.set_xlabel(f"Absolute Difference ({model_a} - {model_b})",
                fontweight='bold', labelpad=10, color="#374151")
  ax.set_title("Distribution of Differences", pad=20,
               fontsize=14, fontweight='bold', color="#111827")

  ax.grid(axis='x', color="#e5e7eb", linestyle='--', linewidth=1, zorder=0)

  for spine in ['top', 'right', 'left']:
    ax.spines[spine].set_visible(False)
  ax.spines['bottom'].set_color('#9ca3af')

  fig.tight_layout()

  return fig

Generate a styled heatmap for p-values matrix from a post-hoc test.

Parameters:

Name	Type	Description	Default
`pvalue_matrix`		DataFrame with pair-wise p-values.	required
`alpha`		Significance level (default: 0.05). Values <= alpha will be in bold.	required
`min_alpha`		The exact value where the color changes to green (default: 0.10).	required
`figsize`	`tuple[float, float]`	Size of the figure (width, height).	`(8, 6)`
`fontsize`	`int`	Size of the font used in this plot.	`12`
`grid_linewidth`	`float`	Width for the grid line used between cells.	`2.0`

Returns:

Type	Description
`Figure`	Figure instance from matplotlib, can be saved or showed.

Source code in labicompare/plots/heatmap.py

def plot_pvalue_matrix(
    summary: ComparisonSummary,
    figsize: tuple[float, float] = (8, 6),
    fontsize: int = 12,
    grid_linewidth: float = 2.0,
    aspect: Literal["equal", "auto"] | float | None = "equal"
) -> plt.Figure:
    """
    Generate a styled heatmap for p-values matrix from a post-hoc test.

    Args:
        pvalue_matrix: DataFrame with pair-wise p-values.
        alpha: Significance level (default: 0.05). Values <= alpha will be in bold.
        min_alpha: The exact value where the color changes to green (default: 0.10).
        figsize: Size of the figure (width, height).
        fontsize: Size of the font used in this plot.
        grid_linewidth: Width for the grid line used between cells.

    Returns:
        Figure instance from matplotlib, can be saved or showed.
    """
    fig, ax = plt.subplots(figsize=figsize)


    model_means = summary.model_means
    higher_is_better = summary.higher_is_better
    base_alpha = summary.alpha

    models = sorted(
        list(model_means.keys()), 
        key=lambda m: model_means[m], 
        reverse=higher_is_better
    )
    n_models = len(models)

    data = np.full((n_models, n_models), np.nan)
    sig_matrix = np.full((n_models, n_models), False)
    winner_matrix = np.full((n_models, n_models), None, dtype=object)

    for res in summary.pairwise_results:
        i = models.index(res.model_a)
        j = models.index(res.model_b)

        data[i, j] = data[j, i] = res.p_value
        sig_matrix[i, j] = sig_matrix[j, i] = res.is_significant
        winner_matrix[i, j] = winner_matrix[j, i] = res.winner


    cmap = plt.get_cmap("RdYlGn_r").copy()
    cmap.set_bad(color="#f0f0f0") 

    norm = TwoSlopeNorm(vmin=0.0, vcenter=base_alpha, vmax=1.0)
    cax = ax.imshow(data, cmap=cmap, norm=norm, aspect=aspect)

    cbar = fig.colorbar(cax, ax=ax)
    cbar.ax.set_ylabel("Adjusted P-value", rotation=-90, va="bottom", labelpad=15)

    ax.set_xticks(np.arange(n_models))
    ax.set_yticks(np.arange(n_models))
    ax.set_xticklabels(models)
    ax.set_yticklabels(models)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    for i in range(n_models):
        for j in range(n_models):
            if i == j:
                ax.text(j, i, "-",
                        ha="center", va="center",
                        color="gray", fontsize=fontsize)
            else:
                val = data[i, j]
                is_sig = sig_matrix[i, j]
                winner = winner_matrix[i, j]
                row_model = models[i]

                if is_sig:
                    indicator = " (↑)" if winner == row_model else " (↓)"
                    weight = "bold"
                    text_str = f"{val:.3f}{indicator}"
                else:
                    weight = "normal"
                    text_str = f"{val:.3f}"

                text_color = "white" if (val < (base_alpha/2) or val > 0.7) else "black"
                ax.text(j, i, text_str,
                        ha="center", va="center", color=text_color,
                        fontweight=weight, fontsize=fontsize)

    if grid_linewidth > 0:
        ax.set_xticks(np.arange(n_models + 1) - 0.5, minor=True)
        ax.set_yticks(np.arange(n_models + 1) - 0.5, minor=True)
        ax.grid(which="minor", color="white", linestyle="-", linewidth=grid_linewidth)
        ax.tick_params(which="minor", bottom=False, left=False)

    for _, spine in ax.spines.items():
        spine.set_visible(False)

    ax.set_title("Pairwise Comparison Matrix\n(Post-Hoc P-values)")

    fig.tight_layout()

    return fig

Generates a scatter plot comparing the results of two specific models dataset per dataset. Diagonal line represents draws (y = x).

Parameters:

Name	Type	Description	Default
`data`	`EvaluationData`	EvaluationData containing all results.	required
`model_x`	`str`	Name of the model that will be represented in the X-axis.	required
`model_y`	`str`	Name fo the model that will be represented in the Y-axis.	required
`figsize`	`tuple[float, float]`	Figure size.	`(6, 6)`
`point_size`	`int`	Point size used in scatter plot.	`50`
`alpha_points`	`float`	Point transparency (useful for overlays).	`0.7`

Returns:

Type	Description
`Figure`	Matplotlib figure.

Source code in labicompare/plots/scatter.py

def plot_one_vs_one(
  data: EvaluationData,
  model_x: str,
  model_y: str,
  figsize: tuple[float, float] = (6, 6),
  point_size: int = 50,
  alpha_points: float = 0.7
) -> plt.Figure:
  """
  Generates a scatter plot comparing the results of two specific models
  dataset per dataset. Diagonal line represents draws (y = x).

  Args:
    data: EvaluationData containing all results.
    model_x: Name of the model that will be represented in the X-axis.
    model_y: Name fo the model that will be represented in the Y-axis.
    figsize: Figure size.
    point_size: Point size used in scatter plot.
    alpha_points: Point transparency (useful for overlays).

  Returns:
    Matplotlib figure.
  """
  if model_x not in data.model_names or model_y not in data.model_names:
    raise ValueError(f"Models '{model_x}' or '{model_y}' not found in data.")

  perf_x = data._df[model_x].values
  perf_y = data._df[model_y].values

  fig, ax = plt.subplots(figsize=figsize)

  min_val = min(perf_x.min(), perf_y.min())
  max_val = max(perf_x.max(), perf_y.max())

  padding = (max_val - min_val) * 0.05
  if padding == 0:
    padding = 0.1

  lim_min: float = min_val - padding
  lim_max: float = max_val + padding

  ax.plot(
    [lim_min, lim_max], [lim_min, lim_max], 
    color="gray", linestyle="--", zorder=1, label="Draw-line (y = x)"
  )

  diff = perf_x - perf_y
  if data.higher_is_better:
    x_wins = diff > 0
    y_wins = diff < 0
  else:
    x_wins = diff < 0
    y_wins = diff > 0
  ties = diff == 0

  if np.any(x_wins):
    ax.scatter(
      perf_x[x_wins], perf_y[x_wins], 
      color="#2ca02c", s=point_size, alpha=alpha_points, 
      zorder=2, label=f"{model_x} wins ({len(perf_x[x_wins])} datasets)"
    )

  if np.any(y_wins):
      ax.scatter(
          perf_x[y_wins], perf_y[y_wins], 
          color="#d62728", s=point_size, alpha=alpha_points, 
          zorder=2, label=f"{model_y} wins ({len(perf_x[y_wins])} datasets)"
      )

  if np.any(ties):
      ax.scatter(
          perf_x[ties], perf_y[ties], 
          color="black", s=point_size, marker="x",
          zorder=3, label=f"Exact draws ({len(perf_x[ties])} datasets)"
      )

  ax.set_xlim((lim_min, lim_max))
  ax.set_ylim((lim_min, lim_max))
  ax.set_aspect("equal", adjustable="box")

  ax.set_xlabel(f"Performance: {model_x}", fontweight="bold", labelpad=10)
  ax.set_ylabel(f"Performance: {model_y}", fontweight="bold", labelpad=10)
  ax.set_title(f"One versus One Plot between \n{model_x} vs {model_y}", pad=15)

  ax.grid(True, linestyle=":", alpha=0.6)
  ax.legend(loc="best", framealpha=0.9)

  ax.spines["top"].set_visible(False)
  ax.spines["right"].set_visible(False)

  fig.tight_layout()

  return fig