compare.py: Add --statistics to show statistical significance with a t-test. (#288)

aemerson · web-flow · commit b8f4873f74db · 2025-10-28T15:26:43.000-07:00
* compare.py: Add --statistics to show statistical significance with a t-test.

The alpha level can also be adjusted with --alpha
diff --git a/utils/compare.py b/utils/compare.py
@@ -127,17 +127,29 @@ def merge_values(values, merge_function):
     return values.groupby(level=1).apply(merge_function)
 
 
-def get_values(values):
-    # Create data view without diff column.
-    if "diff" in values.columns:
-        values = values[[c for c in values.columns if c != "diff"]]
+def get_values(values, lhs_name, rhs_name):
+    exclude_cols = ["diff", "t-value", "p-value", "significant"]
+    exclude_cols.extend([f'std_{lhs_name}', f'std_{rhs_name}'])
+    values = values[[c for c in values.columns if c not in exclude_cols]]
     has_two_runs = len(values.columns) == 2
     if has_two_runs:
         return (values.iloc[:, 0], values.iloc[:, 1])
     else:
         return (values.min(axis=1), values.max(axis=1))
 
 
+def get_default_metric(data, second_data=None):
+    """Find a default metric to use if none specified.
+       data: Primary dataframe to check
+       second_data: Optional secondary dataframe (for 'vs' mode with lhs/rhs)
+    """
+    defaults = ["Exec_Time", "exec_time", "Value", "Runtime"]
+    for defkey in defaults:
+        if defkey in data.columns or (second_data is not None and defkey in second_data.columns):
+            return [defkey]
+    return []
+
+
 def add_diff_column(metric, values, absolute_diff=False):
     values0, values1 = get_values(values[metric])
     values0.fillna(0.0, inplace=True)
@@ -150,14 +162,68 @@ def add_diff_column(metric, values, absolute_diff=False):
     return values
 
 
-def add_geomean_row(metrics, data, dataout):
+def compute_statistics(lhs_d, rhs_d, metrics, alpha, lhs_name, rhs_name):
+    stats_dict = {}
+
+    for metric in metrics:
+        if metric not in lhs_d.columns or metric not in rhs_d.columns:
+            continue
+
+        stats_dict[metric] = {}
+
+        # Group by program (more efficient than unique+loc)
+        for program, lhs_group in lhs_d.groupby(level=1):
+            lhs_values = lhs_group[metric].dropna()
+            rhs_values = rhs_d.loc[(slice(None), program), metric].dropna()
+
+            # Compute t-test if we have enough samples
+            if len(lhs_values) >= 2 and len(rhs_values) >= 2:
+                stats_dict[metric][program] = {
+                    f'std_{lhs_name}': lhs_values.std(ddof=1),
+                    f'std_{rhs_name}': rhs_values.std(ddof=1),
+                }
+                t_stat, p_val = stats.ttest_ind(lhs_values, rhs_values)
+                stats_dict[metric][program]['t-value'] = t_stat
+                stats_dict[metric][program]['p-value'] = p_val
+                stats_dict[metric][program]['significant'] = "Y" if p_val < alpha else "N"
+            else:
+                stats_dict[metric][program] = {
+                    f'std_{lhs_name}': float('nan'),
+                    f'std_{rhs_name}': float('nan'),
+                    't-value': float('nan'),
+                    'p-value': float('nan'),
+                    'significant': ""
+                }
+
+    return stats_dict
+
+
+def add_precomputed_statistics(data, stats_dict, stat_col_names):
+    """Add precomputed statistics to the unstacked dataframe."""
+    for metric in data.columns.levels[0]:
+        if metric not in stats_dict:
+            continue
+
+        for stat_name in stat_col_names:
+            values = []
+            for program in data.index:
+                if program in stats_dict[metric]:
+                    values.append(stats_dict[metric][program][stat_name])
+                else:
+                    values.append(float('nan') if stat_name != 'significant' else "")
+            data[(metric, stat_name)] = values
+
+    return data
+
+
+def add_geomean_row(metrics, data, dataout, lhs_name, rhs_name):
     """
     Normalize values1 over values0, compute geomean difference and add a
     summary row to dataout.
     """
     gm = pd.DataFrame(index=[GEOMEAN_ROW], columns=dataout.columns, dtype="float64")
     for metric in metrics:
-        values0, values1 = get_values(data[metric])
+        values0, values1 = get_values(data[metric], lhs_name, rhs_name)
         # Avoid infinite values in the diff and instead use NaN, as otherwise
         # the computation of the geometric mean will fail.
         values0 = values0.replace({0: float("NaN")})
@@ -249,6 +315,8 @@ def print_result(
     sortkey="diff",
     sort_by_abs=True,
     absolute_diff=False,
+    lhs_name="lhs",
+    rhs_name="rhs"
 ):
     metrics = d.columns.levels[0]
     if sort_by_abs:
@@ -272,6 +340,16 @@ def print_result(
     if not absolute_diff:
         for m in metrics:
             formatters[(m, "diff")] = format_relative_diff
+    # Add formatters for statistical columns
+    for m in metrics:
+        if (m, "p-value") in dataout.columns:
+            formatters[(m, "p-value")] = lambda x: "%.4f" % x if not pd.isna(x) else ""
+        if (m, "t-value") in dataout.columns:
+            formatters[(m, "t-value")] = lambda x: "%.3f" % x if not pd.isna(x) else ""
+        if (m, f'std_{lhs_name}') in dataout.columns:
+            formatters[(m, f'std_{lhs_name}')] = lambda x: "%.3f" % x if not pd.isna(x) else ""
+        if (m, f'std_{rhs_name}') in dataout.columns:
+            formatters[(m, f'std_{rhs_name}')] = lambda x: "%.3f" % x if not pd.isna(x) else ""
     # Turn index into a column so we can format it...
     formatted_program = dataout.index.to_series()
     if shorten_names:
@@ -302,7 +380,7 @@ def strip_name_fully(name):
     # as it will otherwise interfere with common prefix/suffix computation.
     if show_diff_column and not absolute_diff:
         # geometric mean only makes sense for relative differences.
-        dataout = add_geomean_row(metrics, d, dataout)
+        dataout = add_geomean_row(metrics, d, dataout, lhs_name, rhs_name)
 
     def float_format(x):
         if x == "":
@@ -320,7 +398,10 @@ def float_format(x):
         formatters=formatters,
     )
     print(out)
-    print(d.describe())
+    exclude_from_summary = ["t-value", "p-value", "significant"]
+    exclude_from_summary.extend([f'std_{lhs_name}', f'std_{rhs_name}'])
+    d_summary = d.drop(columns=exclude_from_summary, level=1, errors='ignore')
+    print(d_summary.describe())
 
 
 def main():
@@ -400,6 +481,19 @@ def main():
         default=False,
         help="Don't use abs() when sorting results",
     )
+    parser.add_argument(
+        "--statistics",
+        action="store_true",
+        dest="statistics",
+        default=False,
+        help="Add statistical analysis columns (std, t-value, p-value, significance)",
+    )
+    parser.add_argument(
+        "--alpha",
+        type=float,
+        default=0.05,
+        help="Significance level for statistical tests (default: 0.05)",
+    )
     config = parser.parse_args()
 
     if config.show_diff is None:
@@ -425,15 +519,30 @@ def main():
 
     # Read inputs
     files = config.files
+    stats_dict = None
+    stat_col_names = None
     if "vs" in files:
         split = files.index("vs")
         lhs = files[0:split]
         rhs = files[split + 1 :]
 
         # Combine the multiple left and right hand sides.
         lhs_d = readmulti(lhs)
-        lhs_merged = merge_values(lhs_d, config.merge_function)
         rhs_d = readmulti(rhs)
+
+        # Compute statistics on raw data before merging (if requested)
+        if config.statistics:
+            metrics_for_stats = config.metrics if len(config.metrics) > 0 else get_default_metric(lhs_d, rhs_d)
+            stats_dict = compute_statistics(
+                lhs_d, rhs_d, metrics_for_stats,
+                alpha=config.alpha,
+                lhs_name=config.lhs_name,
+                rhs_name=config.rhs_name
+            )
+            stat_col_names = [f'std_{config.lhs_name}', f'std_{config.rhs_name}', 't-value', 'p-value', 'significant']
+
+        # Merge data
+        lhs_merged = merge_values(lhs_d, config.merge_function)
         rhs_merged = merge_values(rhs_d, config.merge_function)
 
         # Combine to new dataframe
@@ -448,11 +557,7 @@ def main():
     # Decide which metric to display / what is our "main" metric
     metrics = config.metrics
     if len(metrics) == 0:
-        defaults = ["Exec_Time", "exec_time", "Value", "Runtime"]
-        for defkey in defaults:
-            if defkey in data.columns:
-                metrics = [defkey]
-                break
+        metrics = get_default_metric(data)
     if len(metrics) == 0:
         sys.stderr.write("No default metric found and none specified\n")
         sys.stderr.write("Available metrics:\n")
@@ -508,6 +613,9 @@ def main():
     for metric in data.columns.levels[0]:
         data = add_diff_column(metric, data, absolute_diff=config.absolute_diff)
 
+    if config.statistics and stats_dict is not None:
+        data = add_precomputed_statistics(data, stats_dict, stat_col_names)
+
     sortkey = "diff"
     # TODO: should we still be sorting by diff even if the diff is hidden?
     if len(config.files) == 1:
@@ -526,6 +634,8 @@ def main():
         sortkey,
         config.no_abs_sort,
         config.absolute_diff,
+        config.lhs_name,
+        config.rhs_name,
     )