Skip to content

Commit 3fbb3ac

Browse files
authored
[compare.py] Add confidence interval (#377)
This patch adds a `--diff-confidence-interval=relative|absolute` option to `compare.py` to report 95% (or 1-alpha) confidence intervals for the relative difference between `lhs` and `rhs` runs. The current p-value and significance markers only tells the user if a difference is statistically significant against null hypothesis, but does not show how large the true difference might vary. Example output from `compare.py ... --statistics --diff-confidence-interval` ``` Program exec_time lhs rhs diff std_lhs std_rhs t-value p-value significant diff_ci_rel C 2.95 3.40 15.3% 0.076 0.100 -6.653 0.0027 Y [ 9.3%, 22.7%] A 1.00 1.15 15.0% 0.050 0.050 -3.674 0.0213 Y [ 3.5%, 25.1%] B 1.95 2.20 12.8% 0.076 0.050 -4.427 0.0114 Y [ 4.3%, 18.8%] Geomean difference 14.4% ```
1 parent 91b9dd1 commit 3fbb3ac

1 file changed

Lines changed: 61 additions & 16 deletions

File tree

utils/compare.py

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,12 @@ def merge_values(values, merge_function):
128128

129129

130130
def get_values(values, lhs_name=None, rhs_name=None):
131-
exclude_cols = ["diff", "t-value", "p-value", "significant"]
132-
exclude_cols.extend([f'std_{lhs_name}', f'std_{rhs_name}'])
133-
exclude_cols.extend([f'cv_{lhs_name}', f'cv_{rhs_name}'])
131+
exclude_cols = [
132+
"diff", "t-value", "p-value", "significant",
133+
f'std_{lhs_name}', f'std_{rhs_name}',
134+
f'cv_{lhs_name}', f'cv_{rhs_name}',
135+
"diff_ci_rel", "diff_ci_abs",
136+
]
134137
values = values[[c for c in values.columns if c not in exclude_cols]]
135138
has_two_runs = len(values.columns) == 2
136139
if has_two_runs:
@@ -163,7 +166,7 @@ def add_diff_column(metric, values, absolute_diff=False):
163166
return values
164167

165168

166-
def compute_statistics(lhs_d, rhs_d, metrics, alpha, coeff_var, lhs_name, rhs_name):
169+
def compute_statistics(lhs_d, rhs_d, lhs_name, rhs_name, metrics, alpha, coeff_var, diff_conf_int):
167170
stats_dict = {}
168171

169172
for metric in metrics:
@@ -181,9 +184,9 @@ def compute_statistics(lhs_d, rhs_d, metrics, alpha, coeff_var, lhs_name, rhs_na
181184
if len(lhs_values) >= 2 and len(rhs_values) >= 2:
182185
lhs_std = lhs_values.std(ddof=1)
183186
rhs_std = rhs_values.std(ddof=1)
187+
lhs_mean = lhs_values.mean()
188+
rhs_mean = rhs_values.mean()
184189
if coeff_var:
185-
lhs_mean = lhs_values.mean()
186-
rhs_mean = rhs_values.mean()
187190
stats_dict[metric][program] = {
188191
f'cv_{lhs_name}': lhs_std / lhs_mean if lhs_mean != 0 else float('nan'),
189192
f'cv_{rhs_name}': rhs_std / rhs_mean if rhs_mean != 0 else float('nan'),
@@ -193,10 +196,26 @@ def compute_statistics(lhs_d, rhs_d, metrics, alpha, coeff_var, lhs_name, rhs_na
193196
f'std_{lhs_name}': lhs_std,
194197
f'std_{rhs_name}': rhs_std,
195198
}
196-
t_stat, p_val = stats.ttest_ind(lhs_values, rhs_values)
197-
stats_dict[metric][program]['t-value'] = t_stat
198-
stats_dict[metric][program]['p-value'] = p_val
199-
stats_dict[metric][program]['significant'] = "Y" if p_val < alpha else "N"
199+
ttest = stats.ttest_ind(lhs_values, rhs_values)
200+
stats_dict[metric][program]['t-value'] = ttest.statistic
201+
stats_dict[metric][program]['p-value'] = ttest.pvalue
202+
stats_dict[metric][program]['significant'] = "Y" if ttest.pvalue < alpha else "N"
203+
204+
if diff_conf_int:
205+
ci = ttest.confidence_interval(1 - alpha)
206+
# CI is for mean(lhs)-mean(rhs); negate for rhs-lhs
207+
abs_lo = -ci.high
208+
abs_hi = -ci.low
209+
if diff_conf_int == "relative":
210+
if lhs_mean != 0:
211+
ci_lo = abs_lo / lhs_mean
212+
ci_hi = abs_hi / lhs_mean
213+
else:
214+
ci_lo = float('nan')
215+
ci_hi = float('nan')
216+
stats_dict[metric][program]['diff_ci_rel'] = (ci_lo, ci_hi)
217+
else:
218+
stats_dict[metric][program]['diff_ci_abs'] = (abs_lo, abs_hi)
200219
else:
201220
if coeff_var:
202221
stats_dict[metric][program] = {
@@ -218,6 +237,10 @@ def compute_statistics(lhs_d, rhs_d, metrics, alpha, coeff_var, lhs_name, rhs_na
218237
else:
219238
stat_col_names += [f'std_{lhs_name}', f'std_{rhs_name}']
220239
stat_col_names += ['t-value', 'p-value', 'significant']
240+
if diff_conf_int == "relative":
241+
stat_col_names += ['diff_ci_rel']
242+
elif diff_conf_int == "absolute":
243+
stat_col_names += ['diff_ci_abs']
221244

222245
return stats_dict, stat_col_names
223246

@@ -397,6 +420,14 @@ def print_result(
397420
formatters[(m, f'cv_{lhs_name}')] = lambda x: "%4.1f%%" % (x * 100) if not pd.isna(x) else ""
398421
if (m, f'cv_{rhs_name}') in dataout.columns:
399422
formatters[(m, f'cv_{rhs_name}')] = lambda x: "%4.1f%%" % (x * 100) if not pd.isna(x) else ""
423+
if (m, "diff_ci_rel") in dataout.columns:
424+
formatters[(m, "diff_ci_rel")] = lambda x: \
425+
"[%4.1f%%, %4.1f%%]" % (x[0] * 100, x[1] * 100) \
426+
if isinstance(x, tuple) and not (pd.isna(x[0]) or pd.isna(x[1])) else ""
427+
if (m, "diff_ci_abs") in dataout.columns:
428+
formatters[(m, "diff_ci_abs")] = lambda x: \
429+
"[%4.3f, %4.3f]" % (x[0], x[1]) \
430+
if isinstance(x, tuple) and not (pd.isna(x[0]) or pd.isna(x[1])) else ""
400431
# Turn index into a column so we can format it...
401432
formatted_program = dataout.index.to_series()
402433
if shorten_names:
@@ -445,9 +476,12 @@ def float_format(x):
445476
formatters=formatters,
446477
)
447478
print(out)
448-
exclude_from_summary = ["t-value", "p-value", "significant"]
449-
exclude_from_summary.extend([f'std_{lhs_name}', f'std_{rhs_name}'])
450-
exclude_from_summary.extend([f'cv_{lhs_name}', f'cv_{rhs_name}'])
479+
exclude_from_summary = [
480+
"t-value", "p-value", "significant",
481+
f'std_{lhs_name}', f'std_{rhs_name}',
482+
f'cv_{lhs_name}', f'cv_{rhs_name}',
483+
'diff_ci_rel', 'diff_ci_abs',
484+
]
451485
d_summary = d.drop(columns=exclude_from_summary, level=1, errors='ignore')
452486
print(d_summary.describe())
453487

@@ -564,6 +598,15 @@ def main():
564598
default=False,
565599
help="Compute relative coefficient of variation (%%) rather than absolute stddev",
566600
)
601+
parser.add_argument(
602+
"--diff-confidence-interval",
603+
choices=["relative", "absolute"],
604+
nargs="?",
605+
const="relative",
606+
default=None,
607+
dest="diff_confidence_interval",
608+
help="Show confidence interval for the difference (default: relative)",
609+
)
567610
config = parser.parse_args()
568611

569612
if config.show_diff is None:
@@ -604,11 +647,13 @@ def main():
604647
if config.statistics:
605648
metrics_for_stats = config.metrics if len(config.metrics) > 0 else get_default_metric(lhs_d, rhs_d)
606649
stats_dict, stat_col_names = compute_statistics(
607-
lhs_d, rhs_d, metrics_for_stats,
650+
lhs_d, rhs_d,
651+
lhs_name=config.lhs_name,
652+
rhs_name=config.rhs_name,
653+
metrics=metrics_for_stats,
608654
alpha=config.alpha,
609655
coeff_var=config.coefficient_variation,
610-
lhs_name=config.lhs_name,
611-
rhs_name=config.rhs_name
656+
diff_conf_int=config.diff_confidence_interval,
612657
)
613658

614659
# Merge data

0 commit comments

Comments
 (0)