Skip to content

Commit ec7b3b4

Browse files
committed
handle warnings for no standard deviation
1 parent 61f873f commit ec7b3b4

File tree

1 file changed

+123
-56
lines changed

1 file changed

+123
-56
lines changed

treeherder/perf/stats.py

Lines changed: 123 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import warnings
2+
13
import numpy as np
24
from KDEpy import FFTKDE
35
from scipy import stats
@@ -31,26 +33,29 @@ def count_modes(x, y, min_prom_frac=0.02, max_prom_frac=0.2):
3133
min_prom_frac: relative minimum of the prominence as a fraction of the maximum of y (smoothed)
3234
max_prom_frac: relative maximum of the prominence as a fraction of the maximum of y (smoothed)
3335
"""
34-
# Smooth outliers out
35-
y_smooth = gaussian_filter1d(y, sigma=2)
36-
37-
# Compute derivative, keep high percentile, to find "real" peaks
38-
dy = np.gradient(y_smooth, x)
39-
noise_est = np.percentile(np.abs(dy), 90)
40-
41-
# Set prominence proportional to that noise, capped to fraction of max
42-
max_y = np.max(y_smooth)
43-
prom_est = np.clip(noise_est, min_prom_frac * max_y, max_prom_frac * max_y)
44-
45-
# grid has been fitted by KDEpy already. Consider that peaks should be at
46-
# least 5% of the data range appart. This helps keeping out "horns" at the
47-
# top of a mode.
48-
dx = x[1] - x[0]
49-
x_range = x[-1] - x[0]
50-
min_distance = max(1, int((x_range * 0.05) / dx))
51-
52-
peaks, _ = find_peaks(y_smooth, prominence=prom_est, distance=min_distance)
53-
return len(peaks), x[peaks], prom_est
36+
try:
37+
# Smooth outliers out
38+
y_smooth = gaussian_filter1d(y, sigma=2)
39+
40+
# Compute derivative, keep high percentile, to find "real" peaks
41+
dy = np.gradient(y_smooth, x)
42+
noise_est = np.percentile(np.abs(dy), 90)
43+
44+
# Set prominence proportional to that noise, capped to fraction of max
45+
max_y = np.max(y_smooth)
46+
prom_est = np.clip(noise_est, min_prom_frac * max_y, max_prom_frac * max_y)
47+
48+
# grid has been fitted by KDEpy already. Consider that peaks should be at
49+
# least 5% of the data range appart. This helps keeping out "horns" at the
50+
# top of a mode.
51+
dx = x[1] - x[0]
52+
x_range = x[-1] - x[0]
53+
min_distance = max(1, int((x_range * 0.05) / dx))
54+
55+
peaks, _ = find_peaks(y_smooth, prominence=prom_est, distance=min_distance)
56+
return len(peaks), x[peaks], prom_est
57+
except Exception:
58+
return 0, None, None
5459

5560

5661
# Return a list of interval correspondings to the modes in the data series,
@@ -109,18 +114,22 @@ def split_per_mode(data, intervals):
109114
# the data is unimodal (or even normally distributed). Symmetry of the
110115
# distribution isn't required
111116
def bootstrap_median_diff_ci(a, b, n_iter=1000, alpha=0.05):
112-
data = (a, b)
113-
res = bootstrap(
114-
data,
115-
statistic=lambda *args: np.median(args[1]) - np.median(args[0]),
116-
n_resamples=n_iter,
117-
confidence_level=1 - alpha,
118-
method="percentile", # percentile doesn't require symmetry
119-
vectorized=False,
120-
paired=False,
121-
random_state=None,
122-
)
123-
return np.median(b) - np.median(a), res.confidence_interval
117+
try:
118+
data = (a, b)
119+
120+
res = bootstrap(
121+
data,
122+
statistic=lambda *args: np.median(args[1]) - np.median(args[0]),
123+
n_resamples=n_iter,
124+
confidence_level=1 - alpha,
125+
method="percentile", # percentile doesn't require symmetry
126+
vectorized=False,
127+
paired=False,
128+
random_state=None,
129+
)
130+
return np.median(b) - np.median(a), res.confidence_interval
131+
except Exception:
132+
return None, (None, None)
124133

125134

126135
# Normality test. using Shapiro-Wilk based on this:
@@ -380,15 +389,40 @@ def interpret_cles(
380389

381390
def interpret_silverman_kde(base_data, new_data, lower_is_better):
382391
try:
392+
warning_msgs = []
393+
base_mode_count, base_peak_locs, base_prom = 0, None, None
394+
new_mode_count, new_peak_locs, new_prom = 0, None, None
395+
x_base, y_base = [], []
396+
x_new, y_new = [], []
383397
# Kernel Density Estimator (KDE) - estimate the probability density function (PDF) of a continuous random variable in a smooth, non-parametric way, based on a finite sample of data.
384-
x_base, y_base = FFTKDE(kernel="gaussian", bw="silverman").fit(base_data).evaluate()
385-
x_new, y_new = FFTKDE(kernel="gaussian", bw="silverman").fit(new_data).evaluate()
398+
# 1 datapoint will result in a Bandwidth = 0 → divide-by-zero warning
399+
if len(base_data) > 0:
400+
try:
401+
with warnings.catch_warnings():
402+
warnings.simplefilter("ignore", category=UserWarning)
403+
x_base, y_base = (
404+
FFTKDE(kernel="gaussian", bw="silverman").fit(base_data).evaluate()
405+
)
406+
base_mode_count, base_peak_locs, base_prom = count_modes(x_base, y_base)
407+
except Exception:
408+
warning_msgs.append("Cannot Silverman KDE for base. Likely not enough data.")
409+
else:
410+
warning_msgs.append("Base revision has less than 2 data points to run Silverman KDE.")
411+
if len(new_data) > 0:
412+
try:
413+
with warnings.catch_warnings():
414+
warnings.simplefilter("ignore", category=UserWarning)
415+
x_new, y_new = (
416+
FFTKDE(kernel="gaussian", bw="silverman").fit(new_data).evaluate()
417+
)
418+
new_mode_count, new_peak_locs, new_prom = count_modes(x_new, y_new)
419+
except Exception:
420+
warning_msgs.append("Cannot Silverman KDE for new. Likely not enough data.")
421+
else:
422+
warning_msgs.append("New revision has less than 2 data points to run Silverman KDE.")
386423

387424
# Estimate modes, and warn if multimodal. We estimate the modes w/ a bandwidth computed via silverman's method because it smoothes a bit more
388-
base_mode_count, base_peak_locs, base_prom = count_modes(x_base, y_base)
389-
new_mode_count, new_peak_locs, new_prom = count_modes(x_new, y_new)
390425

391-
warning_msgs = []
392426
if base_mode_count > 1:
393427
warning_msgs.append("Base revision distribution appears multimodal.")
394428
if new_mode_count > 1:
@@ -506,8 +540,8 @@ def plot_kde_with_isj_bandwidth(base, new):
506540
padding = 0.05 * (x_max - x_min) if x_max > x_min else 1
507541
x_min, x_max = x_min - padding, x_max + padding
508542

509-
# Generate grid points defaults to 512 as commonly used
510-
x_grid = np.linspace(x_min, x_max, 512)
543+
# Generate grid points
544+
x_grid = np.linspace(x_min, x_max, 256)
511545

512546
kde_x_base = []
513547
kde_y_base = []
@@ -530,29 +564,62 @@ def plot_kde_with_isj_bandwidth(base, new):
530564

531565
try:
532566
# min 2 data point. FFTKDE(bw="ISJ").fit measure the data's spread or variance. At less than 2, variance is undefined, cannot compute a bandwidth.
533-
if len(base) > 1:
534-
kde_base = FFTKDE(bw="ISJ").fit(base)
535-
y_base = kde_base.evaluate(x_grid)
536-
kde_x_base = x_grid.tolist()
537-
kde_y_base = y_base.tolist()
538-
kde_plot_base["kde_x"] = kde_x_base
539-
kde_plot_base["kde_y"] = kde_y_base
567+
if len(base) > 1 and np.std(base) != 0:
568+
try:
569+
with warnings.catch_warnings():
570+
warnings.simplefilter("ignore")
571+
kde_base = FFTKDE(bw="ISJ").fit(base)
572+
y_base = kde_base.evaluate(x_grid)
573+
kde_x_base = x_grid.tolist()
574+
kde_y_base = y_base.tolist()
575+
kde_plot_base["kde_x"] = kde_x_base
576+
kde_plot_base["kde_y"] = kde_y_base
577+
except Exception:
578+
kde_warnings.append(
579+
"KDE with ISJ bandwidth fitting to Base data failed. Likely not enough data or no standard deviation in data."
580+
)
581+
kde_plot_base["kde_x"] = base
540582
else:
541583
kde_warnings.append(
542-
"Less than 2 datapoints, cannot fit Kernel Density Estimator (KDE) with an ISJ to Base."
584+
"Less than 2 datapoints or no standard variance for a meaningful fit Kernel Density Estimator (KDE) with an ISJ bandwidth to Base."
543585
)
544-
545-
if len(new) > 1:
546-
kde_new = FFTKDE(bw="ISJ").fit(new)
547-
y_new = kde_new.evaluate(x_grid)
548-
kde_x_new = x_grid.tolist()
549-
kde_y_new = y_new.tolist()
550-
kde_plot_new["kde_x"] = kde_x_new
551-
kde_plot_new["kde_y"] = kde_y_new
586+
# produce flat line for 0 data points
587+
if len(base) == 0:
588+
x = np.linspace(-1, 1, 100)
589+
y = np.zeros_like(x)
590+
kde_plot_base["kde_x"] = x.tolist()
591+
kde_plot_base["kde_y"] = y.tolist()
592+
else:
593+
kde_plot_base["kde_x"] = base
594+
kde_plot_base["kde_y"] = []
595+
596+
if len(new) > 1 and np.std(new) != 0:
597+
try:
598+
with warnings.catch_warnings():
599+
warnings.simplefilter("ignore")
600+
kde_new = FFTKDE(bw="ISJ").fit(new)
601+
y_new = kde_new.evaluate(x_grid)
602+
kde_x_new = x_grid.tolist()
603+
kde_y_new = y_new.tolist()
604+
kde_plot_new["kde_x"] = kde_x_new
605+
kde_plot_new["kde_y"] = kde_y_new
606+
except Exception:
607+
kde_warnings.append(
608+
"KDE with ISJ bandwidth fitting to New data failed. Likely not enough data or no standard deviation in data."
609+
)
610+
kde_plot_new["kde_x"] = new
552611
else:
553612
kde_warnings.append(
554-
"Less than 2 datapoints, cannot fit Kernel Density Estimator (KDE) with an ISJ to New."
613+
"Less than 2 datapoints or no standard variance for a meaningful fit Kernel Density Estimator (KDE) with an ISJ bandwidth to New."
555614
)
615+
# produce flat line for 0 data points
616+
if len(new) == 0:
617+
x = np.linspace(-1, 1, 100)
618+
y = np.zeros_like(x)
619+
kde_plot_new["kde_x"] = x.tolist()
620+
kde_plot_new["kde_y"] = y.tolist()
621+
else:
622+
kde_plot_new["kde_x"] = new
556623

557624
except Exception:
558625
# KDE failed, charts will show just medians

0 commit comments

Comments
 (0)