-
Notifications
You must be signed in to change notification settings - Fork 196
Description
This function can compute the mode for arbitrary iterables, including vectors of floats:
StatsBase.jl/src/scalarstats.jl
Lines 110 to 135 in dfe8945
| # compute mode over arbitrary iterable | |
| function mode(a) | |
| isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) | |
| cnts = Dict{eltype(a),Int}() | |
| # first element | |
| mc = 1 | |
| mv, st = iterate(a) | |
| cnts[mv] = 1 | |
| # find the mode along with table construction | |
| y = iterate(a, st) | |
| while y !== nothing | |
| x, st = y | |
| if haskey(cnts, x) | |
| c = (cnts[x] += 1) | |
| if c > mc | |
| mc = c | |
| mv = x | |
| end | |
| else | |
| cnts[x] = 1 | |
| # in this case: c = 1, and thus c > mc won't happen | |
| end | |
| y = iterate(a, st) | |
| end | |
| return mv | |
| end |
Indeed, it's called to compute the mode of a vector of floats:
julia> @which StatsBase.mode([1.,3.,3.141])
mode(a)
@ StatsBase ~/.julia/packages/StatsBase/s2YJR/src/scalarstats.jl:111However, it has very high bias and variance compared to the half-sample mode (HSM, see [1]), for example. This can be easily verified with simulations:
Row │ nobs dist method bias variance MAE
│ Int64 Distribu… String Float64 Float64 Float64
─────┼───────────────────────────────────────────────────────────────────────────────────────────
1 │ 1000 Normal{Float64}(μ=0.0, σ=1.0) StatsBase -0.0105188 1.03791 0.818895
2 │ 1000 Normal{Float64}(μ=0.0, σ=1.0) HSM -0.00256086 0.0550354 0.189431
3 │ 1000000 Normal{Float64}(μ=0.0, σ=1.0) StatsBase -0.0242424 1.03698 0.820519
4 │ 1000000 Normal{Float64}(μ=0.0, σ=1.0) HSM -0.00500615 0.00342947 0.0479765
5 │ 1000 Normal{Float64}(μ=-1.0, σ=1.0) StatsBase 0.00682327 0.956165 0.788644
6 │ 1000 Normal{Float64}(μ=-1.0, σ=1.0) HSM -0.00291935 0.0560346 0.189825
7 │ 1000000 Normal{Float64}(μ=-1.0, σ=1.0) StatsBase 0.0167116 0.955203 0.778444
8 │ 1000000 Normal{Float64}(μ=-1.0, σ=1.0) HSM -0.00111594 0.00361161 0.0485056
9 │ 1000 TDist{Float64}(ν=0.1) StatsBase 1.68401e33 2.77662e69 1.68401e33
10 │ 1000 TDist{Float64}(ν=0.1) HSM 0.00730426 0.0190662 0.110146
11 │ 1000000 TDist{Float64}(ν=0.1) StatsBase -1.96736e28 3.87052e59 1.96738e28
12 │ 1000000 TDist{Float64}(ν=0.1) HSM 0.00123069 0.00077329 0.0222707
13 │ 1000 TDist{Float64}(ν=1.5) StatsBase -0.0189519 10.7845 1.62119
14 │ 1000 TDist{Float64}(ν=1.5) HSM -0.000349072 0.0394342 0.158469
15 │ 1000000 TDist{Float64}(ν=1.5) StatsBase 0.0527691 27.5577 1.96788
16 │ 1000000 TDist{Float64}(ν=1.5) HSM 0.00175729 0.00257369 0.0406603
17 │ 1000 Beta{Float64}(α=9.2, β=5.6) StatsBase -0.0223237 0.0146286 0.0988868
18 │ 1000 Beta{Float64}(α=9.2, β=5.6) HSM -0.0015705 0.00102678 0.0260994
19 │ 1000000 Beta{Float64}(α=9.2, β=5.6) StatsBase -0.0175676 0.0144346 0.0976854
20 │ 1000000 Beta{Float64}(α=9.2, β=5.6) HSM -0.000445367 6.02226e-5 0.00628862
- In all cases, the bias of
StatsBase.modeis at least 4 times higher than that of HSM. It's arbitrarily high when the true data-generating process (DGP) has no mean, like for the Student distribution with degrees of freedom less than 1. Note that the mode of any Student distribution is 0 (or equal to the location parameter), independent of the number of degrees of freedom. - The variance of the
StatsBase.modeestimator can be arbitrarily high as well. In the case ofTDist(0.1), it's on the order of$10^{59}$ . ForTDist(1.5)it's around 30. Meanwhile, the variance of HSM is less than 0.005 in both cases. - The MAE for
StatsBase.modeis at least 15 times (!) higher than for HSM and can be made arbitrarily high ($10^{28}$ forTDist(0.1)compared to 0.02 of HSM).
Is StatsBase.mode actually meant for computing the mode of arbitrary iterables, including samples from a continuous distribution? Probably not, because it simply returns the most common element in the collection, but samples from continuous distributions are usually unique. Perhaps a separate method is needed to estimate the mode from a sample of floats, like the HSM. Is so, I could contribute a PR.
References
- David R. Bickel, Rudolf Frühwirth, "On a fast, robust estimator of the mode: Comparisons to other robust estimators with applications", Computational Statistics & Data Analysis, Volume 50, Issue 12, 2006, Pages 3500-3530, ISSN 0167-9473, https://doi.org/10.1016/j.csda.2005.07.011.
Code to generate the table
import Pkg
Pkg.activate(temp=true)
Pkg.add(name="StatsBase", version="0.34.5")
Pkg.add(["Distributions", "ThreadsX", "DataFrames"])
Pkg.status()
import ThreadsX, StatsBase
import DataFrames: DataFrame
using Distributions
# Assumes `x` is sorted
function mode_HSM(x::AbstractVector{T}; sorted::Bool=false)::T where T<:Real
if !sorted
x = sort(x) # Allocates a vector of `length(x)` elements
end
# Below code does NOT allocate
start_idx = 1
end_idx = length(x)
n = end_idx - start_idx + 1
while n > 3
N = ceil(Int, n / 2)
min_width = T(Inf)
min_start = start_idx
for i in start_idx:(end_idx - N + 1)
width = x[i + N - 1] - x[i]
if width < min_width
min_width = width
min_start = i
end
end
start_idx = min_start
end_idx = min_start + N - 1
n = end_idx - start_idx + 1
end
# Return the estimated mode
if n == 1
x[start_idx]
elseif n == 2
(x[start_idx] + x[end_idx]) / 2
else
d1 = x[start_idx + 1] - x[start_idx]
d2 = x[start_idx + 2] - x[start_idx + 1]
if d1 < d2
(x[start_idx] + x[start_idx + 1]) / 2
elseif d2 < d1
(x[start_idx + 1] + x[start_idx + 2]) / 2
else
x[start_idx + 1]
end
end
end
function compare(nobs::Integer, dist::ContinuousUnivariateDistribution, nsamples::Integer=1000)
mode_true = Distributions.mode(dist)
the_modes = ThreadsX.map(
_->begin
samples = rand(dist, nobs)
StatsBase.mode(samples), mode_HSM(samples)
end, 1:nsamples
)
DataFrame([
(;
nobs, dist,
method="StatsBase",
bias=mean(m1 for (m1, m2) in the_modes) - mode_true,
variance=var(m1 for (m1, m2) in the_modes),
MAE=mean(abs(m1 - mode_true) for (m1, m2) in the_modes)
),
(;
nobs, dist,
method="HSM",
bias=mean(m2 for (m1, m2) in the_modes) - mode_true,
variance=var(m2 for (m1, m2) in the_modes),
MAE=mean(abs(m2 - mode_true) for (m1, m2) in the_modes)
)
])
end
function compare_append(df_full::Union{Nothing, DataFrame}, nobs, dist, nsamples=1000; quiet::Bool=false)
df = compare(nobs, dist, nsamples)
quiet || display(df)
(df_full == nothing) ? df : vcat(df_full, df)
end
df_full = let
@info "Starting benchmark..."
df_full = compare_append(nothing, 10^3, Normal(0, 1))
df_full = compare_append(df_full, 10^6, Normal(0, 1))
df_full = compare_append(df_full, 10^3, Normal(-1, 1))
df_full = compare_append(df_full, 10^6, Normal(-1, 1))
# Mean and variance may not exist, but the mode is always zero
df_full = compare_append(df_full, 10^3, TDist(0.1)) # Mean doesn't exist
df_full = compare_append(df_full, 10^6, TDist(0.1))
df_full = compare_append(df_full, 10^3, TDist(1.5)) # Variance doesn't exist
df_full = compare_append(df_full, 10^6, TDist(1.5))
# Mode only exists when α, β > 1
df_full = compare_append(df_full, 10^3, Beta(9.2, 5.6))
df_full = compare_append(df_full, 10^6, Beta(9.2, 5.6))
println("\n\n")
display(df_full)
df_full
end