Skip to content

StatsBase.mode can have arbitrarily high bias and variance for continuous distributions #957

@ForceBru

Description

@ForceBru

This function can compute the mode for arbitrary iterables, including vectors of floats:

# compute mode over arbitrary iterable
function mode(a)
isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
cnts = Dict{eltype(a),Int}()
# first element
mc = 1
mv, st = iterate(a)
cnts[mv] = 1
# find the mode along with table construction
y = iterate(a, st)
while y !== nothing
x, st = y
if haskey(cnts, x)
c = (cnts[x] += 1)
if c > mc
mc = c
mv = x
end
else
cnts[x] = 1
# in this case: c = 1, and thus c > mc won't happen
end
y = iterate(a, st)
end
return mv
end

Indeed, it's called to compute the mode of a vector of floats:

julia> @which StatsBase.mode([1.,3.,3.141])
mode(a)
     @ StatsBase ~/.julia/packages/StatsBase/s2YJR/src/scalarstats.jl:111

However, it has very high bias and variance compared to the half-sample mode (HSM, see [1]), for example. This can be easily verified with simulations:

 Row │ nobs     dist                            method     bias          variance     MAE        
     │ Int64    Distribu…                       String     Float64       Float64      Float64    
─────┼───────────────────────────────────────────────────────────────────────────────────────────
   1 │    1000  Normal{Float64}(μ=0.0, σ=1.0)   StatsBase  -0.0105188     1.03791     0.818895
   2 │    1000  Normal{Float64}(μ=0.0, σ=1.0)   HSM        -0.00256086    0.0550354   0.189431
   3 │ 1000000  Normal{Float64}(μ=0.0, σ=1.0)   StatsBase  -0.0242424     1.03698     0.820519
   4 │ 1000000  Normal{Float64}(μ=0.0, σ=1.0)   HSM        -0.00500615    0.00342947  0.0479765
   5 │    1000  Normal{Float64}(μ=-1.0, σ=1.0)  StatsBase   0.00682327    0.956165    0.788644
   6 │    1000  Normal{Float64}(μ=-1.0, σ=1.0)  HSM        -0.00291935    0.0560346   0.189825
   7 │ 1000000  Normal{Float64}(μ=-1.0, σ=1.0)  StatsBase   0.0167116     0.955203    0.778444
   8 │ 1000000  Normal{Float64}(μ=-1.0, σ=1.0)  HSM        -0.00111594    0.00361161  0.0485056
   9 │    1000  TDist{Float64}(ν=0.1)           StatsBase   1.68401e33    2.77662e69  1.68401e33
  10 │    1000  TDist{Float64}(ν=0.1)           HSM         0.00730426    0.0190662   0.110146
  11 │ 1000000  TDist{Float64}(ν=0.1)           StatsBase  -1.96736e28    3.87052e59  1.96738e28
  12 │ 1000000  TDist{Float64}(ν=0.1)           HSM         0.00123069    0.00077329  0.0222707
  13 │    1000  TDist{Float64}(ν=1.5)           StatsBase  -0.0189519    10.7845      1.62119
  14 │    1000  TDist{Float64}(ν=1.5)           HSM        -0.000349072   0.0394342   0.158469
  15 │ 1000000  TDist{Float64}(ν=1.5)           StatsBase   0.0527691    27.5577      1.96788
  16 │ 1000000  TDist{Float64}(ν=1.5)           HSM         0.00175729    0.00257369  0.0406603
  17 │    1000  Beta{Float64}(α=9.2, β=5.6)     StatsBase  -0.0223237     0.0146286   0.0988868
  18 │    1000  Beta{Float64}(α=9.2, β=5.6)     HSM        -0.0015705     0.00102678  0.0260994
  19 │ 1000000  Beta{Float64}(α=9.2, β=5.6)     StatsBase  -0.0175676     0.0144346   0.0976854
  20 │ 1000000  Beta{Float64}(α=9.2, β=5.6)     HSM        -0.000445367   6.02226e-5  0.00628862
  • In all cases, the bias of StatsBase.mode is at least 4 times higher than that of HSM. It's arbitrarily high when the true data-generating process (DGP) has no mean, like for the Student distribution with degrees of freedom less than 1. Note that the mode of any Student distribution is 0 (or equal to the location parameter), independent of the number of degrees of freedom.
  • The variance of the StatsBase.mode estimator can be arbitrarily high as well. In the case of TDist(0.1), it's on the order of $10^{59}$. For TDist(1.5) it's around 30. Meanwhile, the variance of HSM is less than 0.005 in both cases.
  • The MAE for StatsBase.mode is at least 15 times (!) higher than for HSM and can be made arbitrarily high ($10^{28}$ for TDist(0.1) compared to 0.02 of HSM).

Is StatsBase.mode actually meant for computing the mode of arbitrary iterables, including samples from a continuous distribution? Probably not, because it simply returns the most common element in the collection, but samples from continuous distributions are usually unique. Perhaps a separate method is needed to estimate the mode from a sample of floats, like the HSM. Is so, I could contribute a PR.

References

  1. David R. Bickel, Rudolf Frühwirth, "On a fast, robust estimator of the mode: Comparisons to other robust estimators with applications", Computational Statistics & Data Analysis, Volume 50, Issue 12, 2006, Pages 3500-3530, ISSN 0167-9473, https://doi.org/10.1016/j.csda.2005.07.011.

Code to generate the table

import Pkg
Pkg.activate(temp=true)
Pkg.add(name="StatsBase", version="0.34.5")
Pkg.add(["Distributions", "ThreadsX", "DataFrames"])
Pkg.status()

import ThreadsX, StatsBase
import DataFrames: DataFrame
using Distributions

# Assumes `x` is sorted
function mode_HSM(x::AbstractVector{T}; sorted::Bool=false)::T where T<:Real
	if !sorted
		x = sort(x) # Allocates a vector of `length(x)` elements
	end

	# Below code does NOT allocate
    start_idx = 1
    end_idx = length(x)
    n = end_idx - start_idx + 1

    while n > 3
        N = ceil(Int, n / 2)
        min_width = T(Inf)
        min_start = start_idx
        for i in start_idx:(end_idx - N + 1)
            width = x[i + N - 1] - x[i]
            if width < min_width
                min_width = width
                min_start = i
            end
        end
        start_idx = min_start
        end_idx = min_start + N - 1
        n = end_idx - start_idx + 1
    end

    # Return the estimated mode
    if n == 1
        x[start_idx]
    elseif n == 2
        (x[start_idx] + x[end_idx]) / 2
    else
        d1 = x[start_idx + 1] - x[start_idx]
        d2 = x[start_idx + 2] - x[start_idx + 1]
        if d1 < d2
            (x[start_idx] + x[start_idx + 1]) / 2
        elseif d2 < d1
            (x[start_idx + 1] + x[start_idx + 2]) / 2
        else
            x[start_idx + 1]
        end
    end
end

function compare(nobs::Integer, dist::ContinuousUnivariateDistribution, nsamples::Integer=1000)
	mode_true = Distributions.mode(dist)
	the_modes = ThreadsX.map(
		_->begin
			samples = rand(dist, nobs)
			StatsBase.mode(samples), mode_HSM(samples)
		end, 1:nsamples
	)

	DataFrame([
		(;
			nobs, dist,
			method="StatsBase",
			bias=mean(m1 for (m1, m2) in the_modes) - mode_true,
			variance=var(m1 for (m1, m2) in the_modes),
			MAE=mean(abs(m1 - mode_true) for (m1, m2) in the_modes)
		),
		(;
			nobs, dist,
			method="HSM",
			bias=mean(m2 for (m1, m2) in the_modes) - mode_true,
			variance=var(m2 for (m1, m2) in the_modes),
			MAE=mean(abs(m2 - mode_true) for (m1, m2) in the_modes)
		)
	])
end

function compare_append(df_full::Union{Nothing, DataFrame}, nobs, dist, nsamples=1000; quiet::Bool=false)
	df = compare(nobs, dist, nsamples)
	quiet || display(df)
	(df_full == nothing) ? df : vcat(df_full, df)
end

df_full = let
	@info "Starting benchmark..."
	df_full = compare_append(nothing, 10^3, Normal(0, 1))
	df_full = compare_append(df_full, 10^6, Normal(0, 1))

	df_full = compare_append(df_full, 10^3, Normal(-1, 1))
	df_full = compare_append(df_full, 10^6, Normal(-1, 1))

	# Mean and variance may not exist, but the mode is always zero
	df_full = compare_append(df_full, 10^3, TDist(0.1)) # Mean doesn't exist
	df_full = compare_append(df_full, 10^6, TDist(0.1))
	df_full = compare_append(df_full, 10^3, TDist(1.5)) # Variance doesn't exist
	df_full = compare_append(df_full, 10^6, TDist(1.5))

	# Mode only exists when α, β > 1
	df_full = compare_append(df_full, 10^3, Beta(9.2, 5.6))
	df_full = compare_append(df_full, 10^6, Beta(9.2, 5.6))

	println("\n\n")
	display(df_full)
	df_full
end

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions