Merge pull request #4 from devmotion/updates

devmotion · web-flow · commit 465fabce0aa0 · 2019-09-20T17:02:26.000+02:00
Update CI and docs
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -2,6 +2,7 @@
 environment:
   matrix:
   - julia_version: 1.1
+  - julia_version: 1.2
   - julia_version: nightly
 platform:
   - x86
@@ -26,6 +27,3 @@ build_script:
 test_script:
   - echo "%JL_TEST_SCRIPT%"
   - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%"
-on_success:
-  - echo "%JL_CODECOV_SCRIPT%"
-  - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
diff --git a/.travis.yml b/.travis.yml
@@ -5,6 +5,7 @@ os:
   - osx
 julia:
   - 1.1
+  - 1.2
   - nightly
 matrix:
   allow_failures:
@@ -13,18 +14,18 @@ matrix:
 notifications:
   email: false
 after_success:
-  - if [[ $TRAVIS_JULIA_VERSION = 1.1 ]] && [[ $TRAVIS_OS_NAME = linux ]]; then
+  - if [[ $TRAVIS_JULIA_VERSION = 1.2 ]] && [[ $TRAVIS_OS_NAME = linux ]]; then
       julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())';
       julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Coveralls.submit(process_folder())';
     fi
 jobs:
   include:
     - stage: Documentation
-      julia: 1.1
+      julia: 1.2
       os: linux
       env:
         - GKSwstype=nul
       script:
         - julia --project=docs -e 'using Pkg; Pkg.instantiate()'
         - julia --project=docs --color=yes docs/make.jl
-      after_success: skip
+      after_success: skip
diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,10 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
+DataStructures = "0.17"
+Distances = "0.8.2"
+Parameters = "0.12"
+StatsBase = "0.32"
 julia = "1.1"
 
 [extras]
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -34,10 +34,10 @@ uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 version = "0.8.0"
 
 [[Colors]]
-deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport", "Test"]
-git-tree-sha1 = "9f0a0210450acb91c730b730a994f8eef1d3d543"
+deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
+git-tree-sha1 = "c9c1845d6bf22e34738bee65c357a69f416ed5d1"
 uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
-version = "0.9.5"
+version = "0.9.6"
 
 [[Compat]]
 deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
@@ -72,9 +72,9 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 
 [[Distances]]
 deps = ["LinearAlgebra", "Statistics"]
-git-tree-sha1 = "44bd29b50552dfd0a0b674b925de2719f3b9bb0b"
+git-tree-sha1 = "23717536c81b63e250f682b0e0933769eecd1411"
 uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
-version = "0.8.1"
+version = "0.8.2"
 
 [[Distributed]]
 deps = ["Random", "Serialization", "Sockets"]
@@ -94,15 +94,15 @@ version = "0.8.0"
 
 [[Documenter]]
 deps = ["Base64", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
-git-tree-sha1 = "c61d6eedbc3c4323c08b64af12d29c8ee0fcbb5f"
+git-tree-sha1 = "1b6ae3796f60311e39cd1770566140d2c056e87f"
 uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-version = "0.23.2"
+version = "0.23.3"
 
 [[FFMPEG]]
 deps = ["BinaryProvider", "Libdl"]
-git-tree-sha1 = "1dd2128ff10894081f30931b355dc892d1352de9"
+git-tree-sha1 = "f65cf703281fb7917beca5ead1c67e6d60ef9597"
 uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a"
-version = "0.2.2"
+version = "0.2.3"
 
 [[FixedPointNumbers]]
 git-tree-sha1 = "d14a6fa5890ea3a7e5dcab6811114f132fec2b4b"
@@ -117,9 +117,9 @@ version = "0.41.0"
 
 [[GeometryTypes]]
 deps = ["ColorTypes", "FixedPointNumbers", "IterTools", "LinearAlgebra", "StaticArrays"]
-git-tree-sha1 = "2b0bfb379a54bdfcd2942f388f7d045f8952373d"
+git-tree-sha1 = "4bf5706f3b9a2c5adbbc473c8c91582c1fa816a3"
 uuid = "4d00f742-c7ba-57c2-abde-4428a4b178cb"
-version = "0.7.5"
+version = "0.7.6"
 
 [[InteractiveUtils]]
 deps = ["Markdown"]
@@ -148,9 +148,9 @@ uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [[Literate]]
 deps = ["Base64", "JSON", "REPL"]
-git-tree-sha1 = "04913ce466978fad4eb666c9f5fafc718fcc4366"
+git-tree-sha1 = "707c58359f2de555ace074313baea957c3187f2b"
 uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
-version = "2.0.3"
+version = "2.0.4"
 
 [[Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
@@ -166,10 +166,9 @@ uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
 version = "0.3.0"
 
 [[Missings]]
-deps = ["SparseArrays", "Test"]
-git-tree-sha1 = "f0719736664b4358aa9ec173077d4285775f8007"
+git-tree-sha1 = "29858ce6c8ae629cf2d733bffa329619a1c843d0"
 uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.1"
+version = "0.4.2"
 
 [[Mmap]]
 uuid = "a63ad114-7e13-5084-954f-fe012c677804"
@@ -188,21 +187,21 @@ version = "1.1.0"
 
 [[PDMats]]
 deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"]
-git-tree-sha1 = "9d6a9b3e19634612fb1edcafc4b1d75242b24bde"
+git-tree-sha1 = "035f8d60ba2a22cb1d2580b1e0e5ce0cb05e4563"
 uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
-version = "0.9.9"
+version = "0.9.10"
 
 [[Parameters]]
 deps = ["OrderedCollections"]
-git-tree-sha1 = "1dfd7cd50a8eb06ef693a4c2bbe945943cd000c5"
+git-tree-sha1 = "b62b2558efb1eef1fa44e4be5ff58a515c287e38"
 uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a"
-version = "0.11.0"
+version = "0.12.0"
 
 [[Parsers]]
 deps = ["Dates", "Test"]
-git-tree-sha1 = "db2b35dedab3c0e46dc15996d170af07a5ab91c9"
+git-tree-sha1 = "ef0af6c8601db18c282d092ccbd2f01f3f0cd70b"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
-version = "0.3.6"
+version = "0.3.7"
 
 [[Pkg]]
 deps = ["Dates", "LibGit2", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
@@ -222,9 +221,9 @@ version = "0.5.8"
 
 [[Plots]]
 deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryTypes", "JSON", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "Reexport", "Requires", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"]
-git-tree-sha1 = "f2aa8a7b5bc0ccec57a1237a97b6f59fc8d9ef57"
+git-tree-sha1 = "59bcea95a16912abb229209c9f6e9e218df44b7c"
 uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
-version = "0.26.2"
+version = "0.26.3"
 
 [[Printf]]
 deps = ["Unicode"]
@@ -325,7 +324,7 @@ uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 version = "0.8.0"
 
 [[SuiteSparse]]
-deps = ["Libdl", "LinearAlgebra", "SparseArrays"]
+deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"]
 uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 
 [[Test]]
diff --git a/docs/src/background.md b/docs/src/background.md
@@ -5,7 +5,7 @@
 A probabilistic model predicts a probability distribution of possible outputs
 for a given input.
 
-A very simple probabilistic model is a model that predicts a uniform 
+A very simple probabilistic model is a model that predicts a uniform
 distribution for a dice roll; there is no input and the possible outputs are
 the numbers $1,2,3,4,5,6$. A probably more complicated probabilistic model
 would be a model that predicts the distribution of stock price changes from
@@ -34,5 +34,5 @@ and [integral probability metrics](https://arxiv.org/pdf/0901.2698.pdf).
 Here we restrict ourselves to classification models, i.e., models for which output
 $Y$ takes only values from a finite set.
 
-The dice roll model above is a classification model, whereas the model that predicts
-stock price changes is not.
+The dice roll model above is a classification model, whereas the model that
+predicts stock price changes is not.
diff --git a/docs/src/calibration.md b/docs/src/calibration.md
@@ -34,8 +34,9 @@ is $x$, the long-run relative frequency of rain is also $x$".
 
 Commonly (see, e.g,
 [Guo et al. (2017)](http://proceedings.mlr.press/v70/guo17a/guo17a.pdf)), only
-calibration of the largest predictions $\max_y g_y(x)$ of a model $g$ is considered.
-According to this common notion a model is calibrated if almost always
+calibration of the most-confident predictions $\max_y g_y(x)$ of a model $g$ is
+considered. According to this common notion a model is calibrated if almost
+always
 ```math
     \mathbb{P}[Y = \textrm{arg} \, \max_y g_y(X) \,|\, \max_y g_y(X)] = \max_y g_y(X).
 ```
@@ -44,9 +45,102 @@ According to this common notion a model is calibrated if almost always
 
 According to the more general definition by
 [Bröcker (2009)](https://rmets.onlinelibrary.wiley.com/doi/pdf/10.1002/qj.456)
-and [Vaicenavicius et al.](http://proceedings.mlr.press/v89/vaicenavicius19a/vaicenavicius19a.pdf),
+and [Vaicenavicius et al. (2019)](http://proceedings.mlr.press/v89/vaicenavicius19a/vaicenavicius19a.pdf),
 a probabilistic model $g$ is calibrated if almost always
 ```math
     \mathbb{P}[Y = y \,|\, g(X)] = g_y(X)
 ```
-for all classes $y$.
+for all classes $y$.
+
+For classification problems with more than two classes, this definition of
+calibration is stronger than the more common one above. By reducing the model
+and applying the strong notion to the simplified model, however, this definition
+still allows to investigate the calibration of the model with respect to only
+certain aspects of interest such as the calibration of the most-confident
+predictions.
+
+Thus in this Julia package and its documentation, we always refer to the strong
+notion of calibration.
+
+Let $y_1, \ldots, y_m$ be the possible outputs. Then we can also define
+calibration in a vectorized form. Equivalently to the definition above, a model
+$g$ is calibrated if and only if
+```math
+    r(g(X)) - g(X) = 0
+```
+holds almost always, where
+```math
+    r(\xi) := (\mathbb{P}[Y = y_1 \,|\, g(X) = \xi], \ldots, \mathbb{P}[Y = y_m \,|\, g(X) = \xi])
+```
+denotes the so-called calibration function.
+
+## Measures
+
+Calibration measures allow a more fine-tuned analysis of calibration and enable
+comparisons of calibration of different models. Intuitively, calibration
+measures quantify the deviation of the left and right hand side in the
+definitions above.
+
+### Expected calibration error (ECE)
+
+The most common calibration measure is the so-called expected calibration error
+(ECE) (see, e.g.,
+[Guo et al. (2017)](http://proceedings.mlr.press/v70/guo17a/guo17a.pdf)).
+Informally, it is defined as the average distance between the left and right
+hand side of the definition above with respect to some metric. Mathematically,
+the expected calibration of model $g$ with respect to distance measure $d$ is
+defined as
+```math
+    \mathrm{ECE}[d, g] := \mathbb{E}[d(r(g(X)), g(X))].
+```
+Here $d$ could be, e.g., the cityblock distance, the total variation distance,
+or the squared Euclidean distance.
+
+If $d(p, q) = 0$ if and only if $p = q$, then the ECE of model $g$ with respect
+to distance measure $d$ is zero if and only if $g$ is calibrated.
+
+### Calibration error (CE)
+
+More generally, Widmann et al. (2019) define the calibration error (CE) of
+a model $g$ with respect to a function class $\mathcal{F} \subset \{f \colon
+\Delta^m \to \mathbb{R}^m\}$ as
+```math
+    \mathrm{CE}[\mathcal{F}, g] := \sup_{f \in \mathcal{F}} \mathbb{E}[(r(g(X)) - g(X))^\intercal f(g(X))].
+```
+
+If model $g$ is calibrated, then the CE is zero, regardless of the choice of
+$\mathcal{F}$. However, for some function spaces (e.g., for
+$\mathcal{F} = \{0\}$) the CE is zero even if $g$ is not calibrated.
+
+Interestingly, the ECE with respect to the cityblock distance, the total
+variation distance, and the squared Euclidean distance are all special cases
+of the CE (Widmann et al. (2019)).
+
+### Kernel calibration error (KCE)
+
+The kernel calibration error (KCE) is another special case of the CE, in which
+the unit ball of a reproducing kernel Hilbert space (RKHS) of vector-valued
+functions is chosen as function space $\mathcal{F}$.
+
+A RKHS of vector-valued functions $f \colon \Delta^m \to \mathbb{R}^m$ can be
+identified with a unique matrix-valued kernel $k \colon \Delta^m \times
+\Delta^m \to \mathbb{R}^{m \times m}$. Then the KCE of a model $g$ with respect
+to kernel $k$ is defined as
+```math
+    \mathrm{KCE}[k, g] := \mathrm{CE}[\mathcal{F}, g],
+```
+where $\mathcal{F}$ is the unit ball of the RKHS corresponding to kernel $k$.
+
+As Widmann et al. (2019) show, for a large class of kernels (so-called universal
+kernels) the KCE is zero if and only if the model $g$ is calibrated. Moreover,
+the KCE can be formulated in terms of the kernel $k$ as
+```math
+    \mathrm{KCE}[k, g] := {\left(\mathbb{E}[(e_Y - g(X))^{\intercal} k(g(X), g(X')) (e_{Y'} - g(X'))]\right)}^{1/2},
+```
+where $(X',Y')$ is an independent copy of $(X,Y)$ and $e_i$ denotes the $i$th
+unit vector.
+
+The so-called maximum mean calibration error (MMCE), proposed by
+[Kumar et al. (2018)](http://proceedings.mlr.press/v80/kumar18a/kumar18a.pdf),
+can be viewed as a special case of the KCE, in which only the most-confident
+predictions are considered (Widmann et al. (2019)).
diff --git a/examples/distribution.jl b/examples/distribution.jl
@@ -5,10 +5,10 @@
 
 # ## Introduction
 #
-# This example is taken from the forthcoming publication
-# "Calibration tests in multi-class classification: A unifying framework" by Widmann, Lindsten,
-# and Zachariah.
-# 
+# This example is taken from the publication
+# "Calibration tests in multi-class classification: A unifying framework" by Widmann,
+# Lindsten, and Zachariah.
+#
 # We estimate calibration errors of the model
 # ```math
 # \begin{aligned}
@@ -27,7 +27,7 @@
 # probability simplex, and $\beta = (1, 0, \ldots, 0)$.
 #
 # In our experiments we sample 250 predictions from the Dirichlet distribution
-# $\textrm{Dir}(\alpha)$, and then we generate corresponding labels according to 
+# $\textrm{Dir}(\alpha)$, and then we generate corresponding labels according to
 # the model stated above, for different choices of $\pi$ and number of classes $m$.
 #
 # We evaluate the standard estimators of expected calibration error (ECE) based on a
@@ -60,7 +60,7 @@ gr(fmt = :png, dpi = 600)
 
 # ## Estimates
 #
-# 
+#
 
 function estimates(rng::AbstractRNG, estimator, π::Real, m::Int)
     ## check arguments
@@ -243,4 +243,4 @@ plot(data)
 
 Random.seed!(1234)
 data = estimates(x -> LinearUnbiasedSKCE(kernel(x)))
-plot(data)
+plot(data)