Turing 0.41. (#661)

penelopeysm · web-flow · commit 9ce0077bb46d · 2025-10-24T15:09:20.000+01:00
* Bump Turing to 0.41

* Update initial_params stuff for 0.41

* Update more initial_params stuff

* istrans -&gt; is_transformed

* Minor text fixes

* Remove SamplingContext

* Fix loss of concreteness in MCMCChains

* Fix flaky test in linear regression doc

* Try StableRNG

* Make GMM tutorial a bit less demanding

* Fix typo
diff --git a/Manifest.toml b/Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -3,23 +3,17 @@ ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
 AbstractMCMC = "80f14c24-f653-4e6a-9b94-39d6b0f70001"
 AbstractPPL = "7a57a42e-76ec-4ea3-a279-07e840d6d9cf"
-AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d"
 AdvancedMH = "5b7e9947-ddc0-4b3f-9b55-0d8042f74170"
 AdvancedVI = "b5ca4192-6429-45e5-a2d9-87aec30a685c"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-DelayDiffEq = "bcd4f6db-9728-5f36-b5f7-82caef46ccdb"
 DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
-DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
-Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
@@ -35,25 +29,24 @@ MLDataUtils = "cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
-Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
-Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationNLopt = "4e6fcdb7-1186-4e1f-a706-475e75c168bb"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SciMLSensitivity = "1ed8b502-d754-442c-8d5d-10ac956f44a1"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
-UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
-Turing = "0.40"
+Turing = "0.41"
diff --git a/_quarto.yml b/_quarto.yml
@@ -43,7 +43,7 @@ website:
         href: https://turinglang.org/team/
     right:
       # Current version
-      - text: "v0.40"
+      - text: "v0.41"
         menu:
           - text: Changelog
             href: https://turinglang.org/docs/changelog.html
diff --git a/developers/compiler/design-overview/index.qmd b/developers/compiler/design-overview/index.qmd
@@ -36,6 +36,7 @@ The following are the main jobs of the `@model` macro:
 
 ## The model
 
+<!-- Very outdated
 A `model::Model` is a callable struct that one can sample from by calling
 
 ```{julia}
@@ -49,6 +50,7 @@ and `context` is a sampling context that can, e.g., modify how the log probabili
 
 Sampling resets the log joint probability of `varinfo` and increases the evaluation counter of `sampler`. If `context` is a `LikelihoodContext`,
 only the log likelihood of `D` will be accumulated, whereas with `PriorContext` only the log prior probability of `P` is. With the `DefaultContext` the log joint probability of both `P` and `D` is accumulated.
+-->
 
 The `Model` struct contains the four internal fields `f`, `args`, `defaults`, and `context`.
 When `model::Model` is called, then the internal function `model.f` is called as `model.f(rng, varinfo, sampler, context, model.args...)`
diff --git a/developers/contexts/submodel-condition/index.qmd b/developers/contexts/submodel-condition/index.qmd
@@ -102,19 +102,14 @@ We begin by mentioning some high-level desiderata for their joint behaviour.
 Take these models, for example:
 
 ```{julia}
-# We define a helper function to unwrap a layer of SamplingContext, to
-# avoid cluttering the print statements.
-unwrap_sampling_context(ctx::DynamicPPL.SamplingContext) = ctx.context
-unwrap_sampling_context(ctx::DynamicPPL.AbstractContext) = ctx
-
 @model function inner()
-    println("inner context: $(unwrap_sampling_context(__model__.context))")
+    println("inner context: $(__model__.context)")
     x ~ Normal()
     return y ~ Normal()
 end
 
 @model function outer()
-    println("outer context: $(unwrap_sampling_context(__model__.context))")
+    println("outer context: $(__model__.context)")
     return a ~ to_submodel(inner())
 end
 
@@ -124,7 +119,7 @@ with_outer_cond = outer() | (@varname(a.x) => 1.0)
 # 'Inner conditioning'
 inner_cond = inner() | (@varname(x) => 1.0)
 @model function outer2()
-    println("outer context: $(unwrap_sampling_context(__model__.context))")
+    println("outer context: $(__model__.context)")
     return a ~ to_submodel(inner_cond)
 end
 with_inner_cond = outer2()
diff --git a/developers/transforms/dynamicppl/index.qmd b/developers/transforms/dynamicppl/index.qmd
@@ -67,7 +67,7 @@ Note that this acts on _all_ variables in the model, including unconstrained one
 vi_linked = DynamicPPL.link(vi, model)
 println("Transformed value: $(DynamicPPL.getindex_internal(vi_linked, vn_x))")
 println("Transformed logp: $(DynamicPPL.getlogp(vi_linked))")
-println("Transformed flag: $(DynamicPPL.istrans(vi_linked, vn_x))")
+println("Transformed flag: $(DynamicPPL.is_transformed(vi_linked, vn_x))")
 ```
 
 Indeed, we can see that the new logp value matches with
@@ -82,7 +82,7 @@ The reverse transformation, `invlink`, reverts all of the above steps:
 vi = DynamicPPL.invlink(vi_linked, model)  # Same as the previous vi
 println("Un-transformed value: $(DynamicPPL.getindex_internal(vi, vn_x))")
 println("Un-transformed logp: $(DynamicPPL.getlogp(vi))")
-println("Un-transformed flag: $(DynamicPPL.istrans(vi, vn_x))")
+println("Un-transformed flag: $(DynamicPPL.is_transformed(vi, vn_x))")
 ```
 
 ### Model and internal representations
@@ -104,7 +104,7 @@ Note that `vi_linked[vn_x]` can also be used as shorthand for `getindex(vi_linke
 We can see (for this linked varinfo) that there are _two_ differences between these outputs:
 
 1. _The internal representation has been transformed using the bijector (in this case, the log function)._
-   This means that the `istrans()` flag which we used above doesn't modify the model representation: it only tells us whether the internal representation has been transformed or not.
+   This means that the `is_transformed()` flag which we used above doesn't modify the model representation: it only tells us whether the internal representation has been transformed or not.
 
 2. _The internal representation is a vector, whereas the model representation is a scalar._
    This is because in DynamicPPL, _all_ internal values are vectorised (i.e. converted into some vector), regardless of distribution. On the other hand, since the model specifies a univariate distribution, the model representation is a scalar.
@@ -131,7 +131,7 @@ Before that, though, we'll take a quick high-level look at how the HMC sampler i
 While DynamicPPL provides the _functionality_ for transforming variables, the transformation itself happens at an even higher level, i.e. in the sampler itself.
 The HMC sampler in Turing.jl is in [this file](https://github.com/TuringLang/Turing.jl/blob/5b24cebe773922e0f3d5c4cb7f53162eb758b04d/src/mcmc/hmc.jl).
 In the first step of sampling, it calls `link` on the sampler.
-This transformation is preserved throughout the sampling process, meaning that `istrans()` always returns true.
+This transformation is preserved throughout the sampling process, meaning that `is_transformed()` always returns true.
 
 We can observe this by inserting print statements into the model.
 Here, `__varinfo__` is the internal symbol for the `VarInfo` object used in model evaluation:
@@ -145,7 +145,7 @@ setprogress!(false)
         println("-----------")
         println("model repn: $(DynamicPPL.getindex(__varinfo__, @varname(x)))")
         println("internal repn: $(DynamicPPL.getindex_internal(__varinfo__, @varname(x)))")
-        println("istrans: $(istrans(__varinfo__, @varname(x)))")
+        println("is_transformed: $(is_transformed(__varinfo__, @varname(x)))")
     end
 end
 
@@ -154,10 +154,10 @@ sample(demo2(), HMC(0.1, 3), 3);
 
 
 (Here, the check on `if x isa AbstractFloat` prevents the printing from occurring during computation of the derivative.)
-You can see that during the three sampling steps, `istrans` is always kept as `true`.
+You can see that during the three sampling steps, `is_transformed` is always kept as `true`.
 
 ::: {.callout-note}
-The first two model evaluations where `istrans` is `false` occur prior to the actual sampling.
+The first two model evaluations where `is_transformed` is `false` occur prior to the actual sampling.
 One occurs when the model is checked for correctness (using [`DynamicPPL.check_model_and_trace`](https://github.com/TuringLang/DynamicPPL.jl/blob/ba490bf362653e1aaefe298364fe3379b60660d3/src/debug_utils.jl#L582-L612)).
 The second occurs because the model is evaluated once to generate a set of initial parameters inside [DynamicPPL's implementation of `AbstractMCMC.step`](https://github.com/TuringLang/DynamicPPL.jl/blob/ba490bf362653e1aaefe298364fe3379b60660d3/src/sampler.jl#L98-L117).
 Both of these steps occur with all samplers in Turing.jl, so are not specific to the HMC example shown here.
@@ -169,7 +169,7 @@ The biggest prerequisite for this to work correctly is that the potential energy
 This is exactly the same as how we had to make sure to define `logq(y)` correctly in the toy HMC example above.
 
 Within Turing.jl, this is correctly handled because a statement like `x ~ LogNormal()` in the model definition above is translated into `assume(LogNormal(), @varname(x), __varinfo__)`, defined [here](https://github.com/TuringLang/DynamicPPL.jl/blob/ba490bf362653e1aaefe298364fe3379b60660d3/src/context_implementations.jl#L225-L229).
-If you follow the trail of function calls, you can verify that the `assume` function does indeed check for the presence of the `istrans` flag and adds the Jacobian term accordingly.
+If you follow the trail of function calls, you can verify that the `assume` function does indeed check for the presence of the `is_transformed` flag and adds the Jacobian term accordingly.
 
 ## A deeper dive into DynamicPPL's internal machinery
 
@@ -234,7 +234,7 @@ DynamicPPL.getindex_internal(vi_linked, vn_x)
 ```
 
 The purpose of having all of these machinery is to allow other parts of DynamicPPL, such as the tilde pipeline, to handle transformed variables correctly.
-The following diagram shows how `assume` first checks whether the variable is transformed (using `istrans`), and then applies the appropriate transformation function.
+The following diagram shows how `assume` first checks whether the variable is transformed (using `is_transformed`), and then applies the appropriate transformation function.
 
 <!-- 'wrappingWidth' setting required because of https://github.com/mermaid-js/mermaid-cli/issues/112#issuecomment-2352670995 -->
 ```{mermaid}
@@ -246,7 +246,7 @@ graph TD
     A["x ~ LogNormal()"]:::boxStyle
     B["vn = <span style='color:#3B6EA8 !important;'>@varname</span>(x)<br>dist = LogNormal()<br>x, vi = ..."]:::boxStyle
     C["assume(vn, dist, vi)"]:::boxStyle
-    D(["<span style='color:#3B6EA8 !important;'>if</span> istrans(vi, vn)"]):::boxStyle
+    D(["<span style='color:#3B6EA8 !important;'>if</span> is_transformed(vi, vn)"]):::boxStyle
     E["f = from_internal_transform(vi, vn, dist)"]:::boxStyle
     F["f = from_linked_internal_transform(vi, vn, dist)"]:::boxStyle
     G["x, logjac = with_logabsdet_jacobian(f, getindex_internal(vi, vn, dist))"]:::boxStyle
@@ -267,7 +267,7 @@ graph TD
 
 Here, `with_logabsdet_jacobian` is defined [in the ChangesOfVariables.jl package](https://juliamath.github.io/ChangesOfVariables.jl/stable/api/#ChangesOfVariables.with_logabsdet_jacobian), and returns both the effect of the transformation `f` as well as the log Jacobian term.
 
-Because we chose `f` appropriately, we find here that `x` is always the model representation; furthermore, if the variable was _not_ linked (i.e. `istrans` was false), the log Jacobian term will be zero.
+Because we chose `f` appropriately, we find here that `x` is always the model representation; furthermore, if the variable was _not_ linked (i.e. `is_transformed` was false), the log Jacobian term will be zero.
 However, if it was linked, then the Jacobian term would be appropriately included, making sure that sampling proceeds correctly.
 
 ## Why do we need to do this at runtime?
diff --git a/tutorials/bayesian-linear-regression/index.qmd b/tutorials/bayesian-linear-regression/index.qmd
@@ -41,9 +41,8 @@ using StatsBase
 # Functionality for working with scaled identity matrices.
 using LinearAlgebra
 
-# Set a seed for reproducibility.
-using Random
-Random.seed!(0);
+# For ensuring reproducibility.
+using StableRNGs: StableRNG
 ```
 
 ```{julia}
@@ -76,7 +75,7 @@ The next step is to get our data ready for testing. We'll split the `mtcars` dat
 select!(data, Not(:Model))
 
 # Split our dataset 70%/30% into training/test sets.
-trainset, testset = map(DataFrame, splitobs(data; at=0.7, shuffle=true))
+trainset, testset = map(DataFrame, splitobs(StableRNG(468), data; at=0.7, shuffle=true))
 
 # Turing requires data in matrix form.
 target = :MPG
@@ -143,7 +142,7 @@ With our model specified, we can call the sampler. We will use the No U-Turn Sam
 
 ```{julia}
 model = linear_regression(train, train_target)
-chain = sample(model, NUTS(), 5_000)
+chain = sample(StableRNG(468), model, NUTS(), 20_000)
 ```
 
 We can also check the densities and traces of the parameters visually using the `plot` functionality.
@@ -158,7 +157,7 @@ It looks like all parameters have converged.
 #| echo: false
 let
     ess_df = ess(chain)
-    @assert minimum(ess_df[:, :ess]) > 500 "Minimum ESS: $(minimum(ess_df[:, :ess])) - not > 700"
+    @assert minimum(ess_df[:, :ess]) > 500 "Minimum ESS: $(minimum(ess_df[:, :ess])) - not > 500"
     @assert mean(ess_df[:, :ess]) > 2_000 "Mean ESS: $(mean(ess_df[:, :ess])) - not > 2000"
     @assert maximum(ess_df[:, :ess]) > 3_500 "Maximum ESS: $(maximum(ess_df[:, :ess])) - not > 3500"
 end
@@ -243,9 +242,11 @@ let
     ols_test_loss = msd(test_prediction_ols, testset[!, target])
     @assert bayes_train_loss < bayes_test_loss "Bayesian training loss ($bayes_train_loss) >= Bayesian test loss ($bayes_test_loss)"
     @assert ols_train_loss < ols_test_loss "OLS training loss ($ols_train_loss) >= OLS test loss ($ols_test_loss)"
-    @assert isapprox(bayes_train_loss, ols_train_loss; rtol=0.01) "Difference between Bayesian training loss ($bayes_train_loss) and OLS training loss ($ols_train_loss) unexpectedly large!"
-    @assert isapprox(bayes_test_loss, ols_test_loss; rtol=0.05) "Difference between Bayesian test loss ($bayes_test_loss) and OLS test loss ($ols_test_loss) unexpectedly large!"
+    @assert bayes_train_loss > ols_train_loss "Bayesian training loss ($bayes_train_loss) <= OLS training loss ($bayes_train_loss)"
+    @assert bayes_test_loss < ols_test_loss "Bayesian test loss ($bayes_test_loss) >= OLS test loss ($ols_test_loss)"
 end
 ```
 
-As we can see above, OLS and our Bayesian model fit our training and test data set about the same.
+We can see from this that both linear regression techniques perform fairly similarly.
+The Bayesian linear regression approach performs worse on the training set, but better on the test set.
+This indicates that the Bayesian approach is more able to generalise to unseen data, i.e., it is not overfitting the training data as much.
diff --git a/tutorials/bayesian-poisson-regression/index.qmd b/tutorials/bayesian-poisson-regression/index.qmd
@@ -182,7 +182,10 @@ We use the Gelman, Rubin, and Brooks Diagnostic to check whether our chains have
 We expect the chains to have converged. This is because we have taken sufficient number of iterations (1500) for the NUTS sampler. However, in case the test fails, then we will have to take a larger number of iterations, resulting in longer computation time.
 
 ```{julia}
-gelmandiag(chain)
+# Because some of the sampler statistics are `missing`, we need to extract only
+# the parameters and then concretize the array so that `gelmandiag` can be computed.
+parameter_chain = MCMCChains.concretize(MCMCChains.get_sections(chain, :parameters))
+gelmandiag(parameter_chain)
 ```
 
 From the above diagnostic, we can conclude that the chains have converged because the PSRF values of the coefficients are close to 1.
diff --git a/tutorials/gaussian-mixture-models/index.qmd b/tutorials/gaussian-mixture-models/index.qmd
@@ -38,11 +38,11 @@ Random.seed!(3)
 
 # Define Gaussian mixture model.
 w = [0.5, 0.5]
-μ = [-3.5, 0.5]
-mixturemodel = MixtureModel([MvNormal(Fill(μₖ, 2), I) for μₖ in μ], w)
+μ = [-2.0, 2.0]
+mixturemodel = MixtureModel([MvNormal(Fill(μₖ, 2), 0.2 * I) for μₖ in μ], w)
 
 # We draw the data points.
-N = 60
+N = 30
 x = rand(mixturemodel, N);
 ```
 
@@ -145,7 +145,7 @@ let
         # μ[1] and μ[2] can switch places, so we sort the values first.
         chain = Array(chains[:, ["μ[1]", "μ[2]"], i])
         μ_mean = vec(mean(chain; dims=1))
-        @assert isapprox(sort(μ_mean), μ; rtol=0.1) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
+        @assert isapprox(sort(μ_mean), μ; atol=0.5) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
     end
 end
 ```
@@ -212,7 +212,7 @@ let
         # μ[1] and μ[2] can no longer switch places. Check that they've found the mean
         chain = Array(chains[:, ["μ[1]", "μ[2]"], i])
         μ_mean = vec(mean(chain; dims=1))
-        @assert isapprox(sort(μ_mean), μ; rtol=0.4) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
+        @assert isapprox(sort(μ_mean), μ; atol=0.5) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
     end
 end
 ```
@@ -350,7 +350,7 @@ let
         # μ[1] and μ[2] can no longer switch places. Check that they've found the mean
         chain = Array(chains[:, ["μ[1]", "μ[2]"], i])
         μ_mean = vec(mean(chain; dims=1))
-        @assert isapprox(sort(μ_mean), μ; rtol=0.4) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
+        @assert isapprox(sort(μ_mean), μ; atol=0.5) "Difference between estimated mean of μ ($(sort(μ_mean))) and data-generating μ ($μ) unexpectedly large!"
     end
 end
 ```
diff --git a/usage/mode-estimation/index.qmd b/usage/mode-estimation/index.qmd
@@ -37,7 +37,7 @@ data = [1.5, 2.0]
 model = gdemo(data)
 ```
 
-Finding the maximum aposteriori or maximum likelihood parameters is as simple as
+Finding the maximum a posteriori or maximum likelihood parameters is as simple as
 
 ```{julia}
 # Generate a MLE estimate.
@@ -80,6 +80,14 @@ maximum_likelihood(
 
 When providing values to arguments like `initial_params` the parameters are typically specified in the order in which they appear in the code of the model, so in this case first `s²` then `m`. More precisely it's the order returned by `Turing.Inference.getparams(model, DynamicPPL.VarInfo(model))`.
 
+::: {.callout-note}
+## Initialisation strategies and consistency with MCMC sampling
+
+Since Turing v0.41, for MCMC sampling, the `initial_params` argument must be a `DynamicPPL.AbstractInitStrategy` as described in [the sampling options page]({{< meta usage-sampling-options >}}#specifying-initial-parameters)).
+The optimisation interface has not yet been updated to use this; thus, initial parameters are still specified as Vectors.
+We expect that this will be changed in the near future.
+:::
+
 We can also do constrained optimisation, by providing either intervals within which the parameters must stay, or costraint functions that they need to respect. For instance, here's how one can find the MLE with the constraint that the variance must be less than 0.01 and the mean must be between -1 and 1.:
 
 ```{julia}
diff --git a/usage/sampling-options/index.qmd b/usage/sampling-options/index.qmd
diff --git a/usage/troubleshooting/index.qmd b/usage/troubleshooting/index.qmd