Scale and precision related changes for RMSNorm and Einsum.

reddragon · The gemma Authors · commit 2f14004b083f · 2025-06-24T01:08:35.000-07:00
PiperOrigin-RevId: 775113121
diff --git a/gemma/gm/nn/_layers.py b/gemma/gm/nn/_layers.py
@@ -26,6 +26,7 @@ class Einsum(nn.Module):
   weight_name: str = 'w'
   initializer: nn.initializers.Initializer = nn.initializers.normal()
   dtype: jnp.dtype | None = None
+  w_scale: float | None = None
 
   @nn.compact
   def __call__(self, eqn: str, x: jax.Array) -> jax.Array:
@@ -35,24 +36,42 @@ def __call__(self, eqn: str, x: jax.Array) -> jax.Array:
         self.shape,
         self.dtype if self.dtype is not None else None,
     )
+    if self.w_scale:
+      w *= self.w_scale
     return jnp.einsum(eqn, x, w)
 
 
+def reduce_precision(x: jax.Array) -> jax.Array:
+  """Helper function to reduce the precision of a tensor."""
+  finfo = jnp.finfo(x.dtype)  # jnp important!
+  return jax.lax.reduce_precision(x, finfo.nexp, finfo.nmant)
+
+
 class RMSNorm(nn.Module):
   """RMSNorm layer."""
 
+  with_scale: bool = True
+  scale_init: nn.initializers.Initializer = nn.initializers.zeros_init()
+  scale_plus_one: bool = True
+  guard_against_excess_precision: bool = False
+
   @nn.compact
   def __call__(self, x):
-    scale = self.param('scale', nn.initializers.zeros_init(), (x.shape[-1]))
+    if self.guard_against_excess_precision:
+      x = reduce_precision(x)
+
     var = jnp.mean(jnp.square(x), axis=-1, keepdims=True)
 
     # Jax.lax.rsqrt is used because it returns different floats than
     # jnp.reciprocal(jnp.sqrt(var + 1e-06))
     normed_inputs = x * jax.lax.rsqrt(var + 1e-06)
 
-    # normed_inputs is a rank-K tensor, K > 1 (K is typically 2 or 3). scale is
-    # a rank-1 tensor. To avoid implicit rank-promotion, reshape scale to
-    # a (1, ..., 1, D) tensor, so the rank of scale matches normed_inputs.
-    scale = jnp.expand_dims(scale, axis=range(len(x.shape) - 1))
-    normed_inputs = normed_inputs * (1 + scale)
+    if self.with_scale:
+      scale = self.param('scale', self.scale_init, (x.shape[-1]))
+      # normed_inputs is a rank-K tensor, K > 1 (K is typically 2 or 3). scale
+      # is a rank-1 tensor. To avoid implicit rank-promotion, reshape scale to
+      # a (1, ..., 1, D) tensor, so the rank of scale matches normed_inputs.
+      scale = jnp.expand_dims(scale, axis=range(len(x.shape) - 1))
+      normed_inputs = normed_inputs * (
+          1. + scale if self.scale_plus_one else scale)
     return normed_inputs
diff --git a/gemma/gm/nn/_layers_test.py b/gemma/gm/nn/_layers_test.py
@@ -14,12 +14,16 @@
 
 """Tests for transformer layers."""
 
+from flax import linen as nn
 from gemma import gm
 import jax
 import jax.numpy as jnp
 import numpy as np
 import pytest
 
+_ZERO_INIT = nn.initializers.zeros_init()
+_ONES_INIT = nn.initializers.ones_init()
+
 
 @pytest.mark.parametrize(
     'inputs_shape, params_shape, eqn, expected_shape',
@@ -48,3 +52,23 @@ def test_rmsnorm(x, expected):
   params = rmsnorm.init(jax.random.PRNGKey(0), x)
   output = rmsnorm.apply(params, x)
   np.testing.assert_array_equal(output, jnp.array([expected]))
+
+
+@pytest.mark.parametrize(
+    'x, expected,with_scale,scale_init',
+    [
+        # This is the default case.
+        ([0.1, 0.2], [0.6324429, 1.2648858], True, _ZERO_INIT),
+        # In this case, the output is simply scaled by (1 + scale).
+        ([0.1, 0.2], [1.2648858, 2.5297716], True, _ONES_INIT),
+        # When with_scale is False, the output is not scaled.
+        ([0.1, 0.2], [0.6324429, 1.2648858], False, _ZERO_INIT),
+        ([0.1, 0.2], [0.6324429, 1.2648858], False, _ONES_INIT),
+    ],
+)
+def test_rmsnorm_with_scale(x, expected, with_scale, scale_init):
+  x = jnp.array([x])
+  rmsnorm = gm.nn.RMSNorm(with_scale=with_scale, scale_init=scale_init)
+  params = rmsnorm.init(jax.random.PRNGKey(0), x)
+  output = rmsnorm.apply(params, x)
+  np.testing.assert_array_equal(output, jnp.array([expected]))