Implement a LoRA interceptor for nn.DenseGeneral.

wwkong · The gemma Authors · commit db6e1975c7c6 · 2025-06-23T08:32:15.000-07:00
PiperOrigin-RevId: 774791006
diff --git a/gemma/peft/__init__.py b/gemma/peft/__init__.py
@@ -25,6 +25,8 @@
 from gemma.peft._lora import LoRADenseAdapter
 from gemma.peft._lora import LoRAEinsum
 from gemma.peft._lora import LoRAEinsumAdapter
+from gemma.peft._lora import LoRADenseGeneral
+from gemma.peft._lora import LoRADenseGeneralAdapter
 from gemma.peft._tree_utils import fuse_params
 from gemma.peft._tree_utils import merge_params
 from gemma.peft._tree_utils import split_params
diff --git a/gemma/peft/_lora.py b/gemma/peft/_lora.py
@@ -20,7 +20,9 @@
 from flax import linen as nn
 from flax.typing import Array  # pylint: disable=g-importing-member
 from gemma.peft import _einsum_utils
+import jax
 import jax.numpy as jnp
+import numpy as np
 
 
 class LoRADenseAdapter(nn.Module):
@@ -170,3 +172,109 @@ def __call__(self, inputs: Array, einsum_str: str | None = None) -> Array:
         b_init=self.b_init,
     )
     return self.wrapped(inputs) + adapter(inputs)
+
+
+class LoRADenseGeneralAdapter(nn.Module):
+  """LoRA general dense module.
+
+  This module only does the x @ A @ B computation.
+  Use `LoRAGeneralDense` to wrap a `nn.Einsum` layer.
+
+  Attributes:
+    rank: The rank of the LoRA decomposition.
+    features: The number of output features.
+    axis: int or tuple with axes to apply the transformation on.
+    batch_dims: tuple with batch axes.
+    dtype: The dtype to use for the LoRA weights.
+    a_init: The initializer for the A matrix.
+    b_init: The initializer for the B matrix.
+  """
+
+  _: dataclasses.KW_ONLY
+
+  rank: int
+  features: int | Sequence[int]
+  axis: int | Sequence[int]
+  batch_dims: Sequence[int]
+
+  dtype: jnp.dtype = jnp.float_
+  a_init: nn.initializers.Initializer = nn.initializers.kaiming_uniform()
+  b_init: nn.initializers.Initializer = nn.initializers.zeros_init()
+
+  @nn.compact
+  def __call__(self, inputs: Array) -> Array:
+    """Mostly copied from `flax.nn.DenseGeneral`."""
+
+    # Normalize inputs
+    batch_dims = nn.linear._canonicalize_tuple(self.batch_dims)
+    if batch_dims:
+      max_dim = np.max(batch_dims)
+      if set(batch_dims) != set(range(max_dim + 1)):
+        raise ValueError(
+            'batch_dims %s must be consecutive leading '
+            'dimensions starting from 0.'
+            % str(batch_dims)
+        )
+
+    n_dim = inputs.ndim
+    batch_dims = nn.linear._normalize_axes(batch_dims, n_dim)
+    features = nn.linear._canonicalize_tuple(self.features)
+    axis = nn.linear._normalize_axes(
+        nn.linear._canonicalize_tuple(self.axis),
+        n_dim,
+    )
+
+    # Create LoRA params
+    batch_shape = tuple(inputs.shape[i] for i in batch_dims)
+    a_shape = batch_shape + tuple(inputs.shape[i] for i in axis) + (self.rank,)
+    a = self.param('a', self.a_init, a_shape, self.dtype)
+    b_shape = (*batch_shape, self.rank, *features)
+    b = self.param('b', self.b_init, b_shape, self.dtype)
+
+    # Contract across given axes.
+    n_batch_dims, n_axis = len(batch_dims), len(axis)
+    batch_ind = tuple(range(n_batch_dims))
+    contract_ind = tuple(range(n_batch_dims, n_batch_dims + n_axis))
+    inputs = nn.dtypes.promote_dtype(inputs, dtype=self.dtype)[0]
+    # low_rank = x @ A
+    low_rank_dot = ((axis, contract_ind), (batch_dims, batch_ind))
+    low_rank = jax.lax.dot_general(inputs, a, low_rank_dot)
+    # out = low_rank @ B
+    low_rank_cind = [n_dim - n_axis]
+    b_cind = [n_batch_dims]
+    out_dot = ((low_rank_cind, b_cind), (batch_dims, batch_ind))
+    return jax.lax.dot_general(low_rank, b, out_dot)
+
+
+class LoRADenseGeneral(nn.Module):
+  """Wrapper around `nn.DenseGeneral` which adds a LoRA adapter."""
+
+  _: dataclasses.KW_ONLY
+
+  rank: int
+  wrapped: nn.DenseGeneral
+
+  dtype: jnp.dtype = jnp.float_
+  a_init: nn.initializers.Initializer = nn.initializers.kaiming_uniform()
+  b_init: nn.initializers.Initializer = nn.initializers.zeros_init()
+
+  def __post_init__(self):
+    super().__post_init__()
+    # Share scope, to make the wrapper module transparent with respect to the
+    # parameters (instead of nesting `{'params': {'wrapped': params}}`).
+    if self.scope is not None:
+      nn.share_scope(self, self.wrapped)
+
+  @nn.compact
+  def __call__(self, inputs: Array) -> Array:
+    adapter = LoRADenseGeneralAdapter(
+        name='lora',
+        rank=self.rank,
+        features=self.wrapped.features,
+        axis=self.wrapped.axis,
+        batch_dims=self.wrapped.batch_dims,
+        dtype=self.dtype,
+        a_init=self.a_init,
+        b_init=self.b_init,
+    )
+    return self.wrapped(inputs) + adapter(inputs)
diff --git a/gemma/peft/_lora_test.py b/gemma/peft/_lora_test.py
@@ -23,8 +23,10 @@
 def _dense_to_lora(module):
   if isinstance(module, nn.Dense):
     return peft.LoRADense(rank=1, wrapped=module)
-  if isinstance(module, nn.Einsum):
+  elif isinstance(module, nn.Einsum):
     return peft.LoRAEinsum(rank=1, wrapped=module)
+  elif isinstance(module, nn.DenseGeneral):
+    return peft.LoRADenseGeneral(rank=1, wrapped=module)
   else:
     return module
 
@@ -46,11 +48,19 @@ def __call__(self, x):
         shape=(4, 2, 3),
         einsum_str='bi,imn->bmn',
     )(x)
+
+    # Test DenseGeneral
+    y4 = nn.DenseGeneral(
+        features=(2, 3),
+        axis=-1,
+    )(x)
+
     return {
         'y0': y0,
         'y1': y1,
         'y2': y2,
         'y3': y3,
+        'y4': y4,
     }
 
 
@@ -88,11 +98,20 @@ def test_lora():
                   'b': f32[1, 2, 3],
               },
           },
+          'DenseGeneral_0': {
+              'kernel': f32[4, 2, 3],
+              'bias': f32[2, 3],
+              'lora': {
+                  'a': f32[4, 1],
+                  'b': f32[1, 2, 3],
+              },
+          },
       },
   })
   assert etree.spec_like(out) == etree.spec_like({
       'y0': f32[1, 2],
       'y1': f32[1, 2],
       'y2': f32[1, 3],
       'y3': f32[1, 2, 3],
+      'y4': f32[1, 2, 3],
   })