Merge pull request #116 from microsoft/wesselb/bf16-mode

wesselb · web-flow · commit 7765cd44c634 · 2025-06-23T10:26:12.000+02:00
Add `bf16_mode` to enable gradient computation
diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
@@ -23,7 +23,7 @@
 from aurora.model.decoder import Perceiver3DDecoder
 from aurora.model.encoder import Perceiver3DEncoder
 from aurora.model.lora import LoRAMode
-from aurora.model.swin3d import BasicLayer3D, Swin3DTransformerBackbone
+from aurora.model.swin3d import Swin3DTransformerBackbone
 
 __all__ = [
     "Aurora",
@@ -79,6 +79,7 @@ def __init__(
         lora_mode: LoRAMode = "single",
         surf_stats: Optional[dict[str, tuple[float, float]]] = None,
         autocast: bool = False,
+        bf16_mode: bool = False,
         level_condition: Optional[tuple[int | float, ...]] = None,
         dynamic_vars: bool = False,
         atmos_static_vars: bool = False,
@@ -141,8 +142,10 @@ def __init__(
             surf_stats (dict[str, tuple[float, float]], optional): For these surface-level
                 variables, adjust the normalisation to the given tuple consisting of a new location
                 and scale.
-            autocast (bool, optional): Use `torch.autocast` to reduce memory usage. Defaults to
-                `False`.
+            bf16_mode (bool, optional): To reduce memory usage, convert the tokens to BF16, run
+                the backbone in pure BF16, and run the decoder in FP16 AMP. This should enable a
+                gradient computation. USE AT YOUR OWN RISK. THIS WAS NOT USED DURING THE DEVELOPMENT
+                OF AURORA AND IS PURELY PROVIDED AS A STARTING POINT FOR FINE-TUNING.
             level_condition (tuple[int | float, ...], optional): Make the patch embeddings dependent
                 on pressure level. If you want to enable this feature, provide a tuple of all
                 possible pressure levels.
@@ -176,7 +179,6 @@ def __init__(
         self.atmos_vars = atmos_vars
         self.patch_size = patch_size
         self.surf_stats = surf_stats or dict()
-        self.autocast = autocast
         self.max_history_size = max_history_size
         self.timestep = timestep
         self.use_lora = use_lora
@@ -246,6 +248,19 @@ def __init__(
             modulation_head=modulation_head,
         )
 
+        if autocast and not bf16_mode:
+            warnings.warn(
+                "The argument `autocast` no longer does anything due to limited utility. "
+                "Consider instead using `bf16_mode`.",
+                stacklevel=2,
+            )
+
+        self.bf16_mode = bf16_mode
+
+        if self.bf16_mode:
+            # We run the backbone in pure BF16.
+            self.backbone.to(torch.bfloat16)
+
     def forward(self, batch: Batch) -> Batch:
         """Forward pass.
 
@@ -302,24 +317,44 @@ def forward(self, batch: Batch) -> Batch:
 
         transformed_batch = self._pre_encoder_hook(transformed_batch)
 
+        # The encoder is always just run.
         x = self.encoder(
             transformed_batch,
             lead_time=self.timestep,
         )
-        with torch.autocast(device_type="cuda") if self.autocast else contextlib.nullcontext():
-            x = self.backbone(
-                x,
-                lead_time=self.timestep,
-                patch_res=patch_res,
-                rollout_step=batch.metadata.rollout_step,
-            )
-        pred = self.decoder(
+
+        # In BF16 mode, the backbone is run in pure BF16.
+        if self.bf16_mode:
+            x = x.to(torch.bfloat16)
+        x = self.backbone(
             x,
-            batch,
             lead_time=self.timestep,
             patch_res=patch_res,
+            rollout_step=batch.metadata.rollout_step,
         )
 
+        # In BF16 mode, the decoder is run in AMP PF16, and the output is converted back to FP32.
+        # We run in PF16 as opposed to BF16 for improved relative precision.
+        if self.bf16_mode:
+            context = torch.autocast(device_type="cuda", dtype=torch.float16)
+            x = x.to(torch.float16)
+        else:
+            context = contextlib.nullcontext()
+        with context:
+            pred = self.decoder(
+                x,
+                batch,
+                lead_time=self.timestep,
+                patch_res=patch_res,
+            )
+        if self.bf16_mode:
+            pred = dataclasses.replace(
+                pred,
+                surf_vars={k: v.float() for k, v in pred.surf_vars.items()},
+                static_vars={k: v.float() for k, v in pred.static_vars.items()},
+                atmos_vars={k: v.float() for k, v in pred.atmos_vars.items()},
+            )
+
         # Remove batch and history dimension from static variables.
         pred = dataclasses.replace(
             pred,
@@ -476,7 +511,21 @@ def configure_activation_checkpointing(self):
 
         This is required in order to compute gradients without running out of memory.
         """
-        apply_activation_checkpointing(self, check_fn=lambda x: isinstance(x, BasicLayer3D))
+        # Checkpoint these modules:
+        module_names = (
+            "Perceiver3DEncoder",
+            "Swin3DTransformerBackbone",
+            "Basic3DEncoderLayer",
+            "Basic3DDecoderLayer",
+            "Perceiver3DDecoder",
+            "LinearPatchReconstruction",
+        )
+
+        def check(x: torch.nn.Module) -> bool:
+            name = x.__class__.__name__
+            return name in module_names
+
+        apply_activation_checkpointing(self, check_fn=check)
 
 
 class AuroraPretrained(Aurora):
diff --git a/aurora/model/decoder.py b/aurora/model/decoder.py
@@ -20,6 +20,10 @@
 __all__ = ["Perceiver3DDecoder"]
 
 
+class LinearPatchReconstruction(nn.Linear):
+    """Linear layer for patch reconstruction."""
+
+
 class Perceiver3DDecoder(nn.Module):
     """Multi-scale multi-source multi-variable decoder based on the Perceiver architecture."""
 
@@ -110,17 +114,17 @@ def __init__(
             )
 
         self.surf_heads = nn.ParameterDict(
-            {name: nn.Linear(embed_dim, patch_size**2) for name in surf_vars}
+            {name: LinearPatchReconstruction(embed_dim, patch_size**2) for name in surf_vars}
         )
         if not self.level_condition:
             self.atmos_heads = nn.ParameterDict(
-                {name: nn.Linear(embed_dim, patch_size**2) for name in atmos_vars}
+                {name: LinearPatchReconstruction(embed_dim, patch_size**2) for name in atmos_vars}
             )
         else:
             self.atmos_heads = nn.ParameterDict(
                 {
                     name: LevelConditioned(
-                        lambda: nn.Linear(embed_dim, patch_size**2),
+                        lambda: LinearPatchReconstruction(embed_dim, patch_size**2),
                         levels=self.level_condition,
                         levels_dim=-2,
                     )
diff --git a/docs/finetuning.md b/docs/finetuning.md
@@ -13,14 +13,19 @@ model.load_checkpoint()
 ## Computing Gradients
 
 To compute gradients, you will need an A100 with 80 GB of memory.
-In addition, you will need to use [PyTorch AMP](https://pytorch.org/docs/stable/amp.html)
-and gradient checkpointing.
+In addition, you will need to use reduced precision and gradient checkpointing.
 You can do this as follows:
 
 ```python
 from aurora import AuroraPretrained
 
-model = AuroraPretrained(autocast=True)  # Use AMP.
+model = AuroraPretrained(
+    # BF16 mode is an EXPERIMENTAL mode that saves memory by running the backbone in pure BF16
+    # and the decoder in FP16 AMP. This should enable gradient computation. USE AT YOUR OWN RISK.
+    # THIS WAS NOT USED IN THE DEVELOPMENT OF AURORA AND IS PURELY PROVIDED AS A STARTING POINT
+    # FOR FINE-TUNING.
+    bf16_mode=True,
+)
 model.load_checkpoint()
 
 batch = ...  # Load some data.

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,10 @@`
`20`	`20`	`__all__ = ["Perceiver3DDecoder"]`
`21`	`21`
`22`	`22`
	`23`	`+class LinearPatchReconstruction(nn.Linear):`
	`24`	`+ """Linear layer for patch reconstruction."""`
	`25`	`+`
	`26`	`+`
`23`	`27`	`class Perceiver3DDecoder(nn.Module):`
`24`	`28`	`"""Multi-scale multi-source multi-variable decoder based on the Perceiver architecture."""`
`25`	`29`
`@@ -110,17 +114,17 @@ def __init__(`
`110`	`114`	`)`
`111`	`115`
`112`	`116`	`self.surf_heads = nn.ParameterDict(`
`113`		`- {name: nn.Linear(embed_dim, patch_size**2) for name in surf_vars}`
	`117`	`+ {name: LinearPatchReconstruction(embed_dim, patch_size**2) for name in surf_vars}`
`114`	`118`	`)`
`115`	`119`	`if not self.level_condition:`
`116`	`120`	`self.atmos_heads = nn.ParameterDict(`
`117`		`- {name: nn.Linear(embed_dim, patch_size**2) for name in atmos_vars}`
	`121`	`+ {name: LinearPatchReconstruction(embed_dim, patch_size**2) for name in atmos_vars}`
`118`	`122`	`)`
`119`	`123`	`else:`
`120`	`124`	`self.atmos_heads = nn.ParameterDict(`
`121`	`125`	`{`
`122`	`126`	`name: LevelConditioned(`
`123`		`- lambda: nn.Linear(embed_dim, patch_size**2),`
	`127`	`+ lambda: LinearPatchReconstruction(embed_dim, patch_size**2),`
`124`	`128`	`levels=self.level_condition,`
`125`	`129`	`levels_dim=-2,`
`126`	`130`	`)`