test: add model alignment test for qwen3 models.

jiangyangmu · The tunix Authors · commit 3fe890da8175 · 2025-10-27T22:10:55.000-07:00
PiperOrigin-RevId: 824835255
diff --git a/tests/model_alignment/gemma_align_test.py b/tests/model_alignment/gemma_align_test.py
@@ -0,0 +1,135 @@
+"""Check the model correctness for Tunix nnx implemented models.
+
+The test will compare the first N decoder layer output between Tunix model and
+HF PyTorch model, typically we will expect the logits differnece to be within
+1e-3 in fp32.
+"""
+
+import os
+import tempfile
+from absl.testing import absltest
+from absl.testing import parameterized
+import jax
+import jax.numpy as jnp
+import numpy as np
+import torch
+import transformers
+from tunix.models.gemma3 import model as gemma3_model
+from tunix.models.gemma3 import params_safetensors as gemma3_params
+from tunix.sft import utils
+from tunix.tests import test_common as tc
+
+K_MASK = -2.3819763e38
+
+
+def create_pytorch_causal_mask(seq_len):
+  """Creates a causal attention mask for a sequence of a given length.
+
+  Args:
+    seq_len: The length of the sequence.
+
+  Returns:
+    A boolean tensor of shape (seq_len, seq_len) where:
+    - mask[i, j] is True if token i can attend to token j (j <= i).
+    - mask[i, j] is False if token i cannot attend to token j (j > i).
+  """
+  # Create a lower triangular matrix of ones
+  mask = torch.ones(seq_len, seq_len, dtype=torch.float).tril(diagonal=0)
+  mask = mask.masked_fill(mask == 0, K_MASK)
+  mask = mask.masked_fill(mask == 1, 0)
+  return mask
+
+
+def get_hf_output(model, seq_len: int):
+  x = (torch.arange(seq_len) + 1).reshape(1, -1)
+  position_ids = torch.arange(seq_len).reshape(1, -1)
+  attn_mask = create_pytorch_causal_mask(seq_len).unsqueeze(0).unsqueeze(0)
+  return model(x, attn_mask, position_ids).logits.detach().numpy()
+
+
+def get_jax_output(model, seq_len: int):
+  x = (jnp.arange(seq_len) + 1).reshape(1, -1)
+  positions = jnp.arange(seq_len).reshape(1, -1)
+  attn_mask = utils.make_causal_attn_mask(jnp.ones((1, seq_len)))
+  output, _ = model(x, positions, None, attn_mask)
+  return output
+
+
+class GemmaAlignTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name="gemma3_270m_it",
+          model_name="google/gemma-3-270m-it",
+          model_config=gemma3_model.ModelConfig.gemma3_270m,
+          tolerance=1e-3,
+      ),
+  )
+  def test_gemma_model_alignment(self, model_name, model_config, tolerance):
+    model_path = os.path.join(tempfile.gettempdir(), "models", model_name)
+
+    tc.download_from_huggingface(repo_id=model_name, model_path=model_path)
+
+    hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_path, dtype=torch.float32
+    )
+    print("HF model loaded.")
+
+    jax_model = gemma3_params.create_model_from_safe_tensors(
+        model_path,
+        model_config(),
+        mesh=jax.make_mesh((1, 1), ("fsdp", "tp")),
+        dtype=jnp.float32,
+    )
+    print("JAX model loaded.")
+
+    # Make sure model weights are the same (only check the first query weight)
+    hf_emb_weight = hf_model.get_decoder().embed_tokens.weight.detach().numpy()
+    jax_emb_weight = jax_model.embedder.input_embedding.value
+    np.testing.assert_equal(
+        hf_emb_weight,
+        jax_emb_weight,
+        err_msg=(
+            "Embedding weights are not equal, are you sure the loaded model"
+            " weight between HF and JAX is identical?"
+        ),
+    )
+    hf_query_weight = (
+        hf_model.get_decoder()
+        .layers[0]
+        .self_attn.q_proj.weight.detach()
+        .numpy()
+    )
+    jax_query_weight = jax_model.layers[0].attn.q_einsum.w
+    _, d, _ = jax_query_weight.shape
+    jax_query_weight = jax_query_weight.transpose(0, 2, 1).reshape(-1, d)
+    np.testing.assert_equal(
+        hf_query_weight,
+        jax_query_weight,
+        err_msg=(
+            "Query weights are not equal, are you sure the loaded model weight"
+            " between HF and JAX is identical?"
+        ),
+    )
+    print("Model weights check passed :)")
+
+    seq_len = 128
+
+    # Do a check on entire model output
+    hf_output = get_hf_output(hf_model, seq_len)
+    jax_output = get_jax_output(jax_model, seq_len)
+    np.testing.assert_allclose(
+        hf_output.squeeze(),
+        jax_output.squeeze(),
+        atol=tolerance,
+        rtol=tolerance,
+    )
+
+    print("Logits are close! Model alignment check passed :)")
+
+    # clean up
+    tc.delete_directory(model_path)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tests/model_alignment/qwen_align_test.py b/tests/model_alignment/qwen_align_test.py
@@ -16,6 +16,8 @@
 import transformers
 from tunix.models.qwen2 import model as qwen2_model
 from tunix.models.qwen2 import params as qwen2_params
+from tunix.models.qwen3 import model as qwen3_model
+from tunix.models.qwen3 import params as qwen3_params
 from tunix.sft import utils
 from tunix.tests import test_common as tc
 
@@ -97,17 +99,27 @@ class QwenAlignTest(parameterized.TestCase):
           testcase_name="deepseek_r1_distill_qwen_1_5b",
           model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
           model_config=qwen2_model.ModelConfig.deepseek_r1_distill_qwen_1_5b,
+          model_loader=qwen2_params,
           tolerance=2e-3,
       ),
       dict(
           testcase_name="qwen2_5_1_5b_instruct",
           model_name="Qwen/Qwen2.5-1.5B-Instruct",
           model_config=qwen2_model.ModelConfig.qwen2_5_1_5b,
+          model_loader=qwen2_params,
+          tolerance=1e-3,
+      ),
+      dict(
+          testcase_name="qwen3_0_6b",
+          model_name="Qwen/Qwen3-0.6B",
+          model_config=qwen3_model.ModelConfig.qwen3_0_6b,
+          model_loader=qwen3_params,
           tolerance=1e-3,
       ),
-      # Note: Qwen/Qwen2.5-7B-Instruct will OOM on v5e-8.
   )
-  def test_qwen_model_alignment(self, model_name, model_config, tolerance):
+  def test_qwen_model_alignment(
+      self, model_name, model_config, model_loader, tolerance
+  ):
     model_path = os.path.join(tempfile.gettempdir(), "models", model_name)
 
     tc.download_from_huggingface(repo_id=model_name, model_path=model_path)
@@ -117,7 +129,7 @@ def test_qwen_model_alignment(self, model_name, model_config, tolerance):
     )
     print("HF model loaded.")
 
-    jax_model = qwen2_params.create_model_from_safe_tensors(
+    jax_model = model_loader.create_model_from_safe_tensors(
         model_path,
         model_config(),
         mesh=jax.make_mesh((1, 1), ("fsdp", "tp")),
@@ -146,15 +158,16 @@ def test_qwen_model_alignment(self, model_name, model_config, tolerance):
 
     seq_len = 128
 
-    layer_to_run = model_config().num_layers
-    hf_logits = get_per_layer_hf_output(hf_model, seq_len, layer_to_run)
-    jax_logits = get_per_layer_jax_output(jax_model, seq_len, layer_to_run)
-    np.testing.assert_allclose(
-        hf_logits.squeeze(),
-        jax_logits.squeeze(),
-        atol=tolerance,
-        rtol=tolerance,
-    )
+    if model_loader == qwen2_params:
+      layer_to_run = model_config().num_layers
+      hf_logits = get_per_layer_hf_output(hf_model, seq_len, layer_to_run)
+      jax_logits = get_per_layer_jax_output(jax_model, seq_len, layer_to_run)
+      np.testing.assert_allclose(
+          hf_logits.squeeze(),
+          jax_logits.squeeze(),
+          atol=tolerance,
+          rtol=tolerance,
+      )
 
     # Do a check on entire model output
     hf_output = get_hf_output(hf_model, seq_len)