From ace574733213c14d25e2a3000f9a956e530dc1e5 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 01/22] sharding stage3 bugfix --- .../sharding/group_sharded_stage3.py | 73 ++++++++++++++----- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index 0bc8dd3fefce32..f8426a0de1f48e 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -592,19 +592,34 @@ def _forward_pre_hook(layer, inputs): ) def _forward_post_hook(layer, inputs, outputs): - return ForwardPostHooks.apply( - outputs, - layer, - self._order_tracer, - self._trainable_params, - self._param2buffer, - self._param2buffer_size, - self._rank, - self._group, - self._sync_comm, - self._offload, - task_flow, - ) + if isinstance(outputs, tuple): + return ForwardPostHooks.apply( + *outputs, + layer=layer, + order_tracer=self._order_tracer, + trainable_params=self._trainable_params, + param2buffer=self._param2buffer, + param2buffer_size=self._param2buffer_size, + rank=self._rank, + group=self._group, + sync_comm=self._sync_comm, + offload=self._offload, + task_flow=task_flow, + ) + else: + return ForwardPostHooks.apply( + outputs, + layer=layer, + order_tracer=self._order_tracer, + trainable_params=self._trainable_params, + param2buffer=self._param2buffer, + param2buffer_size=self._param2buffer_size, + rank=self._rank, + group=self._group, + sync_comm=self._sync_comm, + offload=self._offload, + task_flow=task_flow, + ) # register previous forward hooks sub_layer.register_forward_pre_hook(_forward_pre_hook) @@ -903,7 +918,7 @@ class ForwardPostHooks(PyLayer): @staticmethod def forward( ctx, - inputs, + *inputs, layer, order_tracer, trainable_params, @@ -936,8 +951,26 @@ def forward( ctx.trainable_params = trainable_params ctx.param2buffer_size = param2buffer_size ctx.offload = offload - - return inputs + inputs_new = [] + grad_none = {} + tensor_count = 0 + for input in inputs: + if isinstance(input, paddle.Tensor): + input_new = paddle.assign(input) + inputs_new.append(input_new) + input_new.stop_gradient = input.stop_gradient + if input.stop_gradient: + grad_none[tensor_count] = True + else: + grad_none[tensor_count] = False + tensor_count += 1 + else: + inputs_new.append(input) + ctx.grad_none = grad_none + if len(inputs_new) == 1: + return inputs_new[0] + else: + return tuple(inputs_new) @staticmethod def backward(ctx, *args): @@ -992,8 +1025,12 @@ def backward(ctx, *args): sync_wait=sync_wait, offload=offload, ) - - return args + grad_none = ctx.grad_none + args = list(args) + for i in range(len(args)): + if grad_none[i]: + args[i] = None + return tuple(args) class TaskFlow: From 6b246461f2743765994ea7893ab4213981f21870 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 02/22] sharding stage3 bugfix --- .../sharding/group_sharded_stage3.py | 43 ++---- .../dygraph_group_sharded_stage3_fix_test.py | 128 ++++++++++++++++++ 2 files changed, 143 insertions(+), 28 deletions(-) create mode 100644 test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index f8426a0de1f48e..ae08dde14a651a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -592,34 +592,21 @@ def _forward_pre_hook(layer, inputs): ) def _forward_post_hook(layer, inputs, outputs): - if isinstance(outputs, tuple): - return ForwardPostHooks.apply( - *outputs, - layer=layer, - order_tracer=self._order_tracer, - trainable_params=self._trainable_params, - param2buffer=self._param2buffer, - param2buffer_size=self._param2buffer_size, - rank=self._rank, - group=self._group, - sync_comm=self._sync_comm, - offload=self._offload, - task_flow=task_flow, - ) - else: - return ForwardPostHooks.apply( - outputs, - layer=layer, - order_tracer=self._order_tracer, - trainable_params=self._trainable_params, - param2buffer=self._param2buffer, - param2buffer_size=self._param2buffer_size, - rank=self._rank, - group=self._group, - sync_comm=self._sync_comm, - offload=self._offload, - task_flow=task_flow, - ) + if isinstance(outputs, paddle.Tensor): + outputs = (outputs,) + return ForwardPostHooks.apply( + *outputs, + layer=layer, + order_tracer=self._order_tracer, + trainable_params=self._trainable_params, + param2buffer=self._param2buffer, + param2buffer_size=self._param2buffer_size, + rank=self._rank, + group=self._group, + sync_comm=self._sync_comm, + offload=self._offload, + task_flow=task_flow, + ) # register previous forward hooks sub_layer.register_forward_pre_hook(_forward_pre_hook) diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py new file mode 100644 index 00000000000000..43ff6a5db81e83 --- /dev/null +++ b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from dist_amp_base import create_optimizer + +import paddle +import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import nn +from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import ( + GroupShardedStage3, +) + + +class RandomDataset(paddle.io.Dataset): + def __init__(self, num_samples=2000, shape=(4, 8, 16)): + self.num_samples = num_samples + self.shape = shape + + def __getitem__(self, idx): + img = np.random.rand(*self.shape).astype('float32') + label = np.ones(1).astype('int64') + return img, label + + def __len__(self): + return self.num_samples + + +def train_step(model, use_pure_bf16=False, use_main_grad=False): + optimizer = create_optimizer( + model=model, use_pure_bf16=use_pure_bf16, use_main_grad=use_main_grad + ) + group = paddle.distributed.new_group([0, 1]) + model = GroupShardedStage3(model, optimizer, group=group) + local_rank = paddle.distributed.get_rank() + epoch = 1 + batch_size = 500 + paddle.seed(2025) + np.random.seed(2025) + train_loader = paddle.io.DataLoader( + RandomDataset(), + batch_size=batch_size, + shuffle=False, + drop_last=True, + num_workers=0, + ) + for eop in range(epoch): + model.train() + for batch_id, data in enumerate(train_loader()): + print("<<<<<<<<<<<< forward >>>>>>>>>>>") + print( + f"-- [rank={local_rank}] epoch {eop}, batch {batch_id}, {data[0].shape=}" + ) + score, out = model(data[0]) + print(f"after forward, {score=}, {out.shape=}") + + loss = out.mean() + + print( + f"-- [rank={local_rank}] epoch {eop}, batch {batch_id}, loss: {loss.astype(paddle.float32).numpy()}" + ) + print("<<<<<<<<<<<< backward >>>>>>>>>>>") + loss.backward() + print("<<<<<<<<<<<< optimizer >>>>>>>>>>>") + optimizer.step() + + +class MulLinear(nn.Layer): + def __init__(self, input_dim, output_dim, scale=1.0): + super().__init__() + self.linear1 = nn.Linear(input_dim, output_dim) + self.linear2 = nn.Linear(input_dim, output_dim) + self.scale1 = self.create_parameter( + shape=[1], default_initializer=nn.initializer.Constant(scale) + ) + self.scale2 = self.create_parameter( + shape=[1], default_initializer=nn.initializer.Constant(1.0 - scale) + ) + + def forward(self, x): + out1 = self.linear1(x) + out2 = self.linear2(x) + output1 = self.scale1 * out1 + output2 = self.scale2 * out2 + score1 = output1.mean() + score2 = output2.mean() + return score1, score2, output1, output2 + + +class MyModel(nn.Layer): + def __init__(self, input_dim, hidden_dim, output_dim, scale): + super().__init__() + self.linear1 = nn.Linear(input_dim, hidden_dim) + self.mullinear = MulLinear(hidden_dim, hidden_dim, scale) + self.linear2 = nn.Linear(hidden_dim, output_dim) + + def forward(self, input): + hidden_states = self.linear1(input) + hidden_states = F.relu(hidden_states) + score1, score2, hidden_states1, hidden_states2 = self.mullinear( + hidden_states + ) + final_score = score1 + score2 + w1 = score1 / final_score + w2 = score2 / final_score + hidden_states = w1 * hidden_states1 + w2 * hidden_states2 + hidden_states = F.relu(hidden_states) + output = self.linear2(hidden_states) + return final_score, output + + +if __name__ == "__main__": + b, s, h = 4, 8, 16 + model = MyModel(input_dim=h, hidden_dim=32, output_dim=h, scale=0.4) + dist.init_parallel_env() + train_step(model) From e054b48fdf756e6cbe10a139b54d1bfe8c165b05 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 03/22] sharding stage3 bugfix --- .../dygraph_group_sharded_stage3_fix_test.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py index 43ff6a5db81e83..5363f73eca38f6 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py @@ -12,6 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +<<<<<<< HEAD +======= +import unittest + +>>>>>>> b3bc81cd9e (sharding stage3 bugfix) import numpy as np from dist_amp_base import create_optimizer @@ -120,9 +125,13 @@ def forward(self, input): output = self.linear2(hidden_states) return final_score, output +class TestStage3Bugfix(unittest.TestCase): + def test_stage3(self): + b, s, h = 4, 8, 16 + model = MyModel(input_dim=h, hidden_dim=32, output_dim=h, scale=0.4) + dist.init_parallel_env() + train_step(model) + if __name__ == "__main__": - b, s, h = 4, 8, 16 - model = MyModel(input_dim=h, hidden_dim=32, output_dim=h, scale=0.4) - dist.init_parallel_env() - train_step(model) + unittest.main() From fbde9523b93483c9cdceef25d8ffed50597e5495 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 04/22] sharding stage3 bugfix --- .../fleet/dygraph_group_sharded_stage3_fix_test.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py index 5363f73eca38f6..3aaa8167960642 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -<<<<<<< HEAD -======= import unittest - ->>>>>>> b3bc81cd9e (sharding stage3 bugfix) import numpy as np from dist_amp_base import create_optimizer @@ -132,6 +128,5 @@ def test_stage3(self): dist.init_parallel_env() train_step(model) - if __name__ == "__main__": unittest.main() From 333bae9797866a8c3008cbfa31cb244eda19672e Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 05/22] sharding stage3 bugfix --- .../sharding/group_sharded_stage3.py | 22 ++++++------- test/collective/fleet/CMakeLists.txt | 14 ++++++++ .../dygraph_group_sharded_stage3_fix_test.py | 33 ++++++++++++++++--- .../fleet/test_sharding_stage3_bugfix.py | 28 ++++++++++++++++ 4 files changed, 81 insertions(+), 16 deletions(-) create mode 100644 test/collective/fleet/test_sharding_stage3_bugfix.py diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index ae08dde14a651a..3474a66e89dd9a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -938,26 +938,26 @@ def forward( ctx.trainable_params = trainable_params ctx.param2buffer_size = param2buffer_size ctx.offload = offload - inputs_new = [] + inputs_list = [] grad_none = {} tensor_count = 0 - for input in inputs: - if isinstance(input, paddle.Tensor): - input_new = paddle.assign(input) - inputs_new.append(input_new) - input_new.stop_gradient = input.stop_gradient - if input.stop_gradient: + for input_tensor in inputs: + if isinstance(input_tensor, paddle.Tensor): + input_new = paddle.assign(input_tensor) + inputs_list.append(input_new) + input_new.stop_gradient = input_tensor.stop_gradient + if input_tensor.stop_gradient: grad_none[tensor_count] = True else: grad_none[tensor_count] = False tensor_count += 1 else: - inputs_new.append(input) + inputs_list.append(input_tensor) ctx.grad_none = grad_none - if len(inputs_new) == 1: - return inputs_new[0] + if len(inputs_list) == 1: + return inputs_list[0] else: - return tuple(inputs_new) + return tuple(inputs_list) @staticmethod def backward(ctx, *args): diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt index 62850027500f1b..4e7f26bbb3cb79 100644 --- a/test/collective/fleet/CMakeLists.txt +++ b/test/collective/fleet/CMakeLists.txt @@ -864,3 +864,17 @@ if((WITH_GPU) AND LOCAL_ALL_PLAT) ) set_tests_properties(test_pp_unified_dygraph_model PROPERTIES TIMEOUT "500") endif() +if((WITH_GPU) AND LOCAL_ALL_PLAT) + bash_test_modules( + test_sharding_stage3_bugfix + START_BASH + ../../legacy_test/dist_test.sh + TIMEOUT + "500" + LABELS + "RUN_TYPE=DIST" + ENVS + "PADDLE_DIST_UT_PORT=21282;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python" + ) + set_tests_properties(test_sharding_stage3_bugfix PROPERTIES TIMEOUT "500") +endif() diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py index 3aaa8167960642..48d703a17b8510 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py @@ -20,6 +20,7 @@ import paddle.distributed as dist import paddle.nn.functional as F from paddle import nn +from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import ( GroupShardedStage3, ) @@ -43,7 +44,8 @@ def train_step(model, use_pure_bf16=False, use_main_grad=False): optimizer = create_optimizer( model=model, use_pure_bf16=use_pure_bf16, use_main_grad=use_main_grad ) - group = paddle.distributed.new_group([0, 1]) + hcg = fleet.get_hybrid_communicate_group() + group = hcg.get_sharding_parallel_group() model = GroupShardedStage3(model, optimizer, group=group) local_rank = paddle.distributed.get_rank() epoch = 1 @@ -97,7 +99,9 @@ def forward(self, x): output2 = self.scale2 * out2 score1 = output1.mean() score2 = output2.mean() - return score1, score2, output1, output2 + combined = paddle.stack([output1, output2], axis=0) + combined.stop_gradient = True + return score1.item(), score2.item(), output1, output2, combined class MyModel(nn.Layer): @@ -110,9 +114,13 @@ def __init__(self, input_dim, hidden_dim, output_dim, scale): def forward(self, input): hidden_states = self.linear1(input) hidden_states = F.relu(hidden_states) - score1, score2, hidden_states1, hidden_states2 = self.mullinear( - hidden_states - ) + ( + score1, + score2, + hidden_states1, + hidden_states2, + combined_hidden_states, + ) = self.mullinear(hidden_states) final_score = score1 + score2 w1 = score1 / final_score w2 = score2 / final_score @@ -121,7 +129,22 @@ def forward(self, input): output = self.linear2(hidden_states) return final_score, output + class TestStage3Bugfix(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 1 + self.sharding_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + "sharding_degree": self.sharding_parallel_size, + } + fleet.init(is_collective=True, strategy=strategy) + def test_stage3(self): b, s, h = 4, 8, 16 model = MyModel(input_dim=h, hidden_dim=32, output_dim=h, scale=0.4) diff --git a/test/collective/fleet/test_sharding_stage3_bugfix.py b/test/collective/fleet/test_sharding_stage3_bugfix.py new file mode 100644 index 00000000000000..14c74638475765 --- /dev/null +++ b/test/collective/fleet/test_sharding_stage3_bugfix.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from legacy_test.test_parallel_dygraph_dataparallel import ( + TestMultipleAccelerators, +) + + +class TestShardingParallel(TestMultipleAccelerators): + def test_sharding_parallel(self): + self.run_mnist_2accelerators('dygraph_group_sharded_stage3_fix_test.py') + + +if __name__ == "__main__": + unittest.main() From 143d7853a16fc3194a1c9d48279da046f81d036b Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 22 Oct 2025 11:34:33 +0800 Subject: [PATCH 06/22] sharding stage3 bugfix --- test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py index 48d703a17b8510..9aef02f3916656 100644 --- a/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py +++ b/test/collective/fleet/dygraph_group_sharded_stage3_fix_test.py @@ -13,6 +13,7 @@ # limitations under the License. import unittest + import numpy as np from dist_amp_base import create_optimizer @@ -151,5 +152,6 @@ def test_stage3(self): dist.init_parallel_env() train_step(model) + if __name__ == "__main__": unittest.main() From 7fd48d9060b292136bce9cdf79983530d5c5d52f Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Mon, 3 Nov 2025 19:56:22 +0800 Subject: [PATCH 07/22] support recompute's forward and backward in pipeline mode --- .../fleet/meta_parallel/pipeline_parallel.py | 163 ++++++++++++++++-- 1 file changed, 149 insertions(+), 14 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 027a734eedd141..47b1ab90f6da25 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -22,10 +22,14 @@ from enum import Enum from functools import partial from typing import Callable +import numpy as np +import random import paddle from paddle import framework - +from paddle.distributed.fleet.meta_parallel.parallel_layers.random import ( + get_rng_state_tracker, +) from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer from ..utils import timer_helper as timer from ..utils.hybrid_parallel_util import ( @@ -38,6 +42,10 @@ from ..utils.log_util import get_sync_logger, logger from .meta_parallel_base import MetaParallelBase from .parallel_layers.pp_layers import PipelineLayer +from ..recompute.recompute import ( + switch_rng_state_tracker, + detach_variable +) _use_four_directions = os.environ.get( 'PADDLE_USE_FOUR_DIRECTIONS_P2P', paddle.base.core.is_compiled_with_xpu() @@ -495,6 +503,16 @@ def __init__(self, layers, hcg, strategy): # only support user hooks during training self.user_hooks_enabled = True + #next layer's recompute's backward overlap with this layer's recompute's forward + self.recompute_overlap = True + #preserve = kwargs.pop('preserve_rng_state', True) + self.preserve_rng_state = True + #offload_indices = kwargs.pop('offload_indices', []) + self.offload_indices =[] + self.custom_get_state_func = lambda x=None: None + self.custom_set_state_func = lambda x=None: None + + def register_hook( self, location: PipelineParallelMicroStepLocations, hook: Callable ): @@ -749,6 +767,90 @@ def _flush_records(self): ) as f: f.writelines(record + '\n' for record in self._records) self._records = [] + + def save_state(self, state_buffers): + state = {} + if self.preserve_rng_state: + state["fw_rng_state"] = paddle.get_rng_state() + state["fwd_rng_state_tracker"] = ( + get_rng_state_tracker().get_states_tracker() + ) + state[s"fwd_numpy_state"] = np.random.get_state() + state["fwd_random_state"] = random.getstate() + state["fwd_custom_state"] = self.custom_get_state_func() + state["custom_get_state_func"] = self.custom_get_state_func + state["custom_set_state_func"] = self.custom_set_state_func + tracer = framework._dygraph_tracer() + state["is_fw_autocast"] = ( + False if tracer._amp_level == framework.core.AmpLevel.O0 else True + ) + if tracer._amp_level == framework.core.AmpLevel.O2: + state["amp_level"] = 'O2' + elif tracer._amp_level in (framework.core.AmpLevel.O1, framework.core.AmpLevel.O0): + state["amp_level"] = 'O1' + else: + raise ValueError(f"unsupported amp level: {tracer._amp_level}") + + if tracer._amp_dtype == 'float16': + state["amp_dtype"] = 'float16' + elif tracer._amp_dtype in ('bfloat16', 'float32'): + state["amp_dtype"] = 'bfloat16' + else: + raise ValueError(f"unsupported amp dtype: {tracer._amp_dtype}") + state["amp_white_list"], state["amp_black_list"] = tracer._get_amp_op_list() + state_buffers.append(state) + + def load_state_and_forward(self, state, input_tensor): + inputs = list(input_tensor) + tensor_indices = state["tensor_indices"] + tensors = self.container + for i, idx in enumerate(tensor_indices): + inputs[idx] = ( + tensors[i].to( + paddle.base.framework._current_expected_place() + ) + if i in state["offload_indices"] + else tensors[i] + ) + if i in state["offload_indices"]: + inputs[idx].stop_gradient = tensors[i].stop_gradient + tracer = framework._dygraph_tracer() + tracer._has_grad = True + + if state["preserve_rng_state"]: + with ( + switch_rng_state_tracker( + state["fw_rng_state"], + state["fwd_rng_state_tracker"], + state["fwd_numpy_state"], + state["fwd_random_state"], + state["fwd_custom_state"], + state["custom_get_state_func"], + state["custom_set_state_func"], + ), + paddle.amp.auto_cast( + enable=state["is_fw_autocast"], + custom_white_list=state["amp_white_list"], + custom_black_list=state["amp_black_list"], + level=state["amp_level"], + dtype=state["amp_dtype"], + ), + ): + detached_inputs = detach_variable(tuple(inputs)) + outputs = self._layers.forward(*detached_inputs) + else: + with paddle.amp.auto_cast( + enable=state["is_fw_autocast"], + custom_white_list=state["amp_white_list"], + custom_black_list=state["amp_black_list"], + level=state["amp_level"], + dtype=state["amp_dtype"], + ): + detached_inputs = detach_variable(tuple(inputs)) + outputs = self._layers.forward(*detached_inputs) + return outputs + + def forward_backward_pipeline( self, @@ -796,6 +898,8 @@ def forward_backward_pipeline( input_buffers = [] output_buffers = [] + if self.recompute_overlap: + state_buffers = [] micro_dataset = self._wrap_data(data) @@ -813,6 +917,8 @@ def forward_backward_pipeline( input_tensor_dict, use_dict = tuple_to_dict_helper(input_tensor) self._record_stamp("F", step_id, '"B"', self._forward_color) + if self.recompute_overlap: + self.save_state(state_buffers) output_tensor, _, _ = self._forward_step( input_tensor=input_tensor_dict if use_dict else input_tensor, micro_dataset=micro_dataset, @@ -856,6 +962,8 @@ def forward_backward_pipeline( self._record_stamp( "F", startup_steps + i, '"B"', self._forward_color ) + if self.recompute_overlap: + self.save_state(state_buffers) output_tensor, _, _ = self._forward_step( input_tensor=input_tensor_dict if use_dict else input_tensor, micro_dataset=micro_dataset, @@ -891,9 +999,16 @@ def forward_backward_pipeline( ) self._record_stamp("B", i, '"B"', self._backward_color) - input_tensor_grad = self._backward_step( - input_tensor, output_tensor, output_tensor_grad, step_id=i - ) + if self.recompute_overlap: + state = state_buffers.pop(0) + output_tensor_recompute = self.load_state_and_forward(state, input_tensor) + input_tensor_grad = self._backward_step( + input_tensor, output_tensor_recompute, output_tensor_grad, step_id=i + ) + else: + input_tensor_grad = self._backward_step( + input_tensor, output_tensor, output_tensor_grad, step_id=i + ) self._record_stamp("B", i, '"E"', self._backward_color) if last_iter: @@ -933,12 +1048,22 @@ def forward_backward_pipeline( self._record_stamp( "B", steady_steps + i, '"B"', self._backward_color ) - input_tensor_grad = self._backward_step( - input_tensor, - output_tensor, - output_tensor_grad, - step_id=steady_steps + i, - ) + if self.recompute_overlap: + state = state_buffers.pop(0) + output_tensor_recompute = self.load_state_and_forward(state, input_tensor) + input_tensor_grad = self._backward_step( + input_tensor, + output_tensor_recompute, + output_tensor_grad, + step_id=steady_steps + i, + ) + else: + input_tensor_grad = self._backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + step_id=steady_steps + i, + ) self._record_stamp( "B", steady_steps + i, '"E"', self._backward_color ) @@ -1254,11 +1379,21 @@ def _forward_step( schedule_chunk = None if overlap_schedule_mode: schedule_chunk = self._layers.get_schedule_chunk(chunk_id=chunk_id) - output_tensor = schedule_chunk.forward(input_tensor) + if self.recompute_overlap: + with paddle.no_grad(): + output_tensor = schedule_chunk.forward(input_tensor) + else: + output_tensor = schedule_chunk.forward(input_tensor) else: - output_tensor = self._layers.forward( - input_tensor, chunk_id=chunk_id - ) + if self.recompute_overlap: + with paddle.no_grad(): + output_tensor = self._layers.forward( + input_tensor, chunk_id=chunk_id + ) + else: + output_tensor = self._layers.forward( + input_tensor, chunk_id=chunk_id + ) self.callbacks.on_location( PipelineParallelMicroStepLocations.FORWARD_END, From 025efc33f3daad27e6b8eda75d032c91c1a7a020 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Tue, 4 Nov 2025 11:30:46 +0800 Subject: [PATCH 08/22] [API Compatibility] Add paddle.Tensor.clip_ --- python/paddle/tensor/__init__.py | 2 ++ test/legacy_test/test_clip_op.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index b6d3d3bdc50847..026b051b337104 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -515,6 +515,7 @@ greater = gt sub = subtract sub_ = subtract_ +clamp_ = clip_ # this list used in math_op_patch.py for _binary_creator_ tensor_method_func = [ @@ -947,6 +948,7 @@ 'gt', 'greater', 'clamp', + 'clamp_', ] diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py index de37d48303782c..480f08c59e3f41 100644 --- a/test/legacy_test/test_clip_op.py +++ b/test/legacy_test/test_clip_op.py @@ -1033,5 +1033,37 @@ def test_static_compatibility(self): np.testing.assert_array_equal(self.np_out, fetches[0]) +class TestClamp_AndClip_(unittest.TestCase): + def setUp(self) -> None: + paddle.disable_static() + self.shape = [3, 4, 5] + self.input_np = np.random.random(self.shape).astype('float32') + self.a = np.random.random(self.shape).astype('float32') + self.b = np.random.random(self.shape).astype('float32') + self.min, self.max = -0.5, 0.5 + + def test_clip_and_clamp(self): + clip_a = paddle.to_tensor(self.a, stop_gradient=False) + clip_b = paddle.to_tensor(self.b, stop_gradient=False) + + clamp_a = paddle.to_tensor(self.a, stop_gradient=False) + clamp_b = paddle.to_tensor(self.b, stop_gradient=False) + + clip_x = clip_a + clip_b + clip_x.clip_(min=self.min, max=self.max) + clip_x.retain_grads() + clip_x.mean().backward() + + clamp_x = clamp_a + clamp_b + clamp_x.clamp_(min=self.min, max=self.max) + clamp_x.retain_grads() + clamp_x.mean().backward() + + np.testing.assert_allclose(clip_x.numpy(), clamp_x.numpy(), rtol=1e-20) + np.testing.assert_allclose( + clip_x.grad.numpy(), clamp_x.grad.numpy(), rtol=1e-20 + ) + + if __name__ == '__main__': unittest.main() From 4e546424352b931aa24cb3fd43f63abc7100d528 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 11:02:00 +0800 Subject: [PATCH 09/22] Revert "support recompute's forward and backward in pipeline mode" This reverts commit 7fd48d9060b292136bce9cdf79983530d5c5d52f. --- .../fleet/meta_parallel/pipeline_parallel.py | 163 ++---------------- 1 file changed, 14 insertions(+), 149 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 47b1ab90f6da25..027a734eedd141 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -22,14 +22,10 @@ from enum import Enum from functools import partial from typing import Callable -import numpy as np -import random import paddle from paddle import framework -from paddle.distributed.fleet.meta_parallel.parallel_layers.random import ( - get_rng_state_tracker, -) + from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer from ..utils import timer_helper as timer from ..utils.hybrid_parallel_util import ( @@ -42,10 +38,6 @@ from ..utils.log_util import get_sync_logger, logger from .meta_parallel_base import MetaParallelBase from .parallel_layers.pp_layers import PipelineLayer -from ..recompute.recompute import ( - switch_rng_state_tracker, - detach_variable -) _use_four_directions = os.environ.get( 'PADDLE_USE_FOUR_DIRECTIONS_P2P', paddle.base.core.is_compiled_with_xpu() @@ -503,16 +495,6 @@ def __init__(self, layers, hcg, strategy): # only support user hooks during training self.user_hooks_enabled = True - #next layer's recompute's backward overlap with this layer's recompute's forward - self.recompute_overlap = True - #preserve = kwargs.pop('preserve_rng_state', True) - self.preserve_rng_state = True - #offload_indices = kwargs.pop('offload_indices', []) - self.offload_indices =[] - self.custom_get_state_func = lambda x=None: None - self.custom_set_state_func = lambda x=None: None - - def register_hook( self, location: PipelineParallelMicroStepLocations, hook: Callable ): @@ -767,90 +749,6 @@ def _flush_records(self): ) as f: f.writelines(record + '\n' for record in self._records) self._records = [] - - def save_state(self, state_buffers): - state = {} - if self.preserve_rng_state: - state["fw_rng_state"] = paddle.get_rng_state() - state["fwd_rng_state_tracker"] = ( - get_rng_state_tracker().get_states_tracker() - ) - state[s"fwd_numpy_state"] = np.random.get_state() - state["fwd_random_state"] = random.getstate() - state["fwd_custom_state"] = self.custom_get_state_func() - state["custom_get_state_func"] = self.custom_get_state_func - state["custom_set_state_func"] = self.custom_set_state_func - tracer = framework._dygraph_tracer() - state["is_fw_autocast"] = ( - False if tracer._amp_level == framework.core.AmpLevel.O0 else True - ) - if tracer._amp_level == framework.core.AmpLevel.O2: - state["amp_level"] = 'O2' - elif tracer._amp_level in (framework.core.AmpLevel.O1, framework.core.AmpLevel.O0): - state["amp_level"] = 'O1' - else: - raise ValueError(f"unsupported amp level: {tracer._amp_level}") - - if tracer._amp_dtype == 'float16': - state["amp_dtype"] = 'float16' - elif tracer._amp_dtype in ('bfloat16', 'float32'): - state["amp_dtype"] = 'bfloat16' - else: - raise ValueError(f"unsupported amp dtype: {tracer._amp_dtype}") - state["amp_white_list"], state["amp_black_list"] = tracer._get_amp_op_list() - state_buffers.append(state) - - def load_state_and_forward(self, state, input_tensor): - inputs = list(input_tensor) - tensor_indices = state["tensor_indices"] - tensors = self.container - for i, idx in enumerate(tensor_indices): - inputs[idx] = ( - tensors[i].to( - paddle.base.framework._current_expected_place() - ) - if i in state["offload_indices"] - else tensors[i] - ) - if i in state["offload_indices"]: - inputs[idx].stop_gradient = tensors[i].stop_gradient - tracer = framework._dygraph_tracer() - tracer._has_grad = True - - if state["preserve_rng_state"]: - with ( - switch_rng_state_tracker( - state["fw_rng_state"], - state["fwd_rng_state_tracker"], - state["fwd_numpy_state"], - state["fwd_random_state"], - state["fwd_custom_state"], - state["custom_get_state_func"], - state["custom_set_state_func"], - ), - paddle.amp.auto_cast( - enable=state["is_fw_autocast"], - custom_white_list=state["amp_white_list"], - custom_black_list=state["amp_black_list"], - level=state["amp_level"], - dtype=state["amp_dtype"], - ), - ): - detached_inputs = detach_variable(tuple(inputs)) - outputs = self._layers.forward(*detached_inputs) - else: - with paddle.amp.auto_cast( - enable=state["is_fw_autocast"], - custom_white_list=state["amp_white_list"], - custom_black_list=state["amp_black_list"], - level=state["amp_level"], - dtype=state["amp_dtype"], - ): - detached_inputs = detach_variable(tuple(inputs)) - outputs = self._layers.forward(*detached_inputs) - return outputs - - def forward_backward_pipeline( self, @@ -898,8 +796,6 @@ def forward_backward_pipeline( input_buffers = [] output_buffers = [] - if self.recompute_overlap: - state_buffers = [] micro_dataset = self._wrap_data(data) @@ -917,8 +813,6 @@ def forward_backward_pipeline( input_tensor_dict, use_dict = tuple_to_dict_helper(input_tensor) self._record_stamp("F", step_id, '"B"', self._forward_color) - if self.recompute_overlap: - self.save_state(state_buffers) output_tensor, _, _ = self._forward_step( input_tensor=input_tensor_dict if use_dict else input_tensor, micro_dataset=micro_dataset, @@ -962,8 +856,6 @@ def forward_backward_pipeline( self._record_stamp( "F", startup_steps + i, '"B"', self._forward_color ) - if self.recompute_overlap: - self.save_state(state_buffers) output_tensor, _, _ = self._forward_step( input_tensor=input_tensor_dict if use_dict else input_tensor, micro_dataset=micro_dataset, @@ -999,16 +891,9 @@ def forward_backward_pipeline( ) self._record_stamp("B", i, '"B"', self._backward_color) - if self.recompute_overlap: - state = state_buffers.pop(0) - output_tensor_recompute = self.load_state_and_forward(state, input_tensor) - input_tensor_grad = self._backward_step( - input_tensor, output_tensor_recompute, output_tensor_grad, step_id=i - ) - else: - input_tensor_grad = self._backward_step( - input_tensor, output_tensor, output_tensor_grad, step_id=i - ) + input_tensor_grad = self._backward_step( + input_tensor, output_tensor, output_tensor_grad, step_id=i + ) self._record_stamp("B", i, '"E"', self._backward_color) if last_iter: @@ -1048,22 +933,12 @@ def forward_backward_pipeline( self._record_stamp( "B", steady_steps + i, '"B"', self._backward_color ) - if self.recompute_overlap: - state = state_buffers.pop(0) - output_tensor_recompute = self.load_state_and_forward(state, input_tensor) - input_tensor_grad = self._backward_step( - input_tensor, - output_tensor_recompute, - output_tensor_grad, - step_id=steady_steps + i, - ) - else: - input_tensor_grad = self._backward_step( - input_tensor, - output_tensor, - output_tensor_grad, - step_id=steady_steps + i, - ) + input_tensor_grad = self._backward_step( + input_tensor, + output_tensor, + output_tensor_grad, + step_id=steady_steps + i, + ) self._record_stamp( "B", steady_steps + i, '"E"', self._backward_color ) @@ -1379,21 +1254,11 @@ def _forward_step( schedule_chunk = None if overlap_schedule_mode: schedule_chunk = self._layers.get_schedule_chunk(chunk_id=chunk_id) - if self.recompute_overlap: - with paddle.no_grad(): - output_tensor = schedule_chunk.forward(input_tensor) - else: - output_tensor = schedule_chunk.forward(input_tensor) + output_tensor = schedule_chunk.forward(input_tensor) else: - if self.recompute_overlap: - with paddle.no_grad(): - output_tensor = self._layers.forward( - input_tensor, chunk_id=chunk_id - ) - else: - output_tensor = self._layers.forward( - input_tensor, chunk_id=chunk_id - ) + output_tensor = self._layers.forward( + input_tensor, chunk_id=chunk_id + ) self.callbacks.on_location( PipelineParallelMicroStepLocations.FORWARD_END, From f45380c50e0fd4ff277f30d74da59fedcf50dbc5 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 11:02:18 +0800 Subject: [PATCH 10/22] Revert "[API Compatibility] Add paddle.Tensor.clip_" This reverts commit 025efc33f3daad27e6b8eda75d032c91c1a7a020. --- python/paddle/tensor/__init__.py | 2 -- test/legacy_test/test_clip_op.py | 32 -------------------------------- 2 files changed, 34 deletions(-) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 026b051b337104..b6d3d3bdc50847 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -515,7 +515,6 @@ greater = gt sub = subtract sub_ = subtract_ -clamp_ = clip_ # this list used in math_op_patch.py for _binary_creator_ tensor_method_func = [ @@ -948,7 +947,6 @@ 'gt', 'greater', 'clamp', - 'clamp_', ] diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py index 480f08c59e3f41..de37d48303782c 100644 --- a/test/legacy_test/test_clip_op.py +++ b/test/legacy_test/test_clip_op.py @@ -1033,37 +1033,5 @@ def test_static_compatibility(self): np.testing.assert_array_equal(self.np_out, fetches[0]) -class TestClamp_AndClip_(unittest.TestCase): - def setUp(self) -> None: - paddle.disable_static() - self.shape = [3, 4, 5] - self.input_np = np.random.random(self.shape).astype('float32') - self.a = np.random.random(self.shape).astype('float32') - self.b = np.random.random(self.shape).astype('float32') - self.min, self.max = -0.5, 0.5 - - def test_clip_and_clamp(self): - clip_a = paddle.to_tensor(self.a, stop_gradient=False) - clip_b = paddle.to_tensor(self.b, stop_gradient=False) - - clamp_a = paddle.to_tensor(self.a, stop_gradient=False) - clamp_b = paddle.to_tensor(self.b, stop_gradient=False) - - clip_x = clip_a + clip_b - clip_x.clip_(min=self.min, max=self.max) - clip_x.retain_grads() - clip_x.mean().backward() - - clamp_x = clamp_a + clamp_b - clamp_x.clamp_(min=self.min, max=self.max) - clamp_x.retain_grads() - clamp_x.mean().backward() - - np.testing.assert_allclose(clip_x.numpy(), clamp_x.numpy(), rtol=1e-20) - np.testing.assert_allclose( - clip_x.grad.numpy(), clamp_x.grad.numpy(), rtol=1e-20 - ) - - if __name__ == '__main__': unittest.main() From dc28d5eac9139d111cde1bfdd5b7171f7b354ac1 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 11/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/__init__.py | 4 +- python/paddle/functional.py | 2 + python/paddle/nn/functional/__init__.py | 2 + python/paddle/nn/functional/activation.py | 2 + python/paddle/nn/init.py | 37 ++++++++++++++++++ test/legacy_test/test_activation_op.py | 47 +++++++++++++++++++++++ test/legacy_test/test_autocast.py | 40 +++++++++++++++++++ test/legacy_test/test_nn_init_function.py | 37 ++++++++++++++++++ 8 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 test/legacy_test/test_autocast.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 29109ac151ccce..687e306d720cb1 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -226,6 +226,7 @@ def new_init(self, *args, **kwargs): get_autocast_gpu_dtype, is_autocast_enabled, ) +from .amp.auto_cast import autocast as _autocast from .autograd import ( enable_grad, grad, @@ -970,7 +971,7 @@ def __dir__(self): manual_seed = seed sub = subtract sub_ = subtract_ - +autocast = _autocast __all__ = [ 'block_diag', @@ -1481,6 +1482,7 @@ def __dir__(self): 'conv3d', 'manual_seed', 'softmax', + 'autocast', ] import os diff --git a/python/paddle/functional.py b/python/paddle/functional.py index 96e0c5eb6106bc..6642f3867899b8 100644 --- a/python/paddle/functional.py +++ b/python/paddle/functional.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from .compat import split +from .tensor.creation import meshgrid from .tensor.einsum import einsum from .tensor.linalg import norm from .tensor.manipulation import ( @@ -31,4 +32,5 @@ "norm", 'split', 'unique_consecutive', + "meshgrid", ] diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index db823aa97d7f1e..cd5c5e702c7245 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -172,6 +172,7 @@ pixel_unshuffle, ) +logsigmoid = log_sigmoid __all__ = [ 'celu', 'conv1d', @@ -192,6 +193,7 @@ 'leaky_relu', 'leaky_relu_', 'log_sigmoid', + 'logsigmoid', 'maxout', 'prelu', 'relu', diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index b7b63d5c7c1323..b5016f298ed890 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -754,6 +754,7 @@ def relu_(x: Tensor, name: str | None = None) -> Tensor: return _C_ops.relu_(x) +@param_one_alias(["x", "input"]) def log_sigmoid(x: Tensor, name: str | None = None) -> Tensor: r""" log_sigmoid activation. @@ -764,6 +765,7 @@ def log_sigmoid(x: Tensor, name: str | None = None) -> Tensor: Parameters: x (Tensor): The input Tensor with data type float32, float64, complex64, complex128. + Alias: ``input``. name (str|None, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Returns: diff --git a/python/paddle/nn/init.py b/python/paddle/nn/init.py index ad6116ddcb64e4..5dc730c6e43e0d 100644 --- a/python/paddle/nn/init.py +++ b/python/paddle/nn/init.py @@ -14,6 +14,8 @@ from __future__ import annotations +import numpy as np + import paddle from ..base.framework import in_dygraph_mode, in_pir_mode @@ -27,6 +29,41 @@ from .initializer.xavier import XavierNormal, XavierUniform +def _calculate_fan_in_and_fan_out(var: paddle.Tensor) -> tuple[int, int]: + """Compute the fan_in and the fan_out for layers + + This method computes the fan_in and the fan_out + for neural network layers, if not specified. It is + not possible to perfectly estimate fan_in and fan_out. + This method will estimate it correctly for matrix multiply and + convolutions. + + Args: + var: variable for which fan_in and fan_out have to be computed. + + Returns: + tuple of two integers (fan_in, fan_out). + """ + shape = var.shape + if not shape or len(shape) == 0: + fan_in = fan_out = 1 + elif len(shape) == 1: + fan_in = fan_out = shape[0] + elif len(shape) == 2: + # This is the case for simple matrix multiply + fan_in = shape[0] + fan_out = shape[1] + else: + # Assume this to be a convolutional kernel + # In PaddlePaddle, the shape of the kernel is like: + # [num_filters, num_filter_channels, ...] where the remaining + # dimensions are the filter_size + receptive_field_size = np.prod(shape[2:]) + fan_in = int(shape[1] * receptive_field_size) + fan_out = int(shape[0] * receptive_field_size) + return (fan_in, fan_out) + + def kaiming_uniform_( tensor: paddle.Tensor, a: float = 0, diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py index a72407c157555f..705a16896b992c 100644 --- a/test/legacy_test/test_activation_op.py +++ b/test/legacy_test/test_activation_op.py @@ -854,6 +854,53 @@ def test_errors(self): F.log_sigmoid(x_fp16) +class TestLogSigmoidOutAndParaDecorator(unittest.TestCase): + def setUp(self) -> None: + paddle.disable_static() + self.apis = [ + paddle.nn.functional.log_sigmoid, + paddle.nn.functional.logsigmoid, + ] + self.shape = [3, 4, 5] + self.input_np = np.random.random(self.shape).astype('float32') + + def do_test(self, api, test_type): + self.test_types = [ + "decorator1", + ] + x = paddle.to_tensor(self.input_np, stop_gradient=False) + out = paddle.zeros(self.shape, dtype='float32') + out.stop_gradient = False + if test_type == "raw": + out = paddle.nn.functional.log_sigmoid(x) + out.mean().backward() + return out, x.grad + elif test_type == "decorator1": + res = api(input=x) + loss = res.mean() + loss.backward() + x_grad = x.grad + return res, x_grad + else: + raise NotImplementedError( + f"Test type {test_type} is not implemented." + ) + + def test_api(self): + out_std, x_grad_std = self.do_test( + paddle.nn.functional.log_sigmoid, "raw" + ) + for api in self.apis: + for test_type in self.test_types: + out, x_grad = self.do_test(api, test_type) + np.testing.assert_allclose( + out.numpy(), out_std.numpy(), rtol=1e-20 + ) + np.testing.assert_allclose( + x_grad.numpy(), x_grad_std.numpy(), rtol=1e-20 + ) + + class TestTanh(TestActivation, TestParameter): def setUp(self): self.op_type = "tanh" diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py new file mode 100644 index 00000000000000..1e2b0593da01c5 --- /dev/null +++ b/test/legacy_test/test_autocast.py @@ -0,0 +1,40 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle + + +class TestAutoCast(unittest.TestCase): + def init_net(self): + self._conv = paddle.nn.Conv2D( + in_channels=1, out_channels=6, kernel_size=3, bias_attr=False + ) + self._linear = paddle.nn.Linear(in_features=4, out_features=4) + + def test_autocast(self): + self.init_net() + with paddle.autocast(): + out1 = self._conv(paddle.rand(shape=[1, 1, 6, 6], dtype='float32')) + out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') + out3 = self._linear(out2) + + self.assertEqual(out1.dtype, paddle.float16) + self.assertEqual(out2.dtype, paddle.float16) + self.assertEqual(out3.dtype, paddle.float32) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/legacy_test/test_nn_init_function.py b/test/legacy_test/test_nn_init_function.py index fb21baacb72e72..ad6ccf89c020b7 100644 --- a/test/legacy_test/test_nn_init_function.py +++ b/test/legacy_test/test_nn_init_function.py @@ -62,6 +62,22 @@ def _calculate_gain(nonlinearity, param): return recommended_gain[nonlinearity] +def _calculate_fan_in_and_fan_out(var: paddle.Tensor) -> tuple[int, int]: + shape = var.shape + if not shape or len(shape) == 0: + fan_in = fan_out = 1 + elif len(shape) == 1: + fan_in = fan_out = shape[0] + elif len(shape) == 2: + fan_in = shape[0] + fan_out = shape[1] + else: + receptive_field_size = np.prod(shape[2:]) + fan_in = shape[1] * receptive_field_size + fan_out = shape[0] * receptive_field_size + return (fan_in, fan_out) + + class Test_calculate_gain(unittest.TestCase): def test(self): for nonlinearity in [ @@ -87,6 +103,27 @@ def test(self): ) +class TestCAlFanINOUT(unittest.TestCase): + def test_cal_fan_in_and_out(self): + x = paddle.tensor.randn([10]) + self.assertEqual( + _calculate_fan_in_and_fan_out(x), + paddle.nn.init._calculate_fan_in_and_fan_out(x), + ) + + y = paddle.tensor.randn([10, 10]) + self.assertEqual( + _calculate_fan_in_and_fan_out(y), + paddle.nn.init._calculate_fan_in_and_fan_out(y), + ) + + z = paddle.randn([10, 10, 10]) + self.assertEqual( + _calculate_fan_in_and_fan_out(z), + paddle.nn.init._calculate_fan_in_and_fan_out(z), + ) + + class Test_kaiming_uniform_(unittest.TestCase): def check_kaiming_uniform( self, tensor, a=0, mode='fan_in', nonlinearity='leaky_relu' From a8f7186d137a330a79bb587b39a2a0d49466bbb4 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 12/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/tensor/__init__.py | 2 ++ test/legacy_test/test_clip_op.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index b6d3d3bdc50847..026b051b337104 100644 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -515,6 +515,7 @@ greater = gt sub = subtract sub_ = subtract_ +clamp_ = clip_ # this list used in math_op_patch.py for _binary_creator_ tensor_method_func = [ @@ -947,6 +948,7 @@ 'gt', 'greater', 'clamp', + 'clamp_', ] diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py index de37d48303782c..480f08c59e3f41 100644 --- a/test/legacy_test/test_clip_op.py +++ b/test/legacy_test/test_clip_op.py @@ -1033,5 +1033,37 @@ def test_static_compatibility(self): np.testing.assert_array_equal(self.np_out, fetches[0]) +class TestClamp_AndClip_(unittest.TestCase): + def setUp(self) -> None: + paddle.disable_static() + self.shape = [3, 4, 5] + self.input_np = np.random.random(self.shape).astype('float32') + self.a = np.random.random(self.shape).astype('float32') + self.b = np.random.random(self.shape).astype('float32') + self.min, self.max = -0.5, 0.5 + + def test_clip_and_clamp(self): + clip_a = paddle.to_tensor(self.a, stop_gradient=False) + clip_b = paddle.to_tensor(self.b, stop_gradient=False) + + clamp_a = paddle.to_tensor(self.a, stop_gradient=False) + clamp_b = paddle.to_tensor(self.b, stop_gradient=False) + + clip_x = clip_a + clip_b + clip_x.clip_(min=self.min, max=self.max) + clip_x.retain_grads() + clip_x.mean().backward() + + clamp_x = clamp_a + clamp_b + clamp_x.clamp_(min=self.min, max=self.max) + clamp_x.retain_grads() + clamp_x.mean().backward() + + np.testing.assert_allclose(clip_x.numpy(), clamp_x.numpy(), rtol=1e-20) + np.testing.assert_allclose( + clip_x.grad.numpy(), clamp_x.grad.numpy(), rtol=1e-20 + ) + + if __name__ == '__main__': unittest.main() From 7fcdd485266d6084cc21161ea06107fc75d1458f Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 13/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 687e306d720cb1..2fb61ae2731d23 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -226,7 +226,7 @@ def new_init(self, *args, **kwargs): get_autocast_gpu_dtype, is_autocast_enabled, ) -from .amp.auto_cast import autocast as _autocast +from .amp.auto_cast import autocast from .autograd import ( enable_grad, grad, @@ -971,7 +971,6 @@ def __dir__(self): manual_seed = seed sub = subtract sub_ = subtract_ -autocast = _autocast __all__ = [ 'block_diag', From 6431d8fe83dbda5678e8df2fc1d5af575f40751c Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 14/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index 1e2b0593da01c5..4b4e3961a72806 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -26,7 +26,7 @@ def init_net(self): def test_autocast(self): self.init_net() - with paddle.autocast(): + with paddle.autocast("gpu"): out1 = self._conv(paddle.rand(shape=[1, 1, 6, 6], dtype='float32')) out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) From 9d93a481f5009b864755dce55913d6b1962830c5 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 15/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index 4b4e3961a72806..f18057913b945b 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -18,14 +18,11 @@ class TestAutoCast(unittest.TestCase): - def init_net(self): - self._conv = paddle.nn.Conv2D( - in_channels=1, out_channels=6, kernel_size=3, bias_attr=False - ) - self._linear = paddle.nn.Linear(in_features=4, out_features=4) + def setUp(self): + self._conv = paddle.nn.Conv2D(1, 1, 3, bias_attr=False) + self._linear = paddle.nn.Linear(4, 4) def test_autocast(self): - self.init_net() with paddle.autocast("gpu"): out1 = self._conv(paddle.rand(shape=[1, 1, 6, 6], dtype='float32')) out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') From fdb2a259d653f22130e70bce6fa6c61500480afe Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 16/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index f18057913b945b..d4378afcd9e869 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -28,7 +28,7 @@ def test_autocast(self): out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) - self.assertEqual(out1.dtype, paddle.float16) + self.assertEqual(out1.dtype, paddle.float32) self.assertEqual(out2.dtype, paddle.float16) self.assertEqual(out3.dtype, paddle.float32) From b2f67a74c81fdce1c946a4faac258834f9485e51 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 17/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index d4378afcd9e869..580d5dd15af02c 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -23,7 +23,12 @@ def setUp(self): self._linear = paddle.nn.Linear(4, 4) def test_autocast(self): - with paddle.autocast("gpu"): + with paddle.autocast( + device_type='cuda', + enabled=True, + dtype=paddle.float16, + cache_enabled=True, + ): out1 = self._conv(paddle.rand(shape=[1, 1, 6, 6], dtype='float32')) out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) From fdce6029fbffa6b6f4db70d403e5688d94c61d2c Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 18/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index 580d5dd15af02c..dac9077dc82208 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -33,7 +33,7 @@ def test_autocast(self): out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) - self.assertEqual(out1.dtype, paddle.float32) + self.assertEqual(out1.dtype, paddle.float16) self.assertEqual(out2.dtype, paddle.float16) self.assertEqual(out3.dtype, paddle.float32) From 767effe2987c0924f44e6fd68f6285dd43f3db01 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 19/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index dac9077dc82208..a6fec9bdc0f404 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -33,8 +33,8 @@ def test_autocast(self): out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) - self.assertEqual(out1.dtype, paddle.float16) - self.assertEqual(out2.dtype, paddle.float16) + self.assertEqual(out1.dtype, paddle.float32) + self.assertEqual(out2.dtype, paddle.float32) self.assertEqual(out3.dtype, paddle.float32) From 97e5dc1bab966c405f32a39e1324cc0bd922c404 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 20/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index a6fec9bdc0f404..26b403088b4d71 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -33,7 +33,6 @@ def test_autocast(self): out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) - self.assertEqual(out1.dtype, paddle.float32) self.assertEqual(out2.dtype, paddle.float32) self.assertEqual(out3.dtype, paddle.float32) From b228dce6573568e35c020c6f8d74a38909e06b27 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 21/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index 26b403088b4d71..26c88d156ae73a 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -33,6 +33,7 @@ def test_autocast(self): out2 = out1 + paddle.rand(shape=out1.shape, dtype='float16') out3 = self._linear(out2) + self.assertEqual(out1.dtype, paddle.float16) self.assertEqual(out2.dtype, paddle.float32) self.assertEqual(out3.dtype, paddle.float32) From 56deece17d42dbfd67cb42b6c6ff2df748e02ce9 Mon Sep 17 00:00:00 2001 From: AlAuAu <458134681@qq.com> Date: Wed, 5 Nov 2025 10:50:31 +0800 Subject: [PATCH 22/22] =?UTF-8?q?[API=20Compatibility]=20Add=20clip=5F?= =?UTF-8?q?=E3=80=81logsigmoid=E3=80=81=5Fcalculate=5Ffan=5Fin=5Fand=5Ffan?= =?UTF-8?q?=5Fout=E3=80=81meshgrid=E3=80=81autocast?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/legacy_test/test_autocast.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/test/legacy_test/test_autocast.py b/test/legacy_test/test_autocast.py index 26c88d156ae73a..e4d16b1b6211c8 100644 --- a/test/legacy_test/test_autocast.py +++ b/test/legacy_test/test_autocast.py @@ -15,14 +15,34 @@ import unittest import paddle - - -class TestAutoCast(unittest.TestCase): +from paddle.base import core + + +@unittest.skipIf( + not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(), + "Require compiled with CUDA or XPU.", +) +@unittest.skipIf( + core.is_compiled_with_cuda() + and paddle.device.cuda.get_device_capability()[0] < 7.0, + "run test when gpu's compute capability is at least 7.0.", +) +@unittest.skipIf( + core.is_compiled_with_xpu() + and core.get_xpu_device_version(0) < core.XPUVersion.XPU3, + "run test when xpu's compute capability >= xpu3.", +) +@unittest.skipIf( + core.is_compiled_with_xpu() + and core.get_xpu_device_version(0) == core.XPUVersion.XPU3, + "Bugs on XPU3, disable temporarily", +) +class TestCudaAutoCast(unittest.TestCase): def setUp(self): self._conv = paddle.nn.Conv2D(1, 1, 3, bias_attr=False) self._linear = paddle.nn.Linear(4, 4) - def test_autocast(self): + def _run_autocast_test(self, ctx): with paddle.autocast( device_type='cuda', enabled=True, @@ -34,7 +54,7 @@ def test_autocast(self): out3 = self._linear(out2) self.assertEqual(out1.dtype, paddle.float16) - self.assertEqual(out2.dtype, paddle.float32) + self.assertEqual(out2.dtype, paddle.float16) self.assertEqual(out3.dtype, paddle.float32)