Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions examples/pre-training/ernie/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,16 +296,16 @@ def formatv(v):
and not args.overwrite_output_dir
):
last_checkpoint = get_last_checkpoint(args.output_dir)
if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0:
raise ValueError(
f"Output directory ({args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# if last_checkpoint is None and len(os.listdir(args.output_dir)) > 0:
# raise ValueError(
# f"Output directory ({args.output_dir}) already exists and is not empty. "
# "Use --overwrite_output_dir to overcome."
# )
# elif last_checkpoint is not None and args.resume_from_checkpoint is None:
# logger.info(
# f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
# "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
# )

def compute_metrics(p):
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
Expand Down Expand Up @@ -439,6 +439,7 @@ def sname_to_tname(pp_model):
cfg.token_balance_seqlen = args.max_seq_length * args.per_device_train_batch_size
cfg.fp16_opt_level = args.fp16_opt_level
cfg.moe_group = args.moe_group
cfg.moe_group_name = args.moe_group
cfg.dtype = dtype
cfg.use_fp8 = args.use_fp8
cfg.enable_mtp_magic_send = args.enable_mtp_magic_send
Expand Down Expand Up @@ -502,7 +503,7 @@ def sname_to_tname(pp_model):
logger.info(f"using model type:{type(model)}")
paddle.set_default_dtype("float32")

logger.info(f"using model={type(model)}, cfg={cfg}")
# logger.info(f"using model={type(model)}, cfg={cfg}")

train_dataset, eval_dataset, test_dataset, data_collator = (
create_pretrained_dataset(args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1260,7 +1260,7 @@ def _maybe_log_save_evaluate(
)
logs["learning_rate"] = float(self._get_learning_rate())
logs["global_step"] = int(self.state.global_step)

logs["loss_md5"] = paddle.to_tensor(logs["loss"])._md5sum()
divisor = 2**30

current_device = framework._current_expected_place_()
Expand Down
3 changes: 2 additions & 1 deletion examples/pre-training/models/ernie/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def __init__(
global_aux_loss=False,
moe_dropout_prob=0.0,
moe_group="world",
moe_group_name="world",
num_experts_per_tok: int = 8,
moe_intermediate_size: Union[int, list] = 0,
moe_num_shared_experts: int = 0,
Expand Down Expand Up @@ -356,6 +357,7 @@ def update_nested_dict(default_dict, update_dict):
self.moe_layer_interval = moe_layer_interval
self.moe_dropout_prob = moe_dropout_prob
self.moe_group = moe_group
self.moe_group_name = moe_group_name
self.num_experts_per_tok = num_experts_per_tok
self.moe_num_shared_experts = moe_num_shared_experts
self.moe_num_dense_experts = moe_num_dense_experts
Expand Down Expand Up @@ -395,7 +397,6 @@ def update_nested_dict(default_dict, update_dict):

self.use_linear_residual_norm_recompute = use_linear_residual_norm_recompute
self.use_rms_qkv_recompute = use_rms_qkv_recompute

assert aux_loss_type in ["", "default", "seq_aux_loss", "switch_aux_loss"]
self.aux_loss_type = aux_loss_type

Expand Down
Loading