huggingface · taha-yassine · Nov 13, 2025
diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -102,6 +102,8 @@ class GRPOConfig(TrainingArguments):
             parameter is only effective when `use_vllm` is set to `False`.
         cache_implementation (`str`, *optional*):
             Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+        skip_special_tokens (`bool`, *optional*, defaults to `True`):
+            Whether to skip special tokens when decoding completions. This affects both reward computation and logging.
 
         > Parameters that control generation acceleration powered by vLLM
 
@@ -451,6 +453,13 @@ class GRPOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Implementation of the cache method for faster generation when use_vllm is set to False."},
     )
+    skip_special_tokens: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to skip special tokens when decoding completions. This affects both reward computation "
+            "and logging."
+        },
+    )
 
     # Parameters that control generation acceleration powered by vLLM
     use_vllm: bool = field(

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -534,6 +534,7 @@ def cast_outputs_to_original_dtype(module, args, output):
         self.log_completions = args.log_completions
         self.log_unique_prompts = args.log_unique_prompts
         self.num_completions_to_print = args.num_completions_to_print
+        self.skip_special_tokens = args.skip_special_tokens
         # Keep logs sized to the generation batch to record only outputs from the latest model update.
         self._logs = {
             "images": deque(maxlen=args.generation_batch_size),
@@ -1569,8 +1570,8 @@ def _generate_and_score_completions(
                 ref_per_token_logps = None
 
         # Decode
-        prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=True)
-        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=self.skip_special_tokens)
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=self.skip_special_tokens)
         if is_conversational(inputs[0]):
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):

diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
@@ -98,6 +98,8 @@ class RLOOConfig(TrainingArguments):
             parameter is only effective when `use_vllm` is set to `False`.
         cache_implementation (`str`, *optional*):
             Implementation of the cache method for faster generation when `use_vllm` is set to `False`.
+        skip_special_tokens (`bool`, *optional*, defaults to `True`):
+            Whether to skip special tokens when decoding completions. This affects both reward computation and logging.
 
         > Parameters that control generation acceleration powered by vLLM
 
@@ -377,6 +379,13 @@ class RLOOConfig(TrainingArguments):
         default=None,
         metadata={"help": "Implementation of the cache method for faster generation when use_vllm is set to False."},
     )
+    skip_special_tokens: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to skip special tokens when decoding completions. This affects both reward computation "
+            "and logging."
+        },
+    )
 
     # Parameters that control generation acceleration powered by vLLM
     use_vllm: bool = field(

diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
@@ -451,6 +451,7 @@ def __init__(
         self.log_completions = args.log_completions
         self.log_unique_prompts = args.log_unique_prompts
         self.num_completions_to_print = args.num_completions_to_print
+        self.skip_special_tokens = args.skip_special_tokens
         # Keep logs sized to the generation batch to record only outputs from the latest model update.
         self._logs = {
             "images": deque(maxlen=args.generation_batch_size),
@@ -1336,8 +1337,8 @@ def _generate_and_score_completions(
                 ref_per_token_logps = None
 
         # Decode
-        prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=True)
-        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        prompts_text = self.processing_class.batch_decode(prompt_ids, skip_special_tokens=self.skip_special_tokens)
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=self.skip_special_tokens)
         if is_conversational(inputs[0]):
             completions = []
             for prompt, completion in zip(prompts, completions_text, strict=True):