huggingface
diff --git a/‎.github/workflows/docker-build.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/docker-build.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 4 additions & 7 deletions b/‎README.md‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎docker/trl-dev/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎docker/trl-dev/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docker/trl/Dockerfile‎
Lines changed: 3 additions & 2 deletions b/‎docker/trl/Dockerfile‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/source/_toctree.yml‎
Lines changed: 16 additions & 14 deletions b/‎docs/source/_toctree.yml‎
Lines changed: 16 additions & 14 deletions
@@ -13,7 +13,8 @@ concurrency:
 jobs:
   trl:
     name: "Build and push TRL Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -52,7 +53,8 @@ jobs:
 
   trl-dev:
     name: "Build and push TRL Dev Docker image"
-    runs-on: ubuntu-latest
+    runs-on:
+      group: aws-general-8-plus
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
@@ -25,7 +25,7 @@ Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documenta
 
 ## Overview
 
-TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Proximal Policy Optimization (PPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
+TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Group Realtive Policy Optimization (GRPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.
 
 ## Highlights
 
@@ -92,16 +92,13 @@ trainer.train()
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer
+from trl.rewards import accuracy_reward
 
-dataset = load_dataset("trl-lib/tldr", split="train")
-
-# Dummy reward function: count the number of unique characters in the completions
-def reward_num_unique_chars(completions, **kwargs):
-    return [len(set(c)) for c in completions]
+dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
 
 trainer = GRPOTrainer(
     model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_num_unique_chars,
+    reward_funcs=accuracy_reward,
     train_dataset=dataset,
 )
 trainer.train()
 
@@ -1,5 +1,5 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
 RUN uv pip install --system --no-cache "git+https://github.com/huggingface/trl.git#egg=trl[liger,peft,vlm]"
-RUN uv pip install --system hf_transfer kernels liger_kernel peft trackio
+RUN uv pip install --system kernels liger_kernel peft trackio
@@ -1,3 +1,4 @@
-FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 RUN pip install --upgrade pip uv
-RUN uv pip install --system trl[liger,peft,vlm] hf_transfer kernels trackio
+RUN uv pip install --system trl[liger,peft,vlm] kernels trackio
@@ -56,24 +56,12 @@
   title: Examples
 - sections:
   - sections: # Sorted alphabetically
-    - local: cpo_trainer
-      title: CPO
     - local: dpo_trainer
       title: DPO
-    - local: gkd_trainer
-      title: GKD
     - local: grpo_trainer
       title: GRPO
     - local: kto_trainer
       title: KTO
-    - local: nash_md_trainer
-      title: Nash-MD
-    - local: orpo_trainer
-      title: ORPO
-    - local: ppo_trainer
-      title: PPO
-    - local: prm_trainer
-      title: PRM
     - local: reward_trainer
       title: Reward
     - local: rloo_trainer
@@ -103,10 +91,12 @@
     title: BEMA for Reference Model
   - local: bco_trainer
     title: BCO
-  - local: online_dpo_trainer
-    title: Online DPO
+  - local: cpo_trainer
+    title: CPO
   - local: gfpo
     title: GFPO
+  - local: gkd_trainer
+    title: GKD
   - local: gold_trainer
     title: GOLD
   - local: grpo_with_replay_buffer
@@ -115,8 +105,20 @@
     title: GSPO-token
   - local: judges
     title: Judges
+  - local: minillm
+    title: MiniLLM
+  - local: nash_md_trainer
+    title: Nash-MD
+  - local: online_dpo_trainer
+    title: Online DPO
+  - local: orpo_trainer
+    title: ORPO
   - local: papo_trainer
     title: PAPO
+  - local: ppo_trainer
+    title: PPO
+  - local: prm_trainer
+    title: PRM
   - local: xpo_trainer
     title: XPO
   - local: openenv