1+ """
2+ Comprehensive benchmarking script for Fused LayerNorm CUDA operator.
3+
4+ This script benchmarks our implementation against PyTorch's native LayerNorm
5+ across various model configurations (BERT, GPT-2, GPT-3) and batch sizes.
6+ """
7+
8+ import torch
9+ import torch .nn as nn
10+ import time
11+ import numpy as np
12+ import pandas as pd
13+ from typing import Dict , List , Tuple
14+ from tqdm import tqdm
15+ import argparse
16+ import json
17+ import os
18+ import matplotlib .pyplot as plt
19+ import seaborn as sns
20+ from datetime import datetime
21+
22+ # Import our fused implementation
23+ from fused_layernorm import FusedLayerNorm
24+
25+
26+ class BenchmarkConfig :
27+ """Configuration for benchmark runs."""
28+
29+ # Model configurations (hidden_size, model_name)
30+ MODEL_CONFIGS = [
31+ (768 , "BERT-Base" ),
32+ (1024 , "BERT-Large" ),
33+ (768 , "GPT-2 Small" ),
34+ (1024 , "GPT-2 Medium" ),
35+ (1280 , "GPT-2 Large" ),
36+ (1600 , "GPT-2 XL" ),
37+ (2048 , "GPT-3 Small" ),
38+ (2560 , "GPT-3 Medium" ),
39+ (4096 , "GPT-3 Large" ),
40+ (5120 , "GPT-3 XL" ),
41+ (8192 , "GPT-3 XXL" ),
42+ (12288 , "GPT-3 175B" ),
43+ ]
44+
45+ # Batch sizes to test
46+ BATCH_SIZES = [1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 ]
47+
48+ # Sequence lengths for transformer models
49+ SEQUENCE_LENGTHS = [128 , 256 , 512 , 1024 ]
50+
51+ # Number of warmup iterations
52+ WARMUP_ITERS = 50
53+
54+ # Number of benchmark iterations
55+ BENCHMARK_ITERS = 200
56+
57+ # Data types to test
58+ DTYPES = [torch .float32 , torch .float16 ]
59+
60+
61+ class LayerNormBenchmark :
62+ """Benchmark harness for LayerNorm implementations."""
63+
64+ def __init__ (self , device = 'cuda' , verbose = True ):
65+ self .device = device
66+ self .verbose = verbose
67+ self .results = []
68+
69+ # Ensure CUDA is available
70+ if not torch .cuda .is_available ():
71+ raise RuntimeError ("CUDA is not available" )
72+
73+ # Get GPU info
74+ self .gpu_name = torch .cuda .get_device_name (0 )
75+ self .gpu_memory = torch .cuda .get_device_properties (0 ).total_memory / 1e9 # GB
76+
77+ if self .verbose :
78+ print (f"GPU: { self .gpu_name } " )
79+ print (f"Memory: { self .gpu_memory :.1f} GB" )
80+ print (f"PyTorch: { torch .__version__ } " )
81+ print (f"CUDA: { torch .version .cuda } " )
82+ print ("-" * 60 )
83+
84+ def benchmark_forward (self , layer : nn .Module , input_tensor : torch .Tensor ,
85+ num_iters : int ) -> float :
86+ """Benchmark forward pass."""
87+ # Warmup
88+ for _ in range (BenchmarkConfig .WARMUP_ITERS ):
89+ _ = layer (input_tensor )
90+
91+ torch .cuda .synchronize ()
92+
93+ # Benchmark
94+ start_event = torch .cuda .Event (enable_timing = True )
95+ end_event = torch .cuda .Event (enable_timing = True )
96+
97+ start_event .record ()
98+ for _ in range (num_iters ):
99+ _ = layer (input_tensor )
100+ end_event .record ()
101+
102+ torch .cuda .synchronize ()
103+ elapsed_time = start_event .elapsed_time (end_event ) / num_iters
104+
105+ return elapsed_time
106+
107+ def benchmark_backward (self , layer : nn .Module , input_tensor : torch .Tensor ,
108+ num_iters : int ) -> float :
109+ """Benchmark backward pass."""
110+ input_tensor .requires_grad = True
111+
112+ # Warmup
113+ for _ in range (BenchmarkConfig .WARMUP_ITERS ):
114+ output = layer (input_tensor )
115+ loss = output .sum ()
116+ loss .backward ()
117+
118+ torch .cuda .synchronize ()
119+
120+ # Benchmark
121+ start_event = torch .cuda .Event (enable_timing = True )
122+ end_event = torch .cuda .Event (enable_timing = True )
123+
124+ start_event .record ()
125+ for _ in range (num_iters ):
126+ output = layer (input_tensor )
127+ loss = output .sum ()
128+ loss .backward ()
129+ end_event .record ()
130+
131+ torch .cuda .synchronize ()
132+ elapsed_time = start_event .elapsed_time (end_event ) / num_iters
133+
134+ return elapsed_time
135+
136+ def measure_memory (self , layer : nn .Module , input_tensor : torch .Tensor ) -> Dict [str , float ]:
137+ """Measure memory usage."""
138+ torch .cuda .empty_cache ()
139+ torch .cuda .reset_peak_memory_stats ()
140+
141+ # Measure forward pass memory
142+ start_memory = torch .cuda .memory_allocated ()
143+ output = layer (input_tensor )
144+ torch .cuda .synchronize ()
145+ forward_memory = torch .cuda .memory_allocated () - start_memory
146+
147+ # Measure backward pass memory
148+ if input_tensor .requires_grad :
149+ loss = output .sum ()
150+ loss .backward ()
151+ torch .cuda .synchronize ()
152+ total_memory = torch .cuda .memory_allocated () - start_memory
153+ backward_memory = total_memory - forward_memory
154+ else :
155+ backward_memory = 0
156+ total_memory = forward_memory
157+
158+ peak_memory = torch .cuda .max_memory_allocated () - start_memory
159+
160+ return {
161+ 'forward_memory_mb' : forward_memory / 1e6 ,
162+ 'backward_memory_mb' : backward_memory / 1e6 ,
163+ 'total_memory_mb' : total_memory / 1e6 ,
164+ 'peak_memory_mb' : peak_memory / 1e6
165+ }
166+
167+ def run_single_benchmark (self , batch_size : int , seq_len : int , hidden_size : int ,
168+ dtype : torch .dtype , model_name : str ) -> Dict :
169+ """Run a single benchmark configuration."""
170+ # Create input tensor
171+ shape = (batch_size * seq_len , hidden_size )
172+ input_tensor = torch .randn (shape , device = self .device , dtype = dtype )
173+
174+ # Create layers
175+ pytorch_layer = nn .LayerNorm (hidden_size , dtype = dtype ).to (self .device )
176+ fused_layer = FusedLayerNorm (hidden_size , dtype = dtype ).to (self .device )
177+
178+ # Copy weights to ensure fair comparison
179+ with torch .no_grad ():
180+ fused_layer .weight .data .copy_ (pytorch_layer .weight .data )
181+ fused_layer .bias .data .copy_ (pytorch_layer .bias .data )
182+
183+ # Benchmark forward pass
184+ pytorch_forward_time = self .benchmark_forward (
185+ pytorch_layer , input_tensor , BenchmarkConfig .BENCHMARK_ITERS
186+ )
187+ fused_forward_time = self .benchmark_forward (
188+ fused_layer , input_tensor , BenchmarkConfig .BENCHMARK_ITERS
189+ )
190+
191+ # Benchmark backward pass
192+ pytorch_backward_time = self .benchmark_backward (
193+ pytorch_layer , input_tensor .clone (), BenchmarkConfig .BENCHMARK_ITERS
194+ )
195+ fused_backward_time = self .benchmark_backward (
196+ fused_layer , input_tensor .clone (), BenchmarkConfig .BENCHMARK_ITERS
197+ )
198+
199+ # Measure memory
200+ pytorch_memory = self .measure_memory (pytorch_layer , input_tensor .clone ())
201+ fused_memory = self .measure_memory (fused_layer , input_tensor .clone ())
202+
203+ # Calculate speedups
204+ forward_speedup = pytorch_forward_time / fused_forward_time
205+ backward_speedup = pytorch_backward_time / fused_backward_time
206+ total_speedup = (pytorch_forward_time + pytorch_backward_time ) / \
207+ (fused_forward_time + fused_backward_time )
208+
209+ # Calculate memory reduction
210+ memory_reduction = 1 - (fused_memory ['peak_memory_mb' ] / pytorch_memory ['peak_memory_mb' ])
211+
212+ # Verify correctness
213+ with torch .no_grad ():
214+ pytorch_output = pytorch_layer (input_tensor )
215+ fused_output = fused_layer (input_tensor )
216+ max_diff = torch .max (torch .abs (pytorch_output - fused_output )).item ()
217+
218+ result = {
219+ 'model_name' : model_name ,
220+ 'batch_size' : batch_size ,
221+ 'seq_len' : seq_len ,
222+ 'hidden_size' : hidden_size ,
223+ 'total_seq_len' : batch_size * seq_len ,
224+ 'dtype' : str (dtype ).split ('.' )[- 1 ],
225+ 'pytorch_forward_ms' : pytorch_forward_time ,
226+ 'fused_forward_ms' : fused_forward_time ,
227+ 'forward_speedup' : forward_speedup ,
228+ 'pytorch_backward_ms' : pytorch_backward_time ,
229+ 'fused_backward_ms' : fused_backward_time ,
230+ 'backward_speedup' : backward_speedup ,
231+ 'total_speedup' : total_speedup ,
232+ 'pytorch_memory_mb' : pytorch_memory ['peak_memory_mb' ],
233+ 'fused_memory_mb' : fused_memory ['peak_memory_mb' ],
234+ 'memory_reduction' : memory_reduction ,
235+ 'max_diff' : max_diff ,
236+ 'timestamp' : datetime .now ().isoformat ()
237+ }
238+
239+ return result
240+
241+ def run_benchmarks (self , output_dir : str = 'benchmarks/results' ):
242+ """Run comprehensive benchmarks."""
243+ os .makedirs (output_dir , exist_ok = True )
244+
245+ # Progress tracking
246+ total_configs = (len (BenchmarkConfig .MODEL_CONFIGS ) *
247+ len (BenchmarkConfig .BATCH_SIZES ) *
248+ len (BenchmarkConfig .SEQUENCE_LENGTHS ) *
249+ len (BenchmarkConfig .DTYPES ))
250+
251+ pbar = tqdm (total = total_configs , desc = "Running benchmarks" )
252+
253+ for hidden_size , model_name in BenchmarkConfig .MODEL_CONFIGS :
254+ for batch_size in BenchmarkConfig .BATCH_SIZES :
255+ for seq_len in BenchmarkConfig .SEQUENCE_LENGTHS :
256+ for dtype in BenchmarkConfig .DTYPES :
257+ # Skip configurations that would exceed memory
258+ total_elements = batch_size * seq_len * hidden_size
259+ bytes_needed = total_elements * (2 if dtype == torch .float16 else 4 )
260+ if bytes_needed > 8e9 : # Skip if > 8GB
261+ pbar .update (1 )
262+ continue
263+
264+ try :
265+ result = self .run_single_benchmark (
266+ batch_size , seq_len , hidden_size , dtype , model_name
267+ )
268+ self .results .append (result )
269+
270+ # Print summary for significant results
271+ if self .verbose and result ['total_speedup' ] > 1.3 :
272+ print (f"\n { model_name } (BS={ batch_size } , Seq={ seq_len } , { result ['dtype' ]} ): "
273+ f"{ result ['total_speedup' ]:.2f} x speedup, "
274+ f"{ result ['memory_reduction' ]* 100 :.1f} % memory reduction" )
275+
276+ except RuntimeError as e :
277+ if "out of memory" in str (e ):
278+ if self .verbose :
279+ print (f"\n Skipping { model_name } BS={ batch_size } Seq={ seq_len } - OOM" )
280+ else :
281+ raise e
282+
283+ pbar .update (1 )
284+ torch .cuda .empty_cache ()
285+
286+ pbar .close ()
287+
288+ # Save results
289+ df = pd .DataFrame (self .results )
290+
291+ # Save as CSV
292+ csv_path = os .path .join (output_dir , 'benchmark_results.csv' )
293+ df .to_csv (csv_path , index = False )
294+
295+ # Save as JSON
296+ json_path = os .path .join (output_dir , 'benchmark_results.json' )
297+ with open (json_path , 'w' ) as f :
298+ json .dump (self .results , f , indent = 2 )
299+
300+ # Save summary statistics
301+ summary = self .generate_summary (df )
302+ summary_path = os .path .join (output_dir , 'benchmark_summary.json' )
303+ with open (summary_path , 'w' ) as f :
304+ json .dump (summary , f , indent = 2 )
305+
306+ print (f"\n Results saved to { output_dir } " )
307+
308+ return df
309+
310+ def generate_summary (self , df : pd .DataFrame ) -> Dict :
311+ """Generate summary statistics from benchmark results."""
312+ summary = {
313+ 'gpu' : self .gpu_name ,
314+ 'gpu_memory_gb' : self .gpu_memory ,
315+ 'pytorch_version' : torch .__version__ ,
316+ 'cuda_version' : torch .version .cuda ,
317+ 'num_benchmarks' : len (df ),
318+ 'timestamp' : datetime .now ().isoformat (),
319+ 'overall_metrics' : {
320+ 'mean_forward_speedup' : df ['forward_speedup' ].mean (),
321+ 'mean_backward_speedup' : df ['backward_speedup' ].mean (),
322+ 'mean_total_speedup' : df ['total_speedup' ].mean (),
323+ 'mean_memory_reduction' : df ['memory_reduction' ].mean (),
324+ 'max_speedup' : df ['total_speedup' ].max (),
325+ 'min_speedup' : df ['total_speedup' ].min (),
326+ },
327+ 'by_model' : {},
328+ 'by_dtype' : {}
329+ }
330+
331+ # Summary by model
332+ for model in df ['model_name' ].unique ():
333+ model_df = df [df ['model_name' ] == model ]
334+ summary ['by_model' ][model ] = {
335+ 'mean_speedup' : model_df ['total_speedup' ].mean (),
336+ 'mean_memory_reduction' : model_df ['memory_reduction' ].mean (),
337+ 'num_configs' : len (model_df )
338+ }
339+
340+ # Summary by dtype
341+ for dtype in df ['dtype' ].unique ():
342+ dtype_df = df [df ['dtype' ] == dtype ]
343+ summary ['by_dtype' ][dtype ] = {
344+ 'mean_speedup' : dtype_df ['total_speedup' ].mean (),
345+ 'mean_memory_reduction' : dtype_df ['memory_reduction' ].mean (),
346+ 'num_configs' : len (dtype_df )
347+ }
348+
349+ return summary
350+
351+
352+ def main ():
353+ parser = argparse .ArgumentParser (description = 'Benchmark Fused LayerNorm' )
354+ parser .add_argument ('--output-dir' , type = str , default = 'benchmarks/results' ,
355+ help = 'Output directory for results' )
356+ parser .add_argument ('--verbose' , action = 'store_true' ,
357+ help = 'Print verbose output' )
358+ parser .add_argument ('--quick' , action = 'store_true' ,
359+ help = 'Run quick benchmark with fewer configurations' )
360+
361+ args = parser .parse_args ()
362+
363+ # Adjust configurations for quick mode
364+ if args .quick :
365+ BenchmarkConfig .MODEL_CONFIGS = BenchmarkConfig .MODEL_CONFIGS [:4 ]
366+ BenchmarkConfig .BATCH_SIZES = [8 , 32 , 64 ]
367+ BenchmarkConfig .SEQUENCE_LENGTHS = [256 , 512 ]
368+ BenchmarkConfig .BENCHMARK_ITERS = 50
369+
370+ # Run benchmarks
371+ benchmark = LayerNormBenchmark (verbose = args .verbose )
372+ results_df = benchmark .run_benchmarks (args .output_dir )
373+
374+ # Print summary
375+ print ("\n " + "=" * 60 )
376+ print ("BENCHMARK SUMMARY" )
377+ print ("=" * 60 )
378+ print (f"Average speedup: { results_df ['total_speedup' ].mean ():.2f} x" )
379+ print (f"Average memory reduction: { results_df ['memory_reduction' ].mean ()* 100 :.1f} %" )
380+ print (f"Best speedup: { results_df ['total_speedup' ].max ():.2f} x" )
381+ print (f"Worst speedup: { results_df ['total_speedup' ].min ():.2f} x" )
382+ print ("=" * 60 )
383+
384+
385+ if __name__ == '__main__' :
386+ main ()
0 commit comments