Skip to content

Commit 2924f53

Browse files
committed
Final changes
0 parents  commit 2924f53

30 files changed

+2903
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

.github/workflows/ci.yml

Whitespace-only changes.

.gitignore

Whitespace-only changes.

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Fused LayerNorm CUDA
2+
3+
High-performance CUDA implementation of LayerNorm for PyTorch.

benchmarks/.DS_Store

6 KB
Binary file not shown.

benchmarks/benchmark_layernorm.py

Lines changed: 386 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
"""
2+
Comprehensive benchmarking script for Fused LayerNorm CUDA operator.
3+
4+
This script benchmarks our implementation against PyTorch's native LayerNorm
5+
across various model configurations (BERT, GPT-2, GPT-3) and batch sizes.
6+
"""
7+
8+
import torch
9+
import torch.nn as nn
10+
import time
11+
import numpy as np
12+
import pandas as pd
13+
from typing import Dict, List, Tuple
14+
from tqdm import tqdm
15+
import argparse
16+
import json
17+
import os
18+
import matplotlib.pyplot as plt
19+
import seaborn as sns
20+
from datetime import datetime
21+
22+
# Import our fused implementation
23+
from fused_layernorm import FusedLayerNorm
24+
25+
26+
class BenchmarkConfig:
27+
"""Configuration for benchmark runs."""
28+
29+
# Model configurations (hidden_size, model_name)
30+
MODEL_CONFIGS = [
31+
(768, "BERT-Base"),
32+
(1024, "BERT-Large"),
33+
(768, "GPT-2 Small"),
34+
(1024, "GPT-2 Medium"),
35+
(1280, "GPT-2 Large"),
36+
(1600, "GPT-2 XL"),
37+
(2048, "GPT-3 Small"),
38+
(2560, "GPT-3 Medium"),
39+
(4096, "GPT-3 Large"),
40+
(5120, "GPT-3 XL"),
41+
(8192, "GPT-3 XXL"),
42+
(12288, "GPT-3 175B"),
43+
]
44+
45+
# Batch sizes to test
46+
BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
47+
48+
# Sequence lengths for transformer models
49+
SEQUENCE_LENGTHS = [128, 256, 512, 1024]
50+
51+
# Number of warmup iterations
52+
WARMUP_ITERS = 50
53+
54+
# Number of benchmark iterations
55+
BENCHMARK_ITERS = 200
56+
57+
# Data types to test
58+
DTYPES = [torch.float32, torch.float16]
59+
60+
61+
class LayerNormBenchmark:
62+
"""Benchmark harness for LayerNorm implementations."""
63+
64+
def __init__(self, device='cuda', verbose=True):
65+
self.device = device
66+
self.verbose = verbose
67+
self.results = []
68+
69+
# Ensure CUDA is available
70+
if not torch.cuda.is_available():
71+
raise RuntimeError("CUDA is not available")
72+
73+
# Get GPU info
74+
self.gpu_name = torch.cuda.get_device_name(0)
75+
self.gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9 # GB
76+
77+
if self.verbose:
78+
print(f"GPU: {self.gpu_name}")
79+
print(f"Memory: {self.gpu_memory:.1f} GB")
80+
print(f"PyTorch: {torch.__version__}")
81+
print(f"CUDA: {torch.version.cuda}")
82+
print("-" * 60)
83+
84+
def benchmark_forward(self, layer: nn.Module, input_tensor: torch.Tensor,
85+
num_iters: int) -> float:
86+
"""Benchmark forward pass."""
87+
# Warmup
88+
for _ in range(BenchmarkConfig.WARMUP_ITERS):
89+
_ = layer(input_tensor)
90+
91+
torch.cuda.synchronize()
92+
93+
# Benchmark
94+
start_event = torch.cuda.Event(enable_timing=True)
95+
end_event = torch.cuda.Event(enable_timing=True)
96+
97+
start_event.record()
98+
for _ in range(num_iters):
99+
_ = layer(input_tensor)
100+
end_event.record()
101+
102+
torch.cuda.synchronize()
103+
elapsed_time = start_event.elapsed_time(end_event) / num_iters
104+
105+
return elapsed_time
106+
107+
def benchmark_backward(self, layer: nn.Module, input_tensor: torch.Tensor,
108+
num_iters: int) -> float:
109+
"""Benchmark backward pass."""
110+
input_tensor.requires_grad = True
111+
112+
# Warmup
113+
for _ in range(BenchmarkConfig.WARMUP_ITERS):
114+
output = layer(input_tensor)
115+
loss = output.sum()
116+
loss.backward()
117+
118+
torch.cuda.synchronize()
119+
120+
# Benchmark
121+
start_event = torch.cuda.Event(enable_timing=True)
122+
end_event = torch.cuda.Event(enable_timing=True)
123+
124+
start_event.record()
125+
for _ in range(num_iters):
126+
output = layer(input_tensor)
127+
loss = output.sum()
128+
loss.backward()
129+
end_event.record()
130+
131+
torch.cuda.synchronize()
132+
elapsed_time = start_event.elapsed_time(end_event) / num_iters
133+
134+
return elapsed_time
135+
136+
def measure_memory(self, layer: nn.Module, input_tensor: torch.Tensor) -> Dict[str, float]:
137+
"""Measure memory usage."""
138+
torch.cuda.empty_cache()
139+
torch.cuda.reset_peak_memory_stats()
140+
141+
# Measure forward pass memory
142+
start_memory = torch.cuda.memory_allocated()
143+
output = layer(input_tensor)
144+
torch.cuda.synchronize()
145+
forward_memory = torch.cuda.memory_allocated() - start_memory
146+
147+
# Measure backward pass memory
148+
if input_tensor.requires_grad:
149+
loss = output.sum()
150+
loss.backward()
151+
torch.cuda.synchronize()
152+
total_memory = torch.cuda.memory_allocated() - start_memory
153+
backward_memory = total_memory - forward_memory
154+
else:
155+
backward_memory = 0
156+
total_memory = forward_memory
157+
158+
peak_memory = torch.cuda.max_memory_allocated() - start_memory
159+
160+
return {
161+
'forward_memory_mb': forward_memory / 1e6,
162+
'backward_memory_mb': backward_memory / 1e6,
163+
'total_memory_mb': total_memory / 1e6,
164+
'peak_memory_mb': peak_memory / 1e6
165+
}
166+
167+
def run_single_benchmark(self, batch_size: int, seq_len: int, hidden_size: int,
168+
dtype: torch.dtype, model_name: str) -> Dict:
169+
"""Run a single benchmark configuration."""
170+
# Create input tensor
171+
shape = (batch_size * seq_len, hidden_size)
172+
input_tensor = torch.randn(shape, device=self.device, dtype=dtype)
173+
174+
# Create layers
175+
pytorch_layer = nn.LayerNorm(hidden_size, dtype=dtype).to(self.device)
176+
fused_layer = FusedLayerNorm(hidden_size, dtype=dtype).to(self.device)
177+
178+
# Copy weights to ensure fair comparison
179+
with torch.no_grad():
180+
fused_layer.weight.data.copy_(pytorch_layer.weight.data)
181+
fused_layer.bias.data.copy_(pytorch_layer.bias.data)
182+
183+
# Benchmark forward pass
184+
pytorch_forward_time = self.benchmark_forward(
185+
pytorch_layer, input_tensor, BenchmarkConfig.BENCHMARK_ITERS
186+
)
187+
fused_forward_time = self.benchmark_forward(
188+
fused_layer, input_tensor, BenchmarkConfig.BENCHMARK_ITERS
189+
)
190+
191+
# Benchmark backward pass
192+
pytorch_backward_time = self.benchmark_backward(
193+
pytorch_layer, input_tensor.clone(), BenchmarkConfig.BENCHMARK_ITERS
194+
)
195+
fused_backward_time = self.benchmark_backward(
196+
fused_layer, input_tensor.clone(), BenchmarkConfig.BENCHMARK_ITERS
197+
)
198+
199+
# Measure memory
200+
pytorch_memory = self.measure_memory(pytorch_layer, input_tensor.clone())
201+
fused_memory = self.measure_memory(fused_layer, input_tensor.clone())
202+
203+
# Calculate speedups
204+
forward_speedup = pytorch_forward_time / fused_forward_time
205+
backward_speedup = pytorch_backward_time / fused_backward_time
206+
total_speedup = (pytorch_forward_time + pytorch_backward_time) / \
207+
(fused_forward_time + fused_backward_time)
208+
209+
# Calculate memory reduction
210+
memory_reduction = 1 - (fused_memory['peak_memory_mb'] / pytorch_memory['peak_memory_mb'])
211+
212+
# Verify correctness
213+
with torch.no_grad():
214+
pytorch_output = pytorch_layer(input_tensor)
215+
fused_output = fused_layer(input_tensor)
216+
max_diff = torch.max(torch.abs(pytorch_output - fused_output)).item()
217+
218+
result = {
219+
'model_name': model_name,
220+
'batch_size': batch_size,
221+
'seq_len': seq_len,
222+
'hidden_size': hidden_size,
223+
'total_seq_len': batch_size * seq_len,
224+
'dtype': str(dtype).split('.')[-1],
225+
'pytorch_forward_ms': pytorch_forward_time,
226+
'fused_forward_ms': fused_forward_time,
227+
'forward_speedup': forward_speedup,
228+
'pytorch_backward_ms': pytorch_backward_time,
229+
'fused_backward_ms': fused_backward_time,
230+
'backward_speedup': backward_speedup,
231+
'total_speedup': total_speedup,
232+
'pytorch_memory_mb': pytorch_memory['peak_memory_mb'],
233+
'fused_memory_mb': fused_memory['peak_memory_mb'],
234+
'memory_reduction': memory_reduction,
235+
'max_diff': max_diff,
236+
'timestamp': datetime.now().isoformat()
237+
}
238+
239+
return result
240+
241+
def run_benchmarks(self, output_dir: str = 'benchmarks/results'):
242+
"""Run comprehensive benchmarks."""
243+
os.makedirs(output_dir, exist_ok=True)
244+
245+
# Progress tracking
246+
total_configs = (len(BenchmarkConfig.MODEL_CONFIGS) *
247+
len(BenchmarkConfig.BATCH_SIZES) *
248+
len(BenchmarkConfig.SEQUENCE_LENGTHS) *
249+
len(BenchmarkConfig.DTYPES))
250+
251+
pbar = tqdm(total=total_configs, desc="Running benchmarks")
252+
253+
for hidden_size, model_name in BenchmarkConfig.MODEL_CONFIGS:
254+
for batch_size in BenchmarkConfig.BATCH_SIZES:
255+
for seq_len in BenchmarkConfig.SEQUENCE_LENGTHS:
256+
for dtype in BenchmarkConfig.DTYPES:
257+
# Skip configurations that would exceed memory
258+
total_elements = batch_size * seq_len * hidden_size
259+
bytes_needed = total_elements * (2 if dtype == torch.float16 else 4)
260+
if bytes_needed > 8e9: # Skip if > 8GB
261+
pbar.update(1)
262+
continue
263+
264+
try:
265+
result = self.run_single_benchmark(
266+
batch_size, seq_len, hidden_size, dtype, model_name
267+
)
268+
self.results.append(result)
269+
270+
# Print summary for significant results
271+
if self.verbose and result['total_speedup'] > 1.3:
272+
print(f"\n{model_name} (BS={batch_size}, Seq={seq_len}, {result['dtype']}): "
273+
f"{result['total_speedup']:.2f}x speedup, "
274+
f"{result['memory_reduction']*100:.1f}% memory reduction")
275+
276+
except RuntimeError as e:
277+
if "out of memory" in str(e):
278+
if self.verbose:
279+
print(f"\nSkipping {model_name} BS={batch_size} Seq={seq_len} - OOM")
280+
else:
281+
raise e
282+
283+
pbar.update(1)
284+
torch.cuda.empty_cache()
285+
286+
pbar.close()
287+
288+
# Save results
289+
df = pd.DataFrame(self.results)
290+
291+
# Save as CSV
292+
csv_path = os.path.join(output_dir, 'benchmark_results.csv')
293+
df.to_csv(csv_path, index=False)
294+
295+
# Save as JSON
296+
json_path = os.path.join(output_dir, 'benchmark_results.json')
297+
with open(json_path, 'w') as f:
298+
json.dump(self.results, f, indent=2)
299+
300+
# Save summary statistics
301+
summary = self.generate_summary(df)
302+
summary_path = os.path.join(output_dir, 'benchmark_summary.json')
303+
with open(summary_path, 'w') as f:
304+
json.dump(summary, f, indent=2)
305+
306+
print(f"\nResults saved to {output_dir}")
307+
308+
return df
309+
310+
def generate_summary(self, df: pd.DataFrame) -> Dict:
311+
"""Generate summary statistics from benchmark results."""
312+
summary = {
313+
'gpu': self.gpu_name,
314+
'gpu_memory_gb': self.gpu_memory,
315+
'pytorch_version': torch.__version__,
316+
'cuda_version': torch.version.cuda,
317+
'num_benchmarks': len(df),
318+
'timestamp': datetime.now().isoformat(),
319+
'overall_metrics': {
320+
'mean_forward_speedup': df['forward_speedup'].mean(),
321+
'mean_backward_speedup': df['backward_speedup'].mean(),
322+
'mean_total_speedup': df['total_speedup'].mean(),
323+
'mean_memory_reduction': df['memory_reduction'].mean(),
324+
'max_speedup': df['total_speedup'].max(),
325+
'min_speedup': df['total_speedup'].min(),
326+
},
327+
'by_model': {},
328+
'by_dtype': {}
329+
}
330+
331+
# Summary by model
332+
for model in df['model_name'].unique():
333+
model_df = df[df['model_name'] == model]
334+
summary['by_model'][model] = {
335+
'mean_speedup': model_df['total_speedup'].mean(),
336+
'mean_memory_reduction': model_df['memory_reduction'].mean(),
337+
'num_configs': len(model_df)
338+
}
339+
340+
# Summary by dtype
341+
for dtype in df['dtype'].unique():
342+
dtype_df = df[df['dtype'] == dtype]
343+
summary['by_dtype'][dtype] = {
344+
'mean_speedup': dtype_df['total_speedup'].mean(),
345+
'mean_memory_reduction': dtype_df['memory_reduction'].mean(),
346+
'num_configs': len(dtype_df)
347+
}
348+
349+
return summary
350+
351+
352+
def main():
353+
parser = argparse.ArgumentParser(description='Benchmark Fused LayerNorm')
354+
parser.add_argument('--output-dir', type=str, default='benchmarks/results',
355+
help='Output directory for results')
356+
parser.add_argument('--verbose', action='store_true',
357+
help='Print verbose output')
358+
parser.add_argument('--quick', action='store_true',
359+
help='Run quick benchmark with fewer configurations')
360+
361+
args = parser.parse_args()
362+
363+
# Adjust configurations for quick mode
364+
if args.quick:
365+
BenchmarkConfig.MODEL_CONFIGS = BenchmarkConfig.MODEL_CONFIGS[:4]
366+
BenchmarkConfig.BATCH_SIZES = [8, 32, 64]
367+
BenchmarkConfig.SEQUENCE_LENGTHS = [256, 512]
368+
BenchmarkConfig.BENCHMARK_ITERS = 50
369+
370+
# Run benchmarks
371+
benchmark = LayerNormBenchmark(verbose=args.verbose)
372+
results_df = benchmark.run_benchmarks(args.output_dir)
373+
374+
# Print summary
375+
print("\n" + "="*60)
376+
print("BENCHMARK SUMMARY")
377+
print("="*60)
378+
print(f"Average speedup: {results_df['total_speedup'].mean():.2f}x")
379+
print(f"Average memory reduction: {results_df['memory_reduction'].mean()*100:.1f}%")
380+
print(f"Best speedup: {results_df['total_speedup'].max():.2f}x")
381+
print(f"Worst speedup: {results_df['total_speedup'].min():.2f}x")
382+
print("="*60)
383+
384+
385+
if __name__ == '__main__':
386+
main()

0 commit comments

Comments
 (0)