Harness the full power of modern PyTorch with CUDA 12.8 acceleration ๐ฅ
PyTorch CUDA Template provides everything you need to jumpstart your GPU-accelerated machine learning projects. Built with modern Python packaging standards and optimized for PyTorch 2.7+ with CUDA 12.8 support, this template eliminates setup friction so you can focus on building amazing models.
- ๐ฅ Cutting-Edge PyTorch - Latest PyTorch 2.7+ with optimized CUDA 12.8 support
- โก GPU-Ready Architecture - Pre-configured CUDA acceleration with intelligent CPU fallback
- ๐ ๏ธ Modern Development Stack - Integrated linting, formatting, testing, and type checking
- ๐ ML Ops Ready - MLflow experiment tracking and Polars for high-performance data processing
- ๐ Lightning-Fast Setup - Powered by
uvfor blazing-fast dependency resolution - ๐๏ธ Production-Ready Structure - Following modern Python packaging best practices
- ๐ Python โฅ 3.11
- ๐ฎ CUDA 12.8 (for GPU acceleration)
- ๐ป GPU Compatible NVIDIA GPU (optional, gracefully falls back to CPU)
- โก uv Package manager (recommended for fastest installs)
# Clone the template
git clone https://github.com/bjoernbethge/torch-cuda.git
cd torch-cuda
# Install everything with uv (recommended)
uv syncChoose exactly what you need:
# ๐ฅ Basic PyTorch setup
uv sync
# ๐งช Development environment (testing, linting, formatting)
uv sync --extra dev
# ๐ ML Ops toolkit (MLflow, Polars, Plotly, profiling tools)
uv sync --extra extras
# ๐ Everything included (the full experience)
uv sync --extra all
# Add new packages
uv add torchvisionimport torch
print(f"๐ฅ PyTorch version: {torch.__version__}")
print(f"โก CUDA available: {torch.cuda.is_available()}")
print(f"๐ฎ CUDA version: {torch.version.cuda}")
print(f"๐ป GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
print(f"๐ Current GPU: {torch.cuda.get_device_name()}")
print(f"๐พ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")import torch
import torch.nn as nn
# ๐ฏ Automatically detect best device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"๐ Using device: {device}")
# ๐ง Build a neural network
class SimpleNet(nn.Module):
def __init__(self, input_size=784, hidden_size=256, num_classes=10):
super().__init__()
self.network = nn.Sequential(
nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
return self.network(x)
# ๐ Instantiate and move to GPU
model = SimpleNet().to(device)
# ๐ Model info
total_params = sum(p.numel() for p in model.parameters())
print(f"๐ง Model parameters: {total_params:,}")
# ๐ฏ Test forward pass
sample_input = torch.randn(32, 784).to(device)
output = model(sample_input)
print(f"๐ Input shape: {sample_input.shape}")
print(f"๐ Output shape: {output.shape}")import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import mlflow
import mlflow.pytorch
# ๐ Initialize MLflow experiment
mlflow.set_experiment("pytorch-cuda-training")
mlflow.start_run()
# ๐ฏ Setup training environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNet().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
# ๐ Log hyperparameters
mlflow.log_params({
"learning_rate": 0.001,
"batch_size": 32,
"epochs": 10,
"device": str(device),
"model_params": sum(p.numel() for p in model.parameters())
})
# ๐ Create sample dataset
X = torch.randn(1000, 784)
y = torch.randint(0, 10, (1000,))
dataset = TensorDataset(X, y)
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=True,
num_workers=4, # ๐ Parallel data loading
pin_memory=True # โก Faster GPU transfer
)
# ๐๏ธ Training loop with MLflow logging
model.train()
for epoch in range(10):
epoch_loss = 0
correct_predictions = 0
pbar = tqdm(dataloader, desc=f"๐๏ธ Epoch {epoch+1}/10")
for batch_x, batch_y in pbar:
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
pred = outputs.argmax(dim=1)
correct_predictions += (pred == batch_y).sum().item()
pbar.set_postfix({'Loss': f'{loss.item():.4f}'})
# ๐ Log metrics to MLflow
avg_loss = epoch_loss / len(dataloader)
accuracy = correct_predictions / len(dataset)
mlflow.log_metrics({
"loss": avg_loss,
"accuracy": accuracy,
"epoch": epoch + 1
})
print(f"๐ฏ Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.3f}")
# ๐พ Save model
mlflow.pytorch.log_model(model, "model")
mlflow.end_run()import polars as pl
import torch
from torch.utils.data import Dataset, DataLoader
# ๐ Create and process data with Polars (much faster than pandas)
def create_sample_dataset():
"""Create a sample dataset using Polars for high-performance processing"""
# ๐ Generate sample data with Polars
df = pl.DataFrame({
"feature_1": pl.Series([i * 0.1 for i in range(10000)]),
"feature_2": pl.Series([i * 0.2 + 1 for i in range(10000)]),
"feature_3": pl.Series([i * 0.05 - 0.5 for i in range(10000)]),
"target": pl.Series([i % 3 for i in range(10000)])
})
# ๐ High-performance data transformations
processed_df = (
df
.with_columns([
# ๐ Feature engineering
((pl.col("feature_1") * pl.col("feature_2")).alias("interaction_1")),
(pl.col("feature_3").pow(2).alias("feature_3_squared")),
# ๐ Normalization
((pl.col("feature_1") - pl.col("feature_1").mean()) / pl.col("feature_1").std()).alias("feature_1_norm"),
((pl.col("feature_2") - pl.col("feature_2").mean()) / pl.col("feature_2").std()).alias("feature_2_norm")
])
.filter(pl.col("feature_1") > 0.5) # ๐ฏ Fast filtering
)
print(f"๐ Processed {len(processed_df)} samples")
return processed_df
# ๐ฏ Custom Dataset class for Polars integration
class PolarsDataset(Dataset):
def __init__(self, df: pl.DataFrame, feature_cols: list, target_col: str):
self.features = torch.tensor(df.select(feature_cols).to_numpy(), dtype=torch.float32)
self.targets = torch.tensor(df.select(target_col).to_numpy().flatten(), dtype=torch.long)
def __len__(self):
return len(self.features)
def __getitem__(self, idx):
return self.features[idx], self.targets[idx]
# ๐ Use the high-performance dataset
df = create_sample_dataset()
feature_cols = ["feature_1_norm", "feature_2_norm", "feature_3_squared", "interaction_1"]
dataset = PolarsDataset(df, feature_cols, "target")
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, pin_memory=True)
print(f"โ
Created dataset with {len(dataset)} samples and {len(feature_cols)} features")import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import torch
import numpy as np
def visualize_training_metrics(losses, accuracies, gpu_utilization=None):
"""Create interactive training visualizations"""
# ๐ Create subplots
fig = make_subplots(
rows=2, cols=2,
subplot_titles=('๐๏ธ Training Loss', '๐ฏ Accuracy', 'โก GPU Utilization', '๐ Learning Curve'),
specs=[[{"secondary_y": False}, {"secondary_y": False}],
[{"secondary_y": False}, {"secondary_y": False}]]
)
epochs = list(range(1, len(losses) + 1))
# ๐ Loss curve
fig.add_trace(
go.Scatter(x=epochs, y=losses, mode='lines+markers', name='Loss', line=dict(color='red')),
row=1, col=1
)
# ๐ฏ Accuracy curve
fig.add_trace(
go.Scatter(x=epochs, y=accuracies, mode='lines+markers', name='Accuracy', line=dict(color='green')),
row=1, col=2
)
# โก GPU utilization (if available)
if gpu_utilization:
fig.add_trace(
go.Scatter(x=epochs, y=gpu_utilization, mode='lines+markers', name='GPU %', line=dict(color='blue')),
row=2, col=1
)
# ๐ Combined learning curve
fig.add_trace(
go.Scatter(x=epochs, y=losses, mode='lines', name='Loss (normalized)', line=dict(color='red', dash='dot')),
row=2, col=2
)
fig.add_trace(
go.Scatter(x=epochs, y=accuracies, mode='lines', name='Accuracy', line=dict(color='green')),
row=2, col=2
)
# ๐จ Update layout
fig.update_layout(
title="๐ PyTorch CUDA Training Dashboard",
showlegend=True,
height=600
)
return fig
# ๐ Example usage
sample_losses = [2.3, 1.8, 1.4, 1.1, 0.9, 0.7, 0.6, 0.5, 0.4, 0.35]
sample_accuracies = [0.1, 0.3, 0.5, 0.65, 0.75, 0.82, 0.87, 0.91, 0.94, 0.96]
sample_gpu_util = [85, 87, 90, 88, 92, 89, 91, 88, 90, 87]
fig = visualize_training_metrics(sample_losses, sample_accuracies, sample_gpu_util)
fig.show() # ๐ฏ Interactive visualization in browserimport torch
from torch.profiler import profile, record_function, ProfilerActivity
import psutil
import time
def profile_training_step(model, data_loader, device):
"""Profile training performance with detailed GPU metrics"""
# ๐ Start profiling
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
model.train()
for i, (batch_x, batch_y) in enumerate(data_loader):
if i >= 5: # Profile first 5 batches
break
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
with record_function("forward_pass"):
outputs = model(batch_x)
loss = torch.nn.functional.cross_entropy(outputs, batch_y)
with record_function("backward_pass"):
loss.backward()
with record_function("optimizer_step"):
torch.optim.Adam(model.parameters()).step()
# ๐ Print profiling results
print("๐ฅ GPU Profiling Results:")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
# ๐พ Export for visualization
prof.export_chrome_trace("trace.json")
print("๐ Trace exported to trace.json - open in chrome://tracing")
def monitor_system_resources():
"""Monitor CPU, memory, and GPU usage"""
# ๐ป System resources
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
print(f"๐ป CPU Usage: {cpu_percent}%")
print(f"๐พ RAM Usage: {memory.percent}% ({memory.used / 1e9:.1f}GB / {memory.total / 1e9:.1f}GB)")
# ๐ฎ GPU resources
if torch.cuda.is_available():
gpu_memory = torch.cuda.memory_allocated() / 1e9
gpu_cached = torch.cuda.memory_reserved() / 1e9
gpu_total = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"๐ฎ GPU Memory: {gpu_memory:.1f}GB allocated, {gpu_cached:.1f}GB cached, {gpu_total:.1f}GB total")
print(f"๐ GPU Utilization: {(gpu_memory/gpu_total)*100:.1f}%")
# ๐ Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNet().to(device)
# Monitor during training
monitor_system_resources()# ๐ฆ Install all development tools
uv sync --extra dev
# ๐ช Setup pre-commit hooks for code quality
pre-commit install
# ๐งช Verify everything works
pytest --version && black --version && mypy --version# ๐จ Format your code beautifully
black src/ tests/
isort src/ tests/
# ๐ Lint and catch issues
ruff check src/ tests/
# ๐ฏ Type checking for better code
mypy src/
# ๐งช Run comprehensive tests
pytest
# ๐ Test coverage analysis
pytest --cov=src --cov-report=html# ๐พ Monitor GPU memory usage
def print_gpu_memory():
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1e9
cached = torch.cuda.memory_reserved() / 1e9
print(f"๐พ GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB")
# ๐งน Memory cleanup strategies
def cleanup_gpu_memory():
"""Clean up GPU memory periodically"""
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# ๐ Gradient accumulation for large effective batch sizes
accumulation_steps = 4
for i, (batch_x, batch_y) in enumerate(dataloader):
outputs = model(batch_x)
loss = criterion(outputs, batch_y) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()# โก DataLoader optimization
dataloader = DataLoader(
dataset,
batch_size=64,
shuffle=True,
num_workers=min(8, os.cpu_count()), # Optimal worker count
pin_memory=True, # Faster GPU transfer
persistent_workers=True, # Keep workers alive
prefetch_factor=2 # Prefetch batches
)
# ๐ Model compilation (PyTorch 2.0+)
model = torch.compile(
model,
mode="max-autotune", # Maximum optimization
dynamic=False # Static shapes for better optimization
)
# ๐ก Learning rate scheduling
scheduler = optim.lr_scheduler.OneCycleLR(
optimizer,
max_lr=0.01,
steps_per_epoch=len(dataloader),
epochs=num_epochs,
pct_start=0.3, # 30% warmup
anneal_strategy='cos'
)We welcome contributions from the community! Here's how to get involved:
- ๐ด Fork the repository on GitHub
- ๐ฅ Clone your fork:
git clone https://github.com/yourusername/torch-cuda.git - ๐ฆ Install in development mode:
uv sync --extra dev - ๐ฟ Create a feature branch:
git checkout -b feature/amazing-feature - โจ Make your changes and add comprehensive tests
- ๐งช Run the test suite:
pytest - ๐จ Format your code:
black . && isort . - ๐ Commit your changes:
git commit -m 'Add amazing feature' - ๐ Push to your branch:
git push origin feature/amazing-feature - ๐ Submit a Pull Request
โ CUDA Out of Memory
# ๐ก Solutions:
# 1. Reduce batch size
batch_size = 16 # Instead of 64
# 2. Use gradient accumulation
accumulation_steps = 4
# 3. Enable mixed precision
from torch.cuda.amp import autocast
with autocast():
outputs = model(inputs)
# 4. Clear cache periodically
torch.cuda.empty_cache()๐ Slow Training Performance
# ๐ก Performance boosters:
# 1. Optimize DataLoader
dataloader = DataLoader(
dataset,
num_workers=4, # Parallel loading
pin_memory=True, # Faster GPU transfer
persistent_workers=True # Keep workers alive
)
# 2. Enable optimizations
torch.backends.cudnn.benchmark = True
model = torch.compile(model)
# 3. Use appropriate batch sizes
# Sweet spot is usually 32-128 depending on model size๐ซ Installation Issues
# ๐ Refresh installation
uv sync --extra all
# ๐งน Clean cache and reinstall
uv cache clean && uv sync
# ๐ฏ Verify uv configuration
uv tree- ๐ Issues: Check our GitHub Issues
- ๐ Documentation: PyTorch Official Docs
- ๐ฌ Community: PyTorch Forums
- ๐ง Contact: bjoern.bethge@gmail.com
This project is licensed under the MIT License - see the LICENSE file for details.
- ๐ฅ PyTorch Team - For creating the most amazing deep learning framework
- ๐ฎ NVIDIA - For CUDA toolkit and GPU computing revolution
- โก Astral Team - For the blazing-fast
uvpackage manager - ๐ Polars Team - For lightning-fast data processing
- ๐ Open Source Community - For continuous inspiration and collaboration