Source code for locator.gpu_optimizer

"""GPU optimization utilities for Locator.

This module provides utilities to maximize GPU efficiency and speed for
deep learning genomic predictions.
"""

import warnings
from typing import Optional, Tuple

import numpy as np
import tensorflow as tf


[docs] class GPUOptimizer: """Utilities for optimizing GPU performance in TensorFlow."""
[docs] @staticmethod def setup_mixed_precision(): """Enable mixed precision training for 2x speedup on modern GPUs. Returns ------- bool: True if mixed precision was enabled successfully """ try: # Check if GPU supports mixed precision (compute capability >= 7.0) gpus = tf.config.list_physical_devices("GPU") if not gpus: return False # Get compute capability gpu_details = tf.config.experimental.get_device_details(gpus[0]) compute_capability = gpu_details.get("compute_capability", (0, 0)) if compute_capability[0] >= 7: # Tensor Core support policy = tf.keras.mixed_precision.Policy("mixed_float16") tf.keras.mixed_precision.set_global_policy(policy) print( f"Mixed precision training enabled (compute capability {compute_capability})" ) return True else: print( f"GPU compute capability {compute_capability} doesn't support mixed precision efficiently" ) return False except Exception as e: warnings.warn(f"Failed to enable mixed precision: {e}") return False
[docs] @staticmethod def get_optimal_batch_size( # noqa: C901 model: tf.keras.Model, input_shape: Tuple[int, ...], target_memory_usage: float = 0.9, min_batch_size: int = 32, max_batch_size: int = 2048, dataset_size: Optional[int] = None, verbose: bool = True, ) -> int: """Dynamically determine optimal batch size for GPU memory. Args: model: Keras model to optimize for input_shape: Shape of single input sample (excluding batch dimension) target_memory_usage: Target GPU memory usage (0.0-1.0) min_batch_size: Minimum batch size to test max_batch_size: Maximum batch size to test dataset_size: Size of the dataset (if provided, limits max batch size) Returns ------- int: Optimal batch size for current GPU """ gpus = tf.config.list_physical_devices("GPU") if not gpus: return min_batch_size # Limit max batch size based on dataset size if dataset_size is not None: # Don't use batch size larger than 10% of dataset max_reasonable_batch = max(min_batch_size, dataset_size // 10) max_batch_size = min(max_batch_size, max_reasonable_batch) if max_batch_size < 2048 and verbose: print( f"Limiting max batch size to {max_batch_size} based on dataset size {dataset_size}" ) # Get available GPU memory # Note: The available_memory calculation is commented out but preserved # for future use. It would estimate GPU memory for batch size optimization. # After tf.config.set_visible_devices() or CUDA_VISIBLE_DEVICES is set, # the selected GPU is always accessible as 'GPU:0' from TensorFlow's perspective. # try: # gpu_memory = tf.config.experimental.get_memory_info("GPU:0") # available_memory = gpu_memory["current"] * target_memory_usage # except Exception: # # Fallback: use conservative estimate # # Most consumer GPUs have 8-24GB, datacenter GPUs 40-80GB # gpu_name = gpus[0].name.lower() # if "a100" in gpu_name or "a6000" in gpu_name: # available_memory = 40 * 1024 * 1024 * 1024 * target_memory_usage # 40GB # elif "v100" in gpu_name or "3090" in gpu_name or "4090" in gpu_name: # available_memory = 24 * 1024 * 1024 * 1024 * target_memory_usage # 24GB # else: # available_memory = 8 * 1024 * 1024 * 1024 * target_memory_usage # 8GB default # if verbose: # print(f"Using estimated GPU memory for {gpus[0].name}") # Binary search for optimal batch size left, right = min_batch_size, max_batch_size optimal_batch_size = min_batch_size while left <= right: test_batch_size = (left + right) // 2 try: # Create dummy data and test forward pass dummy_input = tf.random.normal((test_batch_size,) + input_shape) # Clear any previous allocations tf.keras.backend.clear_session() # Test forward and backward pass with tf.GradientTape() as tape: output = model(dummy_input, training=True) loss = tf.reduce_mean(output) # Test gradient computation _ = tape.gradient(loss, model.trainable_variables) # If successful, try larger batch optimal_batch_size = test_batch_size left = test_batch_size + 1 except tf.errors.ResourceExhaustedError: # If OOM, try smaller batch right = test_batch_size - 1 except Exception as e: # Other errors, try smaller batch warnings.warn(f"Error testing batch size {test_batch_size}: {e}") right = test_batch_size - 1 # Clear session after testing tf.keras.backend.clear_session() # Round to nearest power of 2 for efficiency optimal_batch_size = max(optimal_batch_size, 1) optimal_batch_size = 2 ** int(np.log2(optimal_batch_size)) # Final check against dataset size if dataset_size is not None and optimal_batch_size > dataset_size // 10: # For small datasets, use a more conservative batch size optimal_batch_size = min(optimal_batch_size, max(32, dataset_size // 16)) if verbose: print(f"Adjusted batch size for small dataset: {optimal_batch_size}") if verbose: print(f"Optimal batch size determined: {optimal_batch_size}") return optimal_batch_size
[docs] @staticmethod def optimize_gpu_memory(mode: str = "growth", memory_limit: Optional[int] = None): """Configure GPU memory allocation strategy. Args: mode: Memory allocation mode ('growth', 'preallocate', 'limit') memory_limit: Memory limit in MB (only used with mode='limit') """ gpus = tf.config.list_physical_devices("GPU") if not gpus: return for gpu in gpus: try: if mode == "growth": tf.config.experimental.set_memory_growth(gpu, True) elif mode == "preallocate": tf.config.experimental.set_memory_growth(gpu, False) elif mode == "limit" and memory_limit: tf.config.set_logical_device_configuration( gpu, [ tf.config.LogicalDeviceConfiguration( memory_limit=memory_limit ) ], ) except (RuntimeError, ValueError) as e: # Both can fire when GPU is already configured — e.g. a Ray # actor that installed a memory cap before Locator init runs. warnings.warn(f"GPU memory configuration failed: {e}")
[docs] @staticmethod def enable_xla_compilation(): """Enable XLA compilation for additional performance. Note: This is experimental and may not work with all operations. """ tf.config.optimizer.set_jit(True) print("XLA compilation enabled (experimental)")