Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| import plotly.graph_objects as go | |
| from sklearn.ensemble import RandomForestRegressor | |
| class GEMMPredictor: | |
| def __init__(self, model_path='model.joblib'): | |
| self.stacked_model = joblib.load(model_path) | |
| self.initialize_features() | |
| def initialize_features(self): | |
| """Initialize features used by the model""" | |
| # Core matrix features | |
| self.core_features = [ | |
| 'm', 'n', 'k', | |
| 'blocksize1', 'blocksize2', 'blocksize3' | |
| ] | |
| # Derived features | |
| self.derived_features = [ | |
| 'arithmetic_intensity', | |
| 'bytes_accessed', | |
| 'total_flops' | |
| ] | |
| # Categorical features | |
| self.categorical_features = ['Layout'] | |
| # Target features | |
| self.target_features = [ | |
| 'runtime', | |
| 'power', | |
| 'Energy', | |
| 'TFlops' | |
| ] | |
| self.numerical_features = self.core_features + self.derived_features | |
| def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3): | |
| """Calculate GEMM-specific characteristics""" | |
| total_flops = 2 * m * n * k # 2 operations per FMA | |
| bytes_accessed = (m * k + k * n + m * n) * 4 # Single precision | |
| arithmetic_intensity = total_flops / bytes_accessed | |
| bound_type = 'compute' if arithmetic_intensity > 59 else 'memory' | |
| return { | |
| 'total_flops': total_flops, | |
| 'bytes_accessed': bytes_accessed, | |
| 'arithmetic_intensity': arithmetic_intensity, | |
| 'bound_type': bound_type | |
| } | |
| def get_default_numeric_values(self): | |
| """Return default values for missing numeric features""" | |
| return { | |
| # Memory-related defaults | |
| 'total_memory': 12288, # 12GB for RTX 4070 | |
| 'free_memory': 10240, # Assuming 80% free | |
| 'used_memory': 2048, # Assuming 20% used | |
| 'mem_util': 20.0, # 20% utilization | |
| 'mem_util2': 20.0, # Secondary memory utilization | |
| # GPU state defaults | |
| 'temp': 65.0, # Default temperature | |
| 'gpu_util': 80.0, # Default GPU utilization | |
| 'gpu_util1': 80.0, # Secondary GPU utilization | |
| 'clock_sm': 2475, # Default SM clock for RTX 4070 | |
| 'power_limit': 200.0, # Default power limit | |
| 'clocks.meme': 2000, # Memory clock speed | |
| 'alpha': 1.0, # Default scaling factor | |
| 'beta': 0.0, # Default scaling factor | |
| 'problem_size_m': 1024, | |
| 'problem_size_n': 1024, | |
| 'problem_size_k': 1024 | |
| } | |
| def get_default_categorical_values(self): | |
| """Return default values for missing categorical features""" | |
| return { | |
| 'stage': 'main', | |
| 'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1', | |
| 'computation_pattern': 'GEMM', | |
| 'combination_type': 'standard', | |
| 'state': 'active', | |
| 'uses_shared_memory': 'true', | |
| 'gpu_name': 'RTX4070' | |
| } | |
| def prepare_input_data(self, input_dict): | |
| """Prepare input data for prediction with default values for missing features""" | |
| numeric_defaults = self.get_default_numeric_values() | |
| categorical_defaults = self.get_default_categorical_values() | |
| complete_input = {**numeric_defaults, **categorical_defaults} | |
| complete_input.update(input_dict) | |
| df = pd.DataFrame([complete_input]) | |
| characteristics = self.calculate_gemm_characteristics( | |
| df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0], | |
| df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0] | |
| ) | |
| df['total_flops'] = characteristics['total_flops'] | |
| df['bytes_accessed'] = characteristics['bytes_accessed'] | |
| df['arithmetic_intensity'] = characteristics['arithmetic_intensity'] | |
| for col in self.categorical_features: | |
| if col in df.columns: | |
| df[col] = df[col].astype(str) | |
| for col in self.numerical_features: | |
| if col in df.columns: | |
| df[col] = pd.to_numeric(df[col], errors='coerce') | |
| return df | |
| def estimate_power(df): | |
| BASE_POWER = 30 | |
| MAX_POWER = 200 | |
| MAX_TFLOPS = 40 | |
| df['estimated_power'] = BASE_POWER + ( | |
| (MAX_POWER - BASE_POWER) * | |
| (df['total_flops'] / (MAX_TFLOPS * 1e12)) | |
| ) | |
| df['power'] = df['power'].fillna(df['estimated_power']) | |
| return df | |
| def filter_power_bounds(df): | |
| MIN_POWER = 25 # Minimum idle power | |
| MAX_POWER = 200 # Maximum TDP | |
| df = df[ | |
| (df['power'].between(MIN_POWER, MAX_POWER)) | | |
| (df['power'].isna()) | |
| ] | |
| return df | |
| def impute_power(df): | |
| df['total_elements'] = df['m'] * df['n'] * df['k'] | |
| valid_power = df[df['power'].notna()] | |
| features = ['total_elements', 'total_flops', 'arithmetic_intensity'] | |
| X = valid_power[features] | |
| y = valid_power['power'] | |
| model = RandomForestRegressor(n_estimators=100) | |
| model.fit(X, y) | |
| missing_power = df[df['power'].isna()] | |
| imputed_values = model.predict(missing_power[features]) | |
| df.loc[df['power'].isna(), 'power'] = imputed_values | |
| return df | |
| def preprocess_data(self, df): | |
| """Preprocess data focusing on GEMM characteristics with improved power handling""" | |
| print("\nPreprocessing data...") | |
| try: | |
| df_processed = df.copy() | |
| df_processed = df_processed.replace('[N/A]', np.nan) | |
| df_processed = df_processed.replace('', np.nan) | |
| df_processed = self.calculate_gemm_characteristics(df_processed) | |
| df_processed['Layout'] = df_processed['Layout'].astype(str) | |
| df_processed = self.estimate_power(df_processed) | |
| df_processed = self.impute_power(df_processed) | |
| df_processed = self.filter_power_bounds(df_processed) | |
| for col in self.numerical_features: | |
| if col in df_processed.columns: | |
| df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce') | |
| Q1 = df_processed[col].quantile(0.01) | |
| Q3 = df_processed[col].quantile(0.99) | |
| df_processed[col] = df_processed[col].clip(Q1, Q3) | |
| df_processed[col] = df_processed[col].fillna(df_processed[col].median()) | |
| print("Data preprocessing completed successfully") | |
| print(f"Features summary:") | |
| print(df_processed[self.numerical_features].describe()) | |
| return df_processed | |
| except Exception as e: | |
| print(f"Error in preprocess_data: {str(e)}") | |
| raise | |
| def predict(self, input_data): | |
| """Make predictions using the stacked model""" | |
| df = self.prepare_input_data(input_data) | |
| predictions = self.stacked_model.predict(df) | |
| # Map predictions to target features | |
| prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)} | |
| prediction_dict['characteristics'] = self.calculate_gemm_characteristics( | |
| input_data['m'], input_data['n'], input_data['k'], | |
| input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3'] | |
| ) | |
| return prediction_dict | |
| def create_comparison_chart(current_metrics, optimal_metrics): | |
| """Create a comparison chart using plotly""" | |
| metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS'] | |
| current_values = [ | |
| current_metrics['runtime'], | |
| current_metrics['power'], | |
| current_metrics['Energy'], | |
| current_metrics['TFlops'] | |
| ] | |
| optimal_values = [ | |
| optimal_metrics['runtime'], | |
| optimal_metrics['power'], | |
| optimal_metrics['Energy'], | |
| optimal_metrics['TFlops'] | |
| ] | |
| fig = go.Figure(data=[ | |
| go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'), | |
| go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38') | |
| ]) | |
| fig.update_layout( | |
| barmode='group', | |
| title='Performance Comparison', | |
| xaxis_title='Metrics', | |
| yaxis_title='Values', | |
| height=400 | |
| ) | |
| return fig | |
| def create_heatmap(m, n, k, block_m, block_n): | |
| """Create a heatmap visualization of the matrix blocking""" | |
| grid_m = int(np.ceil(m / block_m)) | |
| grid_n = int(np.ceil(n / block_n)) | |
| grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n)) | |
| fig = go.Figure(data=go.Heatmap( | |
| z=grid, | |
| colorscale='Viridis', | |
| showscale=False | |
| )) | |
| fig.update_layout( | |
| title='Matrix Blocking Visualization', | |
| xaxis_title='N dimension (columns)', | |
| yaxis_title='M dimension (rows)', | |
| height=300, | |
| margin=dict(l=50, r=50, t=50, b=50) | |
| ) | |
| return fig | |
| def create_performance_metrics_chart(predictions): | |
| """Create a gauge chart for TFLOPS and other metrics""" | |
| max_tflops = 40 # RTX 4070 theoretical max | |
| tflops_percentage = (predictions['TFlops'] / max_tflops) * 100 | |
| fig = go.Figure(go.Indicator( | |
| mode = "gauge+number", | |
| value = predictions['TFlops'], | |
| domain = {'x': [0, 1], 'y': [0, 1]}, | |
| title = {'text': "TFLOPS Performance"}, | |
| gauge = { | |
| 'axis': {'range': [None, max_tflops]}, | |
| 'bar': {'color': "darkblue"}, | |
| 'steps': [ | |
| {'range': [0, max_tflops/3], 'color': "red"}, | |
| {'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"}, | |
| {'range': [2*max_tflops/3, max_tflops], 'color': "green"} | |
| ], | |
| 'threshold': { | |
| 'line': {'color': "red", 'width': 4}, | |
| 'thickness': 0.75, | |
| 'value': predictions['TFlops'] | |
| } | |
| } | |
| )) | |
| fig.update_layout(height=300) | |
| return fig | |
| def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization): | |
| """Create a spider chart showing various efficiency metrics""" | |
| fig = go.Figure() | |
| categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization'] | |
| fig.add_trace(go.Scatterpolar( | |
| r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization], | |
| theta=categories, | |
| fill='toself', | |
| name='Current Configuration' | |
| )) | |
| fig.update_layout( | |
| polar=dict( | |
| radialaxis=dict( | |
| visible=True, | |
| range=[0, 100] | |
| )), | |
| showlegend=False, | |
| height=300 | |
| ) | |
| return fig | |
| def main(): | |
| st.set_page_config(page_title="GEMM Performance Predictor", layout="wide") | |
| st.markdown(""" | |
| <style> | |
| .main { | |
| padding: 2rem 1rem; | |
| max-width: 100%; | |
| } | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.title("GEMM Performance Predictor for RTX 4070") | |
| try: | |
| predictor = GEMMPredictor() | |
| col1, col2, col3 = st.columns([1,1,1]) | |
| with col1: | |
| st.subheader("Matrix Dimensions") | |
| with st.expander("Set Matrix Dimensions", expanded=True): | |
| m = st.number_input("M", min_value=1, value=512) | |
| n = st.number_input("N", min_value=1, value=512) | |
| k = st.number_input("K", min_value=1, value=1024) | |
| with col2: | |
| st.subheader("Block Sizes") | |
| with st.expander("Set Block Dimensions", expanded=True): | |
| blocksize1 = st.number_input("Block Size 1", min_value=1, value=512) | |
| blocksize2 = st.number_input("Block Size 2", min_value=1, value=128) | |
| blocksize3 = st.number_input("Block Size 3", min_value=1, value=512) | |
| with col3: | |
| st.subheader("Configuration") | |
| with st.expander("Additional Settings", expanded=True): | |
| layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt']) | |
| kernel_name = st.selectbox( | |
| "CUTLASS Kernel", | |
| [ | |
| 'cutlass_simt_sgemm_128x128_8x2_nn_align1', | |
| 'cutlass_simt_sgemm_128x128_8x2_nt_align1', | |
| 'cutlass_simt_sgemm_128x128_8x2_tn_align1', | |
| 'cutlass_simt_sgemm_128x128_8x2_tt_align1' | |
| ] | |
| ) | |
| alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25) | |
| beta = st.number_input("Beta Scalar", value=0.50, step=0.25) | |
| if st.button("Analyze Performance", use_container_width=True): | |
| with st.spinner("Analyzing performance..."): | |
| input_data = { | |
| 'm': m, 'n': n, 'k': k, | |
| 'blocksize1': blocksize1, | |
| 'blocksize2': blocksize2, | |
| 'blocksize3': blocksize3, | |
| 'Layout': layout, | |
| 'kernel_name': kernel_name, | |
| 'alpha': alpha, | |
| 'beta': beta | |
| } | |
| predictions = predictor.predict(input_data) | |
| tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"]) | |
| with tab1: | |
| st.subheader("GEMM Characteristics") | |
| metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4) | |
| with metric_col1: | |
| st.metric( | |
| "Arithmetic Intensity", | |
| f"{predictions['characteristics']['arithmetic_intensity']:.2f}", | |
| f"{predictions['characteristics']['bound_type'].upper()} bound" | |
| ) | |
| with metric_col2: | |
| st.metric( | |
| "Total FLOPS", | |
| f"{predictions['characteristics']['total_flops']/1e9:.2f}G", | |
| "Operations" | |
| ) | |
| with metric_col3: | |
| st.metric( | |
| "Memory Accessed", | |
| f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB", | |
| "Total Data Movement" | |
| ) | |
| with metric_col4: | |
| memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) | |
| st.metric( | |
| "Memory Efficiency", | |
| f"{memory_efficiency:.1f}%", | |
| "vs Peak Bandwidth" | |
| ) | |
| st.markdown("---") | |
| perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4) | |
| with perf_col1: | |
| st.metric( | |
| "Runtime", | |
| f"{max(0.01, predictions['runtime']):.2f} ms", | |
| "Execution Time" | |
| ) | |
| with perf_col2: | |
| st.metric( | |
| "Power", | |
| f"{max(1.0, predictions['power']):.2f} W", | |
| "Power Consumption" | |
| ) | |
| with perf_col3: | |
| st.metric( | |
| "Energy", | |
| f"{max(0.01, predictions['Energy']):.2f} J", | |
| "Total Energy" | |
| ) | |
| with perf_col4: | |
| efficiency = (predictions['TFlops'] / 40) * 100 | |
| st.metric( | |
| "TFLOPS", | |
| f"{predictions['TFlops']:.2f}", | |
| f"{efficiency:.1f}% of Peak" | |
| ) | |
| with tab2: | |
| st.subheader("Detailed Performance Analysis") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("#### Matrix Configuration") | |
| st.markdown(f""" | |
| - Total Matrix Elements: {m*n:,} | |
| - Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB | |
| - Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3} | |
| - Grid Size: {m//blocksize1}x{n//blocksize2} blocks | |
| """) | |
| with col2: | |
| st.markdown("#### Performance Bottlenecks") | |
| ai = predictions['characteristics']['arithmetic_intensity'] | |
| if ai > 59: | |
| st.success("✅ Compute Bound - Optimal for GPU") | |
| else: | |
| st.warning("⚠️ Memory Bound - Consider Optimization") | |
| efficiency = (predictions['TFlops'] / 40) * 100 | |
| if efficiency < 30: | |
| st.error("🔴 Low Compute Efficiency - Check Configuration") | |
| elif efficiency < 60: | |
| st.warning("🟡 Moderate Efficiency - Room for Improvement") | |
| else: | |
| st.success("🟢 Good Efficiency") | |
| with tab3: | |
| st.subheader("Performance Visualizations") | |
| viz_col1, viz_col2 = st.columns(2) | |
| with viz_col1: | |
| st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True) | |
| with viz_col2: | |
| mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100) | |
| compute_util = min(100, (predictions['TFlops'] / 40) * 100) | |
| st.plotly_chart( | |
| create_efficiency_chart( | |
| predictions['characteristics']['arithmetic_intensity'], | |
| mem_bw_util, | |
| compute_util | |
| ), | |
| use_container_width=True | |
| ) | |
| st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True) | |
| st.markdown("### Recommendations") | |
| recommendations = [] | |
| if blocksize1 * blocksize2 > 1024: | |
| recommendations.append("⚠️ Block size might be too large for optimal occupancy") | |
| if predictions['characteristics']['arithmetic_intensity'] < 30: | |
| recommendations.append("Consider increasing arithmetic intensity through blocking") | |
| if efficiency < 50: | |
| recommendations.append("Performance is below 50% of peak - try different block sizes") | |
| if recommendations: | |
| for rec in recommendations: | |
| st.markdown(f"- {rec}") | |
| else: | |
| st.success("Current configuration appears optimal!") | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.") | |
| st.write("If the error persists, check the input parameters and model compatibility.") | |
| if __name__ == "__main__": | |
| main() |