Spaces:

pavlyhalim
/

gemm_predictor

Sleeping

gemm_predictor / app.py

pavlyhalim

updated app.py

368a654 about 1 year ago

20.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import joblib
	import plotly.graph_objects as go
	from sklearn.ensemble import RandomForestRegressor

	class GEMMPredictor:
	def __init__(self, model_path='model.joblib'):
	self.stacked_model = joblib.load(model_path)
	self.initialize_features()

	def initialize_features(self):
	"""Initialize features used by the model"""
	# Core matrix features
	self.core_features = [
	'm', 'n', 'k',
	'blocksize1', 'blocksize2', 'blocksize3'
	]
	# Derived features
	self.derived_features = [
	'arithmetic_intensity',
	'bytes_accessed',
	'total_flops'
	]
	# Categorical features
	self.categorical_features = ['Layout']
	# Target features
	self.target_features = [
	'runtime',
	'power',
	'Energy',
	'TFlops'
	]
	self.numerical_features = self.core_features + self.derived_features

	def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3):
	"""Calculate GEMM-specific characteristics"""
	total_flops = 2 * m * n * k # 2 operations per FMA
	bytes_accessed = (m * k + k * n + m * n) * 4 # Single precision
	arithmetic_intensity = total_flops / bytes_accessed
	bound_type = 'compute' if arithmetic_intensity > 59 else 'memory'

	return {
	'total_flops': total_flops,
	'bytes_accessed': bytes_accessed,
	'arithmetic_intensity': arithmetic_intensity,
	'bound_type': bound_type
	}

	def get_default_numeric_values(self):
	"""Return default values for missing numeric features"""
	return {
	# Memory-related defaults
	'total_memory': 12288, # 12GB for RTX 4070
	'free_memory': 10240, # Assuming 80% free
	'used_memory': 2048, # Assuming 20% used
	'mem_util': 20.0, # 20% utilization
	'mem_util2': 20.0, # Secondary memory utilization

	# GPU state defaults
	'temp': 65.0, # Default temperature
	'gpu_util': 80.0, # Default GPU utilization
	'gpu_util1': 80.0, # Secondary GPU utilization
	'clock_sm': 2475, # Default SM clock for RTX 4070
	'power_limit': 200.0, # Default power limit
	'clocks.meme': 2000, # Memory clock speed

	'alpha': 1.0, # Default scaling factor
	'beta': 0.0, # Default scaling factor
	'problem_size_m': 1024,
	'problem_size_n': 1024,
	'problem_size_k': 1024
	}

	def get_default_categorical_values(self):
	"""Return default values for missing categorical features"""
	return {
	'stage': 'main',
	'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1',
	'computation_pattern': 'GEMM',
	'combination_type': 'standard',
	'state': 'active',
	'uses_shared_memory': 'true',
	'gpu_name': 'RTX4070'
	}

	def prepare_input_data(self, input_dict):
	"""Prepare input data for prediction with default values for missing features"""
	numeric_defaults = self.get_default_numeric_values()
	categorical_defaults = self.get_default_categorical_values()

	complete_input = {numeric_defaults, categorical_defaults}

	complete_input.update(input_dict)

	df = pd.DataFrame([complete_input])

	characteristics = self.calculate_gemm_characteristics(
	df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0],
	df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0]
	)

	df['total_flops'] = characteristics['total_flops']
	df['bytes_accessed'] = characteristics['bytes_accessed']
	df['arithmetic_intensity'] = characteristics['arithmetic_intensity']

	for col in self.categorical_features:
	if col in df.columns:
	df[col] = df[col].astype(str)

	for col in self.numerical_features:
	if col in df.columns:
	df[col] = pd.to_numeric(df[col], errors='coerce')

	return df

	def estimate_power(df):
	BASE_POWER = 30
	MAX_POWER = 200
	MAX_TFLOPS = 40

	df['estimated_power'] = BASE_POWER + (
	(MAX_POWER - BASE_POWER) *
	(df['total_flops'] / (MAX_TFLOPS * 1e12))
	)

	df['power'] = df['power'].fillna(df['estimated_power'])

	return df

	def filter_power_bounds(df):
	MIN_POWER = 25 # Minimum idle power
	MAX_POWER = 200 # Maximum TDP

	df = df[
	(df['power'].between(MIN_POWER, MAX_POWER)) \|
	(df['power'].isna())
	]

	return df

	def impute_power(df):
	df['total_elements'] = df['m'] * df['n'] * df['k']
	valid_power = df[df['power'].notna()]

	features = ['total_elements', 'total_flops', 'arithmetic_intensity']
	X = valid_power[features]
	y = valid_power['power']

	model = RandomForestRegressor(n_estimators=100)
	model.fit(X, y)

	missing_power = df[df['power'].isna()]
	imputed_values = model.predict(missing_power[features])
	df.loc[df['power'].isna(), 'power'] = imputed_values

	return df

	def preprocess_data(self, df):
	"""Preprocess data focusing on GEMM characteristics with improved power handling"""
	print("\nPreprocessing data...")

	try:
	df_processed = df.copy()
	df_processed = df_processed.replace('[N/A]', np.nan)
	df_processed = df_processed.replace('', np.nan)
	df_processed = self.calculate_gemm_characteristics(df_processed)

	df_processed['Layout'] = df_processed['Layout'].astype(str)

	df_processed = self.estimate_power(df_processed)
	df_processed = self.impute_power(df_processed)
	df_processed = self.filter_power_bounds(df_processed)

	for col in self.numerical_features:
	if col in df_processed.columns:
	df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
	Q1 = df_processed[col].quantile(0.01)
	Q3 = df_processed[col].quantile(0.99)
	df_processed[col] = df_processed[col].clip(Q1, Q3)
	df_processed[col] = df_processed[col].fillna(df_processed[col].median())

	print("Data preprocessing completed successfully")
	print(f"Features summary:")
	print(df_processed[self.numerical_features].describe())

	return df_processed

	except Exception as e:
	print(f"Error in preprocess_data: {str(e)}")
	raise

	def predict(self, input_data):
	"""Make predictions using the stacked model"""
	df = self.prepare_input_data(input_data)
	predictions = self.stacked_model.predict(df)

	# Map predictions to target features
	prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)}

	prediction_dict['characteristics'] = self.calculate_gemm_characteristics(
	input_data['m'], input_data['n'], input_data['k'],
	input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3']
	)

	return prediction_dict

	def create_comparison_chart(current_metrics, optimal_metrics):
	"""Create a comparison chart using plotly"""
	metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS']
	current_values = [
	current_metrics['runtime'],
	current_metrics['power'],
	current_metrics['Energy'],
	current_metrics['TFlops']
	]
	optimal_values = [
	optimal_metrics['runtime'],
	optimal_metrics['power'],
	optimal_metrics['Energy'],
	optimal_metrics['TFlops']
	]

	fig = go.Figure(data=[
	go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'),
	go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38')
	])

	fig.update_layout(
	barmode='group',
	title='Performance Comparison',
	xaxis_title='Metrics',
	yaxis_title='Values',
	height=400
	)

	return fig

	def create_heatmap(m, n, k, block_m, block_n):
	"""Create a heatmap visualization of the matrix blocking"""
	grid_m = int(np.ceil(m / block_m))
	grid_n = int(np.ceil(n / block_n))

	grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n))

	fig = go.Figure(data=go.Heatmap(
	z=grid,
	colorscale='Viridis',
	showscale=False
	))

	fig.update_layout(
	title='Matrix Blocking Visualization',
	xaxis_title='N dimension (columns)',
	yaxis_title='M dimension (rows)',
	height=300,
	margin=dict(l=50, r=50, t=50, b=50)
	)

	return fig

	def create_performance_metrics_chart(predictions):
	"""Create a gauge chart for TFLOPS and other metrics"""
	max_tflops = 40 # RTX 4070 theoretical max
	tflops_percentage = (predictions['TFlops'] / max_tflops) * 100

	fig = go.Figure(go.Indicator(
	mode = "gauge+number",
	value = predictions['TFlops'],
	domain = {'x': [0, 1], 'y': [0, 1]},
	title = {'text': "TFLOPS Performance"},
	gauge = {
	'axis': {'range': [None, max_tflops]},
	'bar': {'color': "darkblue"},
	'steps': [
	{'range': [0, max_tflops/3], 'color': "red"},
	{'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"},
	{'range': [2*max_tflops/3, max_tflops], 'color': "green"}
	],
	'threshold': {
	'line': {'color': "red", 'width': 4},
	'thickness': 0.75,
	'value': predictions['TFlops']
	}
	}
	))

	fig.update_layout(height=300)
	return fig

	def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization):
	"""Create a spider chart showing various efficiency metrics"""
	fig = go.Figure()

	categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization']

	fig.add_trace(go.Scatterpolar(
	r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization],
	theta=categories,
	fill='toself',
	name='Current Configuration'
	))

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[0, 100]
	)),
	showlegend=False,
	height=300
	)

	return fig

	def main():
	st.set_page_config(page_title="GEMM Performance Predictor", layout="wide")
	st.markdown("""
	<style>
	.main {
	padding: 2rem 1rem;
	max-width: 100%;
	}
	.metric-card {
	background-color: #f0f2f6;
	padding: 1rem;
	border-radius: 0.5rem;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	</style>
	""", unsafe_allow_html=True)

	st.title("GEMM Performance Predictor for RTX 4070")

	try:
	predictor = GEMMPredictor()
	col1, col2, col3 = st.columns([1,1,1])

	with col1:
	st.subheader("Matrix Dimensions")
	with st.expander("Set Matrix Dimensions", expanded=True):
	m = st.number_input("M", min_value=1, value=512)
	n = st.number_input("N", min_value=1, value=512)
	k = st.number_input("K", min_value=1, value=1024)

	with col2:
	st.subheader("Block Sizes")
	with st.expander("Set Block Dimensions", expanded=True):
	blocksize1 = st.number_input("Block Size 1", min_value=1, value=512)
	blocksize2 = st.number_input("Block Size 2", min_value=1, value=128)
	blocksize3 = st.number_input("Block Size 3", min_value=1, value=512)

	with col3:
	st.subheader("Configuration")
	with st.expander("Additional Settings", expanded=True):
	layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt'])
	kernel_name = st.selectbox(
	"CUTLASS Kernel",
	[
	'cutlass_simt_sgemm_128x128_8x2_nn_align1',
	'cutlass_simt_sgemm_128x128_8x2_nt_align1',
	'cutlass_simt_sgemm_128x128_8x2_tn_align1',
	'cutlass_simt_sgemm_128x128_8x2_tt_align1'
	]
	)
	alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25)
	beta = st.number_input("Beta Scalar", value=0.50, step=0.25)

	if st.button("Analyze Performance", use_container_width=True):
	with st.spinner("Analyzing performance..."):
	input_data = {
	'm': m, 'n': n, 'k': k,
	'blocksize1': blocksize1,
	'blocksize2': blocksize2,
	'blocksize3': blocksize3,
	'Layout': layout,
	'kernel_name': kernel_name,
	'alpha': alpha,
	'beta': beta
	}
	predictions = predictor.predict(input_data)

	tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"])

	with tab1:
	st.subheader("GEMM Characteristics")
	metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)

	with metric_col1:
	st.metric(
	"Arithmetic Intensity",
	f"{predictions['characteristics']['arithmetic_intensity']:.2f}",
	f"{predictions['characteristics']['bound_type'].upper()} bound"
	)

	with metric_col2:
	st.metric(
	"Total FLOPS",
	f"{predictions['characteristics']['total_flops']/1e9:.2f}G",
	"Operations"
	)

	with metric_col3:
	st.metric(
	"Memory Accessed",
	f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB",
	"Total Data Movement"
	)

	with metric_col4:
	memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
	st.metric(
	"Memory Efficiency",
	f"{memory_efficiency:.1f}%",
	"vs Peak Bandwidth"
	)

	st.markdown("---")

	perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4)

	with perf_col1:
	st.metric(
	"Runtime",
	f"{max(0.01, predictions['runtime']):.2f} ms",
	"Execution Time"
	)

	with perf_col2:
	st.metric(
	"Power",
	f"{max(1.0, predictions['power']):.2f} W",
	"Power Consumption"
	)

	with perf_col3:
	st.metric(
	"Energy",
	f"{max(0.01, predictions['Energy']):.2f} J",
	"Total Energy"
	)

	with perf_col4:
	efficiency = (predictions['TFlops'] / 40) * 100
	st.metric(
	"TFLOPS",
	f"{predictions['TFlops']:.2f}",
	f"{efficiency:.1f}% of Peak"
	)

	with tab2:
	st.subheader("Detailed Performance Analysis")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("#### Matrix Configuration")
	st.markdown(f"""
	- Total Matrix Elements: {m*n:,}
	- Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB
	- Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3}
	- Grid Size: {m//blocksize1}x{n//blocksize2} blocks
	""")

	with col2:
	st.markdown("#### Performance Bottlenecks")
	ai = predictions['characteristics']['arithmetic_intensity']
	if ai > 59:
	st.success("✅ Compute Bound - Optimal for GPU")
	else:
	st.warning("⚠️ Memory Bound - Consider Optimization")

	efficiency = (predictions['TFlops'] / 40) * 100
	if efficiency < 30:
	st.error("🔴 Low Compute Efficiency - Check Configuration")
	elif efficiency < 60:
	st.warning("🟡 Moderate Efficiency - Room for Improvement")
	else:
	st.success("🟢 Good Efficiency")

	with tab3:
	st.subheader("Performance Visualizations")

	viz_col1, viz_col2 = st.columns(2)

	with viz_col1:
	st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True)

	with viz_col2:
	mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
	compute_util = min(100, (predictions['TFlops'] / 40) * 100)
	st.plotly_chart(
	create_efficiency_chart(
	predictions['characteristics']['arithmetic_intensity'],
	mem_bw_util,
	compute_util
	),
	use_container_width=True
	)

	st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True)

	st.markdown("### Recommendations")

	recommendations = []
	if blocksize1 * blocksize2 > 1024:
	recommendations.append("⚠️ Block size might be too large for optimal occupancy")
	if predictions['characteristics']['arithmetic_intensity'] < 30:
	recommendations.append("Consider increasing arithmetic intensity through blocking")
	if efficiency < 50:
	recommendations.append("Performance is below 50% of peak - try different block sizes")

	if recommendations:
	for rec in recommendations:
	st.markdown(f"- {rec}")
	else:
	st.success("Current configuration appears optimal!")

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.")
	st.write("If the error persists, check the input parameters and model compatibility.")

	if __name__ == "__main__":
	main()