Spaces:

Allanatrix
/

NexaEvals

Running

App Files Files Community

NexaEvals / app.py

Allanatrix

Update app.py

9b1cb79 verified 6 months ago

raw

history blame contribute delete

12.6 kB

	import gradio as gr
	import plotly.graph_objs as go
	import plotly.express as px
	import pandas as pd
	import json

	# Domain-specific model evaluations
	MODEL_EVALS = {
	"Proteins": {
	"Nexa Bio1 (Secondary)": 0.71,
	"Porter6 (Secondary)": 0.8456,
	"DeepCNF (Secondary)": 0.85,
	"AlphaFold2 (Tertiary GDT-TS)": 0.924,
	"Nexa Bio2 (Tertiary)": 0.90,
	},
	"Astro": {
	"Nexa Astro": 0.97,
	"Baseline CNN": 0.89,
	},
	"Materials": {
	"Nexa Materials": 0.9999,
	"Random Forest Baseline": 0.92,
	},
	"QST": {
	"Nexa PIN Model": 0.80,
	"Quantum TomoNet": 0.85,
	},
	"HEP": {
	"Nexa HEP Model": 0.91,
	"CMSNet": 0.94,
	},
	"CFD": {
	"Nexa CFD Model": 0.92,
	"FlowNet": 0.89,
	},
	}

	# SCIEVAL/OSIR metrics data
	SCIEVAL_METRICS = {
	"Nexa Mistral Sci-7B": {
	"OSIR (General)": {
	"Entropy / Novelty": 6.2,
	"Internal Consistency": 8.5,
	"Hypothesis Framing": 6.8,
	"Thematic Grounding": 7.9,
	"Citation & Structure": 7.3,
	"Symbolism & Math Logic": 6.1,
	"Scientific Utility": 7.6
	},
	"OSIR-Field (Physics)": {
	"Entropy / Novelty": 7.1,
	"Internal Consistency": 8.9,
	"Hypothesis Framing": 7.4,
	"Thematic Grounding": 8.2,
	"Citation & Structure": 6.5,
	"Symbolism & Math Logic": 7.8,
	"Scientific Utility": 8.3
	}
	},
	"Nexa-Llama-Sci-7B": {
	"OSIR (General)": {
	"Entropy / Novelty": 6.7,
	"Internal Consistency": 7.8,
	"Hypothesis Framing": 7.5,
	"Thematic Grounding": 7.9,
	"Citation & Structure": 6.5,
	"Symbolism & Math Logic": 5.9,
	"Scientific Utility": 7.0
	},
	"OSIR-Field (Physics)": {
	"Entropy / Novelty": 7.0,
	"Internal Consistency": 8.0,
	"Hypothesis Framing": 7.8,
	"Thematic Grounding": 8.1,
	"Citation & Structure": 6.2,
	"Symbolism & Math Logic": 6.5,
	"Scientific Utility": 7.4
	}
	}
	}


	def plot_domain_benchmark(domain):
	"""Create horizontal bar chart for domain-specific benchmarks"""
	models = list(MODEL_EVALS[domain].keys())
	scores = list(MODEL_EVALS[domain].values())

	# Color coding for Nexa models vs others
	colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]

	fig = go.Figure()
	fig.add_trace(go.Bar(
	y=models,
	x=scores,
	orientation='h',
	marker_color=colors,
	text=[f'{score:.3f}' for score in scores],
	textposition='auto'
	))

	fig.update_layout(
	title=f"Model Benchmark Scores — {domain}",
	yaxis_title="Model",
	xaxis_title="Score",
	xaxis_range=[0, 1.0],
	template="plotly_white",
	height=500,
	showlegend=False
	)
	return fig

	def plot_scieval_comparison(model_name):
	"""Create horizontal comparison chart for SCIEVAL metrics"""
	if model_name not in SCIEVAL_METRICS:
	return go.Figure()

	metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
	osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
	field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name='OSIR (General)',
	y=metrics,
	x=osir_scores,
	orientation='h',
	marker_color='#FFD700',
	text=[f'{score:.1f}' for score in osir_scores],
	textposition='auto'
	))

	fig.add_trace(go.Bar(
	name='OSIR-Field (Physics)',
	y=metrics,
	x=field_scores,
	orientation='h',
	marker_color='#FF6B35',
	text=[f'{score:.1f}' for score in field_scores],
	textposition='auto'
	))

	fig.update_layout(
	title=f"SCIEVAL Metrics Comparison — {model_name}",
	yaxis_title="Metric",
	xaxis_title="Score (1-10)",
	xaxis_range=[0, 10],
	template="plotly_white",
	height=500,
	barmode='group'
	)
	return fig

	def create_leaderboard():
	"""Create leaderboard table"""
	leaderboard_data = []

	# Add domain benchmark leaders
	for domain, models in MODEL_EVALS.items():
	best_model = max(models.items(), key=lambda x: x[1])
	leaderboard_data.append({
	"Domain": domain,
	"Best Model": best_model[0],
	"Score": f"{best_model[1]:.3f}",
	"Metric Type": "Domain Benchmark"
	})

	# Add SCIEVAL leaders
	for model, evaluations in SCIEVAL_METRICS.items():
	avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
	avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])

	leaderboard_data.append({
	"Domain": "OSIR General",
	"Best Model": model,
	"Score": f"{avg_osir:.2f}",
	"Metric Type": "SCIEVAL"
	})

	leaderboard_data.append({
	"Domain": "OSIR Physics",
	"Best Model": model,
	"Score": f"{avg_field:.2f}",
	"Metric Type": "SCIEVAL"
	})

	df = pd.DataFrame(leaderboard_data)
	return df

	def get_model_details(domain):
	"""Get JSON details for domain models"""
	return json.dumps(MODEL_EVALS[domain], indent=2)

	def display_domain_eval(domain):
	"""Display domain evaluation results"""
	plot = plot_domain_benchmark(domain)
	details = get_model_details(domain)
	return plot, details

	def display_scieval(model_name):
	"""Display SCIEVAL results"""
	plot = plot_scieval_comparison(model_name)
	if model_name in SCIEVAL_METRICS:
	details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
	else:
	details = "Model not found in SCIEVAL database"
	return plot, details

	# Create Gradio interface
	with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🔬 Scientific ML Benchmark Suite
	### Comprehensive evaluation framework for scientific machine learning models

	This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
	comprehensive assessment of ML models across scientific disciplines.
	""")

	with gr.Tabs():
	# Domain Benchmarks Tab
	with gr.TabItem("🧪 Domain Benchmarks"):
	gr.Markdown("""
	### Domain-Specific Model Evaluations
	Compare models across scientific domains including Proteins, Astronomy, Materials Science,
	Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
	""")

	with gr.Row():
	domain_dropdown = gr.Dropdown(
	choices=list(MODEL_EVALS.keys()),
	label="Select Scientific Domain",
	value="Proteins"
	)
	domain_btn = gr.Button("Run Domain Evaluation", variant="primary")

	with gr.Row():
	domain_plot = gr.Plot(label="Domain Benchmark Results")
	domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")

	domain_btn.click(
	display_domain_eval,
	inputs=domain_dropdown,
	outputs=[domain_plot, domain_metrics]
	)

	# SCIEVAL Tab
	with gr.TabItem("📊 SCIEVAL Metrics"):
	gr.Markdown("""
	### SCIEVAL: Scientific Reasoning Evaluation
	Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.

	Metrics evaluated:
	- Entropy/Novelty: Originality and information richness
	- Internal Consistency: Logical structure and argument continuity
	- Hypothesis Framing: Research aim clarity
	- Thematic Grounding: Domain focus and relevance
	- Citation & Structure: Scientific formatting
	- Symbolism & Math Logic: Mathematical rigor
	- Scientific Utility: Real-world research value
	""")

	with gr.Row():
	scieval_dropdown = gr.Dropdown(
	choices=list(SCIEVAL_METRICS.keys()),
	label="Select Model for SCIEVAL",
	value="Nexa Mistral Sci-7B"
	)
	scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")

	with gr.Row():
	scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
	scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")

	scieval_btn.click(
	display_scieval,
	inputs=scieval_dropdown,
	outputs=[scieval_plot, scieval_metrics]
	)

	# Leaderboard Tab
	with gr.TabItem("🏆 Leaderboard"):
	gr.Markdown("""
	### Scientific ML Model Leaderboard
	Current best-performing models across all evaluated domains and metrics.
	""")

	leaderboard_df = create_leaderboard()
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	label="Current Leaders by Domain",
	interactive=False
	)

	# About Tab
	with gr.TabItem("ℹ️ About"):
	gr.Markdown("""
	### About the Scientific ML Benchmark Suite

	This comprehensive evaluation framework combines two powerful assessment methodologies:

	#### 🎯 Domain Benchmarks
	- Proteins: Secondary/tertiary structure prediction accuracy
	- Astronomy: Object classification and detection
	- Materials: Property prediction and discovery
	- QST: Quantum state tomography reconstruction
	- HEP: High energy physics event classification
	- CFD: Computational fluid dynamics modeling

	#### 🔬 SCIEVAL Framework
	SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:

	- Standardized Evaluation: Reproducible metrics for scientific LLMs
	- Domain Adaptation: Field-specific evaluation extensions
	- Research Utility: Assessment of real-world scientific value

	OSIR-Field Extensions:
	- `osir-field-physics`: Physics-specific reasoning evaluation
	- `osir-field-bio`: Biological sciences assessment
	- `osir-field-chem`: Chemistry domain evaluation
	- `osir-field-cs`: Computer science applications

	#### 📈 Scoring System
	- Domain Benchmarks: 0.0 - 1.0 scale (higher is better)
	- SCIEVAL Metrics: 1 - 10 scale across seven dimensions

	#### 🤝 Contributing
	This is an open framework welcoming contributions:
	- New domain-specific test sets
	- Additional evaluation metrics
	- Model submissions for benchmarking

	#### 📄 Citation
	```
	@misc{scieval2024,
	title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
	author={NEXA Research},
	year={2025},
	url={https://huggingface.co/spaces/osir/scieval}
	}
	```

	---

	License: Apache 2.0 \| Framework: OSIR Initiative \| Platform: Gradio + Plotly
	""")

	# Initialize with default values
	demo.load(
	lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
	outputs=[domain_plot, domain_metrics]
	)

	demo.load(
	lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
	json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
	outputs=[scieval_plot, scieval_metrics]
	)

	if __name__ == "__main__":
	demo.launch()