Spaces:
Running
Running
| import gradio as gr | |
| import plotly.graph_objs as go | |
| import plotly.express as px | |
| import pandas as pd | |
| import json | |
| # Domain-specific model evaluations | |
| MODEL_EVALS = { | |
| "Proteins": { | |
| "Nexa Bio1 (Secondary)": 0.71, | |
| "Porter6 (Secondary)": 0.8456, | |
| "DeepCNF (Secondary)": 0.85, | |
| "AlphaFold2 (Tertiary GDT-TS)": 0.924, | |
| "Nexa Bio2 (Tertiary)": 0.90, | |
| }, | |
| "Astro": { | |
| "Nexa Astro": 0.97, | |
| "Baseline CNN": 0.89, | |
| }, | |
| "Materials": { | |
| "Nexa Materials": 0.9999, | |
| "Random Forest Baseline": 0.92, | |
| }, | |
| "QST": { | |
| "Nexa PIN Model": 0.80, | |
| "Quantum TomoNet": 0.85, | |
| }, | |
| "HEP": { | |
| "Nexa HEP Model": 0.91, | |
| "CMSNet": 0.94, | |
| }, | |
| "CFD": { | |
| "Nexa CFD Model": 0.92, | |
| "FlowNet": 0.89, | |
| }, | |
| } | |
| # SCIEVAL/OSIR metrics data | |
| SCIEVAL_METRICS = { | |
| "Nexa Mistral Sci-7B": { | |
| "OSIR (General)": { | |
| "Entropy / Novelty": 6.2, | |
| "Internal Consistency": 8.5, | |
| "Hypothesis Framing": 6.8, | |
| "Thematic Grounding": 7.9, | |
| "Citation & Structure": 7.3, | |
| "Symbolism & Math Logic": 6.1, | |
| "Scientific Utility": 7.6 | |
| }, | |
| "OSIR-Field (Physics)": { | |
| "Entropy / Novelty": 7.1, | |
| "Internal Consistency": 8.9, | |
| "Hypothesis Framing": 7.4, | |
| "Thematic Grounding": 8.2, | |
| "Citation & Structure": 6.5, | |
| "Symbolism & Math Logic": 7.8, | |
| "Scientific Utility": 8.3 | |
| } | |
| }, | |
| "Nexa-Llama-Sci-7B": { | |
| "OSIR (General)": { | |
| "Entropy / Novelty": 6.7, | |
| "Internal Consistency": 7.8, | |
| "Hypothesis Framing": 7.5, | |
| "Thematic Grounding": 7.9, | |
| "Citation & Structure": 6.5, | |
| "Symbolism & Math Logic": 5.9, | |
| "Scientific Utility": 7.0 | |
| }, | |
| "OSIR-Field (Physics)": { | |
| "Entropy / Novelty": 7.0, | |
| "Internal Consistency": 8.0, | |
| "Hypothesis Framing": 7.8, | |
| "Thematic Grounding": 8.1, | |
| "Citation & Structure": 6.2, | |
| "Symbolism & Math Logic": 6.5, | |
| "Scientific Utility": 7.4 | |
| } | |
| } | |
| } | |
| def plot_domain_benchmark(domain): | |
| """Create horizontal bar chart for domain-specific benchmarks""" | |
| models = list(MODEL_EVALS[domain].keys()) | |
| scores = list(MODEL_EVALS[domain].values()) | |
| # Color coding for Nexa models vs others | |
| colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models] | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar( | |
| y=models, | |
| x=scores, | |
| orientation='h', | |
| marker_color=colors, | |
| text=[f'{score:.3f}' for score in scores], | |
| textposition='auto' | |
| )) | |
| fig.update_layout( | |
| title=f"Model Benchmark Scores β {domain}", | |
| yaxis_title="Model", | |
| xaxis_title="Score", | |
| xaxis_range=[0, 1.0], | |
| template="plotly_white", | |
| height=500, | |
| showlegend=False | |
| ) | |
| return fig | |
| def plot_scieval_comparison(model_name): | |
| """Create horizontal comparison chart for SCIEVAL metrics""" | |
| if model_name not in SCIEVAL_METRICS: | |
| return go.Figure() | |
| metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys()) | |
| osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values()) | |
| field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values()) | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar( | |
| name='OSIR (General)', | |
| y=metrics, | |
| x=osir_scores, | |
| orientation='h', | |
| marker_color='#FFD700', | |
| text=[f'{score:.1f}' for score in osir_scores], | |
| textposition='auto' | |
| )) | |
| fig.add_trace(go.Bar( | |
| name='OSIR-Field (Physics)', | |
| y=metrics, | |
| x=field_scores, | |
| orientation='h', | |
| marker_color='#FF6B35', | |
| text=[f'{score:.1f}' for score in field_scores], | |
| textposition='auto' | |
| )) | |
| fig.update_layout( | |
| title=f"SCIEVAL Metrics Comparison β {model_name}", | |
| yaxis_title="Metric", | |
| xaxis_title="Score (1-10)", | |
| xaxis_range=[0, 10], | |
| template="plotly_white", | |
| height=500, | |
| barmode='group' | |
| ) | |
| return fig | |
| def create_leaderboard(): | |
| """Create leaderboard table""" | |
| leaderboard_data = [] | |
| # Add domain benchmark leaders | |
| for domain, models in MODEL_EVALS.items(): | |
| best_model = max(models.items(), key=lambda x: x[1]) | |
| leaderboard_data.append({ | |
| "Domain": domain, | |
| "Best Model": best_model[0], | |
| "Score": f"{best_model[1]:.3f}", | |
| "Metric Type": "Domain Benchmark" | |
| }) | |
| # Add SCIEVAL leaders | |
| for model, evaluations in SCIEVAL_METRICS.items(): | |
| avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"]) | |
| avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"]) | |
| leaderboard_data.append({ | |
| "Domain": "OSIR General", | |
| "Best Model": model, | |
| "Score": f"{avg_osir:.2f}", | |
| "Metric Type": "SCIEVAL" | |
| }) | |
| leaderboard_data.append({ | |
| "Domain": "OSIR Physics", | |
| "Best Model": model, | |
| "Score": f"{avg_field:.2f}", | |
| "Metric Type": "SCIEVAL" | |
| }) | |
| df = pd.DataFrame(leaderboard_data) | |
| return df | |
| def get_model_details(domain): | |
| """Get JSON details for domain models""" | |
| return json.dumps(MODEL_EVALS[domain], indent=2) | |
| def display_domain_eval(domain): | |
| """Display domain evaluation results""" | |
| plot = plot_domain_benchmark(domain) | |
| details = get_model_details(domain) | |
| return plot, details | |
| def display_scieval(model_name): | |
| """Display SCIEVAL results""" | |
| plot = plot_scieval_comparison(model_name) | |
| if model_name in SCIEVAL_METRICS: | |
| details = json.dumps(SCIEVAL_METRICS[model_name], indent=2) | |
| else: | |
| details = "Model not found in SCIEVAL database" | |
| return plot, details | |
| # Create Gradio interface | |
| with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π¬ Scientific ML Benchmark Suite | |
| ### Comprehensive evaluation framework for scientific machine learning models | |
| This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide | |
| comprehensive assessment of ML models across scientific disciplines. | |
| """) | |
| with gr.Tabs(): | |
| # Domain Benchmarks Tab | |
| with gr.TabItem("π§ͺ Domain Benchmarks"): | |
| gr.Markdown(""" | |
| ### Domain-Specific Model Evaluations | |
| Compare models across scientific domains including Proteins, Astronomy, Materials Science, | |
| Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD). | |
| """) | |
| with gr.Row(): | |
| domain_dropdown = gr.Dropdown( | |
| choices=list(MODEL_EVALS.keys()), | |
| label="Select Scientific Domain", | |
| value="Proteins" | |
| ) | |
| domain_btn = gr.Button("Run Domain Evaluation", variant="primary") | |
| with gr.Row(): | |
| domain_plot = gr.Plot(label="Domain Benchmark Results") | |
| domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json") | |
| domain_btn.click( | |
| display_domain_eval, | |
| inputs=domain_dropdown, | |
| outputs=[domain_plot, domain_metrics] | |
| ) | |
| # SCIEVAL Tab | |
| with gr.TabItem("π SCIEVAL Metrics"): | |
| gr.Markdown(""" | |
| ### SCIEVAL: Scientific Reasoning Evaluation | |
| Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework. | |
| **Metrics evaluated:** | |
| - **Entropy/Novelty**: Originality and information richness | |
| - **Internal Consistency**: Logical structure and argument continuity | |
| - **Hypothesis Framing**: Research aim clarity | |
| - **Thematic Grounding**: Domain focus and relevance | |
| - **Citation & Structure**: Scientific formatting | |
| - **Symbolism & Math Logic**: Mathematical rigor | |
| - **Scientific Utility**: Real-world research value | |
| """) | |
| with gr.Row(): | |
| scieval_dropdown = gr.Dropdown( | |
| choices=list(SCIEVAL_METRICS.keys()), | |
| label="Select Model for SCIEVAL", | |
| value="Nexa Mistral Sci-7B" | |
| ) | |
| scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary") | |
| with gr.Row(): | |
| scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison") | |
| scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json") | |
| scieval_btn.click( | |
| display_scieval, | |
| inputs=scieval_dropdown, | |
| outputs=[scieval_plot, scieval_metrics] | |
| ) | |
| # Leaderboard Tab | |
| with gr.TabItem("π Leaderboard"): | |
| gr.Markdown(""" | |
| ### Scientific ML Model Leaderboard | |
| Current best-performing models across all evaluated domains and metrics. | |
| """) | |
| leaderboard_df = create_leaderboard() | |
| leaderboard_table = gr.Dataframe( | |
| value=leaderboard_df, | |
| label="Current Leaders by Domain", | |
| interactive=False | |
| ) | |
| # About Tab | |
| with gr.TabItem("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ### About the Scientific ML Benchmark Suite | |
| This comprehensive evaluation framework combines two powerful assessment methodologies: | |
| #### π― Domain Benchmarks | |
| - **Proteins**: Secondary/tertiary structure prediction accuracy | |
| - **Astronomy**: Object classification and detection | |
| - **Materials**: Property prediction and discovery | |
| - **QST**: Quantum state tomography reconstruction | |
| - **HEP**: High energy physics event classification | |
| - **CFD**: Computational fluid dynamics modeling | |
| #### π¬ SCIEVAL Framework | |
| SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing: | |
| - **Standardized Evaluation**: Reproducible metrics for scientific LLMs | |
| - **Domain Adaptation**: Field-specific evaluation extensions | |
| - **Research Utility**: Assessment of real-world scientific value | |
| **OSIR-Field Extensions:** | |
| - `osir-field-physics`: Physics-specific reasoning evaluation | |
| - `osir-field-bio`: Biological sciences assessment | |
| - `osir-field-chem`: Chemistry domain evaluation | |
| - `osir-field-cs`: Computer science applications | |
| #### π Scoring System | |
| - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better) | |
| - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions | |
| #### π€ Contributing | |
| This is an open framework welcoming contributions: | |
| - New domain-specific test sets | |
| - Additional evaluation metrics | |
| - Model submissions for benchmarking | |
| #### π Citation | |
| ``` | |
| @misc{scieval2024, | |
| title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models}, | |
| author={NEXA Research}, | |
| year={2025}, | |
| url={https://huggingface.co/spaces/osir/scieval} | |
| } | |
| ``` | |
| --- | |
| **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly | |
| """) | |
| # Initialize with default values | |
| demo.load( | |
| lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")), | |
| outputs=[domain_plot, domain_metrics] | |
| ) | |
| demo.load( | |
| lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"), | |
| json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)), | |
| outputs=[scieval_plot, scieval_metrics] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |