Spaces:

Allanatrix
/

NexaEvals

Sleeping

App Files Files Community

Allanatrix commited on Jul 4

Commit

ff9ab22

verified ·

1 Parent(s): 81d7a7c

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -289

app.py CHANGED Viewed

@@ -1,10 +1,8 @@
 import gradio as gr
 import plotly.graph_objs as go
-import plotly.express as px
-import pandas as pd
 import json
-# Domain-specific model evaluations
 MODEL_EVALS = {
     "Proteins": {
         "Nexa Bio1 (Secondary)": 0.71,
@@ -33,310 +31,141 @@ MODEL_EVALS = {
         "Nexa CFD Model": 0.92,
         "FlowNet": 0.89,
     },
-}
-# SCIEVAL/OSIR metrics data
-SCIEVAL_METRICS = {
-    "Nexa Mistral Sci-7B": {
-        "OSIR (General)": {
-            "Entropy / Novelty": 6.2,
-            "Internal Consistency": 8.5,
-            "Hypothesis Framing": 6.8,
-            "Thematic Grounding": 7.9,
-            "Citation & Structure": 7.3,
-            "Symbolism & Math Logic": 6.1,
-            "Scientific Utility": 7.6
         },
-        "OSIR-Field (Physics)": {
-            "Entropy / Novelty": 7.1,
-            "Internal Consistency": 8.9,
-            "Hypothesis Framing": 7.4,
-            "Thematic Grounding": 8.2,
-            "Citation & Structure": 6.5,
-            "Symbolism & Math Logic": 7.8,
-            "Scientific Utility": 8.3
         }
     }
 }
-def plot_domain_benchmark(domain):
-    """Create horizontal bar chart for domain-specific benchmarks"""
-    models = list(MODEL_EVALS[domain].keys())
-    scores = list(MODEL_EVALS[domain].values())
-    # Color coding for Nexa models vs others
-    colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
-    fig = go.Figure()
-    fig.add_trace(go.Bar(
-        y=models,
-        x=scores,
-        orientation='h',
-        marker_color=colors,
-        text=[f'{score:.3f}' for score in scores],
-        textposition='auto'
-    ))
-    fig.update_layout(
-        title=f"Model Benchmark Scores — {domain}",
-        yaxis_title="Model",
-        xaxis_title="Score",
-        xaxis_range=[0, 1.0],
-        template="plotly_white",
-        height=500,
-        showlegend=False
-    )
-    return fig
-def plot_scieval_comparison(model_name):
-    """Create horizontal comparison chart for SCIEVAL metrics"""
-    if model_name not in SCIEVAL_METRICS:
-        return go.Figure()
-    metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
-    osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
-    field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
     fig = go.Figure()
-    fig.add_trace(go.Bar(
-        name='OSIR (General)',
-        y=metrics,
-        x=osir_scores,
-        orientation='h',
-        marker_color='#FFD700',
-        text=[f'{score:.1f}' for score in osir_scores],
-        textposition='auto'
-    ))
-    fig.add_trace(go.Bar(
-        name='OSIR-Field (Physics)',
-        y=metrics,
-        x=field_scores,
-        orientation='h',
-        marker_color='#FF6B35',
-        text=[f'{score:.1f}' for score in field_scores],
-        textposition='auto'
-    ))
-    fig.update_layout(
-        title=f"SCIEVAL Metrics Comparison — {model_name}",
-        yaxis_title="Metric",
-        xaxis_title="Score (1-10)",
-        xaxis_range=[0, 10],
-        template="plotly_white",
-        height=500,
-        barmode='group'
-    )
-    return fig
-def create_leaderboard():
-    """Create leaderboard table"""
-    leaderboard_data = []
-    # Add domain benchmark leaders
-    for domain, models in MODEL_EVALS.items():
-        best_model = max(models.items(), key=lambda x: x[1])
-        leaderboard_data.append({
-            "Domain": domain,
-            "Best Model": best_model[0],
-            "Score": f"{best_model[1]:.3f}",
-            "Metric Type": "Domain Benchmark"
-        })
-    # Add SCIEVAL leaders
-    for model, evaluations in SCIEVAL_METRICS.items():
-        avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
-        avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
-        leaderboard_data.append({
-            "Domain": "OSIR General",
-            "Best Model": model,
-            "Score": f"{avg_osir:.2f}",
-            "Metric Type": "SCIEVAL"
-        })
-        leaderboard_data.append({
-            "Domain": "OSIR Physics",
-            "Best Model": model,
-            "Score": f"{avg_field:.2f}",
-            "Metric Type": "SCIEVAL"
-        })
-    df = pd.DataFrame(leaderboard_data)
-    return df
 def get_model_details(domain):
-    """Get JSON details for domain models"""
-    return json.dumps(MODEL_EVALS[domain], indent=2)
-def display_domain_eval(domain):
-    """Display domain evaluation results"""
-    plot = plot_domain_benchmark(domain)
     details = get_model_details(domain)
     return plot, details
-def display_scieval(model_name):
-    """Display SCIEVAL results"""
-    plot = plot_scieval_comparison(model_name)
-    if model_name in SCIEVAL_METRICS:
-        details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
-    else:
-        details = "Model not found in SCIEVAL database"
-    return plot, details
-# Create Gradio interface
-with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🔬 Scientific ML Benchmark Suite
-    ### Comprehensive evaluation framework for scientific machine learning models
-    This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
-    comprehensive assessment of ML models across scientific disciplines.
     """)
-    with gr.Tabs():
-        # Domain Benchmarks Tab
-        with gr.TabItem("🧪 Domain Benchmarks"):
-            gr.Markdown("""
-            ### Domain-Specific Model Evaluations
-            Compare models across scientific domains including Proteins, Astronomy, Materials Science,
-            Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
-            """)
-            with gr.Row():
-                domain_dropdown = gr.Dropdown(
-                    choices=list(MODEL_EVALS.keys()),
-                    label="Select Scientific Domain",
-                    value="Proteins"
-                )
-                domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
-            with gr.Row():
-                domain_plot = gr.Plot(label="Domain Benchmark Results")
-                domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
-            domain_btn.click(
-                display_domain_eval,
-                inputs=domain_dropdown,
-                outputs=[domain_plot, domain_metrics]
-            )
-        # SCIEVAL Tab
-        with gr.TabItem("📊 SCIEVAL Metrics"):
-            gr.Markdown("""
-            ### SCIEVAL: Scientific Reasoning Evaluation
-            Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
-            **Metrics evaluated:**
-            - **Entropy/Novelty**: Originality and information richness
-            - **Internal Consistency**: Logical structure and argument continuity
-            - **Hypothesis Framing**: Research aim clarity
-            - **Thematic Grounding**: Domain focus and relevance
-            - **Citation & Structure**: Scientific formatting
-            - **Symbolism & Math Logic**: Mathematical rigor
-            - **Scientific Utility**: Real-world research value
-            """)
-            with gr.Row():
-                scieval_dropdown = gr.Dropdown(
-                    choices=list(SCIEVAL_METRICS.keys()),
-                    label="Select Model for SCIEVAL",
-                    value="Nexa Mistral Sci-7B"
-                )
-                scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
-            with gr.Row():
-                scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
-                scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
-            scieval_btn.click(
-                display_scieval,
-                inputs=scieval_dropdown,
-                outputs=[scieval_plot, scieval_metrics]
-            )
-        # Leaderboard Tab
-        with gr.TabItem("🏆 Leaderboard"):
-            gr.Markdown("""
-            ### Scientific ML Model Leaderboard
-            Current best-performing models across all evaluated domains and metrics.
-            """)
-            leaderboard_df = create_leaderboard()
-            leaderboard_table = gr.Dataframe(
-                value=leaderboard_df,
-                label="Current Leaders by Domain",
-                interactive=False
-            )
-        # About Tab
-        with gr.TabItem("ℹ️ About"):
-            gr.Markdown("""
-            ### About the Scientific ML Benchmark Suite
-            This comprehensive evaluation framework combines two powerful assessment methodologies:
-            #### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3
-            #### 🎯 Domain Benchmarks
-            - **Proteins**: Secondary/tertiary structure prediction accuracy
-            - **Astronomy**: Object classification and detection
-            - **Materials**: Property prediction and discovery
-            - **QST**: Quantum state tomography reconstruction
-            - **HEP**: High energy physics event classification
-            - **CFD**: Computational fluid dynamics modeling
-            #### 🔬 SCIEVAL Framework
-            SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
-            - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
-            - **Domain Adaptation**: Field-specific evaluation extensions
-            - **Research Utility**: Assessment of real-world scientific value
-            **OSIR-Field Extensions:**
-            - `osir-field-physics`: Physics-specific reasoning evaluation
-            - `osir-field-bio`: Biological sciences assessment
-            - `osir-field-chem`: Chemistry domain evaluation
-            - `osir-field-cs`: Computer science applications
-            #### 📈 Scoring System
-            - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
-            - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
-            #### 🤝 Contributing
-            This is an open framework welcoming contributions:
-            - New domain-specific test sets
-            - Additional evaluation metrics
-            - Model submissions for benchmarking
-            #### 📄 Citation
-            ```
-            @misc{scieval2024,
-              title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
-              author={NEXA Research},
-              year={2025},
-              url={https://huggingface.co/spaces/osir/scieval}
-            }
-            ```
-            ---
-            **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
-            """)
-    # Initialize with default values
-    demo.load(
-        lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
-        outputs=[domain_plot, domain_metrics]
-    )
-    demo.load(
-        lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
-                json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
-        outputs=[scieval_plot, scieval_metrics]
-    )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import plotly.graph_objs as go
 import json
+# Expanded MODEL_EVALS including LLM benchmarks with nested JSON scores
 MODEL_EVALS = {
     "Proteins": {
         "Nexa Bio1 (Secondary)": 0.71,
         "Nexa CFD Model": 0.92,
         "FlowNet": 0.89,
     },
+    # Added LLM domain with nested OSIR benchmark scores
+    "LLM": {
+        "Nexa Mistral": {
+            "OSIR (General)": {
+                "Entropy / Novelty": 6.7,
+                "Internal Consistency": 7.8,
+                "Hypothesis Framing": 7.5,
+                "Thematic Grounding": 7.9,
+                "Citation & Structure": 6.5,
+                "Symbolism & Math Logic": 5.9,
+                "Scientific Utility": 7.0
+            },
+            "OSIR-Field (Physics)": {
+                "Entropy / Novelty": 7.0,
+                "Internal Consistency": 8.0,
+                "Hypothesis Framing": 7.8,
+                "Thematic Grounding": 8.1,
+                "Citation & Structure": 6.2,
+                "Symbolism & Math Logic": 6.5,
+                "Scientific Utility": 7.4
+            }
         },
+        "nexa-Llama-sci7b": {
+            "OSIR (General)": {
+                "Entropy / Novelty": 6.2,
+                "Internal Consistency": 8.5,
+                "Hypothesis Framing": 6.8,
+                "Thematic Grounding": 7.9,
+                "Citation & Structure": 7.3,
+                "Symbolism & Math Logic": 6.1,
+                "Scientific Utility": 7.6
+            },
+            "OSIR-Field (Physics)": {
+                "Entropy / Novelty": 7.1,
+                "Internal Consistency": 8.9,
+                "Hypothesis Framing": 7.4,
+                "Thematic Grounding": 8.2,
+                "Citation & Structure": 6.5,
+                "Symbolism & Math Logic": 7.8,
+                "Scientific Utility": 8.3
+            }
         }
     }
 }
+def plot_domain(domain):
+    data = MODEL_EVALS[domain]
     fig = go.Figure()
+    if domain != "LLM":
+        # Simple bar plot for normal domains
+        models = list(data.keys())
+        scores = list(data.values())
+        fig.add_trace(go.Bar(x=models, y=scores, marker_color='indigo'))
+        fig.update_layout(
+            title=f"Model Benchmark Scores — {domain}",
+            xaxis_title="Model",
+            yaxis_title="Score",
+            yaxis_range=[0, 1.0],
+            template="plotly_white",
+            height=500
+        )
+    else:
+        # For LLM domain, plot grouped bars for each model and metric category
+        categories = ["Entropy / Novelty", "Internal Consistency", "Hypothesis Framing",
+                      "Thematic Grounding", "Citation & Structure", "Symbolism & Math Logic", "Scientific Utility"]
+        benchmarks = ["OSIR (General)", "OSIR-Field (Physics)"]
+        x_labels = []
+        bar_data = {model: [] for model in data.keys()}
+        # Construct x-axis labels combining benchmark and category
+        for bench in benchmarks:
+            for cat in categories:
+                x_labels.append(f"{bench}\n{cat}")
+        # Collect scores for each model in order of x_labels
+        for model, bench_data in data.items():
+            scores = []
+            for bench in benchmarks:
+                for cat in categories:
+                    scores.append(bench_data[bench][cat])
+            bar_data[model] = scores
+        # Add bars for each model
+        colors = ['indigo', 'darkorange']
+        for i, (model, scores) in enumerate(bar_data.items()):
+            fig.add_trace(go.Bar(
+                x=x_labels,
+                y=scores,
+                name=model,
+                marker_color=colors[i % len(colors)]
+            ))
+        fig.update_layout(
+            barmode='group',
+            title="LLM Model Benchmark Scores (OSIR Metrics)",
+            xaxis_title="Metric Category",
+            yaxis_title="Score",
+            yaxis_range=[0, 10],
+            template="plotly_white",
+            height=600
+        )
+    return fig
 def get_model_details(domain):
+    # For LLM domain, pretty-print nested JSON; otherwise, simple JSON
+    if domain != "LLM":
+        return json.dumps(MODEL_EVALS[domain], indent=2)
+    else:
+        return json.dumps(MODEL_EVALS[domain], indent=2)
+def display_eval(domain):
+    plot = plot_domain(domain)
     details = get_model_details(domain)
     return plot, details
+domain_list = list(MODEL_EVALS.keys())
+with gr.Blocks(title="Nexa Evals — Scientific ML Benchmark Suite") as demo:
     gr.Markdown("""
+    # 🔬 Nexa Evals
+    A benchmarking suite comparing Nexa models against SOTA across scientific domains.
     """)
+    with gr.Row():
+        domain = gr.Dropdown(domain_list, label="Select Domain")
+        show_btn = gr.Button("Run Evaluation")
+    with gr.Row():
+        plot_output = gr.Plot(label="Benchmark Plot")
+        metrics_output = gr.Code(label="Raw Scores (JSON)", language="json")
+    show_btn.click(display_eval, inputs=domain, outputs=[plot_output, metrics_output])
+demo.launch()