Allanatrix commited on
Commit
ff9ab22
·
verified ·
1 Parent(s): 81d7a7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -289
app.py CHANGED
@@ -1,10 +1,8 @@
1
  import gradio as gr
2
  import plotly.graph_objs as go
3
- import plotly.express as px
4
- import pandas as pd
5
  import json
6
 
7
- # Domain-specific model evaluations
8
  MODEL_EVALS = {
9
  "Proteins": {
10
  "Nexa Bio1 (Secondary)": 0.71,
@@ -33,310 +31,141 @@ MODEL_EVALS = {
33
  "Nexa CFD Model": 0.92,
34
  "FlowNet": 0.89,
35
  },
36
- }
37
-
38
- # SCIEVAL/OSIR metrics data
39
- SCIEVAL_METRICS = {
40
- "Nexa Mistral Sci-7B": {
41
- "OSIR (General)": {
42
- "Entropy / Novelty": 6.2,
43
- "Internal Consistency": 8.5,
44
- "Hypothesis Framing": 6.8,
45
- "Thematic Grounding": 7.9,
46
- "Citation & Structure": 7.3,
47
- "Symbolism & Math Logic": 6.1,
48
- "Scientific Utility": 7.6
 
 
 
 
 
 
 
 
49
  },
50
- "OSIR-Field (Physics)": {
51
- "Entropy / Novelty": 7.1,
52
- "Internal Consistency": 8.9,
53
- "Hypothesis Framing": 7.4,
54
- "Thematic Grounding": 8.2,
55
- "Citation & Structure": 6.5,
56
- "Symbolism & Math Logic": 7.8,
57
- "Scientific Utility": 8.3
 
 
 
 
 
 
 
 
 
 
 
58
  }
59
  }
60
  }
61
 
62
- def plot_domain_benchmark(domain):
63
- """Create horizontal bar chart for domain-specific benchmarks"""
64
- models = list(MODEL_EVALS[domain].keys())
65
- scores = list(MODEL_EVALS[domain].values())
66
-
67
- # Color coding for Nexa models vs others
68
- colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
69
-
70
- fig = go.Figure()
71
- fig.add_trace(go.Bar(
72
- y=models,
73
- x=scores,
74
- orientation='h',
75
- marker_color=colors,
76
- text=[f'{score:.3f}' for score in scores],
77
- textposition='auto'
78
- ))
79
-
80
- fig.update_layout(
81
- title=f"Model Benchmark Scores — {domain}",
82
- yaxis_title="Model",
83
- xaxis_title="Score",
84
- xaxis_range=[0, 1.0],
85
- template="plotly_white",
86
- height=500,
87
- showlegend=False
88
- )
89
- return fig
90
 
91
- def plot_scieval_comparison(model_name):
92
- """Create horizontal comparison chart for SCIEVAL metrics"""
93
- if model_name not in SCIEVAL_METRICS:
94
- return go.Figure()
95
-
96
- metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
97
- osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
98
- field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
99
-
100
  fig = go.Figure()
101
-
102
- fig.add_trace(go.Bar(
103
- name='OSIR (General)',
104
- y=metrics,
105
- x=osir_scores,
106
- orientation='h',
107
- marker_color='#FFD700',
108
- text=[f'{score:.1f}' for score in osir_scores],
109
- textposition='auto'
110
- ))
111
-
112
- fig.add_trace(go.Bar(
113
- name='OSIR-Field (Physics)',
114
- y=metrics,
115
- x=field_scores,
116
- orientation='h',
117
- marker_color='#FF6B35',
118
- text=[f'{score:.1f}' for score in field_scores],
119
- textposition='auto'
120
- ))
121
-
122
- fig.update_layout(
123
- title=f"SCIEVAL Metrics Comparison — {model_name}",
124
- yaxis_title="Metric",
125
- xaxis_title="Score (1-10)",
126
- xaxis_range=[0, 10],
127
- template="plotly_white",
128
- height=500,
129
- barmode='group'
130
- )
131
- return fig
132
 
133
- def create_leaderboard():
134
- """Create leaderboard table"""
135
- leaderboard_data = []
136
-
137
- # Add domain benchmark leaders
138
- for domain, models in MODEL_EVALS.items():
139
- best_model = max(models.items(), key=lambda x: x[1])
140
- leaderboard_data.append({
141
- "Domain": domain,
142
- "Best Model": best_model[0],
143
- "Score": f"{best_model[1]:.3f}",
144
- "Metric Type": "Domain Benchmark"
145
- })
146
-
147
- # Add SCIEVAL leaders
148
- for model, evaluations in SCIEVAL_METRICS.items():
149
- avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
150
- avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
151
-
152
- leaderboard_data.append({
153
- "Domain": "OSIR General",
154
- "Best Model": model,
155
- "Score": f"{avg_osir:.2f}",
156
- "Metric Type": "SCIEVAL"
157
- })
158
 
159
- leaderboard_data.append({
160
- "Domain": "OSIR Physics",
161
- "Best Model": model,
162
- "Score": f"{avg_field:.2f}",
163
- "Metric Type": "SCIEVAL"
164
- })
165
-
166
- df = pd.DataFrame(leaderboard_data)
167
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  def get_model_details(domain):
170
- """Get JSON details for domain models"""
171
- return json.dumps(MODEL_EVALS[domain], indent=2)
 
 
 
172
 
173
- def display_domain_eval(domain):
174
- """Display domain evaluation results"""
175
- plot = plot_domain_benchmark(domain)
176
  details = get_model_details(domain)
177
  return plot, details
178
 
179
- def display_scieval(model_name):
180
- """Display SCIEVAL results"""
181
- plot = plot_scieval_comparison(model_name)
182
- if model_name in SCIEVAL_METRICS:
183
- details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
184
- else:
185
- details = "Model not found in SCIEVAL database"
186
- return plot, details
187
 
188
- # Create Gradio interface
189
- with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
190
  gr.Markdown("""
191
- # 🔬 Scientific ML Benchmark Suite
192
- ### Comprehensive evaluation framework for scientific machine learning models
193
-
194
- This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
195
- comprehensive assessment of ML models across scientific disciplines.
196
  """)
197
-
198
- with gr.Tabs():
199
- # Domain Benchmarks Tab
200
- with gr.TabItem("🧪 Domain Benchmarks"):
201
- gr.Markdown("""
202
- ### Domain-Specific Model Evaluations
203
- Compare models across scientific domains including Proteins, Astronomy, Materials Science,
204
- Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
205
- """)
206
-
207
- with gr.Row():
208
- domain_dropdown = gr.Dropdown(
209
- choices=list(MODEL_EVALS.keys()),
210
- label="Select Scientific Domain",
211
- value="Proteins"
212
- )
213
- domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
214
-
215
- with gr.Row():
216
- domain_plot = gr.Plot(label="Domain Benchmark Results")
217
- domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
218
-
219
- domain_btn.click(
220
- display_domain_eval,
221
- inputs=domain_dropdown,
222
- outputs=[domain_plot, domain_metrics]
223
- )
224
-
225
- # SCIEVAL Tab
226
- with gr.TabItem("📊 SCIEVAL Metrics"):
227
- gr.Markdown("""
228
- ### SCIEVAL: Scientific Reasoning Evaluation
229
- Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
230
-
231
- **Metrics evaluated:**
232
- - **Entropy/Novelty**: Originality and information richness
233
- - **Internal Consistency**: Logical structure and argument continuity
234
- - **Hypothesis Framing**: Research aim clarity
235
- - **Thematic Grounding**: Domain focus and relevance
236
- - **Citation & Structure**: Scientific formatting
237
- - **Symbolism & Math Logic**: Mathematical rigor
238
- - **Scientific Utility**: Real-world research value
239
- """)
240
-
241
- with gr.Row():
242
- scieval_dropdown = gr.Dropdown(
243
- choices=list(SCIEVAL_METRICS.keys()),
244
- label="Select Model for SCIEVAL",
245
- value="Nexa Mistral Sci-7B"
246
- )
247
- scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
248
-
249
- with gr.Row():
250
- scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
251
- scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
252
-
253
- scieval_btn.click(
254
- display_scieval,
255
- inputs=scieval_dropdown,
256
- outputs=[scieval_plot, scieval_metrics]
257
- )
258
-
259
- # Leaderboard Tab
260
- with gr.TabItem("🏆 Leaderboard"):
261
- gr.Markdown("""
262
- ### Scientific ML Model Leaderboard
263
- Current best-performing models across all evaluated domains and metrics.
264
- """)
265
-
266
- leaderboard_df = create_leaderboard()
267
- leaderboard_table = gr.Dataframe(
268
- value=leaderboard_df,
269
- label="Current Leaders by Domain",
270
- interactive=False
271
- )
272
-
273
- # About Tab
274
- with gr.TabItem("ℹ️ About"):
275
- gr.Markdown("""
276
- ### About the Scientific ML Benchmark Suite
277
-
278
- This comprehensive evaluation framework combines two powerful assessment methodologies:
279
 
280
- #### Full reference gist for explaining the framework: https://gist.github.com/DarkStarStrix/4a2f4f91b8148e35574fc696ab9715e3
 
 
281
 
282
-
283
- #### 🎯 Domain Benchmarks
284
- - **Proteins**: Secondary/tertiary structure prediction accuracy
285
- - **Astronomy**: Object classification and detection
286
- - **Materials**: Property prediction and discovery
287
- - **QST**: Quantum state tomography reconstruction
288
- - **HEP**: High energy physics event classification
289
- - **CFD**: Computational fluid dynamics modeling
290
-
291
- #### 🔬 SCIEVAL Framework
292
- SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
293
-
294
- - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
295
- - **Domain Adaptation**: Field-specific evaluation extensions
296
- - **Research Utility**: Assessment of real-world scientific value
297
-
298
- **OSIR-Field Extensions:**
299
- - `osir-field-physics`: Physics-specific reasoning evaluation
300
- - `osir-field-bio`: Biological sciences assessment
301
- - `osir-field-chem`: Chemistry domain evaluation
302
- - `osir-field-cs`: Computer science applications
303
-
304
- #### 📈 Scoring System
305
- - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
306
- - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
307
-
308
- #### 🤝 Contributing
309
- This is an open framework welcoming contributions:
310
- - New domain-specific test sets
311
- - Additional evaluation metrics
312
- - Model submissions for benchmarking
313
-
314
- #### 📄 Citation
315
- ```
316
- @misc{scieval2024,
317
- title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
318
- author={NEXA Research},
319
- year={2025},
320
- url={https://huggingface.co/spaces/osir/scieval}
321
- }
322
- ```
323
-
324
- ---
325
-
326
- **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
327
- """)
328
-
329
- # Initialize with default values
330
- demo.load(
331
- lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
332
- outputs=[domain_plot, domain_metrics]
333
- )
334
-
335
- demo.load(
336
- lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
337
- json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
338
- outputs=[scieval_plot, scieval_metrics]
339
- )
340
 
341
- if __name__ == "__main__":
342
- demo.launch()
 
1
  import gradio as gr
2
  import plotly.graph_objs as go
 
 
3
  import json
4
 
5
+ # Expanded MODEL_EVALS including LLM benchmarks with nested JSON scores
6
  MODEL_EVALS = {
7
  "Proteins": {
8
  "Nexa Bio1 (Secondary)": 0.71,
 
31
  "Nexa CFD Model": 0.92,
32
  "FlowNet": 0.89,
33
  },
34
+ # Added LLM domain with nested OSIR benchmark scores
35
+ "LLM": {
36
+ "Nexa Mistral": {
37
+ "OSIR (General)": {
38
+ "Entropy / Novelty": 6.7,
39
+ "Internal Consistency": 7.8,
40
+ "Hypothesis Framing": 7.5,
41
+ "Thematic Grounding": 7.9,
42
+ "Citation & Structure": 6.5,
43
+ "Symbolism & Math Logic": 5.9,
44
+ "Scientific Utility": 7.0
45
+ },
46
+ "OSIR-Field (Physics)": {
47
+ "Entropy / Novelty": 7.0,
48
+ "Internal Consistency": 8.0,
49
+ "Hypothesis Framing": 7.8,
50
+ "Thematic Grounding": 8.1,
51
+ "Citation & Structure": 6.2,
52
+ "Symbolism & Math Logic": 6.5,
53
+ "Scientific Utility": 7.4
54
+ }
55
  },
56
+ "nexa-Llama-sci7b": {
57
+ "OSIR (General)": {
58
+ "Entropy / Novelty": 6.2,
59
+ "Internal Consistency": 8.5,
60
+ "Hypothesis Framing": 6.8,
61
+ "Thematic Grounding": 7.9,
62
+ "Citation & Structure": 7.3,
63
+ "Symbolism & Math Logic": 6.1,
64
+ "Scientific Utility": 7.6
65
+ },
66
+ "OSIR-Field (Physics)": {
67
+ "Entropy / Novelty": 7.1,
68
+ "Internal Consistency": 8.9,
69
+ "Hypothesis Framing": 7.4,
70
+ "Thematic Grounding": 8.2,
71
+ "Citation & Structure": 6.5,
72
+ "Symbolism & Math Logic": 7.8,
73
+ "Scientific Utility": 8.3
74
+ }
75
  }
76
  }
77
  }
78
 
79
+ def plot_domain(domain):
80
+ data = MODEL_EVALS[domain]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
 
 
 
 
 
 
 
82
  fig = go.Figure()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ if domain != "LLM":
85
+ # Simple bar plot for normal domains
86
+ models = list(data.keys())
87
+ scores = list(data.values())
88
+ fig.add_trace(go.Bar(x=models, y=scores, marker_color='indigo'))
89
+ fig.update_layout(
90
+ title=f"Model Benchmark Scores {domain}",
91
+ xaxis_title="Model",
92
+ yaxis_title="Score",
93
+ yaxis_range=[0, 1.0],
94
+ template="plotly_white",
95
+ height=500
96
+ )
97
+ else:
98
+ # For LLM domain, plot grouped bars for each model and metric category
99
+ categories = ["Entropy / Novelty", "Internal Consistency", "Hypothesis Framing",
100
+ "Thematic Grounding", "Citation & Structure", "Symbolism & Math Logic", "Scientific Utility"]
101
+ benchmarks = ["OSIR (General)", "OSIR-Field (Physics)"]
102
+
103
+ x_labels = []
104
+ bar_data = {model: [] for model in data.keys()}
 
 
 
 
105
 
106
+ # Construct x-axis labels combining benchmark and category
107
+ for bench in benchmarks:
108
+ for cat in categories:
109
+ x_labels.append(f"{bench}\n{cat}")
110
+
111
+ # Collect scores for each model in order of x_labels
112
+ for model, bench_data in data.items():
113
+ scores = []
114
+ for bench in benchmarks:
115
+ for cat in categories:
116
+ scores.append(bench_data[bench][cat])
117
+ bar_data[model] = scores
118
+
119
+ # Add bars for each model
120
+ colors = ['indigo', 'darkorange']
121
+ for i, (model, scores) in enumerate(bar_data.items()):
122
+ fig.add_trace(go.Bar(
123
+ x=x_labels,
124
+ y=scores,
125
+ name=model,
126
+ marker_color=colors[i % len(colors)]
127
+ ))
128
+
129
+ fig.update_layout(
130
+ barmode='group',
131
+ title="LLM Model Benchmark Scores (OSIR Metrics)",
132
+ xaxis_title="Metric Category",
133
+ yaxis_title="Score",
134
+ yaxis_range=[0, 10],
135
+ template="plotly_white",
136
+ height=600
137
+ )
138
+
139
+ return fig
140
 
141
  def get_model_details(domain):
142
+ # For LLM domain, pretty-print nested JSON; otherwise, simple JSON
143
+ if domain != "LLM":
144
+ return json.dumps(MODEL_EVALS[domain], indent=2)
145
+ else:
146
+ return json.dumps(MODEL_EVALS[domain], indent=2)
147
 
148
+ def display_eval(domain):
149
+ plot = plot_domain(domain)
 
150
  details = get_model_details(domain)
151
  return plot, details
152
 
153
+ domain_list = list(MODEL_EVALS.keys())
 
 
 
 
 
 
 
154
 
155
+ with gr.Blocks(title="Nexa Evals — Scientific ML Benchmark Suite") as demo:
 
156
  gr.Markdown("""
157
+ # 🔬 Nexa Evals
158
+ A benchmarking suite comparing Nexa models against SOTA across scientific domains.
 
 
 
159
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
+ with gr.Row():
162
+ domain = gr.Dropdown(domain_list, label="Select Domain")
163
+ show_btn = gr.Button("Run Evaluation")
164
 
165
+ with gr.Row():
166
+ plot_output = gr.Plot(label="Benchmark Plot")
167
+ metrics_output = gr.Code(label="Raw Scores (JSON)", language="json")
168
+
169
+ show_btn.click(display_eval, inputs=domain, outputs=[plot_output, metrics_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ demo.launch()