Spaces:
Sleeping
Sleeping
Commit
Β·
1b0c644
1
Parent(s):
3ac4c37
Fixed issue with charts loading slowly.
Browse files
app.py
CHANGED
|
@@ -154,6 +154,7 @@ df_full_rs = df_full_rs.apply(add_link, axis=1)
|
|
| 154 |
|
| 155 |
df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])
|
| 156 |
|
|
|
|
| 157 |
# dataframe
|
| 158 |
df_rs['MT-Bench'] = '' # MT-Bench μ΄ μΆκ°
|
| 159 |
df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
|
|
@@ -285,10 +286,10 @@ def search_openai_plot(dropdown_model): # openai plot ν¨μ μ μ
|
|
| 285 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 286 |
|
| 287 |
category_labels = []
|
| 288 |
-
category_labels.append(openai_top_model + "
|
| 289 |
-
category_labels.append(openai_top_model + "
|
| 290 |
-
category_labels.append(dropdown_model + "
|
| 291 |
-
category_labels.append(dropdown_model + "
|
| 292 |
|
| 293 |
fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai")
|
| 294 |
return fig
|
|
@@ -310,10 +311,10 @@ def search_keval_plot(dropdown_model): # keval plot ν¨μ μ μ
|
|
| 310 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 311 |
|
| 312 |
category_labels = []
|
| 313 |
-
category_labels.append(keval_top_model + "
|
| 314 |
-
category_labels.append(keval_top_model + "
|
| 315 |
-
category_labels.append(dropdown_model + "
|
| 316 |
-
category_labels.append(dropdown_model + "
|
| 317 |
|
| 318 |
fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval")
|
| 319 |
return fig
|
|
@@ -327,13 +328,13 @@ def plot_average():
|
|
| 327 |
|
| 328 |
# gpt-4o
|
| 329 |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
|
| 330 |
-
name=f'gpt-4o(Average)',
|
| 331 |
line=dict(color=colors[0][0], dash='dash'),
|
| 332 |
marker=dict(symbol='x', size=10)))
|
| 333 |
|
| 334 |
# keval
|
| 335 |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
|
| 336 |
-
name=f'keval(Average)',
|
| 337 |
line=dict(color=colors[0][1]),
|
| 338 |
marker=dict(symbol='circle', size=10)))
|
| 339 |
|
|
@@ -352,44 +353,48 @@ def plot_average():
|
|
| 352 |
|
| 353 |
#gradio
|
| 354 |
with gr.Blocks(css='assets/leaderboard.css') as demo:
|
| 355 |
-
gr.
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
with gr.TabItem("Openai Judgment"):
|
| 375 |
-
gr.Dataframe(value=df_openai,
|
| 376 |
-
datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
|
| 377 |
-
with gr.TabItem("Keval Judgment"):
|
| 378 |
-
gr.Dataframe(value=df_keval,
|
| 379 |
-
datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
|
| 380 |
-
with gr.TabItem("Model Detail View"):
|
| 381 |
-
with gr.Blocks():
|
| 382 |
-
with gr.Row():
|
| 383 |
-
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
|
| 384 |
with gr.Row():
|
| 385 |
-
|
| 386 |
-
|
| 387 |
with gr.Row():
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])
|
| 156 |
|
| 157 |
+
|
| 158 |
# dataframe
|
| 159 |
df_rs['MT-Bench'] = '' # MT-Bench μ΄ μΆκ°
|
| 160 |
df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
|
|
|
|
| 286 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 287 |
|
| 288 |
category_labels = []
|
| 289 |
+
category_labels.append(openai_top_model + " (Turn 1)")
|
| 290 |
+
category_labels.append(openai_top_model + " (Turn 2)")
|
| 291 |
+
category_labels.append(dropdown_model + " (Turn 1)")
|
| 292 |
+
category_labels.append(dropdown_model + " (Turn 2)")
|
| 293 |
|
| 294 |
fig = radar_chart(CATEGORIES, top1_openai_turn1, top1_openai_turn2, openai_turn1, openai_turn2, category_labels,"openai")
|
| 295 |
return fig
|
|
|
|
| 311 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 312 |
|
| 313 |
category_labels = []
|
| 314 |
+
category_labels.append(keval_top_model + " (Turn 1)")
|
| 315 |
+
category_labels.append(keval_top_model + " (Turn 2)")
|
| 316 |
+
category_labels.append(dropdown_model + " (Turn 1)")
|
| 317 |
+
category_labels.append(dropdown_model + " (Turn 2)")
|
| 318 |
|
| 319 |
fig = radar_chart(CATEGORIES, top1_keval_turn1, top1_keval_turn2, keval_turn1, keval_turn2, category_labels, "keval")
|
| 320 |
return fig
|
|
|
|
| 328 |
|
| 329 |
# gpt-4o
|
| 330 |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
|
| 331 |
+
name=f'gpt-4o (Average)',
|
| 332 |
line=dict(color=colors[0][0], dash='dash'),
|
| 333 |
marker=dict(symbol='x', size=10)))
|
| 334 |
|
| 335 |
# keval
|
| 336 |
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
|
| 337 |
+
name=f'keval (Average)',
|
| 338 |
line=dict(color=colors[0][1]),
|
| 339 |
marker=dict(symbol='circle', size=10)))
|
| 340 |
|
|
|
|
| 353 |
|
| 354 |
#gradio
|
| 355 |
with gr.Blocks(css='assets/leaderboard.css') as demo:
|
| 356 |
+
with gr.Blocks():
|
| 357 |
+
gr.Markdown("")
|
| 358 |
+
gr.Markdown("# π Ko-Bench Leaderboard")
|
| 359 |
+
gr.Markdown("")
|
| 360 |
+
gr.Markdown(
|
| 361 |
+
"#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
| 362 |
+
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
| 363 |
+
gr.Markdown(
|
| 364 |
+
"- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
| 365 |
+
gr.Markdown(
|
| 366 |
+
"- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
| 367 |
+
gr.Markdown("")
|
| 368 |
+
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
| 369 |
+
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
| 370 |
+
gr.Markdown("")
|
| 371 |
+
|
| 372 |
+
with gr.Blocks():
|
| 373 |
+
with gr.Row():
|
| 374 |
+
with gr.TabItem("Ko-Bench"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
with gr.Row():
|
| 376 |
+
gr.Dataframe(value=df_full_rs,
|
| 377 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns])
|
| 378 |
with gr.Row():
|
| 379 |
+
avg = plot_average()
|
| 380 |
+
gr.Plot(avg)
|
| 381 |
+
with gr.TabItem("Openai Judgment"):
|
| 382 |
+
gr.Dataframe(value=df_openai,
|
| 383 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
|
| 384 |
+
with gr.TabItem("Keval Judgment"):
|
| 385 |
+
gr.Dataframe(value=df_keval,
|
| 386 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
|
| 387 |
+
with gr.TabItem("Model Detail View"):
|
| 388 |
+
with gr.Blocks():
|
| 389 |
+
with gr.Row():
|
| 390 |
+
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
|
| 391 |
+
with gr.Row():
|
| 392 |
+
dataframe = gr.Dataframe(label="Model Detail View")
|
| 393 |
+
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
|
| 394 |
+
with gr.Row():
|
| 395 |
+
plot_openai = gr.Plot(label="Openai Plot")
|
| 396 |
+
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
| 397 |
+
plot_keval = gr.Plot(label="Keval Plot")
|
| 398 |
+
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
| 399 |
+
|
| 400 |
+
demo.launch(share=True, server_name="0.0.0.0", server_port=7860, debug=True)
|