Spaces:
Sleeping
Sleeping
Commit
ยท
174062d
1
Parent(s):
731e515
add claud-3-5 results
Browse files- app.py +157 -48
- ko_bench.csv +92 -88
app.py
CHANGED
|
@@ -3,27 +3,43 @@ import pandas as pd
|
|
| 3 |
import numpy as np
|
| 4 |
import random
|
| 5 |
import plotly.graph_objects as go
|
|
|
|
|
|
|
| 6 |
|
| 7 |
file_result_score = 'ko_bench.csv'
|
| 8 |
|
| 9 |
file_full_lb = 'mt_bench_240805.csv'
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
# read csv
|
| 13 |
df_result_score = pd.read_csv(file_result_score)
|
| 14 |
df_full_lb = pd.read_csv(file_full_lb)
|
| 15 |
|
| 16 |
-
|
| 17 |
# dataframe
|
| 18 |
df = pd.DataFrame(df_result_score)
|
|
|
|
|
|
|
| 19 |
df_rs = pd.DataFrame(df_result_score)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
df_full_lboard = pd.DataFrame(df_full_lb)
|
| 21 |
|
| 22 |
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ
|
| 23 |
-
models = df_full_lboard['Model'].unique() # ์ด ์ถ๊ฐ๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
| 24 |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
| 25 |
|
| 26 |
def custom_mean(series):
|
|
|
|
|
|
|
| 27 |
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ
|
| 28 |
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ
|
| 29 |
|
|
@@ -34,7 +50,8 @@ def get_mt_bench(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
| 34 |
return matching_rows['MT-bench (score)'].values[0]
|
| 35 |
return ''
|
| 36 |
|
| 37 |
-
def get_organization(
|
|
|
|
| 38 |
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
| 39 |
return 'Mistral'
|
| 40 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
|
@@ -44,13 +61,32 @@ def get_organization(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ
|
|
| 44 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
| 45 |
if not matching_rows.empty:
|
| 46 |
return matching_rows['Organization'].values[0]
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
| 50 |
-
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
| 51 |
return 'Apache-2.0'
|
| 52 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
| 53 |
return 'llama3'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
model_lower = model.lower()
|
| 56 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
|
@@ -58,10 +94,26 @@ def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์
|
|
| 58 |
return matching_rows['License'].values[0]
|
| 59 |
return ''
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
# dataframe_full
|
| 63 |
df_full_rs = df_rs.copy()
|
| 64 |
-
df_full_rs.rename(columns={'score': '
|
| 65 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
| 66 |
|
| 67 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
|
@@ -69,16 +121,16 @@ df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean
|
|
| 69 |
df_full_rs = df_full_rs.round(2)
|
| 70 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 71 |
|
| 72 |
-
df_full_rs['
|
| 73 |
-
df_full_rs['
|
| 74 |
for idx, j_model in df_full_rs['judge_model'].items():
|
| 75 |
if j_model == 'keval':
|
| 76 |
-
df_full_rs.at[idx, '
|
| 77 |
else :
|
| 78 |
-
df_full_rs.at[idx, '
|
| 79 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
| 80 |
|
| 81 |
-
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() #
|
| 82 |
df_full_rs = df_full_rs.round(2)
|
| 83 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 84 |
|
|
@@ -87,17 +139,20 @@ df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
|
|
| 87 |
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
|
| 88 |
|
| 89 |
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ
|
| 90 |
-
df_full_rs['Organization'] = df_full_rs
|
| 91 |
|
| 92 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
| 93 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
| 94 |
|
| 95 |
-
df_full_rs = df_full_rs.sort_values(by='
|
| 96 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
| 97 |
-
df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
|
| 98 |
|
| 99 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
# dataframe
|
| 103 |
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
|
@@ -115,6 +170,10 @@ df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2
|
|
| 115 |
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
|
| 116 |
df_openai = df_openai.round(2)
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
df_openai = df_openai.sort_values(by='score', ascending=False)
|
| 119 |
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
|
| 120 |
|
|
@@ -127,6 +186,10 @@ df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ turn1,2 sc
|
|
| 127 |
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
|
| 128 |
df_keval = df_keval.round(2)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
df_keval = df_keval.sort_values(by='score', ascending=False)
|
| 131 |
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
| 132 |
|
|
@@ -206,10 +269,13 @@ def radar_chart(categories, Top1_turn1, Top1_turn2, Selected_model_turn1, Select
|
|
| 206 |
return fig
|
| 207 |
|
| 208 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
| 210 |
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
| 211 |
|
| 212 |
-
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] ==
|
| 213 |
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
| 214 |
|
| 215 |
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
@@ -219,8 +285,8 @@ def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
|
| 219 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 220 |
|
| 221 |
category_labels = []
|
| 222 |
-
category_labels.append(
|
| 223 |
-
category_labels.append(
|
| 224 |
category_labels.append(dropdown_model + " /Turn 1")
|
| 225 |
category_labels.append(dropdown_model + " /Turn 2")
|
| 226 |
|
|
@@ -228,10 +294,13 @@ def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
|
| 228 |
return fig
|
| 229 |
|
| 230 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
| 232 |
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
| 233 |
|
| 234 |
-
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] ==
|
| 235 |
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
| 236 |
|
| 237 |
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
@@ -241,8 +310,8 @@ def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
|
| 241 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 242 |
|
| 243 |
category_labels = []
|
| 244 |
-
category_labels.append(
|
| 245 |
-
category_labels.append(
|
| 246 |
category_labels.append(dropdown_model + " /Turn 1")
|
| 247 |
category_labels.append(dropdown_model + " /Turn 2")
|
| 248 |
|
|
@@ -250,37 +319,77 @@ def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
|
| 250 |
return fig
|
| 251 |
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
#gradio
|
| 254 |
-
with gr.Blocks() as demo:
|
| 255 |
gr.Markdown("")
|
| 256 |
-
gr.Markdown("# ๐
|
| 257 |
gr.Markdown("")
|
| 258 |
-
gr.Markdown("")
|
| 259 |
-
gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
| 260 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
| 261 |
-
gr.Markdown("-
|
| 262 |
-
gr.Markdown("-
|
| 263 |
-
gr.Markdown("")
|
| 264 |
gr.Markdown("")
|
|
|
|
|
|
|
| 265 |
gr.Markdown("")
|
| 266 |
|
| 267 |
-
with gr.
|
| 268 |
-
gr.
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
with gr.TabItem("Keval Judgment"):
|
| 272 |
-
gr.Dataframe(value=df_keval)
|
| 273 |
-
with gr.TabItem("Model Detail View"):
|
| 274 |
-
with gr.Blocks():
|
| 275 |
with gr.Row():
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import random
|
| 5 |
import plotly.graph_objects as go
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
import plotly.express as px
|
| 8 |
|
| 9 |
file_result_score = 'ko_bench.csv'
|
| 10 |
|
| 11 |
file_full_lb = 'mt_bench_240805.csv'
|
| 12 |
|
| 13 |
|
| 14 |
+
def add_hf_link(row):
|
| 15 |
+
organization, model = row['model'].split('__')
|
| 16 |
+
if organization.lower() not in ['google', 'openai', 'anthropic']:
|
| 17 |
+
row['link'] = f"https://huggingface.co/{organization}/{model}"
|
| 18 |
+
if organization.lower() == 'google' and 'gemini' in model:
|
| 19 |
+
row['link'] = "https://ai.google.dev/gemini-api"
|
| 20 |
+
return row
|
| 21 |
+
|
| 22 |
# read csv
|
| 23 |
df_result_score = pd.read_csv(file_result_score)
|
| 24 |
df_full_lb = pd.read_csv(file_full_lb)
|
| 25 |
|
|
|
|
| 26 |
# dataframe
|
| 27 |
df = pd.DataFrame(df_result_score)
|
| 28 |
+
df['model'] = df['model'].str.split('__').str[1]
|
| 29 |
+
|
| 30 |
df_rs = pd.DataFrame(df_result_score)
|
| 31 |
+
df_rs['link'] = ''
|
| 32 |
+
df_rs = df_rs.apply(add_hf_link, axis=1)
|
| 33 |
+
df_rs['organization'] = df_rs['model'].str.split('__').str[0]
|
| 34 |
+
df_rs['model'] = df_rs['model'].str.split('__').str[1]
|
| 35 |
df_full_lboard = pd.DataFrame(df_full_lb)
|
| 36 |
|
| 37 |
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋ก ๋ณ๊ฒฝ
|
|
|
|
| 38 |
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
| 39 |
|
| 40 |
def custom_mean(series):
|
| 41 |
+
if series.name == 'link' or series.name == 'organization':
|
| 42 |
+
return series.values[0]
|
| 43 |
numeric_series = pd.to_numeric(series, errors='coerce') # ์๋ฆฌ์ฆ๋ฅผ ์ซ์๋ก ๋ณํ
|
| 44 |
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์๋ ๊ฐ์ด ํ๋๋ผ๋ ์์ผ๋ฉด ํ๊ท ๊ณ์ฐ
|
| 45 |
|
|
|
|
| 50 |
return matching_rows['MT-bench (score)'].values[0]
|
| 51 |
return ''
|
| 52 |
|
| 53 |
+
def get_organization(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
| 54 |
+
model = row['model']
|
| 55 |
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
|
| 56 |
return 'Mistral'
|
| 57 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
|
|
|
| 61 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
| 62 |
if not matching_rows.empty:
|
| 63 |
return matching_rows['Organization'].values[0]
|
| 64 |
+
|
| 65 |
+
if row['organization'] != '' and pd.notna(row['organization']):
|
| 66 |
+
organization = row['organization'].lower()
|
| 67 |
+
if organization == 'qwen':
|
| 68 |
+
return 'Alibaba'
|
| 69 |
+
elif organization == 'google':
|
| 70 |
+
return 'Google'
|
| 71 |
+
elif organization == 'lgai-exaone':
|
| 72 |
+
return 'LGAI'
|
| 73 |
+
|
| 74 |
+
return row['organization']
|
| 75 |
|
| 76 |
def get_license(model): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
| 77 |
+
if pd.Series(model).str.contains('mistral-large|WizardLM-2-8x22B|ko-gemma-2', case=False, regex=True).any():
|
| 78 |
return 'Apache-2.0'
|
| 79 |
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
|
| 80 |
return 'llama3'
|
| 81 |
+
elif pd.Series(model).str.contains('Ko-Llama-3-8B-Instruct', case=False, regex=True).any():
|
| 82 |
+
return 'Llama Community'
|
| 83 |
+
elif pd.Series(model).str.contains('claude|gemini|EXAONE-3.0-7.8B-Instruct', case=False, regex=True).any():
|
| 84 |
+
return 'Proprietary'
|
| 85 |
+
elif pd.Series(model).str.contains('qwen', case=False, regex=True).any():
|
| 86 |
+
if pd.Series(model).str.contains('max', case=False, regex=True).any():
|
| 87 |
+
return 'Proprietary'
|
| 88 |
+
else:
|
| 89 |
+
return 'Qianwen LICENSE'
|
| 90 |
|
| 91 |
model_lower = model.lower()
|
| 92 |
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
|
|
|
|
| 94 |
return matching_rows['License'].values[0]
|
| 95 |
return ''
|
| 96 |
|
| 97 |
+
def get_link(row): # ๋์๋ฌธ์ ๋ฌด์ํ๊ณ ๋ชจ๋ธ์ ๋งค์นญํ๊ธฐ ์ํ ํจ์ ์ ์
|
| 98 |
+
if row['link'] != '' and pd.notna(row['link']):
|
| 99 |
+
return row
|
| 100 |
+
|
| 101 |
+
model_lower = row['model'].lower()
|
| 102 |
+
matching_rows = df_full_lboard[df_full_lboard['key'].str.lower() == model_lower]
|
| 103 |
+
if not matching_rows.empty:
|
| 104 |
+
row['link'] = matching_rows['Link'].values[0]
|
| 105 |
+
return row
|
| 106 |
+
|
| 107 |
+
def add_link(row):
|
| 108 |
+
if pd.isna(row['link']):
|
| 109 |
+
row['link'] = ''
|
| 110 |
+
if row['link'] != '':
|
| 111 |
+
row['model'] = f"<a href={row['link']}>{row['model']}</a>"
|
| 112 |
+
return row
|
| 113 |
|
| 114 |
# dataframe_full
|
| 115 |
df_full_rs = df_rs.copy()
|
| 116 |
+
df_full_rs.rename(columns={'score': 'Ko-Bench'}, inplace=True)
|
| 117 |
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
|
| 118 |
|
| 119 |
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ turn1,2 score ํฉ๋ณ
|
|
|
|
| 121 |
df_full_rs = df_full_rs.round(2)
|
| 122 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 123 |
|
| 124 |
+
df_full_rs['Ko-Bench/openai'] = '' # Ko-Bench/openai, Ko-Bench/keval ์ด ์ถ๊ฐ
|
| 125 |
+
df_full_rs['Ko-Bench/keval'] = ''
|
| 126 |
for idx, j_model in df_full_rs['judge_model'].items():
|
| 127 |
if j_model == 'keval':
|
| 128 |
+
df_full_rs.at[idx, 'Ko-Bench/keval'] = df_full_rs.at[idx, 'Ko-Bench']
|
| 129 |
else :
|
| 130 |
+
df_full_rs.at[idx, 'Ko-Bench/openai'] = df_full_rs.at[idx, 'Ko-Bench']
|
| 131 |
df_full_rs = df_full_rs.drop(columns=['judge_model'])
|
| 132 |
|
| 133 |
+
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # Ko-Bench/openai, Ko-Bench/keval ํ ํฉ๋ณ
|
| 134 |
df_full_rs = df_full_rs.round(2)
|
| 135 |
df_full_rs.replace("", np.nan, inplace=True)
|
| 136 |
|
|
|
|
| 139 |
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
|
| 140 |
|
| 141 |
df_full_rs['Organization'] = '' # Organization ์ด ์ถ๊ฐ
|
| 142 |
+
df_full_rs['Organization'] = df_full_rs.apply(get_organization, axis=1 )
|
| 143 |
|
| 144 |
df_full_rs['License'] = '' # License ์ด ์ถ๊ฐ
|
| 145 |
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
|
| 146 |
|
| 147 |
+
df_full_rs = df_full_rs.sort_values(by='Ko-Bench', ascending=False)
|
| 148 |
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
|
|
|
|
| 149 |
|
| 150 |
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์ํ models ๋ฆฌ์คํธ
|
| 151 |
|
| 152 |
+
df_full_rs = df_full_rs.apply(get_link, axis=1)
|
| 153 |
+
df_full_rs = df_full_rs.apply(add_link, axis=1)
|
| 154 |
+
|
| 155 |
+
df_full_rs = df_full_rs.drop(columns=['Ko-Bench', 'link', 'organization'])
|
| 156 |
|
| 157 |
# dataframe
|
| 158 |
df_rs['MT-Bench'] = '' # MT-Bench ์ด ์ถ๊ฐ
|
|
|
|
| 170 |
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
|
| 171 |
df_openai = df_openai.round(2)
|
| 172 |
|
| 173 |
+
df_openai = df_openai.apply(get_link, axis=1)
|
| 174 |
+
df_openai = df_openai.apply(add_link, axis=1)
|
| 175 |
+
df_openai = df_openai.drop(columns=['link', 'organization'])
|
| 176 |
+
|
| 177 |
df_openai = df_openai.sort_values(by='score', ascending=False)
|
| 178 |
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
|
| 179 |
|
|
|
|
| 186 |
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
|
| 187 |
df_keval = df_keval.round(2)
|
| 188 |
|
| 189 |
+
df_keval = df_keval.apply(get_link, axis=1)
|
| 190 |
+
df_keval = df_keval.apply(add_link, axis=1)
|
| 191 |
+
df_keval = df_keval.drop(columns=['link', 'organization'])
|
| 192 |
+
|
| 193 |
df_keval = df_keval.sort_values(by='score', ascending=False)
|
| 194 |
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
|
| 195 |
|
|
|
|
| 269 |
return fig
|
| 270 |
|
| 271 |
def search_openai_plot(dropdown_model): # openai plot ํจ์ ์ ์
|
| 272 |
+
openai_top_model = df_openai.iat[0, df_openai.columns.get_loc('model')]
|
| 273 |
+
openai_top_model = BeautifulSoup(openai_top_model, 'html.parser').get_text()
|
| 274 |
+
|
| 275 |
+
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == openai_top_model)
|
| 276 |
top1_openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
| 277 |
|
| 278 |
+
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == openai_top_model)
|
| 279 |
top1_openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
| 280 |
|
| 281 |
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
|
|
| 285 |
openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 286 |
|
| 287 |
category_labels = []
|
| 288 |
+
category_labels.append(openai_top_model + " /Turn 1")
|
| 289 |
+
category_labels.append(openai_top_model + " /Turn 2")
|
| 290 |
category_labels.append(dropdown_model + " /Turn 1")
|
| 291 |
category_labels.append(dropdown_model + " /Turn 2")
|
| 292 |
|
|
|
|
| 294 |
return fig
|
| 295 |
|
| 296 |
def search_keval_plot(dropdown_model): # keval plot ํจ์ ์ ์
|
| 297 |
+
keval_top_model = df_keval.iat[0, df_keval.columns.get_loc('model')]
|
| 298 |
+
keval_top_model = BeautifulSoup(keval_top_model, 'html.parser').get_text()
|
| 299 |
+
|
| 300 |
+
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == keval_top_model)
|
| 301 |
top1_keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
|
| 302 |
|
| 303 |
+
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == keval_top_model)
|
| 304 |
top1_keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
|
| 305 |
|
| 306 |
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
|
|
|
|
| 310 |
keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
|
| 311 |
|
| 312 |
category_labels = []
|
| 313 |
+
category_labels.append(keval_top_model + " /Turn 1")
|
| 314 |
+
category_labels.append(keval_top_model + " /Turn 2")
|
| 315 |
category_labels.append(dropdown_model + " /Turn 1")
|
| 316 |
category_labels.append(dropdown_model + " /Turn 2")
|
| 317 |
|
|
|
|
| 319 |
return fig
|
| 320 |
|
| 321 |
|
| 322 |
+
# average
|
| 323 |
+
def plot_average():
|
| 324 |
+
fig = go.Figure()
|
| 325 |
+
colors = [px.colors.qualitative.Set2, px.colors.qualitative.Pastel2]
|
| 326 |
+
turn_df = df_full_rs
|
| 327 |
+
|
| 328 |
+
# gpt-4o
|
| 329 |
+
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/openai'], mode='lines+markers',
|
| 330 |
+
name=f'gpt-4o(Average)',
|
| 331 |
+
line=dict(color=colors[0][0], dash='dash'),
|
| 332 |
+
marker=dict(symbol='x', size=10)))
|
| 333 |
+
|
| 334 |
+
# keval
|
| 335 |
+
fig.add_trace(go.Scatter(x=turn_df['model'], y=turn_df['Ko-Bench/keval'], mode='lines+markers',
|
| 336 |
+
name=f'keval(Average)',
|
| 337 |
+
line=dict(color=colors[0][1]),
|
| 338 |
+
marker=dict(symbol='circle', size=10)))
|
| 339 |
+
|
| 340 |
+
fig.update_layout(
|
| 341 |
+
title=f'Comparison of OpenAI ko_bench and keval ko_bench (Average)',
|
| 342 |
+
xaxis_title='Model',
|
| 343 |
+
yaxis_title='Score',
|
| 344 |
+
legend_title='Metric',
|
| 345 |
+
hovermode='x unified',
|
| 346 |
+
template='plotly_white'
|
| 347 |
+
)
|
| 348 |
+
fig.update_yaxes(range=[0, 10])
|
| 349 |
+
fig.update_layout(legend_traceorder="reversed")
|
| 350 |
+
return fig
|
| 351 |
+
|
| 352 |
+
|
| 353 |
#gradio
|
| 354 |
+
with gr.Blocks(css='assets/leaderboard.css') as demo:
|
| 355 |
gr.Markdown("")
|
| 356 |
+
gr.Markdown("# ๐ Ko-Bench Leaderboard")
|
| 357 |
gr.Markdown("")
|
| 358 |
+
gr.Markdown("#### The Ko-Bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
|
|
|
|
| 359 |
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
|
| 360 |
+
gr.Markdown("- Ko-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
|
| 361 |
+
gr.Markdown("- Ko-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
|
|
|
|
| 362 |
gr.Markdown("")
|
| 363 |
+
gr.Markdown("github : https://github.com/davidkim205/Ko-Bench")
|
| 364 |
+
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
|
| 365 |
gr.Markdown("")
|
| 366 |
|
| 367 |
+
with gr.Row():
|
| 368 |
+
with gr.TabItem("Ko-Bench"):
|
| 369 |
+
gr.Dataframe(value=df_full_rs,
|
| 370 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_full_rs.columns])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
with gr.Row():
|
| 372 |
+
with gr.TabItem("Average"):
|
| 373 |
+
gr.Plot(plot_average)
|
| 374 |
+
with gr.TabItem("Openai Judgment"):
|
| 375 |
+
gr.Dataframe(value=df_openai,
|
| 376 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_openai.columns])
|
| 377 |
+
with gr.TabItem("Keval Judgment"):
|
| 378 |
+
gr.Dataframe(value=df_keval,
|
| 379 |
+
datatype=['html' if col == 'model' else 'markdown' for col in df_keval.columns])
|
| 380 |
+
with gr.TabItem("Model Detail View"):
|
| 381 |
+
with gr.Blocks():
|
| 382 |
+
with gr.Row():
|
| 383 |
+
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
|
| 384 |
+
with gr.Row():
|
| 385 |
+
dataframe = gr.Dataframe(label="Model Detail View")
|
| 386 |
+
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
|
| 387 |
+
with gr.Row():
|
| 388 |
+
plot_openai = gr.Plot(label="Openai Plot")
|
| 389 |
+
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
|
| 390 |
+
plot_keval = gr.Plot(label="Keval Plot")
|
| 391 |
+
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
demo.launch(share=True, server_name="0.0.0.0", debug=True)
|
ko_bench.csv
CHANGED
|
@@ -1,89 +1,93 @@
|
|
| 1 |
judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
|
| 2 |
-
gpt-4o,1,
|
| 3 |
-
gpt-4o,1,
|
| 4 |
-
gpt-4o,1,
|
| 5 |
-
gpt-4o,1,
|
| 6 |
-
gpt-4o,1,
|
| 7 |
-
gpt-4o,1,
|
| 8 |
-
gpt-4o,1,
|
| 9 |
-
gpt-4o,1,
|
| 10 |
-
gpt-4o,1,
|
| 11 |
-
gpt-4o,1,gemma-2-9b-it,7.
|
| 12 |
-
gpt-4o,1,
|
| 13 |
-
gpt-4o,1,
|
| 14 |
-
gpt-4o,1,
|
| 15 |
-
gpt-4o,1,
|
| 16 |
-
gpt-4o,1,
|
| 17 |
-
gpt-4o,1,
|
| 18 |
-
gpt-4o,1,
|
| 19 |
-
gpt-4o,1,
|
| 20 |
-
gpt-4o,1,
|
| 21 |
-
gpt-4o,1,
|
| 22 |
-
gpt-4o,1,
|
| 23 |
-
gpt-4o,1,
|
| 24 |
-
gpt-4o,
|
| 25 |
-
gpt-4o,2,
|
| 26 |
-
gpt-4o,2,
|
| 27 |
-
gpt-4o,2,
|
| 28 |
-
gpt-4o,2,
|
| 29 |
-
gpt-4o,2,
|
| 30 |
-
gpt-4o,2,
|
| 31 |
-
gpt-4o,2,
|
| 32 |
-
gpt-4o,2,
|
| 33 |
-
gpt-4o,2,
|
| 34 |
-
gpt-4o,2,gemma-2-9b-it,6.
|
| 35 |
-
gpt-4o,2,
|
| 36 |
-
gpt-4o,2,
|
| 37 |
-
gpt-4o,2,
|
| 38 |
-
gpt-4o,2,
|
| 39 |
-
gpt-4o,2,
|
| 40 |
-
gpt-4o,2,
|
| 41 |
-
gpt-4o,2,
|
| 42 |
-
gpt-4o,2,
|
| 43 |
-
gpt-4o,2,
|
| 44 |
-
gpt-4o,2,
|
| 45 |
-
gpt-4o,2,
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
keval,1,
|
| 49 |
-
keval,1,
|
| 50 |
-
keval,1,
|
| 51 |
-
keval,1,
|
| 52 |
-
keval,1,
|
| 53 |
-
keval,1,
|
| 54 |
-
keval,1,
|
| 55 |
-
keval,1,
|
| 56 |
-
keval,1,
|
| 57 |
-
keval,1,
|
| 58 |
-
keval,1,
|
| 59 |
-
keval,1,
|
| 60 |
-
keval,1,
|
| 61 |
-
keval,1,
|
| 62 |
-
keval,1,
|
| 63 |
-
keval,1,
|
| 64 |
-
keval,1,
|
| 65 |
-
keval,1,
|
| 66 |
-
keval,1,
|
| 67 |
-
keval,1,
|
| 68 |
-
keval,
|
| 69 |
-
keval,
|
| 70 |
-
keval,
|
| 71 |
-
keval,2,
|
| 72 |
-
keval,2,
|
| 73 |
-
keval,2,
|
| 74 |
-
keval,2,
|
| 75 |
-
keval,2,
|
| 76 |
-
keval,2,
|
| 77 |
-
keval,2,
|
| 78 |
-
keval,2,
|
| 79 |
-
keval,2,
|
| 80 |
-
keval,2,
|
| 81 |
-
keval,2,
|
| 82 |
-
keval,2,
|
| 83 |
-
keval,2,
|
| 84 |
-
keval,2,
|
| 85 |
-
keval,2,
|
| 86 |
-
keval,2,
|
| 87 |
-
keval,2,
|
| 88 |
-
keval,2,
|
| 89 |
-
keval,2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
judge_model,turn,model,score,Coding,Extraction,Humanities,Math,Reasoning,Roleplay,STEM,Writing
|
| 2 |
+
gpt-4o,1,openai__GPT-4o-2024-05-13,9.4,8.7,9.6,9.6,9.9,9.0,9.2,9.7,9.3
|
| 3 |
+
gpt-4o,1,Anthropic__claude-3-5-sonnet-20240620,9.0,6.7,9.5,9.2,9.6,9.3,8.7,9.8,9.0
|
| 4 |
+
gpt-4o,1,openai__gpt-4-0125-preview,8.9,7.7,9.8,9.1,9.7,7.8,9.2,8.7,9.4
|
| 5 |
+
gpt-4o,1,openai__GPT-4o-mini-2024-07-18,8.8,7.3,9.2,9.4,10.0,6.9,8.7,9.6,9.1
|
| 6 |
+
gpt-4o,1,Anthropic__claude-3-opus-20240229,8.6,8.1,9.7,9.3,8.7,5.8,8.2,9.4,9.5
|
| 7 |
+
gpt-4o,1,mistralai__Mistral-Large-Instruct-2407,8.5,6.8,8.9,8.7,9.6,6.6,8.5,9.2,9.5
|
| 8 |
+
gpt-4o,1,Qwen__Qwen2-72B-Instruct,8.3,5.1,9.7,8.9,7.5,7.9,8.8,9.2,9.3
|
| 9 |
+
gpt-4o,1,google__gemma-2-27b-it,8.3,6.8,9.4,9.5,7.9,5.4,9.0,9.0,9.2
|
| 10 |
+
gpt-4o,1,google__gemini-1.5-pro,8.2,5.5,9.7,8.7,7.5,6.5,9.1,9.4,9.2
|
| 11 |
+
gpt-4o,1,davidkim205__ko-gemma-2-9b-it,7.8,6.6,9.0,8.4,6.7,6.2,8.1,8.9,8.7
|
| 12 |
+
gpt-4o,1,google__gemma-2-9b-it,7.7,6.2,9.3,8.8,5.4,5.4,8.8,8.8,8.7
|
| 13 |
+
gpt-4o,1,alpindale__WizardLM-2-8x22B,7.4,6.8,6.8,7.8,8.7,4.8,7.2,8.4,8.7
|
| 14 |
+
gpt-4o,1,openai__gpt-3.5-turbo-0125,6.7,5.2,9.0,7.7,6.4,3.3,7.2,6.5,8.6
|
| 15 |
+
gpt-4o,1,meta-llama__Meta-Llama-3.1-70B-Instruct,6.6,6.4,8.7,8.0,4.5,4.0,7.9,7.4,5.9
|
| 16 |
+
gpt-4o,1,Qwen__Qwen2-7B-Instruct,6.5,3.9,9.0,8.0,5.6,3.6,7.0,6.6,8.2
|
| 17 |
+
gpt-4o,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,4.9,7.4,7.1,7.3,5.1,6.4,4.1,7.6
|
| 18 |
+
gpt-4o,1,Qwen__Qwen1.5-32B-Chat,6.1,4.0,8.6,8.5,4.7,2.6,6.3,7.5,6.7
|
| 19 |
+
gpt-4o,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.8,3.5,5.0,8.5,5.4,3.2,5.4,7.5,7.6
|
| 20 |
+
gpt-4o,1,davidkim205__Ko-Llama-3-8B-Instruct,5.7,4.6,7.0,7.7,2.8,2.5,6.2,6.9,7.6
|
| 21 |
+
gpt-4o,1,meta-llama__Meta-Llama-3.1-8B-Instruct,5.4,4.6,7.4,6.3,5.2,3.3,5.2,5.4,6.0
|
| 22 |
+
gpt-4o,1,Qwen__Qwen1.5-14B-Chat,5.4,3.3,7.2,6.8,4.2,2.0,5.7,6.7,7.2
|
| 23 |
+
gpt-4o,1,WizardLMTeam__WizardLM-13B-V1.2,4.8,3.4,8.2,6.1,2.2,3.4,5.0,4.3,6.1
|
| 24 |
+
gpt-4o,1,mistralai__Mistral-7B-Instruct-v0.2,2.6,3.0,3.7,2.0,1.7,1.3,4.5,1.4,3.1
|
| 25 |
+
gpt-4o,2,openai__GPT-4o-2024-05-13,8.3,7.9,8.9,9.2,8.1,7.0,8.9,8.7,7.5
|
| 26 |
+
gpt-4o,2,openai__gpt-4-0125-preview,8.0,7.2,8.5,8.9,6.8,7.3,8.7,8.1,8.6
|
| 27 |
+
gpt-4o,2,Anthropic__claude-3-5-sonnet-20240620,7.9,6.9,9.1,9.0,6.4,6.9,8.1,8.2,8.4
|
| 28 |
+
gpt-4o,2,openai__GPT-4o-mini-2024-07-18,7.6,6.2,7.6,9.1,7.8,4.6,8.2,9.0,8.3
|
| 29 |
+
gpt-4o,2,mistralai__Mistral-Large-Instruct-2407,7.2,6.5,8.8,7.5,7.9,4.7,7.3,7.2,7.6
|
| 30 |
+
gpt-4o,2,google__gemma-2-27b-it,7.0,6.4,7.6,9.0,5.4,5.1,7.9,7.4,7.4
|
| 31 |
+
gpt-4o,2,google__gemini-1.5-pro,7.0,6.3,7.7,8.3,6.1,5.0,8.5,7.8,6.5
|
| 32 |
+
gpt-4o,2,Anthropic__claude-3-opus-20240229,6.9,6.0,9.0,7.3,6.2,5.8,7.3,6.5,7.5
|
| 33 |
+
gpt-4o,2,Qwen__Qwen2-72B-Instruct,6.9,5.5,8.4,8.7,5.3,4.4,7.9,7.4,7.6
|
| 34 |
+
gpt-4o,2,davidkim205__ko-gemma-2-9b-it,6.4,5.7,6.9,8.5,5.6,4.3,7.3,6.6,6.5
|
| 35 |
+
gpt-4o,2,alpindale__WizardLM-2-8x22B,6.4,6.0,8.2,7.2,6.1,4.1,7.0,6.8,5.5
|
| 36 |
+
gpt-4o,2,google__gemma-2-9b-it,6.2,4.8,7.6,8.3,4.9,3.9,7.0,7.4,6.1
|
| 37 |
+
gpt-4o,2,Qwen__Qwen1.5-32B-Chat,5.8,4.3,8.2,7.6,3.8,3.0,6.8,5.9,6.9
|
| 38 |
+
gpt-4o,2,meta-llama__Meta-Llama-3.1-70B-Instruct,5.7,5.5,8.0,7.4,3.6,2.9,6.6,5.7,5.7
|
| 39 |
+
gpt-4o,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,5.6,5.8,6.2,5.5,7.0,4.0,5.7,4.3,6.5
|
| 40 |
+
gpt-4o,2,openai__gpt-3.5-turbo-0125,5.4,5.8,5.7,7.2,4.4,3.0,6.6,4.4,6.4
|
| 41 |
+
gpt-4o,2,Qwen__Qwen2-7B-Instruct,5.3,5.0,7.0,6.6,5.1,2.7,5.6,4.8,5.9
|
| 42 |
+
gpt-4o,2,Qwen__Qwen1.5-14B-Chat,4.9,3.5,5.1,7.4,4.1,2.7,5.9,5.0,5.9
|
| 43 |
+
gpt-4o,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,4.5,3.3,3.8,7.6,4.9,2.1,5.6,5.7,3.3
|
| 44 |
+
gpt-4o,2,mistralai__Mistral-7B-Instruct-v0.2,4.5,3.9,4.4,6.8,2.2,2.4,6.2,5.6,4.6
|
| 45 |
+
gpt-4o,2,davidkim205__Ko-Llama-3-8B-Instruct,4.0,3.7,4.3,6.4,2.8,2.3,4.9,4.0,4.1
|
| 46 |
+
gpt-4o,2,meta-llama__Meta-Llama-3.1-8B-Instruct,3.9,4.1,5.0,4.8,3.8,2.1,4.0,3.5,3.6
|
| 47 |
+
gpt-4o,2,WizardLMTeam__WizardLM-13B-V1.2,3.0,2.6,3.5,3.6,1.8,2.3,3.7,3.3,2.8
|
| 48 |
+
keval,1,openai__GPT-4o-2024-05-13,9.1,7.8,9.5,9.6,9.9,8.8,8.7,9.3,9.2
|
| 49 |
+
keval,1,Anthropic__claude-3-5-sonnet-20240620,9.0,7.2,9.8,9.2,9.3,9.2,8.9,9.4,9.0
|
| 50 |
+
keval,1,openai__gpt-4-0125-preview,8.8,7.7,9.6,9.2,9.8,7.5,8.2,9.5,9.2
|
| 51 |
+
keval,1,openai__GPT-4o-mini-2024-07-18,8.7,7.8,8.2,9.3,10.0,6.9,8.8,9.7,9.2
|
| 52 |
+
keval,1,Anthropic__claude-3-opus-20240229,8.4,8.1,9.8,8.7,8.3,5.8,7.9,9.2,9.0
|
| 53 |
+
keval,1,mistralai__Mistral-Large-Instruct-2407,8.2,6.3,7.9,8.9,9.6,6.4,8.2,9.5,9.2
|
| 54 |
+
keval,1,google__gemini-1.5-pro,8.2,5.7,9.8,8.8,7.4,6.2,9.1,9.7,9.0
|
| 55 |
+
keval,1,google__gemma-2-27b-it,8.1,5.9,9.3,9.4,7.4,5.7,8.9,9.0,9.0
|
| 56 |
+
keval,1,Qwen__Qwen2-72B-Instruct,8.0,5.0,9.2,8.8,8.6,6.9,7.7,9.1,9.0
|
| 57 |
+
keval,1,davidkim205__ko-gemma-2-9b-it,7.8,5.9,9.4,8.5,6.0,6.3,8.2,9.0,8.9
|
| 58 |
+
keval,1,google__gemma-2-9b-it,7.6,6.7,8.8,8.5,5.2,5.5,9.0,8.6,8.5
|
| 59 |
+
keval,1,meta-llama__Meta-Llama-3.1-70B-Instruct,7.3,6.8,9.0,8.3,5.9,5.1,8.4,8.0,7.1
|
| 60 |
+
keval,1,Qwen__Qwen1.5-14B-Chat,7.2,4.7,9.7,8.8,4.5,4.8,8.1,8.9,8.4
|
| 61 |
+
keval,1,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,7.2,5.7,8.8,8.1,8.2,6.0,7.7,5.6,7.3
|
| 62 |
+
keval,1,alpindale__WizardLM-2-8x22B,7.1,6.1,5.6,7.9,8.8,5.9,6.5,8.7,7.1
|
| 63 |
+
keval,1,Qwen__Qwen1.5-32B-Chat,7.0,3.9,9.9,8.9,5.8,3.6,7.1,8.6,7.9
|
| 64 |
+
keval,1,openai__gpt-3.5-turbo-0125,6.9,5.6,8.9,7.7,6.4,3.2,7.4,7.5,8.6
|
| 65 |
+
keval,1,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,6.8,3.4,8.6,8.5,5.5,4.1,6.9,8.8,8.4
|
| 66 |
+
keval,1,Qwen__Qwen2-7B-Instruct,6.4,3.6,9.0,7.7,5.5,3.5,7.1,6.7,8.4
|
| 67 |
+
keval,1,meta-llama__Meta-Llama-3.1-8B-Instruct,6.3,4.3,8.9,7.7,5.3,3.3,7.3,6.0,7.5
|
| 68 |
+
keval,1,davidkim205__Ko-Llama-3-8B-Instruct,6.0,5.0,7.4,7.6,2.9,2.9,7.0,8.0,7.6
|
| 69 |
+
keval,1,WizardLMTeam__WizardLM-13B-V1.2,6.0,3.7,9.3,7.7,2.4,3.8,7.0,6.6,7.7
|
| 70 |
+
keval,1,mistralai__Mistral-7B-Instruct-v0.2,3.0,3.0,6.7,3.0,2.0,2.0,3.3,1.9,2.4
|
| 71 |
+
keval,2,openai__GPT-4o-2024-05-13,8.1,7.7,8.9,9.2,7.8,6.9,8.4,8.7,7.4
|
| 72 |
+
keval,2,openai__gpt-4-0125-preview,7.7,6.3,8.4,8.8,6.9,6.3,8.6,8.6,8.0
|
| 73 |
+
keval,2,openai__GPT-4o-mini-2024-07-18,7.4,6.8,7.6,8.7,7.7,4.3,7.8,8.4,7.8
|
| 74 |
+
keval,2,Anthropic__claude-3-5-sonnet-20240620,7.3,6.6,7.6,9.0,6.6,5.7,7.6,8.1,7.1
|
| 75 |
+
keval,2,mistralai__Mistral-Large-Instruct-2407,7.0,5.4,7.3,8.5,7.3,5.2,7.9,7.8,6.9
|
| 76 |
+
keval,2,Qwen__Qwen2-72B-Instruct,7.0,6.2,7.5,8.7,5.5,5.3,7.5,6.9,8.1
|
| 77 |
+
keval,2,google__gemma-2-27b-it,6.9,6.6,7.0,8.9,5.5,5.0,7.6,6.9,7.3
|
| 78 |
+
keval,2,Anthropic__claude-3-opus-20240229,6.8,6.2,8.4,7.8,5.4,5.1,7.0,7.3,7.5
|
| 79 |
+
keval,2,alpindale__WizardLM-2-8x22B,6.6,5.6,7.6,7.9,6.3,4.9,6.9,7.4,6.3
|
| 80 |
+
keval,2,google__gemini-1.5-pro,6.5,5.2,6.9,8.4,6.0,4.8,8.1,7.3,5.4
|
| 81 |
+
keval,2,davidkim205__ko-gemma-2-9b-it,6.4,5.1,6.6,8.9,6.0,4.0,7.2,6.8,6.7
|
| 82 |
+
keval,2,google__gemma-2-9b-it,6.3,5.2,7.7,8.7,4.6,4.0,7.8,6.8,5.4
|
| 83 |
+
keval,2,LGAI-EXAONE__EXAONE-3.0-7.8B-Instruct,6.2,5.9,7.0,6.4,6.7,4.3,7.6,4.2,7.8
|
| 84 |
+
keval,2,Qwen__Qwen1.5-32B-Chat,6.2,5.2,7.7,8.0,4.1,4.0,7.7,6.7,6.5
|
| 85 |
+
keval,2,Qwen__Qwen1.5-14B-Chat,6.0,4.7,6.9,7.9,4.8,3.8,7.2,6.3,6.7
|
| 86 |
+
keval,2,meta-llama__Meta-Llama-3.1-70B-Instruct,6.0,6.0,7.3,7.6,5.6,2.9,7.0,6.2,5.6
|
| 87 |
+
keval,2,Qwen__Qwen2-7B-Instruct,5.6,4.9,7.0,6.5,5.1,3.1,6.3,5.0,6.5
|
| 88 |
+
keval,2,KISTI-KONI__KONI-Llama3-8B-Instruct-20240729,5.5,4.6,4.9,6.7,5.9,3.2,6.9,6.8,5.2
|
| 89 |
+
keval,2,openai__gpt-3.5-turbo-0125,5.3,6.2,5.5,7.0,4.5,3.3,6.2,4.5,5.4
|
| 90 |
+
keval,2,meta-llama__Meta-Llama-3.1-8B-Instruct,4.8,5.0,6.0,5.5,4.4,2.6,5.9,5.0,4.4
|
| 91 |
+
keval,2,davidkim205__Ko-Llama-3-8B-Instruct,4.2,3.6,4.6,6.3,2.8,2.2,6.1,3.7,4.3
|
| 92 |
+
keval,2,WizardLMTeam__WizardLM-13B-V1.2,4.1,3.7,5.4,5.8,2.8,3.0,5.6,3.3,3.4
|
| 93 |
+
keval,2,mistralai__Mistral-7B-Instruct-v0.2,4.1,3.5,6.1,6.3,2.6,2.2,3.5,3.2,5.5
|