Calculate results directly from the source
Browse files- functions.py +121 -27
functions.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
|
|
|
|
|
|
| 3 |
from datasets import load_dataset
|
| 4 |
from huggingface_hub import (
|
| 5 |
CommitOperationAdd,
|
|
@@ -11,16 +13,114 @@ from huggingface_hub import (
|
|
| 11 |
from huggingface_hub.repocard_data import eval_results_to_model_index
|
| 12 |
from pytablewriter import MarkdownTableWriter
|
| 13 |
|
| 14 |
-
COMMIT_DESCRIPTION = """This is an automated PR created with https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard
|
| 15 |
|
| 16 |
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
|
| 17 |
|
| 18 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
| 19 |
|
| 20 |
|
| 21 |
-
def
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def get_details_url(repo):
|
|
@@ -42,10 +142,9 @@ def get_task_summary(results):
|
|
| 42 |
"dataset_type": "wis-k/instruction-following-eval",
|
| 43 |
"dataset_name": "IFEval (0-Shot)",
|
| 44 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
| 45 |
-
"metric_value":
|
| 46 |
"dataset_config": None,
|
| 47 |
"dataset_split": "train",
|
| 48 |
-
#"dataset_revision": None,
|
| 49 |
"dataset_args": {"num_few_shot": 0},
|
| 50 |
"metric_name": "averaged accuracy",
|
| 51 |
},
|
|
@@ -53,10 +152,9 @@ def get_task_summary(results):
|
|
| 53 |
"dataset_type": "SaylorTwift/bbh",
|
| 54 |
"dataset_name": "BBH (3-Shot)",
|
| 55 |
"metric_type": "acc_norm",
|
| 56 |
-
"metric_value":
|
| 57 |
"dataset_config": None,
|
| 58 |
"dataset_split": "test",
|
| 59 |
-
#"dataset_revision": None,
|
| 60 |
"dataset_args": {"num_few_shot": 3},
|
| 61 |
"metric_name": "normalized accuracy",
|
| 62 |
},
|
|
@@ -64,10 +162,9 @@ def get_task_summary(results):
|
|
| 64 |
"dataset_type": "lighteval/MATH-Hard",
|
| 65 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
| 66 |
"metric_type": "exact_match",
|
| 67 |
-
"metric_value":
|
| 68 |
"dataset_config": None,
|
| 69 |
"dataset_split": "test",
|
| 70 |
-
#"dataset_revision": None,
|
| 71 |
"dataset_args": {"num_few_shot": 4},
|
| 72 |
"metric_name": "exact match",
|
| 73 |
},
|
|
@@ -75,10 +172,9 @@ def get_task_summary(results):
|
|
| 75 |
"dataset_type": "Idavidrein/gpqa",
|
| 76 |
"dataset_name": "GPQA (0-shot)",
|
| 77 |
"metric_type": "acc_norm",
|
| 78 |
-
"metric_value":
|
| 79 |
"dataset_config": None,
|
| 80 |
"dataset_split": "train",
|
| 81 |
-
#"dataset_revision": None,
|
| 82 |
"dataset_args": {"num_few_shot": 0},
|
| 83 |
"metric_name": "acc_norm",
|
| 84 |
},
|
|
@@ -86,7 +182,7 @@ def get_task_summary(results):
|
|
| 86 |
"dataset_type": "TAUR-Lab/MuSR",
|
| 87 |
"dataset_name": "MuSR (0-shot)",
|
| 88 |
"metric_type": "acc_norm",
|
| 89 |
-
"metric_value":
|
| 90 |
"dataset_config": None,
|
| 91 |
"dataset_split": None, # three test splits
|
| 92 |
"dataset_args": {"num_few_shot": 0},
|
|
@@ -96,7 +192,7 @@ def get_task_summary(results):
|
|
| 96 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
| 97 |
"dataset_name": "MMLU-PRO (5-shot)",
|
| 98 |
"metric_type": "acc",
|
| 99 |
-
"metric_value":
|
| 100 |
"dataset_config": "main",
|
| 101 |
"dataset_split": "test",
|
| 102 |
"dataset_args": {"num_few_shot": 5},
|
|
@@ -105,12 +201,11 @@ def get_task_summary(results):
|
|
| 105 |
}
|
| 106 |
|
| 107 |
|
| 108 |
-
def get_eval_results(
|
| 109 |
-
results = search(df, repo)
|
| 110 |
task_summary = get_task_summary(results)
|
| 111 |
table = MarkdownTableWriter()
|
| 112 |
-
table.headers = ["Metric", "%
|
| 113 |
-
table.value_matrix = [["
|
| 114 |
[v["dataset_name"], v["metric_value"]] for v in task_summary.values()
|
| 115 |
]
|
| 116 |
|
|
@@ -123,15 +218,14 @@ Summarized results can be found [here]({get_contents_url(repo)})!
|
|
| 123 |
return text
|
| 124 |
|
| 125 |
|
| 126 |
-
def get_edited_yaml_readme(
|
| 127 |
card = ModelCard.load(repo, token=token)
|
| 128 |
-
results = search(df, repo)
|
| 129 |
|
| 130 |
common = {
|
| 131 |
"task_type": "text-generation",
|
| 132 |
"task_name": "Text Generation",
|
| 133 |
"source_name": "Open LLM Leaderboard",
|
| 134 |
-
"source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
|
| 135 |
}
|
| 136 |
|
| 137 |
tasks_results = get_task_summary(results)
|
|
@@ -167,8 +261,8 @@ def commit(
|
|
| 167 |
else:
|
| 168 |
token = oauth_token
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
if repo.startswith("https://huggingface.co/"):
|
| 174 |
try:
|
|
@@ -181,11 +275,11 @@ def commit(
|
|
| 181 |
try:
|
| 182 |
try: # check if there is a readme already
|
| 183 |
readme_text = get_edited_yaml_readme(
|
| 184 |
-
|
| 185 |
-
) + get_eval_results(
|
| 186 |
except Exception as e:
|
| 187 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
| 188 |
-
readme_text = get_edited_yaml_readme(
|
| 189 |
else:
|
| 190 |
print(f"Something went wrong: {e}")
|
| 191 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import numpy as np
|
| 3 |
+
import urllib3
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
from datasets import load_dataset
|
| 6 |
from huggingface_hub import (
|
| 7 |
CommitOperationAdd,
|
|
|
|
| 13 |
from huggingface_hub.repocard_data import eval_results_to_model_index
|
| 14 |
from pytablewriter import MarkdownTableWriter
|
| 15 |
|
| 16 |
+
COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)!
|
| 17 |
|
| 18 |
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
|
| 19 |
|
| 20 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
| 21 |
|
| 22 |
|
| 23 |
+
def normalize_within_range(value, lower_bound=0, higher_bound=1):
|
| 24 |
+
return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def calculate_results(repo: str, pool: urllib3.PoolManager):
|
| 28 |
+
try:
|
| 29 |
+
base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}"
|
| 30 |
+
html = pool.request("GET", base_url).data
|
| 31 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 32 |
+
dl_link = soup.find_all(title="Download file")[-1]["href"]
|
| 33 |
+
data = pool.request("GET", f"https://huggingface.co{dl_link}").json()
|
| 34 |
+
|
| 35 |
+
del base_url
|
| 36 |
+
del html
|
| 37 |
+
del soup
|
| 38 |
+
del dl_link
|
| 39 |
+
|
| 40 |
+
model_name = data["model_name"]
|
| 41 |
+
precision = data["config"]["model_dtype"]
|
| 42 |
+
revision = data["config"]["model_revision"]
|
| 43 |
+
|
| 44 |
+
# Normalize BBH subtasks scores
|
| 45 |
+
bbh_scores = []
|
| 46 |
+
for subtask_key in data["group_subtasks"]["leaderboard_bbh"]:
|
| 47 |
+
num_choices = len(data["configs"][subtask_key]["doc_to_choice"])
|
| 48 |
+
if subtask_key in data["results"]:
|
| 49 |
+
bbh_raw_score = data["results"][subtask_key]["acc_norm,none"]
|
| 50 |
+
lower_bound = 1 / num_choices
|
| 51 |
+
normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0)
|
| 52 |
+
bbh_scores.append(normalized_score)
|
| 53 |
+
|
| 54 |
+
# Average BBH score
|
| 55 |
+
bbh_score = sum(bbh_scores) / len(bbh_scores)
|
| 56 |
+
|
| 57 |
+
# Calculate the MATH score
|
| 58 |
+
math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
|
| 59 |
+
math_score = normalize_within_range(math_raw_score, 0, 1.0)
|
| 60 |
+
|
| 61 |
+
# Normalize GPQA scores
|
| 62 |
+
gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
|
| 63 |
+
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
|
| 64 |
+
|
| 65 |
+
# Normalize MMLU PRO scores
|
| 66 |
+
mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
|
| 67 |
+
mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
|
| 68 |
+
|
| 69 |
+
# Compute IFEval
|
| 70 |
+
ifeval_inst_score = (
|
| 71 |
+
data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100
|
| 72 |
+
)
|
| 73 |
+
ifeval_prompt_score = (
|
| 74 |
+
data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Average IFEval scores
|
| 78 |
+
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
|
| 79 |
+
|
| 80 |
+
# Normalize MUSR scores
|
| 81 |
+
musr_scores = []
|
| 82 |
+
for subtask_key in data["group_subtasks"]["leaderboard_musr"]:
|
| 83 |
+
subtask_config = data["configs"][subtask_key]
|
| 84 |
+
dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"])
|
| 85 |
+
num_choices = max(len(eval(question["choices"])) for question in dataset)
|
| 86 |
+
musr_raw_score = data["results"][subtask_key]["acc_norm,none"]
|
| 87 |
+
lower_bound = 1 / num_choices
|
| 88 |
+
normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0)
|
| 89 |
+
|
| 90 |
+
musr_scores.append(normalized_score)
|
| 91 |
+
del dataset
|
| 92 |
+
|
| 93 |
+
musr_score = sum(musr_scores) / len(musr_scores)
|
| 94 |
+
|
| 95 |
+
# Calculate overall score
|
| 96 |
+
overall_score = (
|
| 97 |
+
bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
|
| 98 |
+
) / 6
|
| 99 |
+
|
| 100 |
+
# Round all scores to 2 decimal places
|
| 101 |
+
bbh_score = float(round(bbh_score, 2))
|
| 102 |
+
math_score = float(round(math_score, 2))
|
| 103 |
+
gpqa_score = float(round(gpqa_score, 2))
|
| 104 |
+
mmlu_pro_score = float(round(mmlu_pro_score, 2))
|
| 105 |
+
musr_score = float(round(musr_score, 2))
|
| 106 |
+
ifeval_score = float(round(ifeval_score, 2))
|
| 107 |
+
overall_score = float(round(overall_score, 2))
|
| 108 |
+
results = {
|
| 109 |
+
"Model": model_name,
|
| 110 |
+
"Precision": precision,
|
| 111 |
+
"Revision": revision,
|
| 112 |
+
"Average": overall_score,
|
| 113 |
+
"IFEval": ifeval_score,
|
| 114 |
+
"BBH": bbh_score,
|
| 115 |
+
"MATH Lvl 5": math_score,
|
| 116 |
+
"GPQA": gpqa_score,
|
| 117 |
+
"MUSR": musr_score,
|
| 118 |
+
"MMLU-PRO": mmlu_pro_score,
|
| 119 |
+
}
|
| 120 |
+
# pprint(results, sort_dicts=False)
|
| 121 |
+
return results
|
| 122 |
+
except Exception: # likely will be from no results being available
|
| 123 |
+
return None
|
| 124 |
|
| 125 |
|
| 126 |
def get_details_url(repo):
|
|
|
|
| 142 |
"dataset_type": "wis-k/instruction-following-eval",
|
| 143 |
"dataset_name": "IFEval (0-Shot)",
|
| 144 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
| 145 |
+
"metric_value": results["IFEval"],
|
| 146 |
"dataset_config": None,
|
| 147 |
"dataset_split": "train",
|
|
|
|
| 148 |
"dataset_args": {"num_few_shot": 0},
|
| 149 |
"metric_name": "averaged accuracy",
|
| 150 |
},
|
|
|
|
| 152 |
"dataset_type": "SaylorTwift/bbh",
|
| 153 |
"dataset_name": "BBH (3-Shot)",
|
| 154 |
"metric_type": "acc_norm",
|
| 155 |
+
"metric_value": results["BBH"],
|
| 156 |
"dataset_config": None,
|
| 157 |
"dataset_split": "test",
|
|
|
|
| 158 |
"dataset_args": {"num_few_shot": 3},
|
| 159 |
"metric_name": "normalized accuracy",
|
| 160 |
},
|
|
|
|
| 162 |
"dataset_type": "lighteval/MATH-Hard",
|
| 163 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
| 164 |
"metric_type": "exact_match",
|
| 165 |
+
"metric_value": results["MATH Lvl 5"],
|
| 166 |
"dataset_config": None,
|
| 167 |
"dataset_split": "test",
|
|
|
|
| 168 |
"dataset_args": {"num_few_shot": 4},
|
| 169 |
"metric_name": "exact match",
|
| 170 |
},
|
|
|
|
| 172 |
"dataset_type": "Idavidrein/gpqa",
|
| 173 |
"dataset_name": "GPQA (0-shot)",
|
| 174 |
"metric_type": "acc_norm",
|
| 175 |
+
"metric_value": results["GPQA"],
|
| 176 |
"dataset_config": None,
|
| 177 |
"dataset_split": "train",
|
|
|
|
| 178 |
"dataset_args": {"num_few_shot": 0},
|
| 179 |
"metric_name": "acc_norm",
|
| 180 |
},
|
|
|
|
| 182 |
"dataset_type": "TAUR-Lab/MuSR",
|
| 183 |
"dataset_name": "MuSR (0-shot)",
|
| 184 |
"metric_type": "acc_norm",
|
| 185 |
+
"metric_value": results["MUSR"],
|
| 186 |
"dataset_config": None,
|
| 187 |
"dataset_split": None, # three test splits
|
| 188 |
"dataset_args": {"num_few_shot": 0},
|
|
|
|
| 192 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
| 193 |
"dataset_name": "MMLU-PRO (5-shot)",
|
| 194 |
"metric_type": "acc",
|
| 195 |
+
"metric_value": results["MMLU-PRO"],
|
| 196 |
"dataset_config": "main",
|
| 197 |
"dataset_split": "test",
|
| 198 |
"dataset_args": {"num_few_shot": 5},
|
|
|
|
| 201 |
}
|
| 202 |
|
| 203 |
|
| 204 |
+
def get_eval_results(repo: str, results: dict):
|
|
|
|
| 205 |
task_summary = get_task_summary(results)
|
| 206 |
table = MarkdownTableWriter()
|
| 207 |
+
table.headers = ["Metric", "Value (%)"]
|
| 208 |
+
table.value_matrix = [["**Average**", f"**{results["Average"]}**"]] + [
|
| 209 |
[v["dataset_name"], v["metric_value"]] for v in task_summary.values()
|
| 210 |
]
|
| 211 |
|
|
|
|
| 218 |
return text
|
| 219 |
|
| 220 |
|
| 221 |
+
def get_edited_yaml_readme(repo: str, results: dict, token: str | None):
|
| 222 |
card = ModelCard.load(repo, token=token)
|
|
|
|
| 223 |
|
| 224 |
common = {
|
| 225 |
"task_type": "text-generation",
|
| 226 |
"task_name": "Text Generation",
|
| 227 |
"source_name": "Open LLM Leaderboard",
|
| 228 |
+
"source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={repo.replace("/", "%2F")}",
|
| 229 |
}
|
| 230 |
|
| 231 |
tasks_results = get_task_summary(results)
|
|
|
|
| 261 |
else:
|
| 262 |
token = oauth_token
|
| 263 |
|
| 264 |
+
with urllib3.PoolManager() as pool:
|
| 265 |
+
results = calculate_results(repo, pool)
|
| 266 |
|
| 267 |
if repo.startswith("https://huggingface.co/"):
|
| 268 |
try:
|
|
|
|
| 275 |
try:
|
| 276 |
try: # check if there is a readme already
|
| 277 |
readme_text = get_edited_yaml_readme(
|
| 278 |
+
repo, results, token=token
|
| 279 |
+
) + get_eval_results(repo, results)
|
| 280 |
except Exception as e:
|
| 281 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
| 282 |
+
readme_text = get_edited_yaml_readme(repo, results, token=token)
|
| 283 |
else:
|
| 284 |
print(f"Something went wrong: {e}")
|
| 285 |
|