Commit
·
874c0c9
1
Parent(s):
18596de
up
Browse files
app.py
CHANGED
|
@@ -211,21 +211,24 @@ def regex_table(dataframe, regex, filter_button):
|
|
| 211 |
|
| 212 |
# if Score exists, round to 2 decimals
|
| 213 |
if "Score" in data.columns:
|
| 214 |
-
data["Score"] = data["Score"].
|
| 215 |
if "Average" in data.columns:
|
| 216 |
-
data["Average"] = data["Average"].
|
| 217 |
# round all others to 1 decimal
|
| 218 |
for col in data.columns:
|
| 219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
| 220 |
-
data[col] = data[col].
|
| 221 |
return data
|
| 222 |
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
with gr.Blocks(css=custom_css) as app:
|
| 225 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
| 226 |
with gr.Row():
|
| 227 |
with gr.Column(scale=6):
|
| 228 |
-
gr.Markdown(TOP_TEXT)
|
| 229 |
with gr.Column(scale=4):
|
| 230 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
| 231 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
|
|
|
| 211 |
|
| 212 |
# if Score exists, round to 2 decimals
|
| 213 |
if "Score" in data.columns:
|
| 214 |
+
data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
|
| 215 |
if "Average" in data.columns:
|
| 216 |
+
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
|
| 217 |
# round all others to 1 decimal
|
| 218 |
for col in data.columns:
|
| 219 |
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
| 220 |
+
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
| 221 |
return data
|
| 222 |
|
| 223 |
+
# import ipdb; ipdb.set_trace()
|
| 224 |
+
|
| 225 |
+
total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]).values)
|
| 226 |
|
| 227 |
with gr.Blocks(css=custom_css) as app:
|
| 228 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
| 229 |
with gr.Row():
|
| 230 |
with gr.Column(scale=6):
|
| 231 |
+
gr.Markdown(TOP_TEXT.format(str(total_models)))
|
| 232 |
with gr.Column(scale=4):
|
| 233 |
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
| 234 |
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
src/md.py
CHANGED
|
@@ -97,5 +97,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/allenai/rewa
|
|
| 97 |
TOP_TEXT = """
|
| 98 |
# RewardBench: Evaluating Reward Models
|
| 99 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
| 100 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787)
|
| 101 |
"""
|
|
|
|
| 97 |
TOP_TEXT = """
|
| 98 |
# RewardBench: Evaluating Reward Models
|
| 99 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
| 100 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {}
|
| 101 |
"""
|