Spaces:
Running
Running
Amber Tanaka
commited on
Refactor pages and improve tooltips! (#13)
Browse files- c_and_e.py +3 -74
- category_page_builder.py +80 -0
- content.py +7 -3
- data_analysis.py +2 -73
- e2e.py +2 -73
- literature_understanding.py +3 -76
- ui_components.py +17 -9
c_and_e.py
CHANGED
|
@@ -1,81 +1,10 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
# Import our UI factories and the data loader
|
| 5 |
-
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data,create_sub_navigation_bar
|
| 6 |
from content import CODE_EXECUTION_DESCRIPTION
|
|
|
|
|
|
|
| 7 |
# Define the category for this page
|
| 8 |
CATEGORY_NAME = "Code Execution"
|
| 9 |
|
| 10 |
with gr.Blocks() as demo:
|
| 11 |
gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
|
| 12 |
-
|
| 13 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
-
gr.Markdown(CODE_EXECUTION_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
-
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
-
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
# --- This page now has two main sections: Validation and Test ---
|
| 22 |
-
with gr.Tabs():
|
| 23 |
-
with gr.Tab("Results: Test Set") as test_tab:
|
| 24 |
-
# Repeat the process for the "test" split
|
| 25 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 26 |
-
|
| 27 |
-
if not test_df.empty:
|
| 28 |
-
create_leaderboard_display(
|
| 29 |
-
full_df=test_df,
|
| 30 |
-
tag_map=test_tag_map,
|
| 31 |
-
category_name=CATEGORY_NAME,
|
| 32 |
-
split_name="test"
|
| 33 |
-
)
|
| 34 |
-
create_benchmark_details_display(
|
| 35 |
-
full_df=test_df,
|
| 36 |
-
tag_map=test_tag_map,
|
| 37 |
-
category_name=CATEGORY_NAME
|
| 38 |
-
)
|
| 39 |
-
else:
|
| 40 |
-
gr.Markdown("No data available for test split.")
|
| 41 |
-
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 42 |
-
# 1. Load all necessary data for the "validation" split ONCE.
|
| 43 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 44 |
-
|
| 45 |
-
if not validation_df.empty:
|
| 46 |
-
# 2. Render the main category display using the loaded data.
|
| 47 |
-
create_leaderboard_display(
|
| 48 |
-
full_df=validation_df,
|
| 49 |
-
tag_map=validation_tag_map,
|
| 50 |
-
category_name=CATEGORY_NAME,
|
| 51 |
-
split_name="validation"
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
# 3. Render the detailed breakdown for each benchmark in the category.
|
| 55 |
-
create_benchmark_details_display(
|
| 56 |
-
full_df=validation_df,
|
| 57 |
-
tag_map=validation_tag_map,
|
| 58 |
-
category_name=CATEGORY_NAME
|
| 59 |
-
)
|
| 60 |
-
else:
|
| 61 |
-
gr.Markdown("No data available for validation split.")
|
| 62 |
-
|
| 63 |
-
show_validation_js = """
|
| 64 |
-
() => {
|
| 65 |
-
document.getElementById('validation_nav_container').style.display = 'block';
|
| 66 |
-
document.getElementById('test_nav_container').style.display = 'none';
|
| 67 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 68 |
-
}
|
| 69 |
-
"""
|
| 70 |
-
|
| 71 |
-
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 72 |
-
show_test_js = """
|
| 73 |
-
() => {
|
| 74 |
-
document.getElementById('validation_nav_container').style.display = 'none';
|
| 75 |
-
document.getElementById('test_nav_container').style.display = 'block';
|
| 76 |
-
}
|
| 77 |
-
"""
|
| 78 |
-
|
| 79 |
-
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 80 |
-
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 81 |
-
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from content import CODE_EXECUTION_DESCRIPTION
|
| 3 |
+
from category_page_builder import build_category_page
|
| 4 |
+
|
| 5 |
# Define the category for this page
|
| 6 |
CATEGORY_NAME = "Code Execution"
|
| 7 |
|
| 8 |
with gr.Blocks() as demo:
|
| 9 |
gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
|
| 10 |
+
build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
category_page_builder.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# Import our UI factories and the data loader
|
| 5 |
+
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
+
|
| 7 |
+
def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION):
|
| 8 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 9 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 10 |
+
gr.Markdown(PAGE_DESCRIPTION, elem_id="category-intro")
|
| 11 |
+
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 12 |
+
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 13 |
+
|
| 14 |
+
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 15 |
+
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 16 |
+
|
| 17 |
+
# --- This page now has two main sections: Validation and Test ---
|
| 18 |
+
with gr.Tabs():
|
| 19 |
+
with gr.Tab("Results: Test Set") as test_tab:
|
| 20 |
+
# Repeat the process for the "test" split
|
| 21 |
+
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 22 |
+
|
| 23 |
+
if not test_df.empty:
|
| 24 |
+
create_leaderboard_display(
|
| 25 |
+
full_df=test_df,
|
| 26 |
+
tag_map=test_tag_map,
|
| 27 |
+
category_name=CATEGORY_NAME,
|
| 28 |
+
split_name="test"
|
| 29 |
+
)
|
| 30 |
+
create_benchmark_details_display(
|
| 31 |
+
full_df=test_df,
|
| 32 |
+
tag_map=test_tag_map,
|
| 33 |
+
category_name=CATEGORY_NAME
|
| 34 |
+
)
|
| 35 |
+
else:
|
| 36 |
+
gr.Markdown("No data available for test split.")
|
| 37 |
+
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 38 |
+
# 1. Load all necessary data for the "validation" split ONCE.
|
| 39 |
+
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 40 |
+
|
| 41 |
+
if not validation_df.empty:
|
| 42 |
+
# 2. Render the main category display using the loaded data.
|
| 43 |
+
create_leaderboard_display(
|
| 44 |
+
full_df=validation_df,
|
| 45 |
+
tag_map=validation_tag_map,
|
| 46 |
+
category_name=CATEGORY_NAME,
|
| 47 |
+
split_name="validation"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# 3. Render the detailed breakdown for each benchmark in the category.
|
| 51 |
+
create_benchmark_details_display(
|
| 52 |
+
full_df=validation_df,
|
| 53 |
+
tag_map=validation_tag_map,
|
| 54 |
+
category_name=CATEGORY_NAME
|
| 55 |
+
)
|
| 56 |
+
else:
|
| 57 |
+
gr.Markdown("No data available for validation split.")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
show_validation_js = """
|
| 61 |
+
() => {
|
| 62 |
+
document.getElementById('validation_nav_container').style.display = 'block';
|
| 63 |
+
document.getElementById('test_nav_container').style.display = 'none';
|
| 64 |
+
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 65 |
+
}
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 69 |
+
show_test_js = """
|
| 70 |
+
() => {
|
| 71 |
+
document.getElementById('validation_nav_container').style.display = 'none';
|
| 72 |
+
document.getElementById('test_nav_container').style.display = 'block';
|
| 73 |
+
}
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 77 |
+
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 78 |
+
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
| 79 |
+
|
| 80 |
+
return validation_nav_container, test_nav_container
|
content.py
CHANGED
|
@@ -305,17 +305,21 @@ html:not(.dark) #legend-markdown .light-mode-icon,
|
|
| 305 |
content: attr(data-tooltip);
|
| 306 |
position: absolute;
|
| 307 |
bottom: 125%;
|
| 308 |
-
background-color: #
|
| 309 |
color: #fff;
|
| 310 |
-
padding:
|
| 311 |
border-radius: 4px;
|
| 312 |
font-size: 12px;
|
| 313 |
opacity: 0;
|
| 314 |
transition: opacity 0.2s;
|
| 315 |
white-space: pre-line;
|
| 316 |
-
width:
|
|
|
|
| 317 |
text-align: left;
|
| 318 |
pointer-events: none;
|
|
|
|
|
|
|
|
|
|
| 319 |
}
|
| 320 |
|
| 321 |
.tooltip-icon:hover::after {
|
|
|
|
| 305 |
content: attr(data-tooltip);
|
| 306 |
position: absolute;
|
| 307 |
bottom: 125%;
|
| 308 |
+
background-color: #105257;
|
| 309 |
color: #fff;
|
| 310 |
+
padding: 0px 10px 10px;
|
| 311 |
border-radius: 4px;
|
| 312 |
font-size: 12px;
|
| 313 |
opacity: 0;
|
| 314 |
transition: opacity 0.2s;
|
| 315 |
white-space: pre-line;
|
| 316 |
+
width: max-content;
|
| 317 |
+
max-width: 350px; /* Limit width for better readability */
|
| 318 |
text-align: left;
|
| 319 |
pointer-events: none;
|
| 320 |
+
left: 50%;
|
| 321 |
+
transform: translateX(-50%);
|
| 322 |
+
z-index: 1000; /* Ensure it appears above other elements */
|
| 323 |
}
|
| 324 |
|
| 325 |
.tooltip-icon:hover::after {
|
data_analysis.py
CHANGED
|
@@ -1,80 +1,9 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
# Import our UI factories and the data loader
|
| 5 |
-
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
from content import DATA_ANALYSIS_DESCRIPTION
|
|
|
|
| 7 |
# Define the category for this page
|
| 8 |
CATEGORY_NAME = "Data Analysis"
|
| 9 |
|
| 10 |
with gr.Blocks() as demo:
|
| 11 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 12 |
-
|
| 13 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
-
gr.Markdown(DATA_ANALYSIS_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
-
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
-
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
-
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
-
with gr.Tabs():
|
| 21 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 22 |
-
# Repeat the process for the "test" split
|
| 23 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 24 |
-
|
| 25 |
-
if not test_df.empty:
|
| 26 |
-
create_leaderboard_display(
|
| 27 |
-
full_df=test_df,
|
| 28 |
-
tag_map=test_tag_map,
|
| 29 |
-
category_name=CATEGORY_NAME,
|
| 30 |
-
split_name="test"
|
| 31 |
-
)
|
| 32 |
-
create_benchmark_details_display(
|
| 33 |
-
full_df=test_df,
|
| 34 |
-
tag_map=test_tag_map,
|
| 35 |
-
category_name=CATEGORY_NAME
|
| 36 |
-
)
|
| 37 |
-
else:
|
| 38 |
-
gr.Markdown("No data available for test split.")
|
| 39 |
-
with gr.Tab("Results: Validation") as validation_tab:
|
| 40 |
-
# 1. Load all necessary data for the "validation" split ONCE.
|
| 41 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 42 |
-
|
| 43 |
-
if not validation_df.empty:
|
| 44 |
-
# 2. Render the main category display using the loaded data.
|
| 45 |
-
create_leaderboard_display(
|
| 46 |
-
full_df=validation_df,
|
| 47 |
-
tag_map=validation_tag_map,
|
| 48 |
-
category_name=CATEGORY_NAME,
|
| 49 |
-
split_name="validation"
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
# 3. Render the detailed breakdown for each benchmark in the category.
|
| 53 |
-
create_benchmark_details_display(
|
| 54 |
-
full_df=validation_df,
|
| 55 |
-
tag_map=validation_tag_map,
|
| 56 |
-
category_name=CATEGORY_NAME
|
| 57 |
-
)
|
| 58 |
-
else:
|
| 59 |
-
gr.Markdown("No data available for validation split.")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
show_validation_js = """
|
| 63 |
-
() => {
|
| 64 |
-
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
-
document.getElementById('test_nav_container').style.display = 'none';
|
| 66 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 67 |
-
}
|
| 68 |
-
"""
|
| 69 |
-
|
| 70 |
-
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 71 |
-
show_test_js = """
|
| 72 |
-
() => {
|
| 73 |
-
document.getElementById('validation_nav_container').style.display = 'none';
|
| 74 |
-
document.getElementById('test_nav_container').style.display = 'block';
|
| 75 |
-
}
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 79 |
-
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 80 |
-
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from content import DATA_ANALYSIS_DESCRIPTION
|
| 3 |
+
from category_page_builder import build_category_page
|
| 4 |
# Define the category for this page
|
| 5 |
CATEGORY_NAME = "Data Analysis"
|
| 6 |
|
| 7 |
with gr.Blocks() as demo:
|
| 8 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 9 |
+
build_category_page(CATEGORY_NAME, DATA_ANALYSIS_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
e2e.py
CHANGED
|
@@ -1,80 +1,9 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
# Import our UI factories and the data loader
|
| 5 |
-
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
from content import DISCOVERY_DESCRIPTION
|
|
|
|
| 7 |
# Define the category for this page
|
| 8 |
CATEGORY_NAME = "Discovery"
|
| 9 |
|
| 10 |
with gr.Blocks() as demo:
|
| 11 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 12 |
-
|
| 13 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 14 |
-
gr.Markdown(DISCOVERY_DESCRIPTION, elem_id="category-intro")
|
| 15 |
-
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 16 |
-
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 17 |
-
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 18 |
-
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 19 |
-
# --- This page now has two main sections: Validation and Test ---
|
| 20 |
-
with gr.Tabs():
|
| 21 |
-
with gr.Tab("Results: Test") as test_tab:
|
| 22 |
-
# Repeat the process for the "test" split
|
| 23 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 24 |
-
|
| 25 |
-
if not test_df.empty:
|
| 26 |
-
create_leaderboard_display(
|
| 27 |
-
full_df=test_df,
|
| 28 |
-
tag_map=test_tag_map,
|
| 29 |
-
category_name=CATEGORY_NAME,
|
| 30 |
-
split_name="test"
|
| 31 |
-
)
|
| 32 |
-
create_benchmark_details_display(
|
| 33 |
-
full_df=test_df,
|
| 34 |
-
tag_map=test_tag_map,
|
| 35 |
-
category_name=CATEGORY_NAME
|
| 36 |
-
)
|
| 37 |
-
else:
|
| 38 |
-
gr.Markdown("No data available for test split.")
|
| 39 |
-
with gr.Tab("Results: Validation") as validation_tab:
|
| 40 |
-
# 1. Load all necessary data for the "validation" split ONCE.
|
| 41 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 42 |
-
|
| 43 |
-
if not validation_df.empty:
|
| 44 |
-
# 2. Render the main category display using the loaded data.
|
| 45 |
-
create_leaderboard_display(
|
| 46 |
-
full_df=validation_df,
|
| 47 |
-
tag_map=validation_tag_map,
|
| 48 |
-
category_name=CATEGORY_NAME,
|
| 49 |
-
split_name="validation"
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
# 3. Render the detailed breakdown for each benchmark in the category.
|
| 53 |
-
create_benchmark_details_display(
|
| 54 |
-
full_df=validation_df,
|
| 55 |
-
tag_map=validation_tag_map,
|
| 56 |
-
category_name=CATEGORY_NAME
|
| 57 |
-
)
|
| 58 |
-
else:
|
| 59 |
-
gr.Markdown("No data available for validation split.")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
show_validation_js = """
|
| 63 |
-
() => {
|
| 64 |
-
document.getElementById('validation_nav_container').style.display = 'block';
|
| 65 |
-
document.getElementById('test_nav_container').style.display = 'none';
|
| 66 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 67 |
-
}
|
| 68 |
-
"""
|
| 69 |
-
|
| 70 |
-
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 71 |
-
show_test_js = """
|
| 72 |
-
() => {
|
| 73 |
-
document.getElementById('validation_nav_container').style.display = 'none';
|
| 74 |
-
document.getElementById('test_nav_container').style.display = 'block';
|
| 75 |
-
}
|
| 76 |
-
"""
|
| 77 |
-
|
| 78 |
-
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 79 |
-
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 80 |
-
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from content import DISCOVERY_DESCRIPTION
|
| 3 |
+
from category_page_builder import build_category_page
|
| 4 |
# Define the category for this page
|
| 5 |
CATEGORY_NAME = "Discovery"
|
| 6 |
|
| 7 |
with gr.Blocks() as demo:
|
| 8 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 9 |
+
build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
literature_understanding.py
CHANGED
|
@@ -1,83 +1,10 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
# Import our UI factories and the data loader
|
| 5 |
-
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
from content import LIT_DESCRIPTION
|
|
|
|
|
|
|
| 7 |
# Define the category for this page
|
| 8 |
CATEGORY_NAME = "Literature Understanding"
|
| 9 |
|
| 10 |
with gr.Blocks() as demo:
|
| 11 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 12 |
-
|
| 13 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 14 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 15 |
-
gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
|
| 16 |
-
with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
|
| 17 |
-
create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
|
| 18 |
-
|
| 19 |
-
with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
|
| 20 |
-
create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
|
| 21 |
-
|
| 22 |
-
# --- This page now has two main sections: Validation and Test ---
|
| 23 |
-
with gr.Tabs():
|
| 24 |
-
with gr.Tab("Results: Test Set") as test_tab:
|
| 25 |
-
# Repeat the process for the "test" split
|
| 26 |
-
test_df, test_tag_map = get_full_leaderboard_data("test")
|
| 27 |
-
|
| 28 |
-
if not test_df.empty:
|
| 29 |
-
create_leaderboard_display(
|
| 30 |
-
full_df=test_df,
|
| 31 |
-
tag_map=test_tag_map,
|
| 32 |
-
category_name=CATEGORY_NAME,
|
| 33 |
-
split_name="test"
|
| 34 |
-
)
|
| 35 |
-
create_benchmark_details_display(
|
| 36 |
-
full_df=test_df,
|
| 37 |
-
tag_map=test_tag_map,
|
| 38 |
-
category_name=CATEGORY_NAME
|
| 39 |
-
)
|
| 40 |
-
else:
|
| 41 |
-
gr.Markdown("No data available for test split.")
|
| 42 |
-
with gr.Tab("Results: Validation Set") as validation_tab:
|
| 43 |
-
# 1. Load all necessary data for the "validation" split ONCE.
|
| 44 |
-
validation_df, validation_tag_map = get_full_leaderboard_data("validation")
|
| 45 |
-
|
| 46 |
-
if not validation_df.empty:
|
| 47 |
-
# 2. Render the main category display using the loaded data.
|
| 48 |
-
create_leaderboard_display(
|
| 49 |
-
full_df=validation_df,
|
| 50 |
-
tag_map=validation_tag_map,
|
| 51 |
-
category_name=CATEGORY_NAME,
|
| 52 |
-
split_name="validation"
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
# 3. Render the detailed breakdown for each benchmark in the category.
|
| 56 |
-
create_benchmark_details_display(
|
| 57 |
-
full_df=validation_df,
|
| 58 |
-
tag_map=validation_tag_map,
|
| 59 |
-
category_name=CATEGORY_NAME
|
| 60 |
-
)
|
| 61 |
-
else:
|
| 62 |
-
gr.Markdown("No data available for validation split.")
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
show_validation_js = """
|
| 66 |
-
() => {
|
| 67 |
-
document.getElementById('validation_nav_container').style.display = 'block';
|
| 68 |
-
document.getElementById('test_nav_container').style.display = 'none';
|
| 69 |
-
setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
|
| 70 |
-
}
|
| 71 |
-
"""
|
| 72 |
-
|
| 73 |
-
# JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
|
| 74 |
-
show_test_js = """
|
| 75 |
-
() => {
|
| 76 |
-
document.getElementById('validation_nav_container').style.display = 'none';
|
| 77 |
-
document.getElementById('test_nav_container').style.display = 'block';
|
| 78 |
-
}
|
| 79 |
-
"""
|
| 80 |
-
|
| 81 |
-
# Assign the pure JS functions to the select events. No Python `fn` is needed.
|
| 82 |
-
validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
|
| 83 |
-
test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from content import LIT_DESCRIPTION
|
| 3 |
+
from category_page_builder import build_category_page
|
| 4 |
+
|
| 5 |
# Define the category for this page
|
| 6 |
CATEGORY_NAME = "Literature Understanding"
|
| 7 |
|
| 8 |
with gr.Blocks() as demo:
|
| 9 |
gr.Markdown(f"## Astabench{CATEGORY_NAME} Leaderboard")
|
| 10 |
+
build_category_page(CATEGORY_NAME, LIT_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui_components.py
CHANGED
|
@@ -159,26 +159,34 @@ tooling_html = " ".join(tooling_html_items)
|
|
| 159 |
# Your final legend_markdown string (the structure of this does not change)
|
| 160 |
legend_markdown = f"""
|
| 161 |
<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 24px; font-size: 14px; padding-bottom: 8px;">
|
| 162 |
-
|
| 163 |
<div> <!-- Container for the Pareto section -->
|
| 164 |
-
<b>Pareto</b
|
|
|
|
|
|
|
| 165 |
<div style="padding-top: 4px;"><span>📈 On frontier</span></div>
|
| 166 |
</div>
|
| 167 |
|
| 168 |
<div> <!-- Container for the Openness section -->
|
| 169 |
-
<b>Agent Openness</b
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{openness_html}</div>
|
| 171 |
</div>
|
| 172 |
|
| 173 |
<div> <!-- Container for the Tooling section -->
|
| 174 |
-
<b>Agent Tooling</b
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{tooling_html}</div>
|
| 176 |
</div>
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
• Openness: Level of accessibility to model and implementation
|
| 180 |
-
• Agent Tooling: Approach used by the agent
|
| 181 |
-
• Agent: Name of the AI agent
|
| 182 |
• Overall Score: Performance across all benchmarks
|
| 183 |
• Overall Cost: Cost per task in USD
|
| 184 |
• Literature Understanding Score: Performance on scientific literature tasks
|
|
|
|
| 159 |
# Your final legend_markdown string (the structure of this does not change)
|
| 160 |
legend_markdown = f"""
|
| 161 |
<div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 24px; font-size: 14px; padding-bottom: 8px;">
|
| 162 |
+
|
| 163 |
<div> <!-- Container for the Pareto section -->
|
| 164 |
+
<b>Pareto</b><span class="tooltip-icon" data-tooltip="
|
| 165 |
+
•Pareto: Indicates if agent is on the Pareto frontier
|
| 166 |
+
">ⓘ</span>
|
| 167 |
<div style="padding-top: 4px;"><span>📈 On frontier</span></div>
|
| 168 |
</div>
|
| 169 |
|
| 170 |
<div> <!-- Container for the Openness section -->
|
| 171 |
+
<b>Agent Openness</b><span class="tooltip-icon" data-tooltip="
|
| 172 |
+
•Closed: No API or code available
|
| 173 |
+
•API Available: API available, but no code
|
| 174 |
+
•Open Source: Code available, but no weights
|
| 175 |
+
•Open Source + Open Weights: Code and weights available
|
| 176 |
+
">ⓘ</span>
|
| 177 |
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{openness_html}</div>
|
| 178 |
</div>
|
| 179 |
|
| 180 |
<div> <!-- Container for the Tooling section -->
|
| 181 |
+
<b>Agent Tooling</b><span class="tooltip-icon" data-tooltip="
|
| 182 |
+
• Standard: Standard Approach used by the agent
|
| 183 |
+
• Custom with Standard Search: Standard search used by the agent
|
| 184 |
+
• Fully Custom: Fully custom tools used by the agent
|
| 185 |
+
">ⓘ</span>
|
| 186 |
<div style="display: flex; flex-wrap: wrap; align-items: center; gap: 16px; margin-top: 4px;">{tooling_html}</div>
|
| 187 |
</div>
|
| 188 |
+
|
| 189 |
+
<div><b>Column Descriptions</b><span class="tooltip-icon" data-tooltip="
|
|
|
|
|
|
|
|
|
|
| 190 |
• Overall Score: Performance across all benchmarks
|
| 191 |
• Overall Cost: Cost per task in USD
|
| 192 |
• Literature Understanding Score: Performance on scientific literature tasks
|