Amber Tanaka commited on
Commit
dcfd58f
·
unverified ·
1 Parent(s): ac9171f

Switch default tab from validation to test (#12)

Browse files
Files changed (7) hide show
  1. c_and_e.py +22 -23
  2. data_analysis.py +21 -21
  3. e2e.py +21 -20
  4. leaderboard_transformer.py +2 -2
  5. literature_understanding.py +22 -22
  6. main_page.py +14 -16
  7. ui_components.py +10 -2
c_and_e.py CHANGED
@@ -12,15 +12,33 @@ with gr.Blocks() as demo:
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(CODE_EXECUTION_DESCRIPTION, elem_id="category-intro")
15
- with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
- with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
 
20
 
21
  # --- This page now has two main sections: Validation and Test ---
22
  with gr.Tabs():
23
- with gr.Tab("Results: Validation") as validation_tab:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # 1. Load all necessary data for the "validation" split ONCE.
25
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
26
 
@@ -42,29 +60,11 @@ with gr.Blocks() as demo:
42
  else:
43
  gr.Markdown("No data available for validation split.")
44
 
45
- with gr.Tab("Results: Test") as test_tab:
46
- # Repeat the process for the "test" split
47
- test_df, test_tag_map = get_full_leaderboard_data("test")
48
-
49
- if not test_df.empty:
50
- create_leaderboard_display(
51
- full_df=test_df,
52
- tag_map=test_tag_map,
53
- category_name=CATEGORY_NAME,
54
- split_name="test"
55
- )
56
- create_benchmark_details_display(
57
- full_df=test_df,
58
- tag_map=test_tag_map,
59
- category_name=CATEGORY_NAME
60
- )
61
- else:
62
- gr.Markdown("No data available for test split.")
63
-
64
  show_validation_js = """
65
  () => {
66
  document.getElementById('validation_nav_container').style.display = 'block';
67
  document.getElementById('test_nav_container').style.display = 'none';
 
68
  }
69
  """
70
 
@@ -73,7 +73,6 @@ with gr.Blocks() as demo:
73
  () => {
74
  document.getElementById('validation_nav_container').style.display = 'none';
75
  document.getElementById('test_nav_container').style.display = 'block';
76
- setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
77
  }
78
  """
79
 
 
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(CODE_EXECUTION_DESCRIPTION, elem_id="category-intro")
15
+ with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
+ with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
 
20
 
21
  # --- This page now has two main sections: Validation and Test ---
22
  with gr.Tabs():
23
+ with gr.Tab("Results: Test Set") as test_tab:
24
+ # Repeat the process for the "test" split
25
+ test_df, test_tag_map = get_full_leaderboard_data("test")
26
+
27
+ if not test_df.empty:
28
+ create_leaderboard_display(
29
+ full_df=test_df,
30
+ tag_map=test_tag_map,
31
+ category_name=CATEGORY_NAME,
32
+ split_name="test"
33
+ )
34
+ create_benchmark_details_display(
35
+ full_df=test_df,
36
+ tag_map=test_tag_map,
37
+ category_name=CATEGORY_NAME
38
+ )
39
+ else:
40
+ gr.Markdown("No data available for test split.")
41
+ with gr.Tab("Results: Validation Set") as validation_tab:
42
  # 1. Load all necessary data for the "validation" split ONCE.
43
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
44
 
 
60
  else:
61
  gr.Markdown("No data available for validation split.")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  show_validation_js = """
64
  () => {
65
  document.getElementById('validation_nav_container').style.display = 'block';
66
  document.getElementById('test_nav_container').style.display = 'none';
67
+ setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
68
  }
69
  """
70
 
 
73
  () => {
74
  document.getElementById('validation_nav_container').style.display = 'none';
75
  document.getElementById('test_nav_container').style.display = 'block';
 
76
  }
77
  """
78
 
data_analysis.py CHANGED
@@ -12,12 +12,30 @@ with gr.Blocks() as demo:
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(DATA_ANALYSIS_DESCRIPTION, elem_id="category-intro")
15
- with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
- with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  with gr.Tab("Results: Validation") as validation_tab:
22
  # 1. Load all necessary data for the "validation" split ONCE.
23
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
@@ -40,29 +58,12 @@ with gr.Blocks() as demo:
40
  else:
41
  gr.Markdown("No data available for validation split.")
42
 
43
- with gr.Tab("Results: Test") as test_tab:
44
- # Repeat the process for the "test" split
45
- test_df, test_tag_map = get_full_leaderboard_data("test")
46
-
47
- if not test_df.empty:
48
- create_leaderboard_display(
49
- full_df=test_df,
50
- tag_map=test_tag_map,
51
- category_name=CATEGORY_NAME,
52
- split_name="test"
53
- )
54
- create_benchmark_details_display(
55
- full_df=test_df,
56
- tag_map=test_tag_map,
57
- category_name=CATEGORY_NAME
58
- )
59
- else:
60
- gr.Markdown("No data available for test split.")
61
 
62
  show_validation_js = """
63
  () => {
64
  document.getElementById('validation_nav_container').style.display = 'block';
65
  document.getElementById('test_nav_container').style.display = 'none';
 
66
  }
67
  """
68
 
@@ -71,7 +72,6 @@ with gr.Blocks() as demo:
71
  () => {
72
  document.getElementById('validation_nav_container').style.display = 'none';
73
  document.getElementById('test_nav_container').style.display = 'block';
74
- setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
75
  }
76
  """
77
 
 
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(DATA_ANALYSIS_DESCRIPTION, elem_id="category-intro")
15
+ with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
+ with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
21
+ with gr.Tab("Results: Test") as test_tab:
22
+ # Repeat the process for the "test" split
23
+ test_df, test_tag_map = get_full_leaderboard_data("test")
24
+
25
+ if not test_df.empty:
26
+ create_leaderboard_display(
27
+ full_df=test_df,
28
+ tag_map=test_tag_map,
29
+ category_name=CATEGORY_NAME,
30
+ split_name="test"
31
+ )
32
+ create_benchmark_details_display(
33
+ full_df=test_df,
34
+ tag_map=test_tag_map,
35
+ category_name=CATEGORY_NAME
36
+ )
37
+ else:
38
+ gr.Markdown("No data available for test split.")
39
  with gr.Tab("Results: Validation") as validation_tab:
40
  # 1. Load all necessary data for the "validation" split ONCE.
41
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
 
58
  else:
59
  gr.Markdown("No data available for validation split.")
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  show_validation_js = """
63
  () => {
64
  document.getElementById('validation_nav_container').style.display = 'block';
65
  document.getElementById('test_nav_container').style.display = 'none';
66
+ setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
67
  }
68
  """
69
 
 
72
  () => {
73
  document.getElementById('validation_nav_container').style.display = 'none';
74
  document.getElementById('test_nav_container').style.display = 'block';
 
75
  }
76
  """
77
 
e2e.py CHANGED
@@ -12,12 +12,30 @@ with gr.Blocks() as demo:
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(DISCOVERY_DESCRIPTION, elem_id="category-intro")
15
- with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
- with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  with gr.Tab("Results: Validation") as validation_tab:
22
  # 1. Load all necessary data for the "validation" split ONCE.
23
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
@@ -40,28 +58,12 @@ with gr.Blocks() as demo:
40
  else:
41
  gr.Markdown("No data available for validation split.")
42
 
43
- with gr.Tab("Results: Test") as test_tab:
44
- # Repeat the process for the "test" split
45
- test_df, test_tag_map = get_full_leaderboard_data("test")
46
 
47
- if not test_df.empty:
48
- create_leaderboard_display(
49
- full_df=test_df,
50
- tag_map=test_tag_map,
51
- category_name=CATEGORY_NAME,
52
- split_name="test"
53
- )
54
- create_benchmark_details_display(
55
- full_df=test_df,
56
- tag_map=test_tag_map,
57
- category_name=CATEGORY_NAME
58
- )
59
- else:
60
- gr.Markdown("No data available for test split.")
61
  show_validation_js = """
62
  () => {
63
  document.getElementById('validation_nav_container').style.display = 'block';
64
  document.getElementById('test_nav_container').style.display = 'none';
 
65
  }
66
  """
67
 
@@ -70,7 +72,6 @@ with gr.Blocks() as demo:
70
  () => {
71
  document.getElementById('validation_nav_container').style.display = 'none';
72
  document.getElementById('test_nav_container').style.display = 'block';
73
- setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
74
  }
75
  """
76
 
 
12
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
13
  test_df, test_tag_map = get_full_leaderboard_data("test")
14
  gr.Markdown(DISCOVERY_DESCRIPTION, elem_id="category-intro")
15
+ with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
16
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
17
+ with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
18
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
19
  # --- This page now has two main sections: Validation and Test ---
20
  with gr.Tabs():
21
+ with gr.Tab("Results: Test") as test_tab:
22
+ # Repeat the process for the "test" split
23
+ test_df, test_tag_map = get_full_leaderboard_data("test")
24
+
25
+ if not test_df.empty:
26
+ create_leaderboard_display(
27
+ full_df=test_df,
28
+ tag_map=test_tag_map,
29
+ category_name=CATEGORY_NAME,
30
+ split_name="test"
31
+ )
32
+ create_benchmark_details_display(
33
+ full_df=test_df,
34
+ tag_map=test_tag_map,
35
+ category_name=CATEGORY_NAME
36
+ )
37
+ else:
38
+ gr.Markdown("No data available for test split.")
39
  with gr.Tab("Results: Validation") as validation_tab:
40
  # 1. Load all necessary data for the "validation" split ONCE.
41
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
 
58
  else:
59
  gr.Markdown("No data available for validation split.")
60
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  show_validation_js = """
63
  () => {
64
  document.getElementById('validation_nav_container').style.display = 'block';
65
  document.getElementById('test_nav_container').style.display = 'none';
66
+ setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
67
  }
68
  """
69
 
 
72
  () => {
73
  document.getElementById('validation_nav_container').style.display = 'none';
74
  document.getElementById('test_nav_container').style.display = 'block';
 
75
  }
76
  """
77
 
leaderboard_transformer.py CHANGED
@@ -414,7 +414,7 @@ def _plot_scatter_plotly(
414
  text=group['hover_text'],
415
  hoverinfo='text',
416
  marker=dict(
417
- color=color_map.get(category, 'grey'),
418
  symbol=group['shape_symbol'],
419
  size=10,
420
  opacity=0.8,
@@ -445,7 +445,7 @@ def _plot_scatter_plotly(
445
  name=shape_name,
446
  legendgroup="tooling_group",
447
  legendgrouptitle_text="Agent Tooling" if i == 0 else None,
448
- marker=dict(color='grey', symbol=shape_symbol, size=12)
449
  ))
450
 
451
  # --- Section 8: Configure Layout (Restored from your original code) ---
 
414
  text=group['hover_text'],
415
  hoverinfo='text',
416
  marker=dict(
417
+ color=color_map.get(category, 'black'),
418
  symbol=group['shape_symbol'],
419
  size=10,
420
  opacity=0.8,
 
445
  name=shape_name,
446
  legendgroup="tooling_group",
447
  legendgrouptitle_text="Agent Tooling" if i == 0 else None,
448
+ marker=dict(color='black', symbol=shape_symbol, size=12)
449
  ))
450
 
451
  # --- Section 8: Configure Layout (Restored from your original code) ---
literature_understanding.py CHANGED
@@ -13,15 +13,33 @@ with gr.Blocks() as demo:
13
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
14
  test_df, test_tag_map = get_full_leaderboard_data("test")
15
  gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
16
- with gr.Column(elem_id="validation_nav_container", visible=True) as validation_nav_container:
17
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
18
 
19
- with gr.Column(elem_id="test_nav_container", visible=False) as test_nav_container:
20
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
21
 
22
  # --- This page now has two main sections: Validation and Test ---
23
  with gr.Tabs():
24
- with gr.Tab("Results: Validation") as validation_tab:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # 1. Load all necessary data for the "validation" split ONCE.
26
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
27
 
@@ -43,29 +61,12 @@ with gr.Blocks() as demo:
43
  else:
44
  gr.Markdown("No data available for validation split.")
45
 
46
- with gr.Tab("Results: Test") as test_tab:
47
- # Repeat the process for the "test" split
48
- test_df, test_tag_map = get_full_leaderboard_data("test")
49
-
50
- if not test_df.empty:
51
- create_leaderboard_display(
52
- full_df=test_df,
53
- tag_map=test_tag_map,
54
- category_name=CATEGORY_NAME,
55
- split_name="test"
56
- )
57
- create_benchmark_details_display(
58
- full_df=test_df,
59
- tag_map=test_tag_map,
60
- category_name=CATEGORY_NAME
61
- )
62
- else:
63
- gr.Markdown("No data available for test split.")
64
 
65
  show_validation_js = """
66
  () => {
67
  document.getElementById('validation_nav_container').style.display = 'block';
68
  document.getElementById('test_nav_container').style.display = 'none';
 
69
  }
70
  """
71
 
@@ -74,7 +75,6 @@ with gr.Blocks() as demo:
74
  () => {
75
  document.getElementById('validation_nav_container').style.display = 'none';
76
  document.getElementById('test_nav_container').style.display = 'block';
77
- setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
78
  }
79
  """
80
 
 
13
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
14
  test_df, test_tag_map = get_full_leaderboard_data("test")
15
  gr.Markdown(LIT_DESCRIPTION, elem_id="category-intro")
16
+ with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container:
17
  create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME)
18
 
19
+ with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container:
20
  create_sub_navigation_bar(test_tag_map, CATEGORY_NAME)
21
 
22
  # --- This page now has two main sections: Validation and Test ---
23
  with gr.Tabs():
24
+ with gr.Tab("Results: Test Set") as test_tab:
25
+ # Repeat the process for the "test" split
26
+ test_df, test_tag_map = get_full_leaderboard_data("test")
27
+
28
+ if not test_df.empty:
29
+ create_leaderboard_display(
30
+ full_df=test_df,
31
+ tag_map=test_tag_map,
32
+ category_name=CATEGORY_NAME,
33
+ split_name="test"
34
+ )
35
+ create_benchmark_details_display(
36
+ full_df=test_df,
37
+ tag_map=test_tag_map,
38
+ category_name=CATEGORY_NAME
39
+ )
40
+ else:
41
+ gr.Markdown("No data available for test split.")
42
+ with gr.Tab("Results: Validation Set") as validation_tab:
43
  # 1. Load all necessary data for the "validation" split ONCE.
44
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
45
 
 
61
  else:
62
  gr.Markdown("No data available for validation split.")
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  show_validation_js = """
66
  () => {
67
  document.getElementById('validation_nav_container').style.display = 'block';
68
  document.getElementById('test_nav_container').style.display = 'none';
69
+ setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);
70
  }
71
  """
72
 
 
75
  () => {
76
  document.getElementById('validation_nav_container').style.display = 'none';
77
  document.getElementById('test_nav_container').style.display = 'block';
 
78
  }
79
  """
80
 
main_page.py CHANGED
@@ -23,10 +23,20 @@ with gr.Blocks(fill_width=True) as demo:
23
  gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
24
 
25
  with gr.Tabs() as tabs:
26
- with gr.Tab("Results: Validation") as validation_tab:
 
 
 
 
 
 
 
 
 
 
 
27
  # 1. Load all necessary data for the "validation" split ONCE.
28
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
29
-
30
  # Check if data was loaded successfully before trying to display it
31
  if not validation_df.empty:
32
  # 2. Render the display by calling the factory with the loaded data.
@@ -39,28 +49,16 @@ with gr.Blocks(fill_width=True) as demo:
39
  else:
40
  gr.Markdown("No data available for validation split.")
41
 
42
- with gr.Tab("Results: Test") as test_tab:
43
- test_df, test_tag_map = get_full_leaderboard_data("test")
44
- if not test_df.empty:
45
- create_leaderboard_display(
46
- full_df=test_df,
47
- tag_map=test_tag_map,
48
- category_name=CATEGORY_NAME, # Use our constant
49
- split_name="test"
50
- )
51
- else:
52
- gr.Markdown("No data available for test split.")
53
-
54
  with gr.Accordion("📙 Citation", open=False):
55
  gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
56
 
57
 
58
  # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
59
- show_test_js = """
60
  () => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);}
61
  """
62
  # Assign the pure JS functions to the select events. No Python `fn` is needed.
63
- test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js)
64
 
65
  if __name__ == "__main__":
66
  demo.launch()
 
23
  gr.Markdown(f"## Astabench {CATEGORY_NAME} Leaderboard")
24
 
25
  with gr.Tabs() as tabs:
26
+ with gr.Tab("Results: Test Set") as test_tab:
27
+ test_df, test_tag_map = get_full_leaderboard_data("test")
28
+ if not test_df.empty:
29
+ create_leaderboard_display(
30
+ full_df=test_df,
31
+ tag_map=test_tag_map,
32
+ category_name=CATEGORY_NAME, # Use our constant
33
+ split_name="test"
34
+ )
35
+ else:
36
+ gr.Markdown("No data available for test split.")
37
+ with gr.Tab("Results: Validation Set") as validation_tab:
38
  # 1. Load all necessary data for the "validation" split ONCE.
39
  validation_df, validation_tag_map = get_full_leaderboard_data("validation")
 
40
  # Check if data was loaded successfully before trying to display it
41
  if not validation_df.empty:
42
  # 2. Render the display by calling the factory with the loaded data.
 
49
  else:
50
  gr.Markdown("No data available for validation split.")
51
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  with gr.Accordion("📙 Citation", open=False):
53
  gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False)
54
 
55
 
56
  # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots.
57
+ show_validation_js = """
58
  () => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);}
59
  """
60
  # Assign the pure JS functions to the select events. No Python `fn` is needed.
61
+ validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js)
62
 
63
  if __name__ == "__main__":
64
  demo.launch()
ui_components.py CHANGED
@@ -336,7 +336,7 @@ def create_leaderboard_display(
336
  datatype=df_datatypes,
337
  interactive=False,
338
  wrap=True,
339
- column_widths=[30, 30, 30, 250],
340
  elem_classes=["wrap-header-df"]
341
  )
342
 
@@ -527,7 +527,14 @@ def create_benchmark_details_display(
527
  df_datatypes.append("html")
528
  else:
529
  df_datatypes.append("str")
530
-
 
 
 
 
 
 
 
531
  # Create the scatter plot using the full data for context, but plotting benchmark metrics
532
  # This shows all agents on the same axis for better comparison.
533
  benchmark_plot = _plot_scatter_plotly(
@@ -547,6 +554,7 @@ def create_benchmark_details_display(
547
  datatype=df_datatypes,
548
  interactive=False,
549
  wrap=True,
 
550
  elem_classes=["wrap-header-df"]
551
  )
552
 
 
336
  datatype=df_datatypes,
337
  interactive=False,
338
  wrap=True,
339
+ column_widths=[30, 30, 30, 200],
340
  elem_classes=["wrap-header-df"]
341
  )
342
 
 
527
  df_datatypes.append("html")
528
  else:
529
  df_datatypes.append("str")
530
+ # Remove Pareto, Openness, and Agent Tooling from the headers
531
+ header_rename_map = {
532
+ "Pareto": "",
533
+ "Openness": "",
534
+ "Agent Tooling": ""
535
+ }
536
+ # 2. Create the final list of headers for display.
537
+ benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
538
  # Create the scatter plot using the full data for context, but plotting benchmark metrics
539
  # This shows all agents on the same axis for better comparison.
540
  benchmark_plot = _plot_scatter_plotly(
 
554
  datatype=df_datatypes,
555
  interactive=False,
556
  wrap=True,
557
+ column_widths=[40, 40, 40, 350],
558
  elem_classes=["wrap-header-df"]
559
  )
560