openhands openhands commited on
Commit
376500e
·
1 Parent(s): 855423e

Rename columns: Agent→OpenHands Version, Models Used→Language Model, remove Submitter

Browse files

- Updated simple_data_loader.py to use new column names
- Updated leaderboard_transformer.py with fixed_mappings and scatter plot references
- Updated ui_components.py tooltips and column handling
- Changed metadata.json: agent_name→agent_version across all mock data
- Added 'Bug Fixing' tag to swe-bench benchmarks in scores.json
- Removed 'Submitter' column from base_cols and column references

Co-authored-by: openhands <[email protected]>

leaderboard_transformer.py CHANGED
@@ -103,7 +103,8 @@ def _pretty_column_name(raw_col: str) -> str:
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
- 'Agent': 'Agent Version',
 
107
  'Agent description': 'Agent Description',
108
  'Submission date': 'Date',
109
  'Overall': 'Overall Score',
@@ -255,7 +256,7 @@ class DataTransformer:
255
  df_view = df_sorted.copy()
256
 
257
  # --- 3. Add Columns for Agent Openness ---
258
- base_cols = ["id","Agent","Submitter","Models Used","Source"]
259
  new_cols = ["Openness"]
260
  ending_cols = ["Date", "Logs"]
261
 
@@ -304,7 +305,7 @@ class DataTransformer:
304
  data=df_view,
305
  x=primary_cost_col,
306
  y=primary_score_col,
307
- agent_col="Agent",
308
  name=primary_metric
309
  )
310
  # Use a consistent key for easy retrieval later
@@ -347,7 +348,7 @@ def _plot_scatter_plotly(
347
 
348
  x_col_to_use = x
349
  y_col_to_use = y
350
- llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used"
351
 
352
  # --- Section 2: Data Preparation---
353
  required_cols = [y_col_to_use, agent_col, "Openness"]
@@ -432,7 +433,7 @@ def _plot_scatter_plotly(
432
  def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
433
  """
434
  Builds the complete HTML string for the plot's hover tooltip.
435
- Formats the 'Models Used' column as a bulleted list if multiple.
436
  """
437
  h_pad = " "
438
  parts = ["<br>"]
@@ -447,18 +448,18 @@ def _plot_scatter_plotly(
447
 
448
  # Add extra vertical space (line spacing) before the next section
449
  parts.append("<br>")
450
- # Clean and format Models Used column
451
- llm_base_value = row['Models Used']
452
  llm_base_value = clean_llm_base_list(llm_base_value)
453
  if isinstance(llm_base_value, list) and llm_base_value:
454
- parts.append(f"{h_pad}Models Used:{h_pad}<br>")
455
  # Create a list of padded bullet points
456
  list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
457
  # Join them with line breaks
458
  parts.append('<br>'.join(list_items))
459
  else:
460
  # Handle the non-list case with padding
461
- parts.append(f"{h_pad}Models Used: <b>{llm_base_value}</b>{h_pad}")
462
  # Add a final line break for bottom "padding"
463
  parts.append("<br>")
464
  # Join all the parts together into the final HTML string
 
103
  # Case 1: Handle fixed, special-case mappings first.
104
  fixed_mappings = {
105
  'id': 'id',
106
+ 'Openhands version': 'OpenHands Version',
107
+ 'Language model': 'Language Model',
108
  'Agent description': 'Agent Description',
109
  'Submission date': 'Date',
110
  'Overall': 'Overall Score',
 
256
  df_view = df_sorted.copy()
257
 
258
  # --- 3. Add Columns for Agent Openness ---
259
+ base_cols = ["id","OpenHands Version","Language Model","Source"]
260
  new_cols = ["Openness"]
261
  ending_cols = ["Date", "Logs"]
262
 
 
305
  data=df_view,
306
  x=primary_cost_col,
307
  y=primary_score_col,
308
+ agent_col="OpenHands Version",
309
  name=primary_metric
310
  )
311
  # Use a consistent key for easy retrieval later
 
348
 
349
  x_col_to_use = x
350
  y_col_to_use = y
351
+ llm_base = data["Language Model"] if "Language Model" in data.columns else "Language Model"
352
 
353
  # --- Section 2: Data Preparation---
354
  required_cols = [y_col_to_use, agent_col, "Openness"]
 
433
  def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
434
  """
435
  Builds the complete HTML string for the plot's hover tooltip.
436
+ Formats the 'Language Model' column as a bulleted list if multiple.
437
  """
438
  h_pad = " "
439
  parts = ["<br>"]
 
448
 
449
  # Add extra vertical space (line spacing) before the next section
450
  parts.append("<br>")
451
+ # Clean and format Language Model column
452
+ llm_base_value = row['Language Model']
453
  llm_base_value = clean_llm_base_list(llm_base_value)
454
  if isinstance(llm_base_value, list) and llm_base_value:
455
+ parts.append(f"{h_pad}Language Model:{h_pad}<br>")
456
  # Create a list of padded bullet points
457
  list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
458
  # Join them with line breaks
459
  parts.append('<br>'.join(list_items))
460
  else:
461
  # Handle the non-list case with padding
462
+ parts.append(f"{h_pad}Language Model: <b>{llm_base_value}</b>{h_pad}")
463
  # Add a final line break for bottom "padding"
464
  parts.append("<br>")
465
  # Join all the parts together into the final HTML string
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "agent_name": "OpenHands CodeAct v2.1",
3
  "agent_version": "OpenHands CodeAct v2.1",
4
  "model": "claude-3-5-sonnet-20241022",
5
  "openness": "closed_api_available",
 
1
  {
2
+ "agent_version": "OpenHands CodeAct v2.1",
3
  "agent_version": "OpenHands CodeAct v2.1",
4
  "model": "claude-3-5-sonnet-20241022",
5
  "openness": "closed_api_available",
mock_results/1.0.0-dev1/results/20251124_claude_3_5_sonnet_20241022/scores.json CHANGED
@@ -6,7 +6,8 @@
6
  "total_cost": 34.15,
7
  "total_runtime": 541.5,
8
  "tags": [
9
- "swe-bench"
 
10
  ]
11
  },
12
  {
@@ -16,7 +17,8 @@
16
  "total_cost": 31.05,
17
  "total_runtime": 510.5,
18
  "tags": [
19
- "swe-bench-multimodal"
 
20
  ]
21
  },
22
  {
 
6
  "total_cost": 34.15,
7
  "total_runtime": 541.5,
8
  "tags": [
9
+ "swe-bench",
10
+ "Bug Fixing"
11
  ]
12
  },
13
  {
 
17
  "total_cost": 31.05,
18
  "total_runtime": 510.5,
19
  "tags": [
20
+ "swe-bench-multimodal",
21
+ "Bug Fixing"
22
  ]
23
  },
24
  {
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "agent_name": "SWE-Agent",
3
  "agent_version": "SWE-Agent",
4
  "model": "claude-3-opus-20240229",
5
  "openness": "closed_api_available",
 
1
  {
2
+ "agent_version": "SWE-Agent",
3
  "agent_version": "SWE-Agent",
4
  "model": "claude-3-opus-20240229",
5
  "openness": "closed_api_available",
mock_results/1.0.0-dev1/results/20251124_claude_3_opus_20240229/scores.json CHANGED
@@ -6,7 +6,8 @@
6
  "total_cost": 24.9,
7
  "total_runtime": 449.0,
8
  "tags": [
9
- "swe-bench"
 
10
  ]
11
  },
12
  {
@@ -16,7 +17,8 @@
16
  "total_cost": 22.85,
17
  "total_runtime": 428.5,
18
  "tags": [
19
- "swe-bench-multimodal"
 
20
  ]
21
  },
22
  {
 
6
  "total_cost": 24.9,
7
  "total_runtime": 449.0,
8
  "tags": [
9
+ "swe-bench",
10
+ "Bug Fixing"
11
  ]
12
  },
13
  {
 
17
  "total_cost": 22.85,
18
  "total_runtime": 428.5,
19
  "tags": [
20
+ "swe-bench-multimodal",
21
+ "Bug Fixing"
22
  ]
23
  },
24
  {
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "agent_name": "AutoCodeRover",
3
  "agent_version": "AutoCodeRover",
4
  "model": "gpt-4-turbo-2024-04-09",
5
  "openness": "closed_api_available",
 
1
  {
2
+ "agent_version": "AutoCodeRover",
3
  "agent_version": "AutoCodeRover",
4
  "model": "gpt-4-turbo-2024-04-09",
5
  "openness": "closed_api_available",
mock_results/1.0.0-dev1/results/20251124_gpt_4_turbo_2024_04_09/scores.json CHANGED
@@ -6,7 +6,8 @@
6
  "total_cost": 29.35,
7
  "total_runtime": 493.5,
8
  "tags": [
9
- "swe-bench"
 
10
  ]
11
  },
12
  {
@@ -16,7 +17,8 @@
16
  "total_cost": 27.1,
17
  "total_runtime": 471.0,
18
  "tags": [
19
- "swe-bench-multimodal"
 
20
  ]
21
  },
22
  {
 
6
  "total_cost": 29.35,
7
  "total_runtime": 493.5,
8
  "tags": [
9
+ "swe-bench",
10
+ "Bug Fixing"
11
  ]
12
  },
13
  {
 
17
  "total_cost": 27.1,
18
  "total_runtime": 471.0,
19
  "tags": [
20
+ "swe-bench-multimodal",
21
+ "Bug Fixing"
22
  ]
23
  },
24
  {
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "agent_name": "OpenHands CodeAct v2.0",
3
  "agent_version": "OpenHands CodeAct v2.0",
4
  "model": "gpt-4o-2024-11-20",
5
  "openness": "closed_api_available",
 
1
  {
2
+ "agent_version": "OpenHands CodeAct v2.0",
3
  "agent_version": "OpenHands CodeAct v2.0",
4
  "model": "gpt-4o-2024-11-20",
5
  "openness": "closed_api_available",
mock_results/1.0.0-dev1/results/20251124_gpt_4o_2024_11_20/scores.json CHANGED
@@ -6,7 +6,8 @@
6
  "total_cost": 32.55,
7
  "total_runtime": 525.5,
8
  "tags": [
9
- "swe-bench"
 
10
  ]
11
  },
12
  {
@@ -16,7 +17,8 @@
16
  "total_cost": 29.75,
17
  "total_runtime": 497.5,
18
  "tags": [
19
- "swe-bench-multimodal"
 
20
  ]
21
  },
22
  {
 
6
  "total_cost": 32.55,
7
  "total_runtime": 525.5,
8
  "tags": [
9
+ "swe-bench",
10
+ "Bug Fixing"
11
  ]
12
  },
13
  {
 
17
  "total_cost": 29.75,
18
  "total_runtime": 497.5,
19
  "tags": [
20
+ "swe-bench-multimodal",
21
+ "Bug Fixing"
22
  ]
23
  },
24
  {
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/metadata.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "agent_name": "Agentless",
3
  "agent_version": "Agentless",
4
  "model": "gpt-4o-mini-2024-07-18",
5
  "openness": "closed_api_available",
 
1
  {
2
+ "agent_version": "Agentless",
3
  "agent_version": "Agentless",
4
  "model": "gpt-4o-mini-2024-07-18",
5
  "openness": "closed_api_available",
mock_results/1.0.0-dev1/results/20251124_gpt_4o_mini_2024_07_18/scores.json CHANGED
@@ -6,7 +6,8 @@
6
  "total_cost": 26.25,
7
  "total_runtime": 462.5,
8
  "tags": [
9
- "swe-bench"
 
10
  ]
11
  },
12
  {
@@ -16,7 +17,8 @@
16
  "total_cost": 24.45,
17
  "total_runtime": 444.5,
18
  "tags": [
19
- "swe-bench-multimodal"
 
20
  ]
21
  },
22
  {
 
6
  "total_cost": 26.25,
7
  "total_runtime": 462.5,
8
  "tags": [
9
+ "swe-bench",
10
+ "Bug Fixing"
11
  ]
12
  },
13
  {
 
17
  "total_cost": 24.45,
18
  "total_runtime": 444.5,
19
  "tags": [
20
+ "swe-bench-multimodal",
21
+ "Bug Fixing"
22
  ]
23
  },
24
  {
simple_data_loader.py CHANGED
@@ -85,7 +85,7 @@ class SimpleLeaderboardViewer:
85
  # Create one record per benchmark (mimicking old JSONL format)
86
  for score_entry in scores:
87
  record = {
88
- 'agent_name': metadata.get('agent_name', 'Unknown'),
89
  'llm_base': metadata.get('model', 'unknown'),
90
  'openness': metadata.get('openness', 'unknown'),
91
  'submission_time': metadata.get('submission_time', ''),
@@ -119,8 +119,8 @@ class SimpleLeaderboardViewer:
119
  # Group by agent to aggregate results across datasets
120
  transformed_records = []
121
 
122
- for agent_name in df['agent_name'].unique():
123
- agent_records = df[df['agent_name'] == agent_name]
124
 
125
  # Build a single record for this agent
126
  first_record = agent_records.iloc[0]
@@ -132,12 +132,12 @@ class SimpleLeaderboardViewer:
132
 
133
  record = {
134
  # Core agent info - use final display names
135
- 'agent': agent_name, # Will become "Agent Version" after prettifying
136
- 'models used': first_record['llm_base'], # Will become "Model"
137
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
138
  'date': first_record['submission_time'], # Will become "Date"
139
  # Additional columns expected by the transformer
140
- 'id': first_record.get('id', agent_name), # Will become "Id"
141
  'source': first_record.get('source', ''), # Will become "Source"
142
  'logs': first_record.get('logs', ''), # Will become "Logs"
143
  }
 
85
  # Create one record per benchmark (mimicking old JSONL format)
86
  for score_entry in scores:
87
  record = {
88
+ 'agent_version': metadata.get('agent_version', 'Unknown'),
89
  'llm_base': metadata.get('model', 'unknown'),
90
  'openness': metadata.get('openness', 'unknown'),
91
  'submission_time': metadata.get('submission_time', ''),
 
119
  # Group by agent to aggregate results across datasets
120
  transformed_records = []
121
 
122
+ for agent_version in df['agent_version'].unique():
123
+ agent_records = df[df['agent_version'] == agent_version]
124
 
125
  # Build a single record for this agent
126
  first_record = agent_records.iloc[0]
 
132
 
133
  record = {
134
  # Core agent info - use final display names
135
+ 'openhands version': agent_version, # Will become "OpenHands Version"
136
+ 'language model': first_record['llm_base'], # Will become "Language Model"
137
  'openness': normalized_openness, # Will become "Openness" (simplified to "open" or "closed")
138
  'date': first_record['submission_time'], # Will become "Date"
139
  # Additional columns expected by the transformer
140
+ 'id': first_record.get('id', agent_version), # Will become "Id"
141
  'source': first_record.get('source', ''), # Will become "Source"
142
  'logs': first_record.get('logs', ''), # Will become "Logs"
143
  }
ui_components.py CHANGED
@@ -147,9 +147,8 @@ def build_descriptions_tooltip_content(table) -> str:
147
  """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
148
  if table == "Overall":
149
  return """
150
- <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
151
- <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
152
- <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
153
  <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
154
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
155
  <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
@@ -165,9 +164,8 @@ def build_descriptions_tooltip_content(table) -> str:
165
  """
166
  elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
167
  return f"""
168
- <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
169
- <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
170
- <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
171
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
172
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
173
  <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
@@ -178,9 +176,8 @@ def build_descriptions_tooltip_content(table) -> str:
178
  else:
179
  # Fallback for any other table type, e.g., individual benchmarks
180
  return f"""
181
- <div class="tooltip-description-item"><b>Agent:</b> Name of the evaluated agent.</div>
182
- <div class="tooltip-description-item"><b>Submitter:</b> Organization or individual who submitted the agent for evaluation.</div>
183
- <div class="tooltip-description-item"><b>Models Used:</b> Model(s) used by the agent. Hover over ⓘ to view all.</div>
184
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
185
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
186
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
@@ -392,13 +389,13 @@ def create_leaderboard_display(
392
  if "Score" in col:
393
  df_view = format_score_column(df_view, col)
394
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
395
- #Make pretty and format the Models Used column
396
- df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list)
397
- df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html)
398
- # append the repro url to the end of the agent name
399
  if 'Source' in df_view.columns:
400
- df_view['Agent'] = df_view.apply(
401
- lambda row: f"{row['Agent']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['Agent'],
402
  axis=1
403
  )
404
 
@@ -416,7 +413,7 @@ def create_leaderboard_display(
416
  for col in df_headers:
417
  if col == "Logs" or "Cost" in col or "Score" in col:
418
  df_datatypes.append("markdown")
419
- elif col in ["Agent","Icon","Models Used", "Pareto"]:
420
  df_datatypes.append("html")
421
  else:
422
  df_datatypes.append("str")
@@ -508,7 +505,7 @@ def create_benchmark_details_display(
508
  benchmark_cost_col = f"{benchmark_name} Cost"
509
 
510
  # Define the columns needed for the detailed table
511
- table_cols = ['Agent','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used']
512
 
513
  # Filter to only columns that actually exist in the full dataframe
514
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
@@ -540,13 +537,13 @@ def create_benchmark_details_display(
540
 
541
  benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
542
 
543
- #Make pretty and format the Models Used column
544
- benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list)
545
- benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html)
546
- # append the repro url to the end of the agent name
547
  if 'Source' in benchmark_table_df.columns:
548
- benchmark_table_df['Agent'] = benchmark_table_df.apply(
549
- lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'],
550
  axis=1
551
  )
552
 
@@ -573,9 +570,8 @@ def create_benchmark_details_display(
573
  desired_cols_in_order = [
574
  'Pareto',
575
  'Icon',
576
- 'Agent',
577
- 'Submitter',
578
- 'Models Used',
579
  'Attempted Benchmark',
580
  benchmark_score_col,
581
  benchmark_cost_col,
@@ -597,7 +593,7 @@ def create_benchmark_details_display(
597
  for col in df_headers:
598
  if "Logs" in col or "Cost" in col or "Score" in col:
599
  df_datatypes.append("markdown")
600
- elif col in ["Agent", "Icon", "Models Used", "Pareto"]:
601
  df_datatypes.append("html")
602
  else:
603
  df_datatypes.append("str")
 
147
  """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
148
  if table == "Overall":
149
  return """
150
+ <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
151
+ <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
 
152
  <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
153
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
154
  <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
 
164
  """
165
  elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
166
  return f"""
167
+ <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
168
+ <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
 
169
  <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
170
  <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
171
  <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
 
176
  else:
177
  # Fallback for any other table type, e.g., individual benchmarks
178
  return f"""
179
+ <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
180
+ <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
 
181
  <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
182
  <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
183
  <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
 
389
  if "Score" in col:
390
  df_view = format_score_column(df_view, col)
391
  scatter_plot = plots_dict.get('scatter_plot', go.Figure())
392
+ #Make pretty and format the Language Model column
393
+ df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
394
+ df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
395
+ # append the repro url to the end of the OpenHands Version
396
  if 'Source' in df_view.columns:
397
+ df_view['OpenHands Version'] = df_view.apply(
398
+ lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
399
  axis=1
400
  )
401
 
 
413
  for col in df_headers:
414
  if col == "Logs" or "Cost" in col or "Score" in col:
415
  df_datatypes.append("markdown")
416
+ elif col in ["OpenHands Version","Icon","Language Model", "Pareto"]:
417
  df_datatypes.append("html")
418
  else:
419
  df_datatypes.append("str")
 
505
  benchmark_cost_col = f"{benchmark_name} Cost"
506
 
507
  # Define the columns needed for the detailed table
508
+ table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']
509
 
510
  # Filter to only columns that actually exist in the full dataframe
511
  existing_table_cols = [col for col in table_cols if col in full_df.columns]
 
537
 
538
  benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)
539
 
540
+ #Make pretty and format the Language Model column
541
+ benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
542
+ benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
543
+ # append the repro url to the end of the OpenHands Version
544
  if 'Source' in benchmark_table_df.columns:
545
+ benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
546
+ lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
547
  axis=1
548
  )
549
 
 
570
  desired_cols_in_order = [
571
  'Pareto',
572
  'Icon',
573
+ 'OpenHands Version',
574
+ 'Language Model',
 
575
  'Attempted Benchmark',
576
  benchmark_score_col,
577
  benchmark_cost_col,
 
593
  for col in df_headers:
594
  if "Logs" in col or "Cost" in col or "Score" in col:
595
  df_datatypes.append("markdown")
596
+ elif col in ["OpenHands Version", "Icon", "Language Model", "Pareto"]:
597
  df_datatypes.append("html")
598
  else:
599
  df_datatypes.append("str")