Spaces:

OpenHands
/

openhands-index

Running

File size: 32,925 Bytes

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import os
import base64

from huggingface_hub import HfApi

import aliases
from simple_data_loader import SimpleLeaderboardViewer
from leaderboard_transformer import (
    DataTransformer,
    transform_raw_dataframe,
    create_pretty_tag_map,
    INFORMAL_TO_FORMAL_NAME_MAP,
    _plot_scatter_plotly,
    format_cost_column,
    format_score_column,
    get_pareto_df,
    clean_llm_base_list,
)
from config import (
    CONFIG_NAME,
    EXTRACTED_DATA_DIR,
    IS_INTERNAL,
    RESULTS_DATASET,
)
from content import (
    create_gradio_anchor_id,
    format_error,
    get_benchmark_description,
    hf_uri_to_web_url,
    hyperlink,
    SCATTER_DISCLAIMER,
)

api = HfApi()
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True)
# Simplified icon map (no tooling distinction, only openness)
# Not actually used since we removed icons from the table, but keeping for potential future use
OPENNESS_ICON_MAP = {
    aliases.CANONICAL_OPENNESS_OPEN: "assets/ellipse-pink.svg",
    aliases.CANONICAL_OPENNESS_CLOSED: "assets/ellipse-yellow.svg",
}

# Add aliases
for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items():
    for openness_alias in openness_aliases:
        OPENNESS_ICON_MAP[openness_alias] = OPENNESS_ICON_MAP[canonical_openness]


OPENNESS_SVG_MAP = {
    aliases.CANONICAL_OPENNESS_OPEN: {
        "path": "assets/ellipse-pink.svg",
        "description": "Open source model"
    },
    aliases.CANONICAL_OPENNESS_CLOSED: {
        "path": "assets/ellipse-yellow.svg",
        "description": "Closed source model"
    },
}

def get_svg_as_data_uri(path: str) -> str:
    """Reads an SVG file and returns it as a base64-encoded data URI."""
    try:
        with open(path, "rb") as svg_file:
            encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8")
            return f"data:image/svg+xml;base64,{encoded_svg}"
    except FileNotFoundError:
        print(f"Warning: SVG file not found at {path}")
        return ""

def create_svg_html(value, svg_map):
    """
    Generates the absolute simplest HTML for an icon, without any extra text.
    This version is compatible with gr.DataFrame.
    """
    if pd.isna(value) or value not in svg_map:
        return ""

    path_info = svg_map[value]
    # Handle both old string format and new object format
    if isinstance(path_info, dict):
        path = path_info["path"]
    else:
        path = path_info

    src = get_svg_as_data_uri(path)
    # Generate the HTML for the single icon, with NO text.
    if src:
        return f'<img src="{src}" style="width: 16px; height: 16px; vertical-align: middle;" alt="{value}" title="{value}">'
    return ""

def build_openness_tooltip_content() -> str:
    """
    Generates the inner HTML for the Model Openness tooltip card.
    """
    html_items = []
    for name, info in OPENNESS_SVG_MAP.items():
        uri = get_svg_as_data_uri(info["path"])
        desc = info["description"]

        html_items.append(f"""
            <div class="tooltip-legend-item">
                <img src="{uri}" alt="{name}">
                <div>
                    <strong>{name}</strong>
                    <span>{desc}</span>
                </div>
            </div>
        """)

    joined_items = "".join(html_items)

    return f"""<span class="tooltip-icon-legend">
        ⓘ
        <span class="tooltip-card">
            <h3>Model Openness</h3>
            <p class="tooltip-description">Indicates whether the language model is open source or closed source.</p>
            <div class="tooltip-items-container">{joined_items}</div>
        </span>
    </span>"""


def build_pareto_tooltip_content() -> str:
    """Generates the inner HTML for the Pareto tooltip card with final copy."""
    trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
    trophy_icon_html = f'<img src="{trophy_uri}" style="width: 25px; height: 25px; vertical-align: middle;">'
    return f"""
        <h3>On Pareto Frontier</h3>
        <p class="tooltip-description">The Pareto frontier represents the best balance between score and cost.</p>
        <p class="tooltip-description">Agents on the frontier either:</p>
        <ul class="tooltip-sub-list">
            <li>Offer the lowest cost for a given performance, or</li>
            <li>Deliver the best performance at a given cost.</li>
        </ul>
        <div class="tooltip-description" style="margin-top: 12px; display: flex; align-items: center;">
            <span>These agents are marked with this icon:</span>
            <span>{trophy_icon_html}</span>
        </div>
    """




def build_descriptions_tooltip_content(table) -> str:
    """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table."""
    if table == "Overall":
        return """
            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
            <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
            <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
            <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
            <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
            <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
            <div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per problem (USD) across Frontend Development benchmarks.</div>
            <div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
            <div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per problem (USD) across App Creation benchmarks.</div>
            <div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
            <div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per problem (USD) across Test Generation benchmarks.</div>
            <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
            <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per problem (USD) across Information Gathering benchmarks.</div>
            <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
        """
    elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
        return f"""
            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
            <div class="tooltip-description-item"><b>{table} Score:</b> Macro-average score across {table} benchmarks.</div>
            <div class="tooltip-description-item"><b>{table} Cost:</b> Macro-average cost per problem (USD) across {table} benchmarks.</div>
            <div class="tooltip-description-item"><b>Benchmark Score:</b> Average (mean) score on the benchmark.</div>
            <div class="tooltip-description-item"><b>Benchmark Cost:</b> Average (mean) cost per problem (USD) on the benchmark.</div>
            <div class="tooltip-description-item"><b>Benchmarks Attempted:</b> Number of benchmarks attempted in this category (e.g., 3/5).</div>
            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
        """
    else:
        # Fallback for any other table type, e.g., individual benchmarks
        return f"""
            <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
            <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
            <div class="tooltip-description-item"><b>Benchmark Attempted:</b> Indicates whether the agent attempted this benchmark.</div>
            <div class="tooltip-description-item"><b>{table} Score:</b> Score achieved by the agent on this benchmark.</div>
            <div class="tooltip-description-item"><b>{table} Cost:</b> Cost incurred by the agent to solve this benchmark (in USD).</div>
            <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
        """

# Create HTML for the "Openness" legend items for table
openness_html_items = []
for name, info in OPENNESS_SVG_MAP.items():
    uri = get_svg_as_data_uri(info["path"])
    # Each item is now its own flexbox container to guarantee alignment
    openness_html_items.append(
        f'<div style="display: flex; align-items: center; white-space: nowrap;">'
        f'<img src="{uri}" alt="{name}" title="{name}" style="width:16px; height:16px; margin-right: 4px; flex-shrink: 0;">'
        f'<span>{name}</span>'
        f'</div>'
    )
openness_html = " ".join(openness_html_items)

pareto_tooltip_content = build_pareto_tooltip_content()
openness_tooltip_content = build_openness_tooltip_content()

def create_legend_markdown(which_table: str) -> str:
    """
    Generates the complete HTML for the legend section, including tooltips.
    This is used in the main leaderboard display.
    """
    descriptions_tooltip_content = build_descriptions_tooltip_content(which_table)
    trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
    legend_markdown = f"""
    <div style="display: flex; flex-wrap: wrap; align-items: flex-start; gap: 20px; font-size: 14px; padding-bottom: 8px;">
            
        <div> <!-- Container for the Pareto section -->
            <b>Pareto</b>
            <span class="tooltip-icon-legend">
                ⓘ
                <span class="tooltip-card">{pareto_tooltip_content}</span>
            </span>
            <div class="table-legend-item">
                <img src="{trophy_uri}" alt="On Frontier" style="width:20px; height:20px; margin-right: 4px; flex-shrink: 0;">
                <span>On frontier</span>
            </div>
        </div>
    
        <div> <!-- Container for the Openness section -->
            <b>Model Openness</b>
            {openness_tooltip_content}
            <div class="table-legend-item">{openness_html}</div>
        </div>
        
        <div><!-- Container for the Column Descriptions section -->
            <b>Column Descriptions</b>
            <span class="tooltip-icon-legend">
                ⓘ
                <span class="tooltip-card">
                    <h3>Column Descriptions</h3>
                    <div class="tooltip-items-container">{descriptions_tooltip_content}</div>
                </span>
            </span>
        </div>
    </div>
    """
    return legend_markdown

# Create HTML for plot legend with SVG icons and keys
openness_legend_items = []
for name, info in OPENNESS_SVG_MAP.items():
    uri = get_svg_as_data_uri(info["path"])
    if uri:
        openness_legend_items.append(
            f'<div class="plot-legend-item">'
                f'<img class="plot-legend-item-svg" src="{uri}" alt="{name}" title="{name}">'
                f'<div class="plot-legend-item-text">'
                    f'<div>'
                        f'<span>{name}</span>'
                    f'</div>'
                    f'<span class="description">{info["description"]}</span>'
                f'</div>'
            f'</div>'
        )

plot_legend_html = f"""
    <div class="plot-legend-container">
        <div id="plot-legend-logo">
            <img src="{get_svg_as_data_uri("assets/logo.svg")}">
        </div>
        <div style="margin-bottom: 16px;">
            <span class="plot-legend-category-heading">Pareto</span>
            <div style="margin-top: 8px;">
                <div class="plot-legend-item">
                    <img id="plot-legend-item-pareto-svg" class="plot-legend-item-svg" src="{get_svg_as_data_uri("assets/pareto.svg")}">
                    <span>On frontier</span>
                </div>
            </div>
        </div>
        <div>
            <span class="plot-legend-category-heading">Model Openness</span>
            <div style="margin-top: 8px;">
                {''.join(openness_legend_items)}
            </div>
        </div>
    </div>
""";

# --- Global State for Viewers (simple caching) ---
CACHED_VIEWERS = {}
CACHED_TAG_MAPS = {}


class DummyViewer:
    """A mock viewer to be cached on error. It has a ._load() method
       to ensure it behaves like the real LeaderboardViewer."""
    def __init__(self, error_df):
        self._error_df = error_df

    def _load(self):
        # The _load method returns the error DataFrame and an empty tag map
        return self._error_df, {}

def get_leaderboard_viewer_instance(split: str):
    """
    Fetches the LeaderboardViewer for a split, using a cache to avoid
    re-downloading data. On error, returns a stable DummyViewer object.
    """
    global CACHED_VIEWERS, CACHED_TAG_MAPS

    if split in CACHED_VIEWERS:
        # Cache hit: return the cached viewer and tag map
        return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []})

    # --- Cache miss: try to load data from the source ---
    try:
        # First try to load from extracted data directory (local mock data)
        data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
        
        print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
        viewer = SimpleLeaderboardViewer(
            data_dir=data_dir,
            config=CONFIG_NAME,
            split=split
        )

        # Simplify tag map creation
        pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP)

        # Cache the results for next time
        CACHED_VIEWERS[split] = viewer
        CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly

        return viewer, pretty_tag_map

    except Exception as e:
        # On ANY error, create a consistent error message and cache a DummyViewer
        error_message = f"Error loading data for split '{split}': {e}"
        print(format_error(error_message))

        dummy_df = pd.DataFrame({"Message": [error_message]})
        dummy_viewer = DummyViewer(dummy_df)
        dummy_tag_map = {"Overall": []}

        # Cache the dummy objects so we don't try to fetch again on this run
        CACHED_VIEWERS[split] = dummy_viewer
        CACHED_TAG_MAPS[split] = dummy_tag_map

        return dummy_viewer, dummy_tag_map


def create_leaderboard_display(
        full_df: pd.DataFrame,
        tag_map: dict,
        category_name: str,
        split_name: str
):
    """
    This UI factory takes pre-loaded data and renders the main DataFrame and Plot
    for a given category (e.g., "Overall" or "Literature Understanding").
    """
    # 1. Instantiate the transformer and get the specific view for this category.
    # The function no longer loads data itself; it filters the data it receives.
    transformer = DataTransformer(full_df, tag_map)
    df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True)
    pareto_df = get_pareto_df(df_view)
    # Get the list of agents on the frontier. We'll use this list later.
    trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
    trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
    if not pareto_df.empty and 'id' in pareto_df.columns:
        pareto_agent_names = pareto_df['id'].tolist()
    else:
        pareto_agent_names = []
    df_view['Pareto'] = df_view.apply(
        lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
        axis=1
    )
    # Generate openness icons for each row
    def get_openness_icon_html(row):
        openness_val = row.get('Openness', '')
        uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
        return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
    
    df_view['Icon'] = df_view.apply(get_openness_icon_html, axis=1)

    # Format cost columns
    for col in df_view.columns:
        if "Cost" in col:
            df_view = format_cost_column(df_view, col)

    # Fill NaN scores with 0
    for col in df_view.columns:
        if "Score" in col:
            df_view = format_score_column(df_view, col)
    scatter_plot = plots_dict.get('scatter_plot', go.Figure())
    #Make pretty and format the Language Model column
    df_view['Language Model'] = df_view['Language Model'].apply(clean_llm_base_list)
    df_view['Language Model'] = df_view['Language Model'].apply(format_llm_base_with_html)
    # append the repro url to the end of the OpenHands Version
    if 'Source' in df_view.columns:
        df_view['OpenHands Version'] = df_view.apply(
            lambda row: f"{row['OpenHands Version']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['OpenHands Version'],
            axis=1
        )

    all_cols = df_view.columns.tolist()
    # Remove pareto and Icon columns and insert it at the beginning
    all_cols.insert(0, all_cols.pop(all_cols.index('Icon')))
    all_cols.insert(0, all_cols.pop(all_cols.index('Pareto')))
    df_view = df_view[all_cols]
    # Drop internally used columns that are not needed in the display
    columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source']
    df_view = df_view.drop(columns=columns_to_drop, errors='ignore')

    header_rename_map = {
        "Pareto": "",
        "Icon": "",
    }
    # Rename columns first before getting headers
    df_view = df_view.rename(columns=header_rename_map)
    
    # Now get headers from the renamed dataframe
    df_headers = df_view.columns.tolist()
    df_datatypes = []
    for col in df_headers:
        if col == "Logs" or "Cost" in col or "Score" in col:
            df_datatypes.append("markdown")
        elif col in ["OpenHands Version","Language Model", ""]:  # "" for renamed Pareto/Icon columns
            df_datatypes.append("html")
        else:
            df_datatypes.append("str")
    # Dynamically set widths for the DataFrame columns
    fixed_start_widths = [40, 40, 200, 100, 200]
    num_score_cost_cols = 0
    remaining_headers = df_headers[len(fixed_start_widths):]
    for col in remaining_headers:
        if "Score" in col or "Cost" in col:
            num_score_cost_cols += 1
    dynamic_widths = [90] * num_score_cost_cols
    fixed_end_widths = [90, 100, 50]
    # 5. Combine all the lists to create the final, fully dynamic list.
    final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths

    with gr.Row():
        with gr.Column(scale=3):
            plot_component = gr.Plot(
                value=scatter_plot,
                show_label=False,
            )
            gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
        with gr.Column(scale=1):
            gr.HTML(value=plot_legend_html)

    # Put table and key into an accordion
    with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
        dataframe_component = gr.DataFrame(
            headers=df_headers,
            value=df_view,
            datatype=df_datatypes,
            interactive=False,
            wrap=True,
            column_widths=final_column_widths,
            elem_classes=["wrap-header-df"],
            show_search="search",
            elem_id="main-leaderboard"
        )
        legend_markdown = create_legend_markdown(category_name)
        gr.HTML(value=legend_markdown, elem_id="legend-markdown")

    # Return the components so they can be referenced elsewhere.
    return plot_component, dataframe_component

# # --- Detailed Benchmark Display ---
def create_benchmark_details_display(
        full_df: pd.DataFrame,
        tag_map: dict,
        category_name: str,
        validation: bool = False,
):
    """
    Generates a detailed breakdown for each benchmark within a given category.
    For each benchmark, it creates a title, a filtered table, and a scatter plot.
    Args:
        full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split.
        tag_map (dict): The "pretty" tag map to find the list of benchmarks.
        category_name (str): The main category to display details for (e.g., "Literature Understanding").
    """
    # 1. Get the list of benchmarks for the selected category
    benchmark_names = tag_map.get(category_name, [])

    if not benchmark_names:
        gr.Markdown(f"No detailed benchmarks found for the category: {category_name}")
        return

    gr.HTML(f'<h2 class="benchmark-main-subtitle">{category_name} Detailed Benchmark Results</h2>')
    gr.Markdown("---")
    # 2. Loop through each benchmark and create its UI components
    for benchmark_name in benchmark_names:
        anchor_id = create_gradio_anchor_id(benchmark_name, validation)
        gr.HTML(
            f"""
                <h3 class="benchmark-title" id="{anchor_id}">{benchmark_name} Leaderboard <a href="#{anchor_id}" class="header-link-icon">🔗</a></h3>
            <div class="benchmark-description">{get_benchmark_description(benchmark_name, validation)}</div>
            <button onclick="scroll_to_element('page-content-wrapper')" class="primary-link-button">Return to the aggregate {category_name} leaderboard</button>
            """
        )

        # 3. Prepare the data for this specific benchmark's table and plot
        benchmark_score_col = f"{benchmark_name} Score"
        benchmark_cost_col = f"{benchmark_name} Cost"

        # Define the columns needed for the detailed table
        table_cols = ['OpenHands Version','Source','Openness', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Language Model']

        # Filter to only columns that actually exist in the full dataframe
        existing_table_cols = [col for col in table_cols if col in full_df.columns]

        if benchmark_score_col not in existing_table_cols:
            gr.Markdown(f"Score data for {benchmark_name} not available.")
            continue # Skip to the next benchmark if score is missing

        # Create a specific DataFrame for the table view
        benchmark_table_df = full_df[existing_table_cols].copy()
        pareto_df = get_pareto_df(benchmark_table_df)
        # Get the list of agents on the frontier. We'll use this list later.
        trophy_uri = get_svg_as_data_uri("assets/trophy.svg")
        trophy_icon_html = f'<img src="{trophy_uri}" alt="On Pareto Frontier" title="On Pareto Frontier" style="width:25px; height:25px;">'
        if not pareto_df.empty and 'id' in pareto_df.columns:
            pareto_agent_names = pareto_df['id'].tolist()
        else:
            pareto_agent_names = []
        benchmark_table_df['Pareto'] = benchmark_table_df.apply(
            lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '',
            axis=1
        )

        # Create simple openness icons
        def get_openness_icon_html(row):
            openness_val = row.get('Openness', '')
            uri = get_svg_as_data_uri(OPENNESS_ICON_MAP.get(openness_val, "assets/ellipse-pink.svg"))
            return f'<img src="{uri}" alt="{openness_val}" title="{openness_val}" style="width:24px; height:24px;">'
        
        benchmark_table_df['Icon'] = benchmark_table_df.apply(get_openness_icon_html, axis=1)

        #Make pretty and format the Language Model column
        benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(clean_llm_base_list)
        benchmark_table_df['Language Model'] = benchmark_table_df['Language Model'].apply(format_llm_base_with_html)
        # append the repro url to the end of the OpenHands Version
        if 'Source' in benchmark_table_df.columns:
            benchmark_table_df['OpenHands Version'] = benchmark_table_df.apply(
                lambda row: f"{row['OpenHands Version']} {row['Source']}" if row['Source'] else row['OpenHands Version'],
                axis=1
            )

        # Calculated and add "Benchmark Attempted" column
        def check_benchmark_status(row):
            has_score = pd.notna(row.get(benchmark_score_col))
            has_cost = pd.notna(row.get(benchmark_cost_col))
            if has_score and has_cost:
                return "✅"
            if has_score or has_cost:
                return "⚠️"
            return "🚫 "

        # Apply the function to create the new column
        benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1)
        # Sort the DataFrame
        if benchmark_score_col in benchmark_table_df.columns:
            benchmark_table_df = benchmark_table_df.sort_values(
                by=benchmark_score_col, ascending=False, na_position='last'
            )
        # 1. Format the cost and score columns
        benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col)
        benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col)
        desired_cols_in_order = [
            'Pareto',
            'Icon',
            'Language Model',
            'OpenHands Version',
            'Attempted Benchmark',
            benchmark_score_col,
            benchmark_cost_col,
            'Date',
            'Logs'
        ]
        for col in desired_cols_in_order:
            if col not in benchmark_table_df.columns:
                benchmark_table_df[col] = pd.NA # Add as an empty column
        benchmark_table_df = benchmark_table_df[desired_cols_in_order]
        # Rename columns for a cleaner table display, as requested
        benchmark_table_df.rename({
            benchmark_score_col: 'Score',
            benchmark_cost_col: 'Cost',
        }, inplace=True)
        # Remove Pareto and Icon column headers (rename to empty string)
        header_rename_map = {
            "Pareto": "",
            "Icon": "",
        }
        benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map)
        
        # Now get headers from the renamed dataframe
        df_headers = benchmark_table_df.columns.tolist()
        df_datatypes = []
        for col in df_headers:
            if "Logs" in col or "Cost" in col or "Score" in col:
                df_datatypes.append("markdown")
            elif col in ["OpenHands Version", "Language Model", ""]:  # "" for renamed Pareto/Icon columns
                df_datatypes.append("html")
            else:
                df_datatypes.append("str")
        benchmark_plot = _plot_scatter_plotly(
            data=full_df,
            x=benchmark_cost_col,
            y=benchmark_score_col,
            agent_col="Agent",
            name=benchmark_name
        )
        with gr.Row():
            with gr.Column(scale=3):
                gr.Plot(value=benchmark_plot, show_label=False) 
                gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer")
            with gr.Column(scale=1):
                gr.HTML(value=plot_legend_html)

        # Put table and key into an accordion
        with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"):
            gr.DataFrame(
                headers=df_headers,
                value=benchmark_table_df,
                datatype=df_datatypes,
                interactive=False,
                wrap=True,
                column_widths=[40, 40, 200, 150, 175, 85, 100, 100, 80, 40],
                show_search="search",
                elem_classes=["wrap-header-df"]
            )
            legend_markdown = create_legend_markdown(benchmark_name)
            gr.HTML(value=legend_markdown, elem_id="legend-markdown")

def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]:
    """
    Loads and transforms the complete dataset for a given split.
    This function handles caching and returns the final "pretty" DataFrame and tag map.
    """
    viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split)

    if isinstance(viewer_or_data, (SimpleLeaderboardViewer, DummyViewer)):
        raw_df, _ = viewer_or_data._load()
        if raw_df.empty:
            return pd.DataFrame(), {}

        pretty_df = transform_raw_dataframe(raw_df)
        pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP)
        if "Logs" in pretty_df.columns:
            def format_log_entry_to_html(raw_uri):
                if pd.isna(raw_uri) or raw_uri == "": return ""
                web_url = hf_uri_to_web_url(str(raw_uri))
                return hyperlink(web_url, "🔗") if web_url else ""
            # Apply the function to the "Logs" column
            pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html)

        if "Source" in pretty_df.columns:
            def format_source_url_to_html(raw_url):
                # Handle empty or NaN values, returning a blank string.
                if pd.isna(raw_url) or raw_url == "": return ""
                # Assume 'source_url' is already a valid web URL and doesn't need conversion.
                return hyperlink(str(raw_url), "🔗")
            # Apply the function to the "source_url" column.
            pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html)
        return pretty_df, pretty_tag_map

    # Fallback for unexpected types
    return pd.DataFrame(), {}
def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML:
    """
    Builds the entire sub-navigation bar as a single, self-contained HTML component.
    This bypasses Gradio's layout components, giving us full control.
    """
    benchmark_names = tag_map.get(category_name, [])
    if not benchmark_names:
        # Return an empty HTML component to prevent errors
        return gr.HTML()

    # Start building the list of HTML button elements as strings
    html_buttons = []
    for name in benchmark_names:
        target_id = create_gradio_anchor_id(name, validation)

        # Create a standard HTML button.
        # The onclick attribute calls our global JS function directly.
        # Note the mix of double and single quotes.
        button_str = f"""
            <button
                class="primary-link-button"
                onclick="scroll_to_element('{target_id}')"
            >
                {name}
            </button>
        """
        html_buttons.append(button_str)

    # Join the button strings and wrap them in a single div container
    # This container will be our flexbox row.
    full_html = f"""
        <div class="sub-nav-bar-container">
            <span class="sub-nav-label">Benchmarks in this category:</span>
            {' | '.join(html_buttons)}
        </div>
    """

    # Return the entire navigation bar as one single Gradio HTML component
    return gr.HTML(full_html)

def format_llm_base_with_html(value):
    """
    Formats the 'Models Used' cell value.
    If the value is a list with more than 1 element, it returns an
      HTML <span> with the full list in a hover-over tooltip.
    If it's a single-element list, it returns just that element.
    Otherwise, it returns the original value.
    """
    if isinstance(value, list):
        if len(value) > 1:
            # Join the list items with a newline character for a clean tooltip
            tooltip_text = "\n".join(map(str, value))
            # Return an HTML span with the title attribute for the tooltip
            return f'<span class="tooltip-icon cell-tooltip-icon" style="cursor: help;" data-tooltip="{tooltip_text}">{value[0]} (+ {len(value) - 1}) ⓘ</span>'
        if len(value) == 1:
            # If only one item, just return that item
            return value[0]
    # Return the value as-is if it's not a list or is an empty list
    return value