pt 1 of migration to new data frame
Browse files- app.py +12 -25
- graphs/leaderboard.py +34 -51
- graphs/model_market_share.py +27 -32
app.py
CHANGED
|
@@ -83,7 +83,7 @@ model_market_share_area = create_stacked_area_chart(
|
|
| 83 |
)
|
| 84 |
|
| 85 |
world_map = create_world_map(
|
| 86 |
-
|
| 87 |
)
|
| 88 |
|
| 89 |
slider = create_range_slider(
|
|
@@ -208,19 +208,23 @@ app.layout = dmc.MantineProvider(
|
|
| 208 |
]),
|
| 209 |
dcc.Tab(label='Leaderboard', children=[
|
| 210 |
create_leaderboard(
|
| 211 |
-
filtered_df
|
| 212 |
)
|
| 213 |
]),
|
| 214 |
dcc.Tab(label='Model Tree Map', children=[
|
| 215 |
dcc.Graph(figure=tree_map)
|
| 216 |
]),
|
| 217 |
-
dcc.Tab(label='Model Characteristics',
|
| 218 |
-
dcc.Graph(id='language-concentration-chart'),
|
| 219 |
html.Div([
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
]),
|
| 225 |
])
|
| 226 |
],
|
|
@@ -305,23 +309,6 @@ def update_stacked_area(value):
|
|
| 305 |
return updated_fig
|
| 306 |
return model_market_share_area
|
| 307 |
|
| 308 |
-
|
| 309 |
-
# Model Characteristics Tab
|
| 310 |
-
# On dropdown change, update graph
|
| 311 |
-
@app.callback(
|
| 312 |
-
Output('language-concentration-chart', 'figure'),
|
| 313 |
-
[Input('dropdown', 'value')]
|
| 314 |
-
)
|
| 315 |
-
def update_graph(selected_metric):
|
| 316 |
-
if selected_metric == 'Language Concentration':
|
| 317 |
-
return language_concentration_area
|
| 318 |
-
elif selected_metric == 'License':
|
| 319 |
-
return license_concentration_area
|
| 320 |
-
elif selected_metric == 'Method':
|
| 321 |
-
return download_method_cumsum_line
|
| 322 |
-
elif selected_metric == 'Architecture':
|
| 323 |
-
return download_arch_cumsum_line
|
| 324 |
-
|
| 325 |
# Run the app
|
| 326 |
if __name__ == '__main__':
|
| 327 |
app.run(debug=True)
|
|
|
|
| 83 |
)
|
| 84 |
|
| 85 |
world_map = create_world_map(
|
| 86 |
+
filtered_df
|
| 87 |
)
|
| 88 |
|
| 89 |
slider = create_range_slider(
|
|
|
|
| 208 |
]),
|
| 209 |
dcc.Tab(label='Leaderboard', children=[
|
| 210 |
create_leaderboard(
|
| 211 |
+
filtered_df
|
| 212 |
)
|
| 213 |
]),
|
| 214 |
dcc.Tab(label='Model Tree Map', children=[
|
| 215 |
dcc.Graph(figure=tree_map)
|
| 216 |
]),
|
| 217 |
+
dcc.Tab(label='Model Characteristics',children=[
|
|
|
|
| 218 |
html.Div([
|
| 219 |
+
html.H3("Language Concentration", style={'textAlign': 'center', 'marginBottom': 10}),
|
| 220 |
+
dcc.Graph(figure=language_concentration_area),
|
| 221 |
+
html.H3("License Distribution", style={'textAlign': 'center', 'marginBottom': 10}),
|
| 222 |
+
dcc.Graph(figure=license_concentration_area),
|
| 223 |
+
html.H3("Method Trends", style={'textAlign': 'center', 'marginBottom': 10}),
|
| 224 |
+
dcc.Graph(figure=download_method_cumsum_line),
|
| 225 |
+
html.H3("Architecture Trends", style={'textAlign': 'center', 'marginBottom': 10}),
|
| 226 |
+
dcc.Graph(figure=download_arch_cumsum_line),
|
| 227 |
+
], style={'marginBottom': 12}),
|
| 228 |
]),
|
| 229 |
])
|
| 230 |
],
|
|
|
|
| 309 |
return updated_fig
|
| 310 |
return model_market_share_area
|
| 311 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
# Run the app
|
| 313 |
if __name__ == '__main__':
|
| 314 |
app.run(debug=True)
|
graphs/leaderboard.py
CHANGED
|
@@ -2,7 +2,7 @@ import pandas as pd
|
|
| 2 |
from dash import html, dcc
|
| 3 |
import base64
|
| 4 |
|
| 5 |
-
def create_leaderboard(filtered_df,
|
| 6 |
country_icon_map = {
|
| 7 |
"USA": "🇺🇸",
|
| 8 |
"China": "🇨🇳",
|
|
@@ -19,7 +19,8 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
|
|
| 19 |
"Unknown": "❓",
|
| 20 |
"Finland": "🇫🇮",
|
| 21 |
"Lebanon": "🇱🇧",
|
| 22 |
-
"
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
company_icon_map = {
|
|
@@ -30,51 +31,27 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
|
|
| 30 |
"openai": "../assets/icons/openai.png",
|
| 31 |
}
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
filtered_df["org_or_user"] = filtered_df["org_or_user"].where(filtered_df["org_or_user"] == "org", "user")
|
| 39 |
-
|
| 40 |
-
# Merge country info for developers/models
|
| 41 |
-
developer_df = developer_df.merge(
|
| 42 |
-
filtered_df[["country", "author", "org_or_user", "model", "downloads", "estimated_parameters"]].drop_duplicates(subset=["author"]),
|
| 43 |
-
left_on="metric", right_on="author", how="left"
|
| 44 |
-
).drop(columns=["metric"])
|
| 45 |
-
|
| 46 |
-
model_df = model_df.merge(
|
| 47 |
-
filtered_df[["country", "author", "downloads", "org_or_user", "model", "merged_modality", "estimated_parameters"]].drop_duplicates(subset=["model"]),
|
| 48 |
-
left_on="metric", right_on="model", how="left"
|
| 49 |
-
).drop(columns=["metric"])
|
| 50 |
-
|
| 51 |
-
# Rename metric columns
|
| 52 |
-
# country_df = country_df.rename(columns={"metric": "country"})
|
| 53 |
-
country_df = country_df.merge(
|
| 54 |
-
filtered_df[["country", "downloads", "estimated_parameters"]].drop_duplicates(subset=["country"]),
|
| 55 |
-
left_on="metric", right_on="country", how="left"
|
| 56 |
-
).drop(columns=["metric"])
|
| 57 |
|
| 58 |
# Filter by time
|
| 59 |
-
start_time
|
| 60 |
-
|
| 61 |
-
country_df = country_df[(country_df["time"] >= start_time) & (country_df["time"] <= end_time)]
|
| 62 |
-
developer_df = developer_df[(developer_df["time"] >= start_time) & (developer_df["time"] <= end_time)]
|
| 63 |
-
model_df = model_df[(model_df["time"] >= start_time) & (model_df["time"] <= end_time)]
|
| 64 |
|
| 65 |
-
if
|
| 66 |
return html.Div("No data in selected range")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
# Function to get top N leaderboard
|
| 69 |
-
def get_top_n_leaderboard(
|
| 70 |
-
top = (
|
| 71 |
-
df.groupby(group_col)["value"]
|
| 72 |
-
.sum()
|
| 73 |
-
.sort_values(ascending=False)
|
| 74 |
-
.head(top_n)
|
| 75 |
-
.reset_index()
|
| 76 |
-
.rename(columns={group_col: "Name", "value": "Total Value"})
|
| 77 |
-
)
|
| 78 |
total_value = top["Total Value"].sum()
|
| 79 |
top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
|
| 80 |
|
|
@@ -83,13 +60,15 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
|
|
| 83 |
download_top["Total Value"] = download_top["Total Value"].astype(int)
|
| 84 |
download_top["% of total"] = download_top["% of total"].round(2)
|
| 85 |
|
|
|
|
|
|
|
| 86 |
# All relevant metadata columns
|
| 87 |
-
meta_cols =
|
| 88 |
# Collect all metadata per top n for each category (country, author, model)
|
| 89 |
meta_map = {}
|
| 90 |
download_map = {}
|
| 91 |
for name in top["Name"]:
|
| 92 |
-
name_data =
|
| 93 |
meta_map[name] = {}
|
| 94 |
download_map[name] = {}
|
| 95 |
for col in meta_cols:
|
|
@@ -103,23 +82,27 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
|
|
| 103 |
meta = meta_map.get(nm, {})
|
| 104 |
chips = []
|
| 105 |
# Countries
|
| 106 |
-
for c in meta.get("
|
| 107 |
if c == "United States of America":
|
| 108 |
c = "USA"
|
|
|
|
|
|
|
| 109 |
chips.append((country_icon_map.get(c, ""), c))
|
| 110 |
# Author
|
| 111 |
for a in meta.get("author", []):
|
| 112 |
icon = company_icon_map.get(a, "")
|
| 113 |
if icon == "":
|
| 114 |
-
if meta.get("
|
| 115 |
icon = "🏢"
|
| 116 |
else:
|
| 117 |
icon = "👤"
|
| 118 |
chips.append((icon, a))
|
| 119 |
# Downloads
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
| 123 |
# Modality
|
| 124 |
for m in meta.get("merged_modality", []):
|
| 125 |
chips.append(("", m))
|
|
@@ -163,9 +146,9 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
|
|
| 163 |
return top[["Name", "Metadata", "% of total"]], download_top
|
| 164 |
|
| 165 |
# Build leaderboards
|
| 166 |
-
top_countries, download_top_countries = get_top_n_leaderboard(
|
| 167 |
-
top_developers, download_top_developers = get_top_n_leaderboard(
|
| 168 |
-
top_models, download_top_models = get_top_n_leaderboard(
|
| 169 |
|
| 170 |
# Chip renderer
|
| 171 |
def chip(text, bg_color="#F0F0F0"):
|
|
|
|
| 2 |
from dash import html, dcc
|
| 3 |
import base64
|
| 4 |
|
| 5 |
+
def create_leaderboard(filtered_df, start_time=None, top_n=10):
|
| 6 |
country_icon_map = {
|
| 7 |
"USA": "🇺🇸",
|
| 8 |
"China": "🇨🇳",
|
|
|
|
| 19 |
"Unknown": "❓",
|
| 20 |
"Finland": "🇫🇮",
|
| 21 |
"Lebanon": "🇱🇧",
|
| 22 |
+
"User": "👤",
|
| 23 |
+
"International/Online": "🌐",
|
| 24 |
}
|
| 25 |
|
| 26 |
company_icon_map = {
|
|
|
|
| 31 |
"openai": "../assets/icons/openai.png",
|
| 32 |
}
|
| 33 |
|
| 34 |
+
meta_cols_map = {
|
| 35 |
+
"org_country_single": ["org_country_single"],
|
| 36 |
+
"author": ["org_country_single", "author", "merged_country_groups_single"],
|
| 37 |
+
"model": ["org_country_single", "author", "merged_country_groups_single", "merged_modality", "downloads"]
|
| 38 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Filter by time
|
| 41 |
+
if start_time is not None:
|
| 42 |
+
filtered_df = filtered_df[(filtered_df["created"] >= start_time) & (filtered_df["time"] >= start_time)]
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
if filtered_df.empty:
|
| 45 |
return html.Div("No data in selected range")
|
| 46 |
+
|
| 47 |
+
# Merge HF and USA
|
| 48 |
+
filtered_df["org_country_single"] = filtered_df["org_country_single"].replace({"HF": "United States of America"})
|
| 49 |
+
# Merge International and Online
|
| 50 |
+
filtered_df["org_country_single"] = filtered_df["org_country_single"].replace({"International": "International/Online", "Online": "International/Online"})
|
| 51 |
|
| 52 |
# Function to get top N leaderboard
|
| 53 |
+
def get_top_n_leaderboard(group_col, top_n=10):
|
| 54 |
+
top = filtered_df.groupby(group_col)["downloads"].sum().nlargest(top_n).reset_index().rename(columns={group_col: "Name", "downloads": "Total Value"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
total_value = top["Total Value"].sum()
|
| 56 |
top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
|
| 57 |
|
|
|
|
| 60 |
download_top["Total Value"] = download_top["Total Value"].astype(int)
|
| 61 |
download_top["% of total"] = download_top["% of total"].round(2)
|
| 62 |
|
| 63 |
+
top["Name"].replace("User", "user", inplace=True)
|
| 64 |
+
|
| 65 |
# All relevant metadata columns
|
| 66 |
+
meta_cols = meta_cols_map.get(group_col, [])
|
| 67 |
# Collect all metadata per top n for each category (country, author, model)
|
| 68 |
meta_map = {}
|
| 69 |
download_map = {}
|
| 70 |
for name in top["Name"]:
|
| 71 |
+
name_data = filtered_df[filtered_df[group_col] == name]
|
| 72 |
meta_map[name] = {}
|
| 73 |
download_map[name] = {}
|
| 74 |
for col in meta_cols:
|
|
|
|
| 82 |
meta = meta_map.get(nm, {})
|
| 83 |
chips = []
|
| 84 |
# Countries
|
| 85 |
+
for c in meta.get("org_country_single", []):
|
| 86 |
if c == "United States of America":
|
| 87 |
c = "USA"
|
| 88 |
+
if c == "user":
|
| 89 |
+
c = "User"
|
| 90 |
chips.append((country_icon_map.get(c, ""), c))
|
| 91 |
# Author
|
| 92 |
for a in meta.get("author", []):
|
| 93 |
icon = company_icon_map.get(a, "")
|
| 94 |
if icon == "":
|
| 95 |
+
if meta.get("merged_country_groups_single", ["User"])[0] != "User":
|
| 96 |
icon = "🏢"
|
| 97 |
else:
|
| 98 |
icon = "👤"
|
| 99 |
chips.append((icon, a))
|
| 100 |
# Downloads
|
| 101 |
+
# Sum downloads if multiple entries
|
| 102 |
+
total_downloads = sum(d for d in meta.get("downloads", []) if pd.notna(d)) # Check if d is not NaN
|
| 103 |
+
if total_downloads:
|
| 104 |
+
chips.append(("⬇️", f"{int(total_downloads):,}"))
|
| 105 |
+
|
| 106 |
# Modality
|
| 107 |
for m in meta.get("merged_modality", []):
|
| 108 |
chips.append(("", m))
|
|
|
|
| 146 |
return top[["Name", "Metadata", "% of total"]], download_top
|
| 147 |
|
| 148 |
# Build leaderboards
|
| 149 |
+
top_countries, download_top_countries = get_top_n_leaderboard("org_country_single", top_n)
|
| 150 |
+
top_developers, download_top_developers = get_top_n_leaderboard("author", top_n)
|
| 151 |
+
top_models, download_top_models = get_top_n_leaderboard("model", top_n)
|
| 152 |
|
| 153 |
# Chip renderer
|
| 154 |
def chip(text, bg_color="#F0F0F0"):
|
graphs/model_market_share.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import plotly.graph_objects as go
|
| 2 |
from plotly.subplots import make_subplots
|
| 3 |
|
|
@@ -162,10 +164,17 @@ def create_stacked_area_chart(
|
|
| 162 |
|
| 163 |
|
| 164 |
def create_world_map(
|
| 165 |
-
df,
|
| 166 |
):
|
| 167 |
-
#
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# Country code mapping
|
| 171 |
country_code_map = {
|
|
@@ -214,8 +223,10 @@ def create_world_map(
|
|
| 214 |
"Turkey": "TUR",
|
| 215 |
}
|
| 216 |
|
| 217 |
-
df["country_code"] = df[
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
|
| 220 |
fig = make_subplots(
|
| 221 |
rows=1,
|
|
@@ -223,42 +234,26 @@ def create_world_map(
|
|
| 223 |
specs=[[{"type": "geo"}]],
|
| 224 |
)
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
# Average values across time range
|
| 232 |
-
agg_data = (
|
| 233 |
-
range_data.groupby([metric_col, "country_code"])[value_col]
|
| 234 |
-
.mean()
|
| 235 |
-
.reset_index()
|
| 236 |
-
)
|
| 237 |
-
agg_data["percentage"] = agg_data[value_col] * 100
|
| 238 |
-
return agg_data.sort_values("percentage", ascending=False)
|
| 239 |
-
|
| 240 |
-
# Initial data if start or end time are not set (full range)
|
| 241 |
-
if start_time is None:
|
| 242 |
-
start_time = times[0]
|
| 243 |
-
if end_time is None:
|
| 244 |
-
end_time = times[-1]
|
| 245 |
-
initial_data = aggregate_time_range(start_time, end_time)
|
| 246 |
-
# top_countries = initial_data.head(top_n_labels)
|
| 247 |
|
| 248 |
# Create hover text
|
| 249 |
hover_text = []
|
| 250 |
-
for _, row in
|
| 251 |
hover_text.append(
|
| 252 |
-
f"<b>{row[
|
| 253 |
-
f"Avg Downloads: {row['
|
| 254 |
-
f"Avg Value: {row[
|
| 255 |
)
|
| 256 |
|
| 257 |
# Add choropleth to plot
|
| 258 |
fig.add_trace(
|
| 259 |
go.Choropleth(
|
| 260 |
-
locations=
|
| 261 |
-
z=
|
| 262 |
text=hover_text,
|
| 263 |
hovertemplate="%{text}<extra></extra>",
|
| 264 |
colorscale=[
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
import plotly.graph_objects as go
|
| 4 |
from plotly.subplots import make_subplots
|
| 5 |
|
|
|
|
| 164 |
|
| 165 |
|
| 166 |
def create_world_map(
|
| 167 |
+
df, top_n_labels=20
|
| 168 |
):
|
| 169 |
+
# Create a filtered_df with only countries
|
| 170 |
+
df = df[df['org_country_single'] != 'HF']
|
| 171 |
+
df = df[df['org_country_single'] != 'Online']
|
| 172 |
+
df = df[df['org_country_single'] != 'International']
|
| 173 |
+
df = df[df['org_country_single'] != 'user']
|
| 174 |
+
|
| 175 |
+
# Filter out models created after 2024-01-01 and downloads after 2024-01-01
|
| 176 |
+
# df = df[df['created'] > '2024-01-01']
|
| 177 |
+
# df = df[df['time'] > '2024-01-01']
|
| 178 |
|
| 179 |
# Country code mapping
|
| 180 |
country_code_map = {
|
|
|
|
| 223 |
"Turkey": "TUR",
|
| 224 |
}
|
| 225 |
|
| 226 |
+
df["country_code"] = df["org_country_single"].map(country_code_map)
|
| 227 |
+
df = df.dropna(subset=["country_code"])
|
| 228 |
+
|
| 229 |
+
# Fix country plot
|
| 230 |
|
| 231 |
fig = make_subplots(
|
| 232 |
rows=1,
|
|
|
|
| 234 |
specs=[[{"type": "geo"}]],
|
| 235 |
)
|
| 236 |
|
| 237 |
+
downloads_by_country = df.groupby('org_country_single')['downloads'].sum().reset_index()
|
| 238 |
+
|
| 239 |
+
# Prepare top countries for annotation
|
| 240 |
+
total_downloads = float(downloads_by_country['downloads'].sum())
|
| 241 |
+
downloads_by_country['pct'] = (downloads_by_country['downloads'] / total_downloads * 100.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
# Create hover text
|
| 244 |
hover_text = []
|
| 245 |
+
for _, row in downloads_by_country.iterrows():
|
| 246 |
hover_text.append(
|
| 247 |
+
f"<b>{row['org_country_single']}</b><br>"
|
| 248 |
+
f"Avg Downloads: {row['pct']:.1f}% of total<br>"
|
| 249 |
+
f"Avg Value: {row['downloads']:.6f}"
|
| 250 |
)
|
| 251 |
|
| 252 |
# Add choropleth to plot
|
| 253 |
fig.add_trace(
|
| 254 |
go.Choropleth(
|
| 255 |
+
locations=downloads_by_country["country_code"],
|
| 256 |
+
z=np.log10(downloads_by_country["pct"]),
|
| 257 |
text=hover_text,
|
| 258 |
hovertemplate="%{text}<extra></extra>",
|
| 259 |
colorscale=[
|