emsesc commited on
Commit
855952e
·
1 Parent(s): 6054b77

pt 1 of migration to new data frame

Browse files
Files changed (3) hide show
  1. app.py +12 -25
  2. graphs/leaderboard.py +34 -51
  3. graphs/model_market_share.py +27 -32
app.py CHANGED
@@ -83,7 +83,7 @@ model_market_share_area = create_stacked_area_chart(
83
  )
84
 
85
  world_map = create_world_map(
86
- country_concentration_df, "time", "metric", "value"
87
  )
88
 
89
  slider = create_range_slider(
@@ -208,19 +208,23 @@ app.layout = dmc.MantineProvider(
208
  ]),
209
  dcc.Tab(label='Leaderboard', children=[
210
  create_leaderboard(
211
- filtered_df, country_concentration_df, author_concentration_df, model_concentration_df
212
  )
213
  ]),
214
  dcc.Tab(label='Model Tree Map', children=[
215
  dcc.Graph(figure=tree_map)
216
  ]),
217
- dcc.Tab(label='Model Characteristics', children=[
218
- dcc.Graph(id='language-concentration-chart'),
219
  html.Div([
220
- dcc.Dropdown(['Language Concentration', 'Architecture', 'License', 'Method'], 'Language Concentration', id='dropdown'),
221
- ], style={'marginTop': 6}),
222
- ]),
223
- dcc.Tab(label='Model Relationships', children=[
 
 
 
 
 
224
  ]),
225
  ])
226
  ],
@@ -305,23 +309,6 @@ def update_stacked_area(value):
305
  return updated_fig
306
  return model_market_share_area
307
 
308
-
309
- # Model Characteristics Tab
310
- # On dropdown change, update graph
311
- @app.callback(
312
- Output('language-concentration-chart', 'figure'),
313
- [Input('dropdown', 'value')]
314
- )
315
- def update_graph(selected_metric):
316
- if selected_metric == 'Language Concentration':
317
- return language_concentration_area
318
- elif selected_metric == 'License':
319
- return license_concentration_area
320
- elif selected_metric == 'Method':
321
- return download_method_cumsum_line
322
- elif selected_metric == 'Architecture':
323
- return download_arch_cumsum_line
324
-
325
  # Run the app
326
  if __name__ == '__main__':
327
  app.run(debug=True)
 
83
  )
84
 
85
  world_map = create_world_map(
86
+ filtered_df
87
  )
88
 
89
  slider = create_range_slider(
 
208
  ]),
209
  dcc.Tab(label='Leaderboard', children=[
210
  create_leaderboard(
211
+ filtered_df
212
  )
213
  ]),
214
  dcc.Tab(label='Model Tree Map', children=[
215
  dcc.Graph(figure=tree_map)
216
  ]),
217
+ dcc.Tab(label='Model Characteristics',children=[
 
218
  html.Div([
219
+ html.H3("Language Concentration", style={'textAlign': 'center', 'marginBottom': 10}),
220
+ dcc.Graph(figure=language_concentration_area),
221
+ html.H3("License Distribution", style={'textAlign': 'center', 'marginBottom': 10}),
222
+ dcc.Graph(figure=license_concentration_area),
223
+ html.H3("Method Trends", style={'textAlign': 'center', 'marginBottom': 10}),
224
+ dcc.Graph(figure=download_method_cumsum_line),
225
+ html.H3("Architecture Trends", style={'textAlign': 'center', 'marginBottom': 10}),
226
+ dcc.Graph(figure=download_arch_cumsum_line),
227
+ ], style={'marginBottom': 12}),
228
  ]),
229
  ])
230
  ],
 
309
  return updated_fig
310
  return model_market_share_area
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  # Run the app
313
  if __name__ == '__main__':
314
  app.run(debug=True)
graphs/leaderboard.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  from dash import html, dcc
3
  import base64
4
 
5
- def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_time=None, end_time=None, top_n=10):
6
  country_icon_map = {
7
  "USA": "🇺🇸",
8
  "China": "🇨🇳",
@@ -19,7 +19,8 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
19
  "Unknown": "❓",
20
  "Finland": "🇫🇮",
21
  "Lebanon": "🇱🇧",
22
- "HF": "../assets/icons/hugging-face.png",
 
23
  }
24
 
25
  company_icon_map = {
@@ -30,51 +31,27 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
30
  "openai": "../assets/icons/openai.png",
31
  }
32
 
33
- # Ensure datetime
34
- for df in [country_df, developer_df, model_df]:
35
- df["time"] = pd.to_datetime(df["time"])
36
-
37
- # change any value that does not equal "org" to "user"
38
- filtered_df["org_or_user"] = filtered_df["org_or_user"].where(filtered_df["org_or_user"] == "org", "user")
39
-
40
- # Merge country info for developers/models
41
- developer_df = developer_df.merge(
42
- filtered_df[["country", "author", "org_or_user", "model", "downloads", "estimated_parameters"]].drop_duplicates(subset=["author"]),
43
- left_on="metric", right_on="author", how="left"
44
- ).drop(columns=["metric"])
45
-
46
- model_df = model_df.merge(
47
- filtered_df[["country", "author", "downloads", "org_or_user", "model", "merged_modality", "estimated_parameters"]].drop_duplicates(subset=["model"]),
48
- left_on="metric", right_on="model", how="left"
49
- ).drop(columns=["metric"])
50
-
51
- # Rename metric columns
52
- # country_df = country_df.rename(columns={"metric": "country"})
53
- country_df = country_df.merge(
54
- filtered_df[["country", "downloads", "estimated_parameters"]].drop_duplicates(subset=["country"]),
55
- left_on="metric", right_on="country", how="left"
56
- ).drop(columns=["metric"])
57
 
58
  # Filter by time
59
- start_time = start_time or country_df["time"].min()
60
- end_time = end_time or country_df["time"].max()
61
- country_df = country_df[(country_df["time"] >= start_time) & (country_df["time"] <= end_time)]
62
- developer_df = developer_df[(developer_df["time"] >= start_time) & (developer_df["time"] <= end_time)]
63
- model_df = model_df[(model_df["time"] >= start_time) & (model_df["time"] <= end_time)]
64
 
65
- if country_df.empty and developer_df.empty and model_df.empty:
66
  return html.Div("No data in selected range")
 
 
 
 
 
67
 
68
  # Function to get top N leaderboard
69
- def get_top_n_leaderboard(df, group_col, top_n=10):
70
- top = (
71
- df.groupby(group_col)["value"]
72
- .sum()
73
- .sort_values(ascending=False)
74
- .head(top_n)
75
- .reset_index()
76
- .rename(columns={group_col: "Name", "value": "Total Value"})
77
- )
78
  total_value = top["Total Value"].sum()
79
  top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
80
 
@@ -83,13 +60,15 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
83
  download_top["Total Value"] = download_top["Total Value"].astype(int)
84
  download_top["% of total"] = download_top["% of total"].round(2)
85
 
 
 
86
  # All relevant metadata columns
87
- meta_cols = ["country", "author", "downloads", "org_or_user", "merged_modality", "estimated_parameters"]
88
  # Collect all metadata per top n for each category (country, author, model)
89
  meta_map = {}
90
  download_map = {}
91
  for name in top["Name"]:
92
- name_data = df[df[group_col] == name]
93
  meta_map[name] = {}
94
  download_map[name] = {}
95
  for col in meta_cols:
@@ -103,23 +82,27 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
103
  meta = meta_map.get(nm, {})
104
  chips = []
105
  # Countries
106
- for c in meta.get("country", []):
107
  if c == "United States of America":
108
  c = "USA"
 
 
109
  chips.append((country_icon_map.get(c, ""), c))
110
  # Author
111
  for a in meta.get("author", []):
112
  icon = company_icon_map.get(a, "")
113
  if icon == "":
114
- if meta.get("org_or_user", ["user"])[0] == "org":
115
  icon = "🏢"
116
  else:
117
  icon = "👤"
118
  chips.append((icon, a))
119
  # Downloads
120
- for d in meta.get("downloads", []):
121
- if pd.notna(d): # Check if d is not NaN
122
- chips.append(("⬇️", f"{int(d):,}"))
 
 
123
  # Modality
124
  for m in meta.get("merged_modality", []):
125
  chips.append(("", m))
@@ -163,9 +146,9 @@ def create_leaderboard(filtered_df, country_df, developer_df, model_df, start_ti
163
  return top[["Name", "Metadata", "% of total"]], download_top
164
 
165
  # Build leaderboards
166
- top_countries, download_top_countries = get_top_n_leaderboard(country_df, "country", top_n)
167
- top_developers, download_top_developers = get_top_n_leaderboard(developer_df, "author", top_n)
168
- top_models, download_top_models = get_top_n_leaderboard(model_df, "model", top_n)
169
 
170
  # Chip renderer
171
  def chip(text, bg_color="#F0F0F0"):
 
2
  from dash import html, dcc
3
  import base64
4
 
5
+ def create_leaderboard(filtered_df, start_time=None, top_n=10):
6
  country_icon_map = {
7
  "USA": "🇺🇸",
8
  "China": "🇨🇳",
 
19
  "Unknown": "❓",
20
  "Finland": "🇫🇮",
21
  "Lebanon": "🇱🇧",
22
+ "User": "👤",
23
+ "International/Online": "🌐",
24
  }
25
 
26
  company_icon_map = {
 
31
  "openai": "../assets/icons/openai.png",
32
  }
33
 
34
+ meta_cols_map = {
35
+ "org_country_single": ["org_country_single"],
36
+ "author": ["org_country_single", "author", "merged_country_groups_single"],
37
+ "model": ["org_country_single", "author", "merged_country_groups_single", "merged_modality", "downloads"]
38
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Filter by time
41
+ if start_time is not None:
42
+ filtered_df = filtered_df[(filtered_df["created"] >= start_time) & (filtered_df["time"] >= start_time)]
 
 
 
43
 
44
+ if filtered_df.empty:
45
  return html.Div("No data in selected range")
46
+
47
+ # Merge HF and USA
48
+ filtered_df["org_country_single"] = filtered_df["org_country_single"].replace({"HF": "United States of America"})
49
+ # Merge International and Online
50
+ filtered_df["org_country_single"] = filtered_df["org_country_single"].replace({"International": "International/Online", "Online": "International/Online"})
51
 
52
  # Function to get top N leaderboard
53
+ def get_top_n_leaderboard(group_col, top_n=10):
54
+ top = filtered_df.groupby(group_col)["downloads"].sum().nlargest(top_n).reset_index().rename(columns={group_col: "Name", "downloads": "Total Value"})
 
 
 
 
 
 
 
55
  total_value = top["Total Value"].sum()
56
  top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
57
 
 
60
  download_top["Total Value"] = download_top["Total Value"].astype(int)
61
  download_top["% of total"] = download_top["% of total"].round(2)
62
 
63
+ top["Name"].replace("User", "user", inplace=True)
64
+
65
  # All relevant metadata columns
66
+ meta_cols = meta_cols_map.get(group_col, [])
67
  # Collect all metadata per top n for each category (country, author, model)
68
  meta_map = {}
69
  download_map = {}
70
  for name in top["Name"]:
71
+ name_data = filtered_df[filtered_df[group_col] == name]
72
  meta_map[name] = {}
73
  download_map[name] = {}
74
  for col in meta_cols:
 
82
  meta = meta_map.get(nm, {})
83
  chips = []
84
  # Countries
85
+ for c in meta.get("org_country_single", []):
86
  if c == "United States of America":
87
  c = "USA"
88
+ if c == "user":
89
+ c = "User"
90
  chips.append((country_icon_map.get(c, ""), c))
91
  # Author
92
  for a in meta.get("author", []):
93
  icon = company_icon_map.get(a, "")
94
  if icon == "":
95
+ if meta.get("merged_country_groups_single", ["User"])[0] != "User":
96
  icon = "🏢"
97
  else:
98
  icon = "👤"
99
  chips.append((icon, a))
100
  # Downloads
101
+ # Sum downloads if multiple entries
102
+ total_downloads = sum(d for d in meta.get("downloads", []) if pd.notna(d)) # Check if d is not NaN
103
+ if total_downloads:
104
+ chips.append(("⬇️", f"{int(total_downloads):,}"))
105
+
106
  # Modality
107
  for m in meta.get("merged_modality", []):
108
  chips.append(("", m))
 
146
  return top[["Name", "Metadata", "% of total"]], download_top
147
 
148
  # Build leaderboards
149
+ top_countries, download_top_countries = get_top_n_leaderboard("org_country_single", top_n)
150
+ top_developers, download_top_developers = get_top_n_leaderboard("author", top_n)
151
+ top_models, download_top_models = get_top_n_leaderboard("model", top_n)
152
 
153
  # Chip renderer
154
  def chip(text, bg_color="#F0F0F0"):
graphs/model_market_share.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import plotly.graph_objects as go
2
  from plotly.subplots import make_subplots
3
 
@@ -162,10 +164,17 @@ def create_stacked_area_chart(
162
 
163
 
164
  def create_world_map(
165
- df, time_col="time", metric_col="metric", value_col="value", top_n_labels=10, start_time=None, end_time=None
166
  ):
167
- # Get all unique times and sort them
168
- times = sorted(df[time_col].unique())
 
 
 
 
 
 
 
169
 
170
  # Country code mapping
171
  country_code_map = {
@@ -214,8 +223,10 @@ def create_world_map(
214
  "Turkey": "TUR",
215
  }
216
 
217
- df["country_code"] = df[metric_col].map(country_code_map)
218
- mapped_data = df.dropna(subset=["country_code"])
 
 
219
 
220
  fig = make_subplots(
221
  rows=1,
@@ -223,42 +234,26 @@ def create_world_map(
223
  specs=[[{"type": "geo"}]],
224
  )
225
 
226
- # Function to aggregate data for time range
227
- def aggregate_time_range(start_time, end_time):
228
- range_data = mapped_data[
229
- (mapped_data[time_col] >= start_time) & (mapped_data[time_col] <= end_time)
230
- ]
231
- # Average values across time range
232
- agg_data = (
233
- range_data.groupby([metric_col, "country_code"])[value_col]
234
- .mean()
235
- .reset_index()
236
- )
237
- agg_data["percentage"] = agg_data[value_col] * 100
238
- return agg_data.sort_values("percentage", ascending=False)
239
-
240
- # Initial data if start or end time are not set (full range)
241
- if start_time is None:
242
- start_time = times[0]
243
- if end_time is None:
244
- end_time = times[-1]
245
- initial_data = aggregate_time_range(start_time, end_time)
246
- # top_countries = initial_data.head(top_n_labels)
247
 
248
  # Create hover text
249
  hover_text = []
250
- for _, row in initial_data.iterrows():
251
  hover_text.append(
252
- f"<b>{row[metric_col]}</b><br>"
253
- f"Avg Downloads: {row['percentage']:.1f}% of total<br>"
254
- f"Avg Value: {row[value_col]:.6f}"
255
  )
256
 
257
  # Add choropleth to plot
258
  fig.add_trace(
259
  go.Choropleth(
260
- locations=initial_data["country_code"],
261
- z=initial_data["percentage"],
262
  text=hover_text,
263
  hovertemplate="%{text}<extra></extra>",
264
  colorscale=[
 
1
+ import numpy as np
2
+ import pandas as pd
3
  import plotly.graph_objects as go
4
  from plotly.subplots import make_subplots
5
 
 
164
 
165
 
166
  def create_world_map(
167
+ df, top_n_labels=20
168
  ):
169
+ # Create a filtered_df with only countries
170
+ df = df[df['org_country_single'] != 'HF']
171
+ df = df[df['org_country_single'] != 'Online']
172
+ df = df[df['org_country_single'] != 'International']
173
+ df = df[df['org_country_single'] != 'user']
174
+
175
+ # Filter out models created after 2024-01-01 and downloads after 2024-01-01
176
+ # df = df[df['created'] > '2024-01-01']
177
+ # df = df[df['time'] > '2024-01-01']
178
 
179
  # Country code mapping
180
  country_code_map = {
 
223
  "Turkey": "TUR",
224
  }
225
 
226
+ df["country_code"] = df["org_country_single"].map(country_code_map)
227
+ df = df.dropna(subset=["country_code"])
228
+
229
+ # Fix country plot
230
 
231
  fig = make_subplots(
232
  rows=1,
 
234
  specs=[[{"type": "geo"}]],
235
  )
236
 
237
+ downloads_by_country = df.groupby('org_country_single')['downloads'].sum().reset_index()
238
+
239
+ # Prepare top countries for annotation
240
+ total_downloads = float(downloads_by_country['downloads'].sum())
241
+ downloads_by_country['pct'] = (downloads_by_country['downloads'] / total_downloads * 100.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  # Create hover text
244
  hover_text = []
245
+ for _, row in downloads_by_country.iterrows():
246
  hover_text.append(
247
+ f"<b>{row['org_country_single']}</b><br>"
248
+ f"Avg Downloads: {row['pct']:.1f}% of total<br>"
249
+ f"Avg Value: {row['downloads']:.6f}"
250
  )
251
 
252
  # Add choropleth to plot
253
  fig.add_trace(
254
  go.Choropleth(
255
+ locations=downloads_by_country["country_code"],
256
+ z=np.log10(downloads_by_country["pct"]),
257
  text=hover_text,
258
  hovertemplate="%{text}<extra></extra>",
259
  colorscale=[