emsesc commited on
Commit
f136ea6
·
1 Parent(s): 4ad4863

initial shift to duckdb

Browse files
Files changed (3) hide show
  1. app.py +104 -26
  2. graphs/leaderboard.py +182 -60
  3. requirements.txt +3 -1
app.py CHANGED
@@ -1,6 +1,8 @@
1
  from dash import Dash, html, dcc, Input, Output, State
2
  import pandas as pd
3
  import dash_mantine_components as dmc
 
 
4
  from graphs.leaderboard import (
5
  create_leaderboard,
6
  get_top_n_leaderboard,
@@ -11,12 +13,44 @@ from graphs.leaderboard import (
11
  app = Dash()
12
  server = app.server
13
 
14
- # Load pre-processed data frames
15
- filtered_df = pd.read_pickle("data_frames/filtered_df.pkl")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Create a dcc slider for time range selection by year (readable marks)
18
- start_dt = filtered_df["time"].min()
19
- end_dt = filtered_df["time"].max()
20
  start_ts = int(start_dt.timestamp())
21
  end_ts = int(end_dt.timestamp())
22
 
@@ -48,10 +82,6 @@ time_slider = dmc.RangeSlider(
48
  marks=marks,
49
  style={"width": "70%", "margin": "0 auto"},
50
  labelAlwaysOn=False,
51
- # thumbChildren=[
52
- # dmc.Text(id="time-slider-thumb-from-label", size="xs", children="Hello"),
53
- # dmc.Text(id="time-slider-thumb-to-label", size="xs"),
54
- # ]
55
  )
56
 
57
  # App layout
@@ -167,7 +197,7 @@ app.layout = dmc.MantineProvider(
167
  # Intro / description below header (kept but styled to match layout)
168
  # Title
169
  html.Div(
170
- children="Model Leaderboard", # Change this to your desired title
171
  style={
172
  "fontSize": 40,
173
  "fontWeight": "700",
@@ -180,7 +210,7 @@ app.layout = dmc.MantineProvider(
180
  html.Div(
181
  children=[
182
  html.Button(
183
- "Read the paper", # Change this to your desired button text
184
  id="my-button",
185
  style={
186
  "padding": "10px 20px",
@@ -269,7 +299,6 @@ app.layout = dmc.MantineProvider(
269
  "gap": "24px",
270
  "padding": "32px",
271
  "alignItems": "flex-start",
272
- # 'margin': '24px auto 64px', # centered horizontally
273
  "marginLeft": "100px",
274
  "marginRight": "100px",
275
  "backgroundColor": "#FFFBF9",
@@ -281,7 +310,7 @@ app.layout = dmc.MantineProvider(
281
  dcc.Tabs(
282
  id="leaderboard-tabs",
283
  value="Countries",
284
- children=[ # wrap Tabs here
285
  dcc.Tab(
286
  label="Countries",
287
  value="Countries",
@@ -297,10 +326,10 @@ app.layout = dmc.MantineProvider(
297
  "border": "none",
298
  "padding": "10px 18px",
299
  "fontWeight": "700",
300
- "borderBottom": "3px solid #082030", # underline only
301
  },
302
  children=[
303
- create_leaderboard(filtered_df, "countries")
304
  ],
305
  ),
306
  dcc.Tab(
@@ -321,7 +350,7 @@ app.layout = dmc.MantineProvider(
321
  "borderBottom": "3px solid #082030",
322
  },
323
  children=[
324
- create_leaderboard(filtered_df, "developers")
325
  ],
326
  ),
327
  dcc.Tab(
@@ -342,7 +371,7 @@ app.layout = dmc.MantineProvider(
342
  "borderBottom": "3px solid #082030",
343
  },
344
  children=[
345
- create_leaderboard(filtered_df, "models")
346
  ],
347
  ),
348
  ],
@@ -355,7 +384,6 @@ app.layout = dmc.MantineProvider(
355
  "marginBottom": "64px",
356
  "marginLeft": "50px",
357
  "marginRight": "50px",
358
- # 'maxWidth': '1250px',
359
  },
360
  ),
361
  ],
@@ -370,12 +398,62 @@ app.layout = dmc.MantineProvider(
370
 
371
  # Callbacks for interactivity
372
  # -- helper utilities to consolidate duplicated callback logic --
373
- def _apply_time_slider(slider_value):
 
 
 
 
 
 
374
  if slider_value and len(slider_value) == 2:
375
  start = pd.to_datetime(slider_value[0], unit="s")
376
  end = pd.to_datetime(slider_value[1], unit="s")
377
- return filtered_df[(filtered_df["time"] >= start) & (filtered_df["time"] <= end)]
378
- return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF"):
381
  # Normalize label on first load
@@ -393,14 +471,14 @@ def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col
393
  else:
394
  top_n, new_label = 10, "▼ Show Top 50"
395
 
396
- # Apply time filter and build table
397
- df_time = _apply_time_slider(slider_value)
398
- df, download_df = get_top_n_leaderboard(df_time, group_col, top_n)
 
 
399
  return render_table_content(df, download_df, chip_color=chip_color, filename=filename), new_label
400
  # -- end helpers --
401
 
402
- # ...existing code...
403
-
404
  # Callbacks for interactivity (modularized)
405
  @app.callback(
406
  Output("top_countries-table", "children"),
@@ -468,4 +546,4 @@ def update_range_labels(values):
468
 
469
  # Run the app
470
  if __name__ == "__main__":
471
- app.run(debug=True)
 
1
  from dash import Dash, html, dcc, Input, Output, State
2
  import pandas as pd
3
  import dash_mantine_components as dmc
4
+ import duckdb
5
+ import time
6
  from graphs.leaderboard import (
7
  create_leaderboard,
8
  get_top_n_leaderboard,
 
13
  app = Dash()
14
  server = app.server
15
 
16
+ # DuckDB connection (global)
17
+ con = duckdb.connect(database=':memory:', read_only=False)
18
+
19
+ # Load parquet file from Hugging Face using DuckDB
20
+ HF_DATASET_ID = "emsesc/open_model_evolution_data"
21
+ hf_parquet_url = "https://huggingface.co/datasets/emsesc/open_model_evolution_data/resolve/main/filtered_df.parquet"
22
+
23
+ print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}")
24
+ try:
25
+ overall_start_time = time.time()
26
+
27
+ # Install and load httpfs extension for remote file access
28
+ con.execute("INSTALL httpfs;")
29
+ con.execute("LOAD httpfs;")
30
+
31
+ # Create a view that references the remote parquet file
32
+ con.execute(f"""
33
+ CREATE OR REPLACE VIEW filtered_df AS
34
+ SELECT * FROM read_parquet('{hf_parquet_url}')
35
+ """)
36
+
37
+ # Get column list and basic info
38
+ columns = con.execute("DESCRIBE filtered_df").fetchdf()
39
+ print("Columns:", columns['column_name'].tolist())
40
+
41
+ # Get time range for slider
42
+ time_range = con.execute("SELECT MIN(time) as min_time, MAX(time) as max_time FROM filtered_df").fetchdf()
43
+ start_dt = pd.to_datetime(time_range['min_time'].iloc[0])
44
+ end_dt = pd.to_datetime(time_range['max_time'].iloc[0])
45
+
46
+ msg = f"Successfully connected to dataset in {time.time() - overall_start_time:.2f}s."
47
+ print(msg)
48
+ except Exception as e:
49
+ err_msg = f"Failed to load dataset. Error: {e}"
50
+ print(err_msg)
51
+ raise
52
 
53
  # Create a dcc slider for time range selection by year (readable marks)
 
 
54
  start_ts = int(start_dt.timestamp())
55
  end_ts = int(end_dt.timestamp())
56
 
 
82
  marks=marks,
83
  style={"width": "70%", "margin": "0 auto"},
84
  labelAlwaysOn=False,
 
 
 
 
85
  )
86
 
87
  # App layout
 
197
  # Intro / description below header (kept but styled to match layout)
198
  # Title
199
  html.Div(
200
+ children="Model Leaderboard",
201
  style={
202
  "fontSize": 40,
203
  "fontWeight": "700",
 
210
  html.Div(
211
  children=[
212
  html.Button(
213
+ "Read the paper",
214
  id="my-button",
215
  style={
216
  "padding": "10px 20px",
 
299
  "gap": "24px",
300
  "padding": "32px",
301
  "alignItems": "flex-start",
 
302
  "marginLeft": "100px",
303
  "marginRight": "100px",
304
  "backgroundColor": "#FFFBF9",
 
310
  dcc.Tabs(
311
  id="leaderboard-tabs",
312
  value="Countries",
313
+ children=[
314
  dcc.Tab(
315
  label="Countries",
316
  value="Countries",
 
326
  "border": "none",
327
  "padding": "10px 18px",
328
  "fontWeight": "700",
329
+ "borderBottom": "3px solid #082030",
330
  },
331
  children=[
332
+ create_leaderboard(con, "countries")
333
  ],
334
  ),
335
  dcc.Tab(
 
350
  "borderBottom": "3px solid #082030",
351
  },
352
  children=[
353
+ create_leaderboard(con, "developers")
354
  ],
355
  ),
356
  dcc.Tab(
 
371
  "borderBottom": "3px solid #082030",
372
  },
373
  children=[
374
+ create_leaderboard(con, "models")
375
  ],
376
  ),
377
  ],
 
384
  "marginBottom": "64px",
385
  "marginLeft": "50px",
386
  "marginRight": "50px",
 
387
  },
388
  ),
389
  ],
 
398
 
399
  # Callbacks for interactivity
400
  # -- helper utilities to consolidate duplicated callback logic --
401
+ def _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n):
402
+ """
403
+ Query DuckDB directly to get top N entries with metadata
404
+ This minimizes data transfer by doing aggregation in DuckDB
405
+ """
406
+ # Build time filter clause
407
+ time_filter = ""
408
  if slider_value and len(slider_value) == 2:
409
  start = pd.to_datetime(slider_value[0], unit="s")
410
  end = pd.to_datetime(slider_value[1], unit="s")
411
+ time_filter = f"WHERE time >= '{start}' AND time <= '{end}'"
412
+
413
+ # Apply country replacements in the query
414
+ country_case = """
415
+ CASE
416
+ WHEN org_country_single = 'HF' THEN 'United States of America'
417
+ WHEN org_country_single = 'International' THEN 'International/Online'
418
+ WHEN org_country_single = 'Online' THEN 'International/Online'
419
+ ELSE org_country_single
420
+ END as org_country_single
421
+ """
422
+
423
+ # Build the aggregation query to get top N with all needed metadata
424
+ # This query groups by the target column and aggregates downloads
425
+ # while collecting all metadata we need for chips
426
+ query = f"""
427
+ WITH base_data AS (
428
+ SELECT
429
+ {group_col},
430
+ {country_case},
431
+ author,
432
+ merged_country_groups_single,
433
+ merged_modality,
434
+ downloads,
435
+ estimated_parameters,
436
+ model
437
+ FROM filtered_df
438
+ {time_filter}
439
+ ),
440
+ aggregated AS (
441
+ SELECT
442
+ {group_col} as name,
443
+ SUM(downloads) as total_downloads
444
+ FROM base_data
445
+ GROUP BY {group_col}
446
+ ORDER BY total_downloads DESC
447
+ LIMIT {top_n}
448
+ )
449
+ SELECT
450
+ b.*
451
+ FROM base_data b
452
+ INNER JOIN aggregated a ON b.{group_col} = a.name
453
+ ORDER BY a.total_downloads DESC
454
+ """
455
+
456
+ return con.execute(query).fetchdf()
457
 
458
  def _leaderboard_callback_logic(n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF"):
459
  # Normalize label on first load
 
471
  else:
472
  top_n, new_label = 10, "▼ Show Top 50"
473
 
474
+ # Get filtered and aggregated data directly from DuckDB
475
+ df_filtered = _get_filtered_top_n_from_duckdb(slider_value, group_col, top_n)
476
+
477
+ # Process the already-filtered data
478
+ df, download_df = get_top_n_leaderboard(df_filtered, group_col, top_n)
479
  return render_table_content(df, download_df, chip_color=chip_color, filename=filename), new_label
480
  # -- end helpers --
481
 
 
 
482
  # Callbacks for interactivity (modularized)
483
  @app.callback(
484
  Output("top_countries-table", "children"),
 
546
 
547
  # Run the app
548
  if __name__ == "__main__":
549
+ app.run(debug=True)
graphs/leaderboard.py CHANGED
@@ -47,6 +47,33 @@ country_icon_map = {
47
  "Switzerland": "🇨🇭",
48
  "User": "👤",
49
  "International/Online": "🌐",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
 
52
  company_icon_map = {
@@ -370,8 +397,20 @@ def render_table(
370
  )
371
 
372
 
373
- # Function to get top N leaderboard
374
  def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
 
 
 
 
 
 
 
 
 
 
 
 
375
  top = (
376
  filtered_df.groupby(group_col)["downloads"]
377
  .sum()
@@ -379,6 +418,7 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
379
  .reset_index()
380
  .rename(columns={group_col: "Name", "downloads": "Total Value"})
381
  )
 
382
  total_value = top["Total Value"].sum()
383
  top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
384
 
@@ -387,17 +427,21 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
387
  download_top["Total Value"] = download_top["Total Value"].astype(int)
388
  download_top["% of total"] = download_top["% of total"].round(2)
389
 
390
- top["Name"].replace("User", "user")
 
391
 
392
  # All relevant metadata columns
393
  meta_cols = meta_cols_map.get(group_col, [])
 
394
  # Collect all metadata per top n for each category (country, author, model)
395
  meta_map = {}
396
  download_map = {}
 
397
  for name in top["Name"]:
398
  name_data = filtered_df[filtered_df[group_col] == name]
399
  meta_map[name] = {}
400
  download_map[name] = {}
 
401
  for col in meta_cols:
402
  if col in name_data.columns:
403
  unique_vals = name_data[col].unique()
@@ -408,13 +452,15 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
408
  def build_metadata(nm):
409
  meta = meta_map.get(nm, {})
410
  chips = []
 
411
  # Countries
412
  for c in meta.get("org_country_single", []):
413
  if c == "United States of America":
414
  c = "USA"
415
  if c == "user":
416
  c = "User"
417
- chips.append((country_icon_map.get(c, ""), c))
 
418
  # Author
419
  for a in meta.get("author", []):
420
  icon = company_icon_map.get(a, "")
@@ -424,21 +470,22 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
424
  else:
425
  icon = "👤"
426
  chips.append((icon, a))
 
427
  # Downloads
428
- # Sum downloads if multiple entries
429
  total_downloads = sum(
430
  d for d in meta.get("downloads", []) if pd.notna(d)
431
- ) # Check if d is not NaN
432
  if total_downloads:
433
  chips.append(("⬇️", f"{int(total_downloads):,}"))
434
 
435
  # Modality
436
  for m in meta.get("merged_modality", []):
437
- chips.append(("", m))
 
438
 
439
  # Estimated Parameters
440
  for p in meta.get("estimated_parameters", []):
441
- if pd.notna(p): # Check if p is not NaN
442
  if p >= 1e9:
443
  p_str = f"{p / 1e9:.1f}B"
444
  elif p >= 1e6:
@@ -446,28 +493,32 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
446
  elif p >= 1e3:
447
  p_str = f"{p / 1e3:.1f}K"
448
  else:
449
- p_str = str(p)
450
  chips.append(("⚙️", p_str))
 
451
  return chips
452
 
453
- # Function to create downloadable dataframe
454
  def build_download_metadata(nm):
455
  meta = download_map.get(nm, {})
456
  download_info = {}
 
457
  for col in meta_cols:
458
- # don't add empty columns
459
  if col not in meta or not meta[col]:
460
  continue
 
461
  vals = meta.get(col, [])
462
  if vals:
463
- # Join list into a single string for CSV
464
- download_info[col] = ", ".join(str(v) for v in vals)
465
  else:
466
  download_info[col] = ""
 
467
  return download_info
468
 
469
  # Apply metadata builder to top dataframe
470
  top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
 
 
471
  download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
472
  download_info_df = pd.DataFrame(download_info_list)
473
  download_top = pd.concat([download_top, download_info_df], axis=1)
@@ -475,52 +526,123 @@ def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
475
  return top[["Name", "Metadata", "% of total"]], download_top
476
 
477
 
478
- def create_leaderboard(filtered_df, board_type, top_n=10):
479
- if filtered_df.empty:
480
- return html.Div("No data in selected range")
481
-
482
- # Merge HF and USA
483
- filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
484
- {"HF": "United States of America"}
485
- )
486
- # Merge International and Online
487
- filtered_df["org_country_single"] = filtered_df["org_country_single"].replace(
488
- {"International": "International/Online", "Online": "International/Online"}
489
- )
490
-
491
- # Build leaderboards
492
- top_countries, download_top_countries = get_top_n_leaderboard(
493
- filtered_df, "org_country_single", top_n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  )
495
- top_developers, download_top_developers = get_top_n_leaderboard(
496
- filtered_df, "author", top_n
497
- )
498
- top_models, download_top_models = get_top_n_leaderboard(filtered_df, "model", top_n)
499
-
500
- if board_type == "countries":
501
- return render_table(
502
- top_countries,
503
- download_top_countries,
504
- "Top Countries",
505
- chip_color="#F0F9FF",
506
- bar_color="#082030",
507
- filename="top_countries",
508
- )
509
- elif board_type == "developers":
510
- return render_table(
511
- top_developers,
512
- download_top_developers,
513
- "Top Developers",
514
- chip_color="#F0F9FF",
515
- bar_color="#082030",
516
- filename="top_developers",
517
- )
518
- else:
519
- return render_table(
520
- top_models,
521
- download_top_models,
522
- "Top Models",
523
- chip_color="#F0F9FF",
524
- bar_color="#082030",
525
- filename="top_models",
526
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  "Switzerland": "🇨🇭",
48
  "User": "👤",
49
  "International/Online": "🌐",
50
+ "Spain": "🇪🇸",
51
+ "Sweden": "🇸🇪",
52
+ "Norway": "🇳🇴",
53
+ "Denmark": "🇩🇰",
54
+ "Austria": "🇦🇹",
55
+ "Belgium": "🇧🇪",
56
+ "Poland": "🇵🇱",
57
+ "Turkey": "🇹🇷",
58
+ "Mexico": "🇲🇽",
59
+ "Argentina": "🇦🇷",
60
+ "Thailand": "🇹🇭",
61
+ "Indonesia": "🇮🇩",
62
+ "Malaysia": "🇲🇾",
63
+ "Philippines": "🇵🇭",
64
+ "Egypt": "🇪🇬",
65
+ "South Africa": "🇿🇦",
66
+ "New Zealand": "🇳🇿",
67
+ "Ireland": "🇮🇪",
68
+ "Portugal": "🇵🇹",
69
+ "Greece": "🇬🇷",
70
+ "Czech Republic": "🇨🇿",
71
+ "Romania": "🇷🇴",
72
+ "Ukraine": "🇺🇦",
73
+ "United Arab Emirates": "🇦🇪",
74
+ "Saudi Arabia": "🇸🇦",
75
+ "Pakistan": "🇵🇰",
76
+ "Bangladesh": "🇧🇩",
77
  }
78
 
79
  company_icon_map = {
 
397
  )
398
 
399
 
400
+ # Function to get top N leaderboard (now accepts pandas DataFrame from DuckDB query)
401
  def get_top_n_leaderboard(filtered_df, group_col, top_n=10):
402
+ """
403
+ Get top N entries for a leaderboard
404
+
405
+ Args:
406
+ filtered_df: Pandas DataFrame (already filtered by time from DuckDB query)
407
+ group_col: Column to group by
408
+ top_n: Number of top entries to return
409
+
410
+ Returns:
411
+ tuple: (display_df, download_df)
412
+ """
413
+ # Group by and get top N
414
  top = (
415
  filtered_df.groupby(group_col)["downloads"]
416
  .sum()
 
418
  .reset_index()
419
  .rename(columns={group_col: "Name", "downloads": "Total Value"})
420
  )
421
+
422
  total_value = top["Total Value"].sum()
423
  top["% of total"] = top["Total Value"] / total_value * 100 if total_value else 0
424
 
 
427
  download_top["Total Value"] = download_top["Total Value"].astype(int)
428
  download_top["% of total"] = download_top["% of total"].round(2)
429
 
430
+ # Replace "User" in names
431
+ top["Name"] = top["Name"].replace("User", "user")
432
 
433
  # All relevant metadata columns
434
  meta_cols = meta_cols_map.get(group_col, [])
435
+
436
  # Collect all metadata per top n for each category (country, author, model)
437
  meta_map = {}
438
  download_map = {}
439
+
440
  for name in top["Name"]:
441
  name_data = filtered_df[filtered_df[group_col] == name]
442
  meta_map[name] = {}
443
  download_map[name] = {}
444
+
445
  for col in meta_cols:
446
  if col in name_data.columns:
447
  unique_vals = name_data[col].unique()
 
452
  def build_metadata(nm):
453
  meta = meta_map.get(nm, {})
454
  chips = []
455
+
456
  # Countries
457
  for c in meta.get("org_country_single", []):
458
  if c == "United States of America":
459
  c = "USA"
460
  if c == "user":
461
  c = "User"
462
+ chips.append((country_icon_map.get(c, "🌍"), c))
463
+
464
  # Author
465
  for a in meta.get("author", []):
466
  icon = company_icon_map.get(a, "")
 
470
  else:
471
  icon = "👤"
472
  chips.append((icon, a))
473
+
474
  # Downloads
 
475
  total_downloads = sum(
476
  d for d in meta.get("downloads", []) if pd.notna(d)
477
+ )
478
  if total_downloads:
479
  chips.append(("⬇️", f"{int(total_downloads):,}"))
480
 
481
  # Modality
482
  for m in meta.get("merged_modality", []):
483
+ if pd.notna(m):
484
+ chips.append(("", m))
485
 
486
  # Estimated Parameters
487
  for p in meta.get("estimated_parameters", []):
488
+ if pd.notna(p):
489
  if p >= 1e9:
490
  p_str = f"{p / 1e9:.1f}B"
491
  elif p >= 1e6:
 
493
  elif p >= 1e3:
494
  p_str = f"{p / 1e3:.1f}K"
495
  else:
496
+ p_str = str(int(p))
497
  chips.append(("⚙️", p_str))
498
+
499
  return chips
500
 
501
+ # Function to create downloadable dataframe metadata
502
  def build_download_metadata(nm):
503
  meta = download_map.get(nm, {})
504
  download_info = {}
505
+
506
  for col in meta_cols:
 
507
  if col not in meta or not meta[col]:
508
  continue
509
+
510
  vals = meta.get(col, [])
511
  if vals:
512
+ download_info[col] = ", ".join(str(v) for v in vals if pd.notna(v))
 
513
  else:
514
  download_info[col] = ""
515
+
516
  return download_info
517
 
518
  # Apply metadata builder to top dataframe
519
  top["Metadata"] = top["Name"].astype(object).apply(build_metadata)
520
+
521
+ # Build download dataframe with metadata
522
  download_info_list = [build_download_metadata(nm) for nm in download_top["Name"]]
523
  download_info_df = pd.DataFrame(download_info_list)
524
  download_top = pd.concat([download_top, download_info_df], axis=1)
 
526
  return top[["Name", "Metadata", "% of total"]], download_top
527
 
528
 
529
+ def get_top_n_from_duckdb(con, group_col, top_n=10, time_filter=None):
530
+ """
531
+ Query DuckDB directly to get top N entries with minimal data transfer
532
+
533
+ Args:
534
+ con: DuckDB connection object
535
+ group_col: Column to group by
536
+ top_n: Number of top entries
537
+ time_filter: Optional tuple of (start_timestamp, end_timestamp)
538
+
539
+ Returns:
540
+ Pandas DataFrame with only the rows needed for top N
541
+ """
542
+ # Build time filter clause
543
+ time_clause = ""
544
+ if time_filter:
545
+ start = pd.to_datetime(time_filter[0], unit="s")
546
+ end = pd.to_datetime(time_filter[1], unit="s")
547
+ time_clause = f"WHERE time >= '{start}' AND time <= '{end}'"
548
+
549
+ # Apply country replacements in the query
550
+ country_case = """
551
+ CASE
552
+ WHEN org_country_single = 'HF' THEN 'United States of America'
553
+ WHEN org_country_single = 'International' THEN 'International/Online'
554
+ WHEN org_country_single = 'Online' THEN 'International/Online'
555
+ ELSE org_country_single
556
+ END as org_country_single
557
+ """
558
+
559
+ # Optimized query: first find top N, then get only those rows
560
+ query = f"""
561
+ WITH base_data AS (
562
+ SELECT
563
+ {group_col},
564
+ {country_case},
565
+ author,
566
+ merged_country_groups_single,
567
+ merged_modality,
568
+ downloads,
569
+ estimated_parameters,
570
+ model
571
+ FROM filtered_df
572
+ {time_clause}
573
+ ),
574
+ top_items AS (
575
+ SELECT
576
+ {group_col} as name,
577
+ SUM(downloads) as total_downloads
578
+ FROM base_data
579
+ GROUP BY {group_col}
580
+ ORDER BY total_downloads DESC
581
+ LIMIT {top_n}
582
  )
583
+ SELECT
584
+ b.*
585
+ FROM base_data b
586
+ INNER JOIN top_items t ON b.{group_col} = t.name
587
+ ORDER BY t.total_downloads DESC
588
+ """
589
+
590
+ try:
591
+ return con.execute(query).fetchdf()
592
+ except Exception as e:
593
+ print(f"Error querying DuckDB: {e}")
594
+ return pd.DataFrame()
595
+
596
+
597
+ def create_leaderboard(con, board_type, top_n=10):
598
+ """
599
+ Create leaderboard using DuckDB connection with optimized queries
600
+
601
+ Args:
602
+ con: DuckDB connection object
603
+ board_type: Type of leaderboard ('countries', 'developers', 'models')
604
+ top_n: Number of top entries to display
605
+
606
+ Returns:
607
+ Dash HTML component with the leaderboard table
608
+ """
609
+ # Map board type to column name
610
+ column_map = {
611
+ "countries": "org_country_single",
612
+ "developers": "author",
613
+ "models": "model"
614
+ }
615
+
616
+ title_map = {
617
+ "countries": "Top Countries",
618
+ "developers": "Top Developers",
619
+ "models": "Top Models"
620
+ }
621
+
622
+ filename_map = {
623
+ "countries": "top_countries",
624
+ "developers": "top_developers",
625
+ "models": "top_models"
626
+ }
627
+
628
+ group_col = column_map.get(board_type)
629
+ if not group_col:
630
+ return html.Div(f"Unknown board type: {board_type}")
631
+
632
+ # Get only the top N rows from DuckDB
633
+ filtered_df = get_top_n_from_duckdb(con, group_col, top_n)
634
+
635
+ if filtered_df.empty:
636
+ return html.Div("No data available")
637
+
638
+ # Process the already-filtered data
639
+ top_data, download_data = get_top_n_leaderboard(filtered_df, group_col, top_n)
640
+
641
+ return render_table(
642
+ top_data,
643
+ download_data,
644
+ title_map[board_type],
645
+ chip_color="#F0F9FF",
646
+ bar_color="#082030",
647
+ filename=filename_map[board_type],
648
+ )
requirements.txt CHANGED
@@ -3,4 +3,6 @@ dash
3
  plotly
4
  gunicorn
5
  dash-mantine-components
6
- dash-bootstrap-components
 
 
 
3
  plotly
4
  gunicorn
5
  dash-mantine-components
6
+ dash-bootstrap-components
7
+ pyarrow
8
+ duckdb