training-time-calculator / training_time_calculator.py
muellerzr's picture
Create training_time_calculator.py
32e471c verified
import gradio as gr
import csv
import os
import numpy as np
def load_gpu_data():
"""Load GPU data from gpus.csv file."""
gpu_data = {}
csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv')
try:
with open(csv_path, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
gpu_name = row['gpu_model'].replace('_', ' ')
tflops = float(row['sparce_tflops'])
gpu_data[gpu_name] = tflops
except Exception as e:
print(f"Error loading GPU data: {e}")
gpu_data = {"Custom": 0}
return gpu_data
def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage):
"""
Calculate the time to train a model.
Formula:
- Total FLOPs = 6 * num_params * num_tokens
- Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100)
- Training time = Total FLOPs / Effective FLOPs per second
Args:
model_size_billions: Model size in billions of parameters
tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity)
num_gpus: Number of GPUs used
tokens_millions: Number of tokens in millions
mfu_percentage: Model FLOPs Utilization percentage
Returns:
Training time in hours
"""
# Convert inputs to base units
num_params = model_size_billions * 1e9
num_tokens = tokens_millions * 1e6
# Calculate total FLOPs needed
total_flops = 6 * num_params * num_tokens
# Calculate effective FLOPs per second
# tflops_per_gpu is in 10^12 FLOPs per second
flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100)
# Calculate training time in seconds
training_time_seconds = total_flops / flops_per_second
# Convert to hours
training_time_hours = training_time_seconds / 3600
return training_time_hours
def format_output(hours):
"""Format the output in a readable way."""
if hours < 24:
return f"{hours:.2f} hours"
else:
days = hours / 24
if days < 30:
return f"{days:.2f} days ({hours:.1f} hours)"
else:
months = days / 30
return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)"
def slider_to_model_size(value):
"""Convert logarithmic slider value to actual model size in billions."""
# Map 0-100 to 0.1B-1000B logarithmically
min_log = np.log10(0.1) # -1
max_log = np.log10(1000) # 3
log_value = min_log + (max_log - min_log) * value / 100
return 10 ** log_value
def model_size_to_slider(size_billions):
"""Convert model size in billions to slider value."""
min_log = np.log10(0.1)
max_log = np.log10(1000)
log_value = np.log10(size_billions)
return 100 * (log_value - min_log) / (max_log - min_log)
def format_model_size(size_billions):
"""Format model size for display."""
if size_billions < 1:
return f"{size_billions * 1000:.0f}M"
elif size_billions < 1000:
return f"{size_billions:.1f}B"
else:
return f"{size_billions / 1000:.1f}T"
def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage):
"""Update the calculation and return formatted results."""
# Convert model size to billions
if model_size_unit == "B":
model_size_billions = model_size_value
else: # T
model_size_billions = model_size_value * 1000
# Convert tokens to millions
if tokens_unit == "M":
tokens_millions = tokens_value
elif tokens_unit == "B":
tokens_millions = tokens_value * 1000
else: # T
tokens_millions = tokens_value * 1000000
# Determine TFLOPs value
if use_gpu_model and gpu_model != "Custom":
gpu_data = load_gpu_data()
tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops)
gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)"
else:
tflops_per_gpu = custom_tflops
gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)"
hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage)
# Create detailed breakdown
total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6)
effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100)
breakdown = f"""
### Calculation Breakdown:
- **GPU Selection**: {gpu_info}
- **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B)
- **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M)
- **Total FLOPs**: {total_flops:.2e} FLOPs
- **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens
- **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s
- **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU
### Training Time:
**{format_output(hours)}**
"""
return breakdown
# Load GPU data
gpu_data = load_gpu_data()
gpu_choices = ["Custom"] + list(gpu_data.keys())
# Create the Gradio interface
with gr.Blocks(title="Model Training Time Calculator") as demo:
gr.Markdown("# Model Training Time Calculator")
gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.")
with gr.Row():
with gr.Column():
with gr.Row():
model_size_value = gr.Number(
minimum=0.5,
maximum=1000,
value=7,
step=0.1,
label="Model Size",
info="Enter model size (0.5-1000)"
)
model_size_unit = gr.Radio(
choices=["B", "T"],
value="B",
label="Unit",
info="Model size unit"
)
# GPU Selection
use_gpu_model = gr.Checkbox(
value=True,
label="Use GPU Model from List",
info="Check to select a GPU model, uncheck to input custom TFLOPs"
)
gpu_model = gr.Dropdown(
choices=gpu_choices,
value="H100" if "H100" in gpu_choices else gpu_choices[0],
label="GPU Model",
info="Select a GPU model from the list",
visible=True
)
custom_tflops = gr.Slider(
minimum=10,
maximum=2000,
value=300,
step=10,
label="Custom BF16 TFLOPs per GPU",
info="Effective (non-sparsity) TFLOPs per GPU",
visible=False
)
num_gpus = gr.Slider(
minimum=1,
maximum=1024,
value=8,
step=1,
label="Number of GPUs",
info="Total number of GPUs for training"
)
with gr.Row():
tokens_value = gr.Slider(
minimum=1,
maximum=1000,
value=100,
step=1,
label="Training Tokens",
info="Number of training tokens"
)
tokens_unit = gr.Radio(
choices=["M", "B", "T"],
value="B",
label="Unit",
info="Token count unit"
)
mfu = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="Model FLOPs Utilization (MFU) %",
info="Efficiency of hardware utilization (50% is typical for low-end estimate)"
)
with gr.Column():
output = gr.Markdown(label="Results")
# Toggle between GPU model and custom TFLOPs
def toggle_gpu_input(use_gpu):
return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom")
use_gpu_model.change(
fn=toggle_gpu_input,
inputs=[use_gpu_model],
outputs=[gpu_model, custom_tflops]
)
# Show custom TFLOPs when "Custom" is selected
def check_custom_selected(gpu_model_value):
return gr.update(visible=gpu_model_value == "Custom")
gpu_model.change(
fn=check_custom_selected,
inputs=[gpu_model],
outputs=[custom_tflops]
)
# Set up live updating
all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu]
for input_component in all_inputs:
input_component.change(
fn=update_calculation,
inputs=all_inputs,
outputs=output
)
# Initial calculation
demo.load(
fn=update_calculation,
inputs=all_inputs,
outputs=output
)
if __name__ == "__main__":
demo.launch()