Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import csv | |
| import os | |
| import numpy as np | |
| def load_gpu_data(): | |
| """Load GPU data from gpus.csv file.""" | |
| gpu_data = {} | |
| csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv') | |
| try: | |
| with open(csv_path, 'r') as file: | |
| reader = csv.DictReader(file) | |
| for row in reader: | |
| gpu_name = row['gpu_model'].replace('_', ' ') | |
| tflops = float(row['sparce_tflops']) | |
| gpu_data[gpu_name] = tflops | |
| except Exception as e: | |
| print(f"Error loading GPU data: {e}") | |
| gpu_data = {"Custom": 0} | |
| return gpu_data | |
| def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage): | |
| """ | |
| Calculate the time to train a model. | |
| Formula: | |
| - Total FLOPs = 6 * num_params * num_tokens | |
| - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100) | |
| - Training time = Total FLOPs / Effective FLOPs per second | |
| Args: | |
| model_size_billions: Model size in billions of parameters | |
| tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity) | |
| num_gpus: Number of GPUs used | |
| tokens_millions: Number of tokens in millions | |
| mfu_percentage: Model FLOPs Utilization percentage | |
| Returns: | |
| Training time in hours | |
| """ | |
| # Convert inputs to base units | |
| num_params = model_size_billions * 1e9 | |
| num_tokens = tokens_millions * 1e6 | |
| # Calculate total FLOPs needed | |
| total_flops = 6 * num_params * num_tokens | |
| # Calculate effective FLOPs per second | |
| # tflops_per_gpu is in 10^12 FLOPs per second | |
| flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100) | |
| # Calculate training time in seconds | |
| training_time_seconds = total_flops / flops_per_second | |
| # Convert to hours | |
| training_time_hours = training_time_seconds / 3600 | |
| return training_time_hours | |
| def format_output(hours): | |
| """Format the output in a readable way.""" | |
| if hours < 24: | |
| return f"{hours:.2f} hours" | |
| else: | |
| days = hours / 24 | |
| if days < 30: | |
| return f"{days:.2f} days ({hours:.1f} hours)" | |
| else: | |
| months = days / 30 | |
| return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)" | |
| def slider_to_model_size(value): | |
| """Convert logarithmic slider value to actual model size in billions.""" | |
| # Map 0-100 to 0.1B-1000B logarithmically | |
| min_log = np.log10(0.1) # -1 | |
| max_log = np.log10(1000) # 3 | |
| log_value = min_log + (max_log - min_log) * value / 100 | |
| return 10 ** log_value | |
| def model_size_to_slider(size_billions): | |
| """Convert model size in billions to slider value.""" | |
| min_log = np.log10(0.1) | |
| max_log = np.log10(1000) | |
| log_value = np.log10(size_billions) | |
| return 100 * (log_value - min_log) / (max_log - min_log) | |
| def format_model_size(size_billions): | |
| """Format model size for display.""" | |
| if size_billions < 1: | |
| return f"{size_billions * 1000:.0f}M" | |
| elif size_billions < 1000: | |
| return f"{size_billions:.1f}B" | |
| else: | |
| return f"{size_billions / 1000:.1f}T" | |
| def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage): | |
| """Update the calculation and return formatted results.""" | |
| # Convert model size to billions | |
| if model_size_unit == "B": | |
| model_size_billions = model_size_value | |
| else: # T | |
| model_size_billions = model_size_value * 1000 | |
| # Convert tokens to millions | |
| if tokens_unit == "M": | |
| tokens_millions = tokens_value | |
| elif tokens_unit == "B": | |
| tokens_millions = tokens_value * 1000 | |
| else: # T | |
| tokens_millions = tokens_value * 1000000 | |
| # Determine TFLOPs value | |
| if use_gpu_model and gpu_model != "Custom": | |
| gpu_data = load_gpu_data() | |
| tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops) | |
| gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)" | |
| else: | |
| tflops_per_gpu = custom_tflops | |
| gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)" | |
| hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage) | |
| # Create detailed breakdown | |
| total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6) | |
| effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100) | |
| breakdown = f""" | |
| ### Calculation Breakdown: | |
| - **GPU Selection**: {gpu_info} | |
| - **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B) | |
| - **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M) | |
| - **Total FLOPs**: {total_flops:.2e} FLOPs | |
| - **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens | |
| - **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s | |
| - **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU | |
| ### Training Time: | |
| **{format_output(hours)}** | |
| """ | |
| return breakdown | |
| # Load GPU data | |
| gpu_data = load_gpu_data() | |
| gpu_choices = ["Custom"] + list(gpu_data.keys()) | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Model Training Time Calculator") as demo: | |
| gr.Markdown("# Model Training Time Calculator") | |
| gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| with gr.Row(): | |
| model_size_value = gr.Number( | |
| minimum=0.5, | |
| maximum=1000, | |
| value=7, | |
| step=0.1, | |
| label="Model Size", | |
| info="Enter model size (0.5-1000)" | |
| ) | |
| model_size_unit = gr.Radio( | |
| choices=["B", "T"], | |
| value="B", | |
| label="Unit", | |
| info="Model size unit" | |
| ) | |
| # GPU Selection | |
| use_gpu_model = gr.Checkbox( | |
| value=True, | |
| label="Use GPU Model from List", | |
| info="Check to select a GPU model, uncheck to input custom TFLOPs" | |
| ) | |
| gpu_model = gr.Dropdown( | |
| choices=gpu_choices, | |
| value="H100" if "H100" in gpu_choices else gpu_choices[0], | |
| label="GPU Model", | |
| info="Select a GPU model from the list", | |
| visible=True | |
| ) | |
| custom_tflops = gr.Slider( | |
| minimum=10, | |
| maximum=2000, | |
| value=300, | |
| step=10, | |
| label="Custom BF16 TFLOPs per GPU", | |
| info="Effective (non-sparsity) TFLOPs per GPU", | |
| visible=False | |
| ) | |
| num_gpus = gr.Slider( | |
| minimum=1, | |
| maximum=1024, | |
| value=8, | |
| step=1, | |
| label="Number of GPUs", | |
| info="Total number of GPUs for training" | |
| ) | |
| with gr.Row(): | |
| tokens_value = gr.Slider( | |
| minimum=1, | |
| maximum=1000, | |
| value=100, | |
| step=1, | |
| label="Training Tokens", | |
| info="Number of training tokens" | |
| ) | |
| tokens_unit = gr.Radio( | |
| choices=["M", "B", "T"], | |
| value="B", | |
| label="Unit", | |
| info="Token count unit" | |
| ) | |
| mfu = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| value=50, | |
| step=5, | |
| label="Model FLOPs Utilization (MFU) %", | |
| info="Efficiency of hardware utilization (50% is typical for low-end estimate)" | |
| ) | |
| with gr.Column(): | |
| output = gr.Markdown(label="Results") | |
| # Toggle between GPU model and custom TFLOPs | |
| def toggle_gpu_input(use_gpu): | |
| return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom") | |
| use_gpu_model.change( | |
| fn=toggle_gpu_input, | |
| inputs=[use_gpu_model], | |
| outputs=[gpu_model, custom_tflops] | |
| ) | |
| # Show custom TFLOPs when "Custom" is selected | |
| def check_custom_selected(gpu_model_value): | |
| return gr.update(visible=gpu_model_value == "Custom") | |
| gpu_model.change( | |
| fn=check_custom_selected, | |
| inputs=[gpu_model], | |
| outputs=[custom_tflops] | |
| ) | |
| # Set up live updating | |
| all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu] | |
| for input_component in all_inputs: | |
| input_component.change( | |
| fn=update_calculation, | |
| inputs=all_inputs, | |
| outputs=output | |
| ) | |
| # Initial calculation | |
| demo.load( | |
| fn=update_calculation, | |
| inputs=all_inputs, | |
| outputs=output | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |