Spaces:

royeis
/

reasoning_loading_bar

Sleeping

App Files Files Community

reasoning_loading_bar / app.py

royeis

extract generate_with_updates to global scope

3903517 7 months ago

raw

history blame contribute delete

34.8 kB

	import torch
	import threading
	import time
	import queue
	from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache
	import gradio as gr
	import logging
	from rnn_model import RNNSeqRegressorHub, D_FEATURES
	import spaces

	# Setup logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Global variables for model and tokenizer only
	model = None
	tokenizer = None

	# Global variables for RNN model components
	rnn_model = None
	rnn_gru = None
	rnn_head = None
	rnn_device = None

	# Global state variables for </think> detection (baseline only)
	baseline_think_tag_detected = False
	baseline_pre_think_content = ""
	baseline_post_think_content = ""
	baseline_progress_frozen = False

	# Global variables for monotonic progress tracking (prevent progress bars from going down)
	baseline_max_progress = 0.0

	# Model loading status
	model_loaded_successfully = False
	model_loading_error = None

	def load_model(model_name, torch_dtype="float16"):
	"""Load model and tokenizer"""
	global model, tokenizer

	logging.info(f"Loading tokenizer from: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	logging.info("Set pad_token to eos_token.")

	logging.info(f"Loading model from: {model_name}")
	model_dtype = getattr(torch, torch_dtype)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto",
	torch_dtype=model_dtype,
	low_cpu_mem_usage=True
	)
	logging.info(f"Model loaded successfully")

	return "Model loaded successfully"

	def load_rnn_model(rnn_path="rnn_seq_regressor.pt"):
	"""Load RNN model for progress prediction"""
	global model, rnn_model, rnn_gru, rnn_head, rnn_device

	if model is None:
	return "Please load the main model first"

	try:
	# Initialize RNN model with D_FEATURES hidden dimension
	rnn_repo_id = "royeis/DeepSeek-R1-Distill-Qwen-32B-GRUThinkingProgressRegressor"
	rnn_model = RNNSeqRegressorHub.from_pretrained(rnn_repo_id).to(model.device)

	# Extract components for optimized access
	rnn_gru = rnn_model.rnn
	rnn_head = rnn_model.head

	# Store RNN device for later use
	rnn_device = next(rnn_model.parameters()).device

	logging.info(f"RNN model loaded successfully from {rnn_path}")
	logging.info(f"RNN model device: {rnn_device}")

	return f"RNN model loaded successfully from {rnn_path}"

	except Exception as e:
	logging.error(f"Error loading RNN model: {str(e)}")
	return f"Error loading RNN model: {str(e)}"

	def reset_progress_tracking():
	"""Reset progress tracking variables for monotonic progress enforcement"""
	global baseline_max_progress
	baseline_max_progress = 0.0
	logging.info("Progress tracking variables reset for new generation")

	@spaces.GPU(duration=120)
	def load_model_and_rnn():
	"""Load both model and RNN model with hardcoded defaults"""
	model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
	rnn_path = "rnn_seq_regressor.pt"

	# Load model first
	model_status = load_model(model_name)
	if "successfully" not in model_status:
	return f"Model loading failed: {model_status}"

	# Then load RNN model
	rnn_status = load_rnn_model(rnn_path)
	if "successfully" not in rnn_status:
	return f"RNN model loading failed: {rnn_status}"

	return "Model and RNN model loaded successfully"

	def generate_baseline_only(
	prompt,
	max_new_tokens=1024,
	baseline_progress_callback=None,
	baseline_tokens_callback=None,
	stop_event=None
	):
	"""Generates tokens with baseline only (no intervention)."""
	global model, tokenizer

	if model is None or tokenizer is None:
	return "Please load a model and tokenizer first", 0

	# Always add thinking tag if not already present
	if "<think>" not in prompt:
	prompt = prompt + "\nPlease reason step by step, and put your final answer within \\boxed{{}}. \n<think>\n"

	# Tokenize input
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
	input_ids = inputs.input_ids
	attention_mask = inputs.attention_mask if hasattr(inputs, 'attention_mask') else torch.ones_like(input_ids)

	# Progress tracking
	baseline_progress_values = []
	baseline_smoothed_progress_values = [0]
	beta = 0.05

	baseline_generated_tokens = []
	baseline_token_ids = []

	# Track current progress for real-time monitoring
	baseline_last_reported_progress = 0

	# EOS tracking
	baseline_finished = False

	hook_handles = []

	# RNN state for baseline progress tracking
	baseline_rnn_state = {}

	def baseline_forward_hook(module, input_args, output):
	"""Hook that runs on every forward pass through the targeted layer for baseline only"""
	nonlocal baseline_last_reported_progress

	original_output = output
	hidden_states_to_check = None

	if isinstance(output, tuple):
	hidden_states_to_check = output[0]
	elif isinstance(output, torch.Tensor):
	hidden_states_to_check = output
	else:
	logging.warning(f"Unexpected output type from hooked layer: {type(output)}")
	return output

	if hidden_states_to_check is None or not isinstance(hidden_states_to_check, torch.Tensor):
	logging.warning("Hooked layer output does not contain a tensor of hidden states as expected.")
	return output

	# Calculate predicted progress for baseline using RNN
	baseline_p_value = 0.0
	if rnn_gru is not None and rnn_head is not None and rnn_device is not None:
	try:
	with torch.no_grad():
	# Convert to float and move to RNN device for processing
	hidden_f = hidden_states_to_check[:, -1, :].clone().float().to(rnn_device)

	# Initialize RNN state if not exists
	if len(baseline_rnn_state) == 0:
	baseline_rnn_state['state'] = hidden_f * 0.0

	# RNN forward pass
	out_t, baseline_rnn_state['state'] = rnn_gru(hidden_f, baseline_rnn_state['state'])
	baseline_p_value = rnn_head(out_t).squeeze(-1)
	baseline_p_value = max(0.0, min(1.0, baseline_p_value.item()))
	except Exception as e:
	logging.warning(f"RNN computation failed: {str(e)}")
	baseline_p_value = 0.0

	# Store the progress values
	baseline_progress_values.append(baseline_p_value)

	# Store smoothed progress for real-time updates
	baseline_smoothed_p_value = beta * baseline_p_value + (1 - beta) * baseline_smoothed_progress_values[-1]

	# Enforce monotonic progress (prevent going down)
	global baseline_max_progress
	baseline_smoothed_p_value = max(baseline_smoothed_p_value, baseline_max_progress)
	baseline_max_progress = baseline_smoothed_p_value

	baseline_smoothed_progress_values.append(baseline_smoothed_p_value)

	# Update progress callbacks
	global baseline_progress_frozen
	if baseline_progress_callback and not baseline_progress_frozen and abs(baseline_smoothed_p_value - baseline_last_reported_progress) > 0.001:
	baseline_progress_callback(int(baseline_smoothed_p_value * 100))
	baseline_last_reported_progress = baseline_smoothed_p_value
	elif baseline_progress_callback and baseline_progress_frozen:
	baseline_progress_callback(100.0)

	return original_output

	# Register hook for progress tracking
	if rnn_gru is not None and rnn_head is not None:
	try:
	target_module = model.model.norm
	hook_handles.append(target_module.register_forward_hook(baseline_forward_hook))
	logging.info(f"Baseline progress hook registered successfully")
	except AttributeError:
	logging.warning("Could not find model.model.norm. Trying to find another appropriate layer...")
	try:
	for name, module in model.named_modules():
	if 'norm' in name and 'model' in name and isinstance(module, torch.nn.Module):
	hook_handles.append(module.register_forward_hook(baseline_forward_hook))
	logging.info(f"Baseline progress hook registered on {name}")
	break
	else:
	logging.warning("Could not find appropriate normalization layer for progress hook.")
	except Exception as e:
	logging.error(f"Error setting up progress hook: {str(e)}")

	try:
	# Initialize KV cache and input preparation
	past_key_values = DynamicCache()

	# Initial forward pass
	with torch.no_grad():
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=True,
	return_dict=True
	)

	# Update cache with prompt's key-values
	past_key_values = outputs.past_key_values

	# Get logits and sample first token
	logits = outputs.logits[:, -1, :] # Last token's logits
	next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1) # Greedy sampling

	# Process token
	baseline_token_id = next_token_id[0].item()
	baseline_token_ids.append(baseline_token_id)

	baseline_new_token_text = tokenizer.decode([baseline_token_id])
	baseline_generated_tokens.append(baseline_new_token_text)

	# Update callback with the first token
	if baseline_tokens_callback:
	baseline_current_text = tokenizer.decode(baseline_token_ids, skip_special_tokens=True)
	baseline_tokens_callback(baseline_current_text, len(baseline_token_ids))

	# Check for EOS
	if baseline_token_id == tokenizer.eos_token_id:
	baseline_finished = True
	baseline_result = tokenizer.decode(baseline_token_ids, skip_special_tokens=True)
	baseline_total_tokens = len(baseline_token_ids)
	if baseline_progress_callback:
	baseline_progress_callback(100.0)
	logging.info(f"Baseline generation completed early with EOS. Total tokens: {baseline_total_tokens}")
	return baseline_result, baseline_total_tokens

	# Continue generation loop with cached states
	for step in range(1, max_new_tokens):
	# Check if we should stop
	if stop_event and stop_event.is_set():
	logging.info(f"Generation stopped by user at step {step}")
	break

	# If finished, break
	if baseline_finished:
	logging.info(f"Baseline generation completed at step {step}")
	break

	# Prepare input token for next step
	current_input_id = next_token_id # [1, 1]

	# Extend attention mask for the new token
	if attention_mask is not None:
	new_attention = torch.ones((attention_mask.shape[0], 1), device=attention_mask.device, dtype=attention_mask.dtype)
	attention_mask = torch.cat([attention_mask, new_attention], dim=1)

	# Forward pass with cached key-values
	with torch.no_grad():
	outputs = model(
	input_ids=current_input_id,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=True,
	return_dict=True
	)

	# Update cache
	past_key_values = outputs.past_key_values

	# Get next token
	logits = outputs.logits[:, -1, :]
	next_token_id = torch.argmax(logits, dim=-1).unsqueeze(-1)

	# Process token
	baseline_token_id = next_token_id[0].item()
	baseline_token_ids.append(baseline_token_id)
	baseline_new_token_text = tokenizer.decode([baseline_token_id])
	baseline_generated_tokens.append(baseline_new_token_text)

	# Update callback
	if baseline_tokens_callback:
	baseline_current_text = tokenizer.decode(baseline_token_ids, skip_special_tokens=True)
	baseline_tokens_callback(baseline_current_text, len(baseline_token_ids))

	# Check if EOS token was generated
	if baseline_token_id == tokenizer.eos_token_id:
	baseline_finished = True
	logging.info(f"Baseline generation completed with EOS at step {step}. Total tokens: {len(baseline_token_ids)}")

	# Final decoding of all tokens
	baseline_result = tokenizer.decode(baseline_token_ids, skip_special_tokens=True)
	baseline_total_tokens = len(baseline_token_ids)

	# Ensure final progress is set to 100%
	if baseline_progress_callback:
	baseline_progress_callback(100.0)

	logging.info(f"Baseline generation completed. Total tokens: {baseline_total_tokens}")

	finally:
	for handle in hook_handles:
	handle.remove()
	if hook_handles:
	logging.info("All progress hooks removed")

	return baseline_result, baseline_total_tokens

	# Automatically load model and RNN on startup
	def initialize_app():
	"""Initialize the app by loading model and RNN on startup"""
	global model_loaded_successfully, model_loading_error

	logging.info("Starting automatic model loading...")
	try:
	result = load_model_and_rnn()
	if "successfully" in result:
	model_loaded_successfully = True
	logging.info("Model and RNN model loaded successfully on startup")
	else:
	model_loaded_successfully = False
	model_loading_error = result
	logging.error(f"Failed to load model on startup: {result}")
	except Exception as e:
	model_loaded_successfully = False
	model_loading_error = str(e)
	logging.error(f"Exception during model loading: {str(e)}")

	# Load model automatically when script starts
	initialize_app()

	# Global function for resetting UI state
	def reset_ui():
	"""Reset the UI elements for a new generation"""
	global baseline_think_tag_detected, baseline_progress_frozen
	global baseline_pre_think_content, baseline_post_think_content

	# Reset progress tracking for monotonic behavior
	reset_progress_tracking()

	baseline_think_tag_detected = False
	baseline_progress_frozen = False
	baseline_pre_think_content = ""
	baseline_post_think_content = ""

	return {
	"status": "Starting generation...",
	"progress": 0,
	"thinking": "",
	"answer": "",
	"tokens": "",
	"generate_btn_text": "Generating...",
	"generate_btn_interactive": False,
	"stop_btn_interactive": True
	}

	@spaces.GPU(duration=240)
	def generate_with_updates(prompt, baseline_progress_queue, baseline_tokens_queue, stop_generation):
	"""Wrapper around generation function that handles real-time updates"""
	# Check if model is loaded
	if not model_loaded_successfully:
	yield {
	"status": f"Cannot generate: {model_loading_error}"
	}
	return

	# Use default values
	max_tokens = 2048

	# Reset UI first
	yield reset_ui()

	# Start generation in a separate thread to allow for UI updates
	baseline_result = ""
	baseline_token_count = 0
	generation_error = None
	generation_thread = None

	def baseline_progress_updater(prog_value):
	"""Update the baseline progress via the queue"""
	baseline_progress_queue.put(prog_value)

	def baseline_tokens_updater(text, token_count):
	"""Update the baseline generated text via the queue"""
	global baseline_think_tag_detected, baseline_progress_frozen, baseline_pre_think_content, baseline_post_think_content

	# Check if </think> tag appears in the text
	if not baseline_think_tag_detected and "</think>" in text:
	baseline_think_tag_detected = True
	baseline_progress_frozen = True

	# Split content at </think>
	parts = text.split("</think>", 1)
	baseline_pre_think_content = parts[0] + "</think>"
	baseline_post_think_content = parts[1] if len(parts) > 1 else ""

	# Signal content split with token count
	baseline_tokens_queue.put(("THINK_TAG_DETECTED", baseline_pre_think_content, baseline_post_think_content, token_count))
	elif baseline_think_tag_detected:
	# Update post-think content
	if "</think>" in text:
	parts = text.split("</think>", 1)
	baseline_post_think_content = parts[1] if len(parts) > 1 else ""
	baseline_tokens_queue.put(("POST_THINK_UPDATE", baseline_post_think_content))
	else:
	baseline_tokens_queue.put(("NORMAL_UPDATE", text))
	else:
	# Normal pre-think streaming with token count
	baseline_tokens_queue.put(("NORMAL_UPDATE", text, token_count))

	def run_generation():
	nonlocal baseline_result, baseline_token_count, generation_error
	try:
	# Baseline-only generation
	baseline_result, baseline_token_count = generate_baseline_only(
	prompt=prompt,
	max_new_tokens=max_tokens,
	baseline_progress_callback=baseline_progress_updater,
	baseline_tokens_callback=baseline_tokens_updater,
	stop_event=stop_generation
	)
	except Exception as e:
	generation_error = str(e)

	# Start the generation thread
	generation_thread = threading.Thread(target=run_generation)
	generation_thread.start()

	# Monitor queues for updates while generation is running
	baseline_current_text = ""
	baseline_thinking_tokens = 0
	baseline_last_progress = 0

	while generation_thread.is_alive() or not baseline_tokens_queue.empty() or not baseline_progress_queue.empty():
	updates = {}

	# Check baseline tokens queue
	try:
	while not baseline_tokens_queue.empty():
	token_update = baseline_tokens_queue.get_nowait()

	if isinstance(token_update, tuple):
	update_type = token_update[0]

	if update_type == "THINK_TAG_DETECTED":
	# </think> tag detected - split content
	pre_content = token_update[1]
	post_content = token_update[2]
	thinking_token_count = token_update[3]

	updates["thinking"] = pre_content
	updates["answer"] = post_content
	updates["progress"] = 100.0 # Freeze at 100%

	# Use actual token count (before </think>)
	baseline_thinking_tokens = thinking_token_count
	updates["tokens"] = f"{baseline_thinking_tokens}"

	elif update_type == "POST_THINK_UPDATE":
	# Update only the final answer
	post_content = token_update[1]
	updates["answer"] = post_content
	# Don't update token count - frozen at thinking tokens

	elif update_type == "NORMAL_UPDATE":
	# Normal text update
	baseline_current_text = token_update[1]
	if not baseline_think_tag_detected:
	updates["thinking"] = baseline_current_text
	# Update thinking token count with actual token count if available
	if len(token_update) > 2:
	baseline_thinking_tokens = token_update[2]
	else:
	# Fallback to word count for backward compatibility
	baseline_thinking_tokens = len(baseline_current_text.split())
	updates["tokens"] = f"{baseline_thinking_tokens}"
	else:
	# This shouldn't happen, but handle it gracefully
	updates["answer"] = baseline_current_text
	else:
	# Backward compatibility - treat as normal text
	baseline_current_text = token_update
	updates["thinking"] = baseline_current_text
	if not baseline_think_tag_detected:
	baseline_thinking_tokens = len(baseline_current_text.split())
	updates["tokens"] = f"{baseline_thinking_tokens}"

	except queue.Empty:
	pass

	# Check baseline progress queue
	try:
	while not baseline_progress_queue.empty():
	baseline_last_progress = baseline_progress_queue.get_nowait()
	updates["progress"] = baseline_last_progress
	except queue.Empty:
	pass

	# If there are any updates, yield them
	if updates:
	yield updates

	# Sleep briefly to prevent excessive CPU usage
	time.sleep(0.05)

	# Final update
	final_updates = {
	"status": "Generation complete!" if not generation_error else f"Error: {generation_error}",
	"progress": 100,
	"generate_btn_text": "Generate",
	"generate_btn_interactive": True,
	"stop_btn_interactive": True
	}

	if not generation_error:
	# Handle baseline final display
	if baseline_think_tag_detected:
	# Split result for final display
	if "</think>" in baseline_result:
	parts = baseline_result.split("</think>", 1)
	final_updates["thinking"] = parts[0] + "</think>"
	final_updates["answer"] = parts[1] if len(parts) > 1 else ""
	# Use actual token count from generation
	if baseline_thinking_tokens > 0:
	final_updates["tokens"] = f"{baseline_thinking_tokens}"
	else:
	# Fallback: use actual token count for thinking part
	thinking_text = parts[0] + "</think>"
	thinking_token_count = len(tokenizer.encode(thinking_text, add_special_tokens=False))
	final_updates["tokens"] = f"{thinking_token_count}"
	else:
	final_updates["thinking"] = baseline_result
	# Use actual token count
	if baseline_thinking_tokens > 0:
	final_updates["tokens"] = f"{baseline_thinking_tokens}"
	else:
	total_token_count = len(tokenizer.encode(baseline_result, add_special_tokens=False))
	final_updates["tokens"] = f"{total_token_count}"
	else:
	final_updates["thinking"] = baseline_result
	# Use actual token count
	if baseline_thinking_tokens > 0:
	final_updates["tokens"] = f"{baseline_thinking_tokens}"
	else:
	total_token_count = len(tokenizer.encode(baseline_result, add_special_tokens=False))
	final_updates["tokens"] = f"{total_token_count}"

	yield final_updates

	# Create the Gradio interface
	def create_interface():
	# Create custom theme with light green progress bars
	custom_theme = gr.themes.Base().set(
	slider_color="#90EE90", # Light green
	slider_color_dark="#32CD32", # Lime green for dark mode
	# Additional slider styling
	color_accent="#90EE90",
	color_accent_soft="#E8F5E8"
	)

	# Custom CSS to fix text box heights, enable scrolling, and enlarge fonts
	custom_css = """
	/* Base font size increases for all text areas */
	.gr-textbox textarea {
	font-size: 18px !important;
	line-height: 1.5em !important;
	overflow-y: auto !important;
	resize: none !important;
	}

	/* Prompt textbox - larger font */
	.gr-textbox:has(textarea[placeholder*="math"]) textarea,
	.gr-textbox:has(textarea[placeholder*="Enter"]) textarea {
	font-size: 20px !important;
	line-height: 1.6em !important;
	}

	/* Thinking Process and Final Answer boxes - enhanced readability */
	.fixed-height-textbox textarea {
	font-size: 16px !important;
	line-height: 1.5em !important;
	overflow-y: auto !important;
	resize: none !important;
	}

	/* Thinking Process boxes - adjusted height for larger fonts */
	.fixed-height-textbox:has(textarea[data-testid*="textbox"]) textarea {
	height: 280px !important;
	max-height: 280px !important;
	min-height: 280px !important;
	}

	/* Progress bar labels and sliders */
	.gr-slider .gr-label {
	font-size: 16px !important;
	font-weight: 500 !important;
	}

	/* All buttons - larger and more readable */
	.gr-button {
	font-size: 16px !important;
	font-weight: 500 !important;
	padding: 8px 16px !important;
	}

	/* Token count displays - readable but not overwhelming */
	.gr-textbox[data-testid*="textbox"][readonly] textarea,
	.gr-textbox.gr-readonly textarea {
	font-size: 14px !important;
	line-height: 1.4em !important;
	text-align: center !important;
	font-weight: 500 !important;
	}

	/* All labels - consistent sizing */
	.gr-label {
	font-size: 15px !important;
	font-weight: 500 !important;
	}

	/* Status messages and markdown - larger and clearer */
	.gr-markdown {
	font-size: 16px !important;
	line-height: 1.4em !important;
	}

	/* Checkbox labels */
	.gr-checkbox .gr-label {
	font-size: 15px !important;
	}

	/* Headers and titles */
	.gr-markdown h1 {
	font-size: 28px !important;
	}

	.gr-markdown h2 {
	font-size: 22px !important;
	}

	/* Progress percentage display */
	.gr-slider .gr-number {
	font-size: 14px !important;
	font-weight: 500 !important;
	}

	/* Custom light green gradient for progress bars */
	.gr-slider input[type="range"]::-webkit-slider-runnable-track {
	background: linear-gradient(90deg, #90EE90 0%, #32CD32 100%) !important;
	border: none !important;
	height: 8px !important;
	border-radius: 4px !important;
	}

	.gr-slider input[type="range"]::-moz-range-track {
	background: linear-gradient(90deg, #90EE90 0%, #32CD32 100%) !important;
	border: none !important;
	height: 8px !important;
	border-radius: 4px !important;
	}

	/* Webkit slider thumb styling */
	.gr-slider input[type="range"]::-webkit-slider-thumb {
	background: #228B22 !important;
	border: 2px solid #ffffff !important;
	border-radius: 50% !important;
	cursor: pointer !important;
	}

	/* Firefox slider thumb styling */
	.gr-slider input[type="range"]::-moz-range-thumb {
	background: #228B22 !important;
	border: 2px solid #ffffff !important;
	border-radius: 50% !important;
	cursor: pointer !important;
	}

	/* Progress bar container background */
	.gr-slider {
	--slider-color: linear-gradient(90deg, #90EE90 0%, #32CD32 100%) !important;
	}

	/* Additional Gradio slider styling for better compatibility */
	.gr-slider .gr-range {
	background: linear-gradient(90deg, #90EE90 0%, #32CD32 100%) !important;
	}

	/* Style the slider track background (unfilled portion) */
	.gr-slider input[type="range"] {
	background: transparent !important;
	}

	.gr-slider input[type="range"]::-webkit-slider-runnable-track {
	background: linear-gradient(90deg, #E8E8E8 0%, #E8E8E8 var(--slider-percent, 0%), #90EE90 var(--slider-percent, 0%), #32CD32 100%) !important;
	}
	"""

	with gr.Blocks(title="Baseline Generation Demo", css=custom_css, theme=custom_theme) as demo:
	# Display model loading status in the header
	if model_loaded_successfully:
	status_message = "✅ DeepSeek-R1-Distill-Qwen-32B model loaded and ready for generation"
	else:
	status_message = f"❌ Model loading failed: {model_loading_error}"

	gr.Markdown(f"""
	# 🚀 Reasoning Loading Bar Space ⏳

	{status_message}

	This Space demonstrates real-time progress tracking of a reasoning model.

	## How it works:
	1. Enter a prompt below - it works best with math problems that require reasoning
	2. Click "Generate" to start generation with progress visualization
	""")

	# Generation section

	with gr.Row():
	with gr.Column(scale=8):
	prompt = gr.Textbox(
	label="Prompt",
	placeholder="Enter your math problem or reasoning task here...",
	lines=3
	)

	with gr.Column(scale=1):
	generate_btn = gr.Button(
	"Generate",
	variant="primary" if model_loaded_successfully else "secondary",
	interactive=model_loaded_successfully
	)
	stop_btn = gr.Button("Stop", variant="stop", interactive=model_loaded_successfully)

	# Generation progress and results
	with gr.Row():
	generation_status = gr.Markdown("Ready to generate" if model_loaded_successfully else "Model not loaded - cannot generate")

	# Single progress section for baseline only
	with gr.Row():
	with gr.Column():
	with gr.Row():
	baseline_progress_bar = gr.Slider(
	minimum=0,
	maximum=100,
	value=0,
	label="Generation Progress (%)",
	interactive=False,
	scale=5
	)
	baseline_tokens_count = gr.Textbox(label="Thinking tokens", value="", interactive=False, scale=1)
	baseline_thinking_output = gr.Textbox(label="🧠 Thinking Process", lines=10, value="", elem_classes=["fixed-height-textbox"])
	baseline_answer_output = gr.Textbox(label="✅ Final Answer", lines=4, value="", elem_classes=["fixed-height-textbox"])

	# Create queues to store progress and token updates
	baseline_progress_queue = queue.Queue()
	baseline_tokens_queue = queue.Queue()
	stop_generation = threading.Event()

	def stop_generation_fn():
	"""Stop the generation process"""
	stop_generation.set()
	return "Generation stopped"

	def generate_wrapper(prompt):
	"""Wrapper to adapt the global generate_with_updates function for Gradio"""
	# Process updates from the global function and map to UI components
	for update_dict in generate_with_updates(prompt, baseline_progress_queue, baseline_tokens_queue, stop_generation):
	gradio_updates = {}

	# Map the string keys to actual Gradio components
	if "status" in update_dict:
	gradio_updates[generation_status] = update_dict["status"]
	if "progress" in update_dict:
	gradio_updates[baseline_progress_bar] = update_dict["progress"]
	if "thinking" in update_dict:
	gradio_updates[baseline_thinking_output] = update_dict["thinking"]
	if "answer" in update_dict:
	gradio_updates[baseline_answer_output] = update_dict["answer"]
	if "tokens" in update_dict:
	gradio_updates[baseline_tokens_count] = update_dict["tokens"]
	if "generate_btn_text" in update_dict:
	gradio_updates[generate_btn] = gr.Button(
	update_dict["generate_btn_text"],
	variant="secondary" if "Generating" in update_dict["generate_btn_text"] else "primary",
	interactive=update_dict.get("generate_btn_interactive", True)
	)
	if "stop_btn_interactive" in update_dict:
	gradio_updates[stop_btn] = gr.Button(
	"Stop",
	variant="stop",
	interactive=update_dict["stop_btn_interactive"]
	)

	yield gradio_updates

	# Connect the buttons to the handlers
	if model_loaded_successfully:
	generate_btn.click(
	generate_wrapper,
	inputs=[prompt],
	outputs=[
	generation_status,
	baseline_progress_bar,
	baseline_thinking_output,
	baseline_answer_output,
	baseline_tokens_count,
	generate_btn,
	stop_btn
	]
	)

	stop_btn.click(
	stop_generation_fn,
	outputs=[generation_status]
	)

	return demo

	# Launch the app if running directly
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=True)