Spaces:

qiuxi337
/

IntrinSight-4B-Demo

Sleeping

App Files Files Community

qiuxi337 commited on Aug 18, 2025

Commit

2009aa5

verified ·

1 Parent(s): 387cd41

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -386

app.py CHANGED Viewed

@@ -25,33 +25,34 @@ try:
     HAS_SPACES = True
 except ImportError:
     HAS_SPACES = False
-    # Define placeholder decorator
     class spaces:
         @staticmethod
-        def GPU():
-            def decorator(func):
                 return func
-            return decorator
 # Check if GPU is available
 HAS_GPU = torch.cuda.is_available()
 # Try to install flash-attn (only in GPU environment)
-# if HAS_GPU:
-#     try:
-#         import subprocess
-#         subprocess.run('pip install flash-attn==2.7.4.post1 --no-build-isolation',
-#                       env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
-#                       shell=True,
-#                       capture_output=True,
-#                       timeout=30)
-#         HAS_FLASH_ATTN = True
-#     except:
-#         HAS_FLASH_ATTN = False
-# else:
-#     HAS_FLASH_ATTN = False
-HAS_FLASH_ATTN = False
 # Default model checkpoint path
 DEFAULT_CKPT_PATH = 'qiuxi337/IntrinSight-4B'
@@ -59,7 +60,6 @@ DEFAULT_CKPT_PATH = 'qiuxi337/IntrinSight-4B'
 DEFAULT_SYSTEM_PROMPT = (
     "A conversation between user and assistant. The user asks a question, and the assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. "
-    "The reasoning process is to solve the problem step by step, so you will think about it sincerely. "
     "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
     "<think> reasoning process here </think><answer> answer here </answer>."
 )
@@ -193,9 +193,9 @@ textarea:focus {
 def _get_args():
     """Parse command line arguments"""
     parser = ArgumentParser()
-    parser.add_argument('-c', '--checkpoint-path',
                         type=str,
-                        default=DEFAULT_CKPT_PATH,
                         help='Checkpoint name or path, default to %(default)r')
     parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
     parser.add_argument('--share',
@@ -217,26 +217,21 @@ def encode_image_pil(image_path):
     """Encode image to base64 using PIL"""
     try:
         if isinstance(image_path, str):
-            # It's a file path
             img = Image.open(image_path)
         elif isinstance(image_path, np.ndarray):
-            # It's a numpy array
             img = Image.fromarray(image_path)
         elif isinstance(image_path, Image.Image):
-            # It's already a PIL Image
             img = image_path
         else:
             print(f"Unsupported image type: {type(image_path)}")
             return None
-        # Convert to RGB if necessary
         if img.mode not in ('RGB', 'RGBA'):
             img = img.convert('RGB')
-        # Resize if too large
         max_size = (1024, 1024)
         img.thumbnail(max_size, Image.Resampling.LANCZOS)
         buffered = io.BytesIO()
         img.save(buffered, format="PNG")
         return base64.b64encode(buffered.getvalue()).decode('utf-8')
@@ -248,54 +243,39 @@ def encode_image_pil(image_path):
 def _load_model_processor(args):
     """Intelligently load model, automatically choose CPU or GPU based on environment"""
     global HAS_GPU, HAS_FLASH_ATTN
-    # Determine device to use
     use_gpu = HAS_GPU and not args.cpu_only
     device = 'cuda' if use_gpu else 'cpu'
     print(f"{'='*50}")
     print(f"🚀 Loading model: {args.checkpoint_path}")
     print(f"📱 Device: {'GPU (CUDA)' if use_gpu else 'CPU'}")
     print(f"⚡ Flash Attention: {'Enabled' if (use_gpu and HAS_FLASH_ATTN) else 'Disabled'}")
     print(f"{'='*50}")
-    # Choose appropriate configuration based on device
     model_kwargs = {
         'pretrained_model_name_or_path': args.checkpoint_path,
         'torch_dtype': torch.bfloat16 if use_gpu else torch.float32,
     }
-    # Use flash attention only on GPU when available
     if use_gpu and HAS_FLASH_ATTN:
         model_kwargs['attn_implementation'] = 'flash_attention_2'
-    # Set device_map
     if use_gpu:
         model_kwargs['device_map'] = 'auto'
     else:
         model_kwargs['device_map'] = None
         model_kwargs['low_cpu_mem_usage'] = True
     try:
-        # First try to use specific model class
-        try:
-            from transformers import Gemma3ForConditionalGeneration
-            model = Gemma3ForConditionalGeneration.from_pretrained(**model_kwargs)
-        except:
-            # If failed, use generic AutoModel
-            model = AutoModelForImageTextToText.from_pretrained(**model_kwargs)
         model.eval()
-        # If CPU mode, manually move to CPU
         if not use_gpu:
             model = model.to(device)
     except Exception as e:
         print(f"⚠️ Failed to load model with optimal settings: {e}")
         print("🔄 Falling back to CPU mode...")
-        # Fallback to CPU mode
         model_kwargs = {
             'pretrained_model_name_or_path': args.checkpoint_path,
             'torch_dtype': torch.float32,
@@ -307,9 +287,9 @@ def _load_model_processor(args):
         model.eval()
         use_gpu = False
         device = 'cpu'
     processor = AutoProcessor.from_pretrained(args.checkpoint_path)
     print(f"✅ Model loaded successfully on {device}")
     return model, processor, device
@@ -361,8 +341,7 @@ def _parse_text(text):
 def _remove_image_special(text):
     """Remove special image tags from text"""
-    if text is None:
-        return ""
     text = text.replace('<ref>', '').replace('</ref>', '')
     return re.sub(r'<box>.*?(</box>|$)', '', text)
@@ -377,391 +356,229 @@ def _gc():
 def _transform_messages(original_messages, system_prompt):
     """Transform messages with custom system prompt"""
-    transformed_messages = []
-    system_message = {"role": "system", "content": [{"type": "text", "text":system_prompt}]}
-    transformed_messages.append(system_message)
     for message in original_messages:
         new_content = []
         for item in message['content']:
             if 'image' in item:
-                new_item = {'type': 'image', 'image': item['image']}
             elif 'text' in item:
-                new_item = {'type': 'text', 'text': item['text']}
-            else:
-                continue
-            new_content.append(new_item)
-        new_message = {'role': message['role'], 'content': new_content}
-        transformed_messages.append(new_message)
     return transformed_messages
 def normalize_task_history_item(item):
-    """规范化task_history中的项目为字典格式"""
     if isinstance(item, dict):
-        # 已经是字典格式，检查必要的键
-        return {
-            'text': item.get('text', ''),
-            'images': item.get('images', []),
-            'response': item.get('response', None)
-        }
     elif isinstance(item, (list, tuple)) and len(item) >= 2:
-        # 旧格式: (query, response)
         query, response = item[0], item[1]
         if isinstance(query, (list, tuple)):
-            # query是图片列表
-            return {
-                'text': '',
-                'images': list(query),
-                'response': response
-            }
         else:
-            # query是文本
-            return {
-                'text': str(query) if query else '',
-                'images': [],
-                'response': response
-            }
     else:
-        # 其他格式，尝试处理
-        return {
-            'text': str(item) if item else '',
-            'images': [],
-            'response': None
-        }
 def _launch_demo(args, model, processor, device):
     """Launch the Gradio demo interface"""
-    @spaces.GPU
     def call_local_model(model, processor, messages, system_prompt, temperature, top_p, max_tokens):
         """Call the local model with streaming response"""
         messages = _transform_messages(messages, system_prompt)
         inputs = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
             return_tensors="pt"
-        ).to(model.device, dtype=torch.bfloat16)
         tokenizer = processor.tokenizer
         streamer = TextIteratorStreamer(tokenizer, timeout=2000.0, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs = {
-            'max_new_tokens': max_tokens,
-            "do_sample": True,
-            "temperature": temperature,
-            "top_p": top_p,
-            "top_k": 20,
-            'streamer': streamer,
-            **inputs
         }
         with torch.inference_mode():
             thread = Thread(target=model.generate, kwargs=gen_kwargs)
             thread.start()
             generated_text = ''
             for new_text in streamer:
                 generated_text += new_text
-                if "<think>" in generated_text:
-                    generated_text = generated_text.replace("<think>", "**Reasoning Process**:\n")
-                if "</think>" in generated_text:
-                    generated_text = generated_text.replace("</think>", "\n")
-                if "<answer>" in generated_text:
-                    generated_text = generated_text.replace("<answer>", "**Final Answer**:\n")
-                if "</answer>" in generated_text:
-                    generated_text = generated_text.replace("</answer>", "")
-                yield generated_text
-    def create_predict_fn():
-        """Create prediction function with optional GPU acceleration"""
-        def predict_impl(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
-            """Implementation of prediction logic"""
-            if not _chatbot or not task_history:
-                yield _chatbot
-                return
-            chat_query = _chatbot[-1][0]
-            # 规范化task_history中的最后一个项目
-            last_item = normalize_task_history_item(task_history[-1])
-            if not chat_query and not last_item['text'] and not last_item['images']:
-                _chatbot.pop()
-                task_history.pop()
-                yield _chatbot
-                return
-            print(f'User query: {last_item}')
-            # 规范化整个history
-            history_cp = [normalize_task_history_item(item) for item in copy.deepcopy(task_history)]
-            full_response = ''
-            messages = []
-            # 构建消息：确保每个user/assistant对都正确交替
-            for i, item in enumerate(history_cp):
-                if item['response'] is None:  # 当前正在处理的消息
-                    content = []
-                    # 添加图片
-                    if item['images']:
-                        for img_path in item['images']:
-                            if img_path:
-                                encoded_img = encode_image_pil(img_path)
-                                if encoded_img:
-                                    content.append({'image': encoded_img})
-                    # 添加文本
-                    if item['text']:
-                        content.append({'text': str(item['text'])})
-                    if content:
-                        messages.append({'role': 'user', 'content': content})
-                else:  # 历史消息
-                    content = []
-                    # 添加图片
-                    if item['images']:
-                        for img_path in item['images']:
-                            if img_path:
-                                encoded_img = encode_image_pil(img_path)
-                                if encoded_img:
-                                    content.append({'image': encoded_img})
-                    # 添加文本
-                    if item['text']:
-                        content.append({'text': str(item['text'])})
-                    if content:
-                        messages.append({'role': 'user', 'content': content})
-                        messages.append({'role': 'assistant', 'content': [{'text': str(item['response'])}]})
-            try:
-                for response in call_local_model(model, processor, messages, system_prompt, temperature, top_p, max_tokens):
-                    _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
-                    yield _chatbot
-                    full_response = response
-                # 更新task_history中的response
-                if isinstance(task_history[-1], dict):
-                    task_history[-1]['response'] = full_response
-                else:
-                    # 如果是旧格式，转换为新格式
-                    normalized_item = normalize_task_history_item(task_history[-1])
-                    normalized_item['response'] = full_response
-                    task_history[-1] = normalized_item
-                print(f'Assistant: {full_response}')
-            except Exception as e:
-                print(f"Error during generation: {e}")
-                import traceback
-                traceback.print_exc()
-                _chatbot[-1] = (_parse_text(chat_query), f"Error: {str(e)}")
-                # 更新错误信息到task_history
-                if isinstance(task_history[-1], dict):
-                    task_history[-1]['response'] = f"Error: {str(e)}"
-                else:
-                    normalized_item = normalize_task_history_item(task_history[-1])
-                    normalized_item['response'] = f"Error: {str(e)}"
-                    task_history[-1] = normalized_item
                 yield _chatbot
-        # Use GPU decorator if spaces is available and using GPU
-        if HAS_SPACES and device == 'cuda':
-            predict = spaces.GPU()(predict_impl)
-        else:
-            predict = predict_impl
-        return predict
-    def create_regenerate_fn():
-        """Create regenerate function"""
-        def regenerate(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
-            if not task_history or not _chatbot:
-                yield _chatbot
-                return
-            # 规范化最后一个项目
-            last_item = normalize_task_history_item(task_history[-1])
-            if last_item['response'] is None:
-                yield _chatbot
-                return
-            # 重置最后一个回复
-            last_item['response'] = None
-            task_history[-1] = last_item
-            chatbot_item = _chatbot.pop(-1) if _chatbot else None
-            if chatbot_item:
-                if chatbot_item[0] is None and len(_chatbot) > 0:
-                    _chatbot[-1] = (_chatbot[-1][0], None)
-                else:
-                    _chatbot.append((chatbot_item[0], None))
-            # Use the predict function directly
-            for updated_chatbot in predict(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
-                yield updated_chatbot
-        return regenerate
-    predict = create_predict_fn()
-    regenerate = create_regenerate_fn()
     def add_text_and_files(history, task_history, text, files):
-        """合并文本和文件到同一个消息中"""
         history = history if history is not None else []
         task_history = task_history if task_history is not None else []
-        # 检查是否有有效输入
         has_text = text and text.strip()
         has_files = files and len(files) > 0
         if not has_text and not has_files:
             return history, task_history, text, files
-        # 准备消息内容
-        display_parts = []
-        file_paths = []
-        # 处理文件
         if has_files:
             for file in files:
-                if file is not None:
-                    file_path = file.name if hasattr(file, 'name') else str(file)
-                    file_paths.append(file_path)
             if file_paths:
                 display_parts.append(f"[Uploaded {len(file_paths)} images]")
-        # 处理文本
         if has_text:
             display_parts.append(text)
-        # 创建显示消息
         display_message = " ".join(display_parts)
-        # 添加到历史记录
         history.append([_parse_text(display_message), None])
-        task_history.append({
-            'text': text if has_text else '',
-            'images': file_paths,
-            'response': None
-        })
-        return history, task_history, '', None  # 清空输入
-    def reset_user_input():
-        """Reset user input field"""
-        return gr.update(value='')
     def reset_state():
-        """Clear conversation history"""
         _gc()
-        return [], [], None  # Return empty chatbot, empty task_history, and None for file input
-    # Create Gradio interface
     with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo:
-        gr.HTML("""
             <div class="container">
                 <h1 class="main-title">IntrinSight Assistant</h1>
                 <p class="sub-title">
                     Powered by IntrinSight-4B Model
-                    <span class="{}">{}</span>
                 </p>
             </div>
-        """.format(
-            "status-indicator gpu-status" if device == 'cuda' else "status-indicator cpu-status",
-            "🚀 GPU Mode" if device == 'cuda' else "💻 CPU Mode"
-        ))
-        # Initialize states
         task_history = gr.State([])
         with gr.Row():
             with gr.Column(scale=4):
                 chatbot = gr.Chatbot(
-                    label='IntrinSight-4B Chat Interface',
-                    elem_classes='control-height',
-                    height=600,
                     avatar_images=(None, "https://em-content.zobj.net/thumbs/240/twitter/348/robot_1f916.png")
                 )
                 with gr.Row():
-                    query = gr.Textbox(
-                        lines=3,
-                        label='💬 Message Input',
-                        placeholder="Enter your question here...",
-                        elem_classes="custom-input"
-                    )
                 with gr.Row():
-                    # Multi-file upload with drag and drop support
                     addfile_btn = gr.File(
-                        label="📸 Upload Images (Drag & Drop Supported, Multiple Selection)",
-                        file_count="multiple",
-                        file_types=["image"],
-                        elem_classes="file-upload-area"
                     )
                 with gr.Row():
                     submit_btn = gr.Button('🚀 Send', variant="primary", elem_classes="custom-button")
                     regen_btn = gr.Button('🔄 Regenerate', variant="secondary", elem_classes="custom-button")
                     empty_bin = gr.Button('🗑️ Clear History', variant="stop", elem_classes="custom-button")
             with gr.Column(scale=2):
-                # System prompt section
                 with gr.Group(elem_classes="parameter-section"):
                     gr.Markdown("### ⚙️ System Configuration")
-                    system_prompt = gr.Textbox(
-                        label="System Prompt",
-                        value=DEFAULT_SYSTEM_PROMPT,
-                        lines=5,
-                        placeholder="Enter system prompt here..."
-                    )
-                # Generation parameters section
                 with gr.Group(elem_classes="parameter-section"):
                     gr.Markdown("### 🎛️ Generation Parameters")
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=2.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature (Creativity)",
-                        info="Higher values make output more random"
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=1.0,
-                        step=0.05,
-                        label="Top-p (Nucleus Sampling)",
-                        info="Cumulative probability for token selection"
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=256,
-                        maximum=16384,
-                        value=8192,
-                        step=256,
-                        label="Max Tokens",
-                        info="Maximum number of tokens to generate"
-                    )
-                # Instructions section
-                gr.Markdown("""
                 ### 📋 Instructions
                 **Basic Usage:**
@@ -771,65 +588,53 @@ def _launch_demo(args, model, processor, device):
                 - **Parameters**: Adjust generation settings as needed
                 **Performance Info:**
-                - Current Mode: **{}**
-                - Flash Attention: **{}**
                 - Recommended Image Size: < 1024×1024
                 ### ⚠️ Disclaimer
                 This demo is subject to the Gemma license agreement.
                 Please do not generate or disseminate harmful content.
-                """.format(
-                    "GPU Acceleration" if device == 'cuda' else "CPU Mode",
-                    "Enabled" if (device == 'cuda' and HAS_FLASH_ATTN) else "Disabled"
-                ))
-        # Event bindings
         submit_btn.click(
-            add_text_and_files,
             [chatbot, task_history, query, addfile_btn],
             [chatbot, task_history, query, addfile_btn]
         ).then(
-            predict,
-            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
-            [chatbot],
-            show_progress=True
-        )
-        empty_bin.click(
-            reset_state,
-            outputs=[chatbot, task_history, addfile_btn],
-            show_progress=True
         )
         regen_btn.click(
-            regenerate,
-            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
-            [chatbot],
-            show_progress=True
         )
-        # Enter key to send message
         query.submit(
-            add_text_and_files,
             [chatbot, task_history, query, addfile_btn],
             [chatbot, task_history, query, addfile_btn]
         ).then(
-            predict,
-            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
-            [chatbot],
-            show_progress=True
         )
     demo.queue(max_size=10).launch(
-        share=args.share,
-        inbrowser=args.inbrowser,
-        server_port=args.server_port,
-        server_name=args.server_name,
         show_error=True
     )
 def main():
     """Main entry point"""
     args = _get_args()

     HAS_SPACES = True
 except ImportError:
     HAS_SPACES = False
     class spaces:
         @staticmethod
+        def GPU(func=None, **kwargs):
+            if func:
                 return func
+            return lambda f: f
 # Check if GPU is available
 HAS_GPU = torch.cuda.is_available()
 # Try to install flash-attn (only in GPU environment)
+if HAS_GPU:
+    try:
+        import subprocess
+        subprocess.run('pip install flash-attn==2.7.4.post1 --no-build-isolation',
+                      env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+                      shell=True,
+                      capture_output=True,
+                      timeout=30)
+        import flash_attn
+        HAS_FLASH_ATTN = True
+    except Exception as e:
+        print(f"Flash Attention installation failed: {e}")
+        HAS_FLASH_ATTN = False
+else:
+    HAS_FLASH_ATTN = False
+HAS_FLASH_ATTN = False
 # Default model checkpoint path
 DEFAULT_CKPT_PATH = 'qiuxi337/IntrinSight-4B'
 DEFAULT_SYSTEM_PROMPT = (
     "A conversation between user and assistant. The user asks a question, and the assistant solves it. The assistant "
     "first thinks about the reasoning process in the mind and then provides the user with the answer. "
     "The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
     "<think> reasoning process here </think><answer> answer here </answer>."
 )
 def _get_args():
     """Parse command line arguments"""
     parser = ArgumentParser()
+    parser.add_argument('-c', '--checkpoint-path',
                         type=str,
+                        default=DEFAULT_CKPT_PATH,
                         help='Checkpoint name or path, default to %(default)r')
     parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
     parser.add_argument('--share',
     """Encode image to base64 using PIL"""
     try:
         if isinstance(image_path, str):
             img = Image.open(image_path)
         elif isinstance(image_path, np.ndarray):
             img = Image.fromarray(image_path)
         elif isinstance(image_path, Image.Image):
             img = image_path
         else:
             print(f"Unsupported image type: {type(image_path)}")
             return None
         if img.mode not in ('RGB', 'RGBA'):
             img = img.convert('RGB')
         max_size = (1024, 1024)
         img.thumbnail(max_size, Image.Resampling.LANCZOS)
         buffered = io.BytesIO()
         img.save(buffered, format="PNG")
         return base64.b64encode(buffered.getvalue()).decode('utf-8')
 def _load_model_processor(args):
     """Intelligently load model, automatically choose CPU or GPU based on environment"""
     global HAS_GPU, HAS_FLASH_ATTN
     use_gpu = HAS_GPU and not args.cpu_only
     device = 'cuda' if use_gpu else 'cpu'
     print(f"{'='*50}")
     print(f"🚀 Loading model: {args.checkpoint_path}")
     print(f"📱 Device: {'GPU (CUDA)' if use_gpu else 'CPU'}")
     print(f"⚡ Flash Attention: {'Enabled' if (use_gpu and HAS_FLASH_ATTN) else 'Disabled'}")
     print(f"{'='*50}")
     model_kwargs = {
         'pretrained_model_name_or_path': args.checkpoint_path,
         'torch_dtype': torch.bfloat16 if use_gpu else torch.float32,
     }
     if use_gpu and HAS_FLASH_ATTN:
         model_kwargs['attn_implementation'] = 'flash_attention_2'
     if use_gpu:
         model_kwargs['device_map'] = 'auto'
     else:
         model_kwargs['device_map'] = None
         model_kwargs['low_cpu_mem_usage'] = True
     try:
+        model = AutoModelForImageTextToText.from_pretrained(**model_kwargs)
         model.eval()
+        # Note: even with device_map='auto', we might need to move a CPU-only model explicitly
         if not use_gpu:
             model = model.to(device)
     except Exception as e:
         print(f"⚠️ Failed to load model with optimal settings: {e}")
         print("🔄 Falling back to CPU mode...")
         model_kwargs = {
             'pretrained_model_name_or_path': args.checkpoint_path,
             'torch_dtype': torch.float32,
         model.eval()
         use_gpu = False
         device = 'cpu'
     processor = AutoProcessor.from_pretrained(args.checkpoint_path)
     print(f"✅ Model loaded successfully on {device}")
     return model, processor, device
 def _remove_image_special(text):
     """Remove special image tags from text"""
+    if text is None: return ""
     text = text.replace('<ref>', '').replace('</ref>', '')
     return re.sub(r'<box>.*?(</box>|$)', '', text)
 def _transform_messages(original_messages, system_prompt):
     """Transform messages with custom system prompt"""
+    transformed_messages = [{"role": "system", "content": [{"type": "text", "text":system_prompt}]}]
     for message in original_messages:
         new_content = []
         for item in message['content']:
             if 'image' in item:
+                new_content.append({'type': 'image', 'image': item['image']})
             elif 'text' in item:
+                new_content.append({'type': 'text', 'text': item['text']})
+        if new_content:
+            transformed_messages.append({'role': message['role'], 'content': new_content})
     return transformed_messages
 def normalize_task_history_item(item):
+    """Normalize items in task_history to a dictionary format"""
     if isinstance(item, dict):
+        return {'text': item.get('text', ''), 'images': item.get('images', []), 'response': item.get('response', None)}
     elif isinstance(item, (list, tuple)) and len(item) >= 2:
         query, response = item[0], item[1]
         if isinstance(query, (list, tuple)):
+            return {'text': '', 'images': list(query), 'response': response}
         else:
+            return {'text': str(query) if query else '', 'images': [], 'response': response}
     else:
+        return {'text': str(item) if item else '', 'images': [], 'response': None}
 def _launch_demo(args, model, processor, device):
     """Launch the Gradio demo interface"""
     def call_local_model(model, processor, messages, system_prompt, temperature, top_p, max_tokens):
         """Call the local model with streaming response"""
         messages = _transform_messages(messages, system_prompt)
         inputs = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
             return_tensors="pt"
+        )
+        # ====================================================================
+        # THE FINAL, ROBUST FIX for all environments (CUDA, ZeroGPU, CPU)
+        # We must move the input tensors to the correct device.
+        # However, to be compatible with ZeroGPU's `torch.compile`, we must use
+        # a string ('cuda' or 'cpu') instead of a `torch.device` object.
+        # The `device` variable (a string) is passed in from the parent scope.
+        # This prevents both the "device mismatch" error and the "ConstantVariable" error.
+        # ====================================================================
+        inputs = inputs.to(device)
+        # ====================================================================
         tokenizer = processor.tokenizer
         streamer = TextIteratorStreamer(tokenizer, timeout=2000.0, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs = {
+            'max_new_tokens': max_tokens, "do_sample": True, "temperature": temperature,
+            "top_p": top_p, "top_k": 20, 'streamer': streamer, **inputs
         }
         with torch.inference_mode():
             thread = Thread(target=model.generate, kwargs=gen_kwargs)
             thread.start()
             generated_text = ''
             for new_text in streamer:
                 generated_text += new_text
+                display_text = generated_text
+                if "<think>" in display_text: display_text = display_text.replace("<think>", "**Reasoning Process**:\n")
+                if "</think>" in display_text: display_text = display_text.replace("</think>", "\n")
+                if "<answer>" in display_text: display_text = display_text.replace("<answer>", "**Final Answer**:\n")
+                if "</answer>" in display_text: display_text = display_text.replace("</answer>", "")
+                yield display_text, generated_text
+    @spaces.GPU
+    def predict(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
+        if not _chatbot or not task_history:
+            yield _chatbot
+            return
+        chat_query = _chatbot[-1][0]
+        last_item = normalize_task_history_item(task_history[-1])
+        if not chat_query and not last_item['text'] and not last_item['images']:
+            _chatbot.pop()
+            task_history.pop()
+            yield _chatbot
+            return
+        print(f'User query: {last_item}')
+        history_cp = [normalize_task_history_item(item) for item in copy.deepcopy(task_history)]
+        full_response_raw = ''
+        messages = []
+        for i, item in enumerate(history_cp):
+            content = []
+            if item['images']:
+                for img_path in item['images']:
+                    if img_path:
+                        encoded_img = encode_image_pil(img_path)
+                        if encoded_img: content.append({'image': encoded_img})
+            if item['text']: content.append({'text': str(item['text'])})
+            if item['response'] is None:
+                if content: messages.append({'role': 'user', 'content': content})
+            else:
+                if content: messages.append({'role': 'user', 'content': content})
+                messages.append({'role': 'assistant', 'content': [{'text': str(item['response'])}]})
+        try:
+            for response_display, response_raw in call_local_model(model, processor, messages, system_prompt, temperature, top_p, max_tokens):
+                _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response_display)))
                 yield _chatbot
+                full_response_raw = response_raw
+            task_history[-1]['response'] = full_response_raw
+            print(f'Assistant: {full_response_raw}')
+        except Exception as e:
+            print(f"Error during generation: {e}")
+            import traceback
+            traceback.print_exc()
+            error_msg = f"Error: {str(e)}"
+            _chatbot[-1] = (_parse_text(chat_query), error_msg)
+            task_history[-1]['response'] = error_msg
+            yield _chatbot
+    @spaces.GPU
+    def regenerate(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
+        if not task_history or not _chatbot:
+            yield _chatbot
+            return
+        last_item = normalize_task_history_item(task_history[-1])
+        if last_item['response'] is None:
+            yield _chatbot
+            return
+        last_item['response'] = None
+        task_history[-1] = last_item
+        _chatbot.pop(-1)
+        display_message_parts = []
+        if last_item['images']: display_message_parts.append(f"[Uploaded {len(last_item['images'])} images]")
+        if last_item['text']: display_message_parts.append(last_item['text'])
+        display_message = " ".join(display_message_parts)
+        _chatbot.append([_parse_text(display_message), None])
+        for updated_chatbot in predict(_chatbot, task_history, system_prompt, temperature, top_p, max_tokens):
+            yield updated_chatbot
     def add_text_and_files(history, task_history, text, files):
         history = history if history is not None else []
         task_history = task_history if task_history is not None else []
         has_text = text and text.strip()
         has_files = files and len(files) > 0
         if not has_text and not has_files:
             return history, task_history, text, files
+        display_parts, file_paths = [], []
         if has_files:
             for file in files:
+                if file and hasattr(file, 'name'):
+                    file_paths.append(file.name)
             if file_paths:
                 display_parts.append(f"[Uploaded {len(file_paths)} images]")
         if has_text:
             display_parts.append(text)
         display_message = " ".join(display_parts)
         history.append([_parse_text(display_message), None])
+        task_history.append({'text': text if has_text else '', 'images': file_paths, 'response': None})
+        return history, task_history, '', None
     def reset_state():
         _gc()
+        return [], [], None
     with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo:
+        gr.HTML(f"""
             <div class="container">
                 <h1 class="main-title">IntrinSight Assistant</h1>
                 <p class="sub-title">
                     Powered by IntrinSight-4B Model
+                    <span class="status-indicator {'gpu-status' if device == 'cuda' else 'cpu-status'}">
+                        {'🚀 GPU Mode' if device == 'cuda' else '💻 CPU Mode'}
+                    </span>
                 </p>
             </div>
+        """)
         task_history = gr.State([])
         with gr.Row():
             with gr.Column(scale=4):
                 chatbot = gr.Chatbot(
+                    label='IntrinSight-4B Chat Interface', elem_classes='control-height', height=600,
                     avatar_images=(None, "https://em-content.zobj.net/thumbs/240/twitter/348/robot_1f916.png")
                 )
                 with gr.Row():
+                    query = gr.Textbox(lines=3, label='💬 Message Input', placeholder="Enter your question here...", elem_classes="custom-input")
                 with gr.Row():
                     addfile_btn = gr.File(
+                        label="📸 Upload Images (Drag & Drop Supported, Multiple Selection)", file_count="multiple",
+                        file_types=["image"], elem_classes="file-upload-area"
                     )
                 with gr.Row():
                     submit_btn = gr.Button('🚀 Send', variant="primary", elem_classes="custom-button")
                     regen_btn = gr.Button('🔄 Regenerate', variant="secondary", elem_classes="custom-button")
                     empty_bin = gr.Button('🗑️ Clear History', variant="stop", elem_classes="custom-button")
             with gr.Column(scale=2):
                 with gr.Group(elem_classes="parameter-section"):
                     gr.Markdown("### ⚙️ System Configuration")
+                    system_prompt = gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=5, placeholder="Enter system prompt here...")
                 with gr.Group(elem_classes="parameter-section"):
                     gr.Markdown("### 🎛️ Generation Parameters")
+                    temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature (Creativity)", info="Higher values make output more random")
+                    top_p = gr.Slider(minimum=0.1, maximum=1.0, value=1.0, step=0.05, label="Top-p (Nucleus Sampling)", info="Cumulative probability for token selection")
+                    max_tokens = gr.Slider(minimum=256, maximum=16384, value=8192, step=256, label="Max Tokens", info="Maximum number of tokens to generate")
+                gr.Markdown(f"""
                 ### 📋 Instructions
                 **Basic Usage:**
                 - **Parameters**: Adjust generation settings as needed
                 **Performance Info:**
+                - Current Mode: **{'GPU Acceleration' if device == 'cuda' else 'CPU Mode'}**
+                - Flash Attention: **{'Enabled' if (device == 'cuda' and HAS_FLASH_ATTN) else 'Disabled'}**
                 - Recommended Image Size: < 1024×1024
                 ### ⚠️ Disclaimer
                 This demo is subject to the Gemma license agreement.
                 Please do not generate or disseminate harmful content.
+                """)
         submit_btn.click(
+            add_text_and_files,
             [chatbot, task_history, query, addfile_btn],
             [chatbot, task_history, query, addfile_btn]
         ).then(
+            predict,
+            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
+            [chatbot],
+            show_progress="full"
         )
+        empty_bin.click(reset_state, outputs=[chatbot, task_history, addfile_btn], show_progress=True)
         regen_btn.click(
+            regenerate,
+            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
+            [chatbot],
+            show_progress="full"
         )
         query.submit(
+            add_text_and_files,
             [chatbot, task_history, query, addfile_btn],
             [chatbot, task_history, query, addfile_btn]
         ).then(
+            predict,
+            [chatbot, task_history, system_prompt, temperature, top_p, max_tokens],
+            [chatbot],
+            show_progress="full"
         )
     demo.queue(max_size=10).launch(
+        share=args.share, inbrowser=args.inbrowser,
+        server_port=args.server_port, server_name=args.server_name,
         show_error=True
     )
 def main():
     """Main entry point"""
     args = _get_args()