import os
import cv2
import time
import uuid
import base64
import re
import gradio as gr
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# --- Configuration ---
QIANFAN_URL = os.getenv("QIANFAN_URL", "")
QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "")


MODEL_NAME = "ernie-4.5-turbo-vl"
MAX_CONCURRENT_REQUESTS = 4
MAX_VIDEO_DURATION_SEC = 1800 

def extract_frames(video_path, interval_sec=1):
    """Extract frames from the video."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return []
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0: fps = 30 
    chunk_frames, chunks, frame_count = [], [], 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        if frame_count % int(fps * interval_sec) == 0:
            height, width = frame.shape[:2]
            scale = 512 / height
            resized_frame = cv2.resize(frame, (int(width * scale), 512))
            _, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
            chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
            if len(chunk_frames) == 30:
                chunks.append(chunk_frames)
                chunk_frames = []
        frame_count += 1
    if chunk_frames: chunks.append(chunk_frames)
    cap.release()
    return chunks

def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
    """Send chunk to LLM."""
    prompt = (
        "This is a segment from a video. "
        "If code is visible, please extract the HTML, CSS, and JavaScript. "
        "If it is a general video (e.g., sports, nature), describe the scene, colors, and mood to inspire a website design. "
        "Provide a detailed summary suitable for a frontend engineer to build a website from."
    )
    content = [{"type": "text", "text": prompt}]
    for f in frames_b64:
        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": content}],
                temperature=0.1, max_tokens=1024
            )
            return chunk_index, response.choices[0].message.content
        except Exception as e:
            time.sleep(2)
    return chunk_index, ""

def aggregate_and_generate_webpage(client, summaries):
    """Generate final HTML."""
    full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
    final_prompt = f"""
    You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
    If the video was a coding tutorial, reconstruct the code.
    If the video was a general scene, create a modern, responsive website inspired by the video's content (e.g., a skiing resort page for a skiing video).
    
    **Summaries:**
    {full_summary}

    **Strict Output Instructions:**
    1. Return ONLY the raw HTML code.
    2. Start directly with `<!DOCTYPE html>`.
    3. End directly with `</html>`.
    4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
    """
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.2, top_p=0.8
    )
    content = response.choices[0].message.content
    
    # Regex Cleaning
    content = content.replace("```html", "").replace("```", "").strip()
    match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
    if match:
        content = match.group(1)
    else:
        start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
        if start_match:
            content = content[start_match.start():]
            
    return content

def main_process(video_file, progress=gr.Progress()):
    # 1. 初始化状态
    yield "⏳ Initializing...", None, None, None
    
    api_key = QIANFAN_TOKEN
    if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
    if not video_file: raise gr.Error("Please upload a video.")

    # check duration
    cap = cv2.VideoCapture(video_file)
    fps = cap.get(cv2.CAP_PROP_FPS)
    count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = count / fps if fps > 0 else 0
    cap.release()
    if duration > MAX_VIDEO_DURATION_SEC:
        raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")

    client = OpenAI(api_key=api_key, base_url=QIANFAN_URL)
    
    # 2. 抽帧阶段
    yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
    progress(0.1, desc="Extracting frames...")
    chunks = extract_frames(video_file)
    if not chunks: raise gr.Error("Frame extraction failed.")
    
    # 3. 分析阶段
    yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
    progress(0.3, desc="Analyzing content...")
    chunk_summaries = {}
    
    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
        future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
        
        completed = 0
        total = len(chunks)
        
        for future in as_completed(future_to_chunk):
            idx, summary = future.result()
            if summary: chunk_summaries[idx] = summary
            
            completed += 1
            # 实时更新状态文字
            yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None
            
    # 4. 生成代码阶段
    yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
    progress(0.85, desc="Synthesizing code...")
    html_code = aggregate_and_generate_webpage(client, chunk_summaries)
    
    output_path = f"generated_website_{uuid.uuid4().hex}.html"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_code)
    
    # Create Iframe
    b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
    data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
    iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
    
    progress(1.0, desc="Done")
    # 5. 完成，返回所有结果
    yield "✅ Generation Complete!", iframe_html, output_path, html_code

# --- UI ---

with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
    
    # 修复2：将 open 设置为 True，默认展开
    with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
        gr.Markdown("""
        This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding:
        
        *   **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
        *   **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
        *   **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
        *   **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials.
        """)
    
    gr.Markdown("Upload a video (e.g., a coding tutorial or a scene like skiing). The AI will watch it and generate a website based on the content.")

    with gr.Row():
        with gr.Column(scale=1):
            # 修复1：去掉了 height 参数，让它自适应高度，不会再“扁扁的”
            video_input = gr.Video(label="Upload Video", format="mp4")
            
            gr.Examples(
                examples=[["sample_demo.mp4"], ["skiing.mp4"]], 
                inputs=[video_input],
                label="▶️ Or try this example video:",
                cache_examples=False 
            )
            
            submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
            
            # 修复3：新增一个状态文本框，直接显示在这里，不用滚轮找进度条了
            status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)
        
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("🌐 Live Preview (Result)"):
                    html_preview = gr.HTML(label="Rendered Page")
                
                with gr.TabItem("📝 Source Code"):
                    code_output = gr.Code(language="html", label="HTML Source")
                
                with gr.TabItem("⬇️ Download"):
                     file_download = gr.File(label="Download .html File")

    # 绑定事件：outputs 增加了 status_output
    submit_btn.click(
        fn=main_process,
        inputs=[video_input],
        outputs=[status_output, html_preview, file_download, code_output]
    )

if __name__ == "__main__":
    demo.launch()