Ernie4.5-VL-Video2Coder

Sleeping

File size: 9,377 Bytes

e6bc9f9
 
 
f162685
e6bc9f9
5f6df4d
e6bc9f9
 
 
f162685
 
 
 
e6bc9f9
 
f162685
 
 
 
e6bc9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83dc473
 
 
 
e6bc9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f6df4d
e6bc9f9
 
 
83dc473
 
e6bc9f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f6df4d
 
e6bc9f9
 
 
 
 
 
 
 
 
 
 
35da597
 
5f6df4d
f162685
e6bc9f9
 
 
 
 
 
 
 
 
 
 
 
f162685
e6bc9f9
35da597
 
 
e6bc9f9
 
 
35da597
 
 
e6bc9f9
5f6df4d
e6bc9f9
 
5f6df4d
 
35da597
5f6df4d
 
e6bc9f9
 
5f6df4d
 
35da597
 
 
 
 
 
e6bc9f9
 
f162685
e6bc9f9
 
 
5f6df4d
e6bc9f9
 
5f6df4d
e6bc9f9
35da597
 
 
e6bc9f9
 
 
 
 
5f6df4d
 
35da597
 
5f6df4d
850ce03
5f6df4d
 
35da597
 
 
5f6df4d
 
83dc473
5f6df4d
e6bc9f9
 
35da597
 
5f6df4d
 
83dc473
5f6df4d
 
35da597
5f6df4d
 
e6bc9f9
35da597
 
 
e6bc9f9
 
 
 
 
 
 
 
 
 
 
 
35da597
e6bc9f9
 
 
35da597
e6bc9f9

import os
import cv2
import time
import uuid
import base64
import re
import gradio as gr
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# --- Configuration ---
QIANFAN_URL = os.getenv("QIANFAN_URL", "")
QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "")


MODEL_NAME = "ernie-4.5-turbo-vl"
MAX_CONCURRENT_REQUESTS = 4
MAX_VIDEO_DURATION_SEC = 1800 

def extract_frames(video_path, interval_sec=1):
    """Extract frames from the video."""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return []
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0: fps = 30 
    chunk_frames, chunks, frame_count = [], [], 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret: break
        if frame_count % int(fps * interval_sec) == 0:
            height, width = frame.shape[:2]
            scale = 512 / height
            resized_frame = cv2.resize(frame, (int(width * scale), 512))
            _, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
            chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
            if len(chunk_frames) == 30:
                chunks.append(chunk_frames)
                chunk_frames = []
        frame_count += 1
    if chunk_frames: chunks.append(chunk_frames)
    cap.release()
    return chunks

def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
    """Send chunk to LLM."""
    prompt = (
        "This is a segment from a video. "
        "If code is visible, please extract the HTML, CSS, and JavaScript. "
        "If it is a general video (e.g., sports, nature), describe the scene, colors, and mood to inspire a website design. "
        "Provide a detailed summary suitable for a frontend engineer to build a website from."
    )
    content = [{"type": "text", "text": prompt}]
    for f in frames_b64:
        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": content}],
                temperature=0.1, max_tokens=1024
            )
            return chunk_index, response.choices[0].message.content
        except Exception as e:
            time.sleep(2)
    return chunk_index, ""

def aggregate_and_generate_webpage(client, summaries):
    """Generate final HTML."""
    full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
    final_prompt = f"""
    You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
    If the video was a coding tutorial, reconstruct the code.
    If the video was a general scene, create a modern, responsive website inspired by the video's content (e.g., a skiing resort page for a skiing video).
    
    **Summaries:**
    {full_summary}

    **Strict Output Instructions:**
    1. Return ONLY the raw HTML code.
    2. Start directly with `<!DOCTYPE html>`.
    3. End directly with `</html>`.
    4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
    """
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": final_prompt}],
        temperature=0.2, top_p=0.8
    )
    content = response.choices[0].message.content
    
    # Regex Cleaning
    content = content.replace("```html", "").replace("```", "").strip()
    match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
    if match:
        content = match.group(1)
    else:
        start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
        if start_match:
            content = content[start_match.start():]
            
    return content

def main_process(video_file, progress=gr.Progress()):
    # 1. 初始化状态
    yield "⏳ Initializing...", None, None, None
    
    api_key = QIANFAN_TOKEN
    if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
    if not video_file: raise gr.Error("Please upload a video.")

    # check duration
    cap = cv2.VideoCapture(video_file)
    fps = cap.get(cv2.CAP_PROP_FPS)
    count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    duration = count / fps if fps > 0 else 0
    cap.release()
    if duration > MAX_VIDEO_DURATION_SEC:
        raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")

    client = OpenAI(api_key=api_key, base_url=QIANFAN_URL)
    
    # 2. 抽帧阶段
    yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
    progress(0.1, desc="Extracting frames...")
    chunks = extract_frames(video_file)
    if not chunks: raise gr.Error("Frame extraction failed.")
    
    # 3. 分析阶段
    yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
    progress(0.3, desc="Analyzing content...")
    chunk_summaries = {}
    
    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
        future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
        
        completed = 0
        total = len(chunks)
        
        for future in as_completed(future_to_chunk):
            idx, summary = future.result()
            if summary: chunk_summaries[idx] = summary
            
            completed += 1
            # 实时更新状态文字
            yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None
            
    # 4. 生成代码阶段
    yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
    progress(0.85, desc="Synthesizing code...")
    html_code = aggregate_and_generate_webpage(client, chunk_summaries)
    
    output_path = f"generated_website_{uuid.uuid4().hex}.html"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_code)
    
    # Create Iframe
    b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
    data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
    iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
    
    progress(1.0, desc="Done")
    # 5. 完成，返回所有结果
    yield "✅ Generation Complete!", iframe_html, output_path, html_code

# --- UI ---

with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
    
    # 修复2：将 open 设置为 True，默认展开
    with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
        gr.Markdown("""
        This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding:
        
        *   **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
        *   **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
        *   **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
        *   **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials.
        """)
    
    gr.Markdown("Upload a video (e.g., a coding tutorial or a scene like skiing). The AI will watch it and generate a website based on the content.")

    with gr.Row():
        with gr.Column(scale=1):
            # 修复1：去掉了 height 参数，让它自适应高度，不会再“扁扁的”
            video_input = gr.Video(label="Upload Video", format="mp4")
            
            gr.Examples(
                examples=[["sample_demo.mp4"], ["skiing.mp4"]], 
                inputs=[video_input],
                label="▶️ Or try this example video:",
                cache_examples=False 
            )
            
            submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
            
            # 修复3：新增一个状态文本框，直接显示在这里，不用滚轮找进度条了
            status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)
        
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.TabItem("🌐 Live Preview (Result)"):
                    html_preview = gr.HTML(label="Rendered Page")
                
                with gr.TabItem("📝 Source Code"):
                    code_output = gr.Code(language="html", label="HTML Source")
                
                with gr.TabItem("⬇️ Download"):
                     file_download = gr.File(label="Download .html File")

    # 绑定事件：outputs 增加了 status_output
    submit_btn.click(
        fn=main_process,
        inputs=[video_input],
        outputs=[status_output, html_preview, file_download, code_output]
    )

if __name__ == "__main__":
    demo.launch()