Ernie4.5-VL-Video2Coder

Sleeping

App Files Files Community

zhijun.li commited on 22 days ago

Commit

5f6df4d

1 Parent(s): 4578e32

update app.py

Browse files

Files changed (1) hide show

app.py +53 -36

app.py CHANGED Viewed

@@ -2,11 +2,10 @@ import os
 import cv2
 import time
 import base64
 import gradio as gr
 from openai import OpenAI
 from concurrent.futures import ThreadPoolExecutor, as_completed
-import re
 # --- Configuration ---
 BASE_URL = "https://aistudio.baidu.com/llm/lmapi/v3"
@@ -63,10 +62,8 @@ def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
     return chunk_index, ""
 def aggregate_and_generate_webpage(client, summaries):
-    """Aggregate summaries and generate final HTML."""
     full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
-    # Prompt 稍微加强一点语气
     final_prompt = f"""
     You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
@@ -79,18 +76,16 @@ def aggregate_and_generate_webpage(client, summaries):
     3. End directly with `</html>`.
     4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
     """
     response = client.chat.completions.create(
         model=MODEL_NAME,
         messages=[{"role": "user", "content": final_prompt}],
         temperature=0.2, top_p=0.8
     )
     content = response.choices[0].message.content
-    content = content.replace("```html", "").replace("```", "").strip()
     match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
     if match:
         content = match.group(1)
     else:
@@ -100,8 +95,10 @@ def aggregate_and_generate_webpage(client, summaries):
     return content
 def main_process(video_file, progress=gr.Progress()):
     api_key = os.environ.get("ERNIE_API_KEY")
     if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
     if not video_file: raise gr.Error("Please upload a video.")
@@ -117,63 +114,83 @@ def main_process(video_file, progress=gr.Progress()):
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
-    progress(0.1, desc="Extracting frames...")
     chunks = extract_frames(video_file)
     if not chunks: raise gr.Error("Frame extraction failed.")
-    progress(0.3, desc="Analyzing content...")
     chunk_summaries = {}
     with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
         future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
-        for i, future in enumerate(as_completed(future_to_chunk)):
             idx, summary = future.result()
             if summary: chunk_summaries[idx] = summary
-            progress(0.3 + 0.5 * ((i+1)/len(chunks)), desc=f"Analyzed {i+1}/{len(chunks)}")
-    progress(0.8, desc="Synthesizing code...")
     html_code = aggregate_and_generate_webpage(client, chunk_summaries)
-    # Save file
     output_path = "generated_website.html"
     with open(output_path, "w", encoding="utf-8") as f:
         f.write(html_code)
-    # --- 关键修改：制作 Data URI Iframe ---
-    # 将 HTML 编码为 Base64，放入 iframe 的 src 中
-    # 这样实现了完美的沙箱隔离，样式不会冲突，JS 也能正常运行
     b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
     data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
-    # 创建一个 HTML 字符串，里面包含一个 iframe
-    iframe_html = f"""
-    <iframe
-        src="{data_uri}"
-        width="100%"
-        height="600px"
-        style="border: 1px solid #ccc; border-radius: 8px; background-color: white;">
-    </iframe>
-    """
-    progress(1.0, desc="Done!")
-    # 返回 iframe 字符串给 HTML 组件，返回路径给下载组件，返回源码给 Code 组件
     return iframe_html, output_path, html_code
 # --- UI ---
 with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎬 Ernie 4.5-VL: Video to Code Agent")
-    gr.Markdown("Upload a frontend video tutorial. The AI will generate and **render** the code instantly.")
     with gr.Row():
         with gr.Column(scale=1):
-            video_input = gr.Video(label="Upload Video", format="mp4", height=300)
             submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
         with gr.Column(scale=2):
-            # 直接展示预览，不再隐藏在 Tab 里，或者设为默认 Tab
             with gr.Tabs():
                 with gr.TabItem("🌐 Live Preview (Result)"):
-                    # 这个组件现在接收的是 iframe 字符串
                     html_preview = gr.HTML(label="Rendered Page")
                 with gr.TabItem("📝 Source Code"):

 import cv2
 import time
 import base64
+import re
 import gradio as gr
 from openai import OpenAI
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # --- Configuration ---
 BASE_URL = "https://aistudio.baidu.com/llm/lmapi/v3"
     return chunk_index, ""
 def aggregate_and_generate_webpage(client, summaries):
+    """Generate final HTML."""
     full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
     final_prompt = f"""
     You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
     3. End directly with `</html>`.
     4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
     """
     response = client.chat.completions.create(
         model=MODEL_NAME,
         messages=[{"role": "user", "content": final_prompt}],
         temperature=0.2, top_p=0.8
     )
     content = response.choices[0].message.content
+    # Regex Cleaning
+    content = content.replace("```html", "").replace("```", "").strip()
     match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
     if match:
         content = match.group(1)
     else:
     return content
 def main_process(video_file, progress=gr.Progress()):
+    # Clean progress bar logic: explicitly call progress()
+    progress(0, desc="Starting...")
     api_key = os.environ.get("ERNIE_API_KEY")
     if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
     if not video_file: raise gr.Error("Please upload a video.")
     client = OpenAI(api_key=api_key, base_url=BASE_URL)
+    progress(0.1, desc="Step 1/3: Extracting frames...")
     chunks = extract_frames(video_file)
     if not chunks: raise gr.Error("Frame extraction failed.")
+    progress(0.3, desc="Step 2/3: ERNIE Analyzing content...")
     chunk_summaries = {}
+    # Using ThreadPool without tqdm to avoid UI glitches
     with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
         future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
+        total_chunks = len(chunks)
+        completed = 0
+        for future in as_completed(future_to_chunk):
             idx, summary = future.result()
             if summary: chunk_summaries[idx] = summary
+            completed += 1
+            # Smooth progress update from 0.3 to 0.8
+            current_progress = 0.3 + (0.5 * (completed / total_chunks))
+            progress(current_progress, desc=f"Step 2/3: Analyzing segment {completed}/{total_chunks}")
+    progress(0.85, desc="Step 3/3: Synthesizing final code...")
     html_code = aggregate_and_generate_webpage(client, chunk_summaries)
     output_path = "generated_website.html"
     with open(output_path, "w", encoding="utf-8") as f:
         f.write(html_code)
+    # Create Iframe
     b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
     data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
+    iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
+    progress(1.0, desc="Completed!")
     return iframe_html, output_path, html_code
 # --- UI ---
 with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
+    # --- Header & Description (Goal 3) ---
+    gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
+    with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=False):
+        gr.Markdown("""
+        This application is powered by **Baidu ERNIE 4.5 **, a state-of-the-art foundation model with specific enhancements for video understanding:
+        *   **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
+        *   **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information, allowing precise understanding of event sequences in videos.
+        *   **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios, ensuring fine-grained details (like small code font on screen) are captured accurately.
+        *   **🚀 Long Context Window**: Supports up to 128k context length, enabling the analysis of longer tutorials and complex logic flows.
+        """)
+    gr.Markdown("Upload a frontend coding tutorial video (or try the example below). The AI will watch it, understand the code, and render the result instantly.")
     with gr.Row():
         with gr.Column(scale=1):
+            # --- Input Section ---
+            video_input = gr.Video(label="Upload Video", format="mp4", height=320)
+            # --- Goal 1: Examples Component ---
+            # 用户点击这里的视频，会自动填充到上面的 video_input 中
+            gr.Examples(
+                examples=[["sample_demo.mp4"]], # ⚠️ 确保你上传了名为 sample_demo.mp4 的文件
+                inputs=[video_input],
+                label="▶️ Or try this example video:",
+                cache_examples=False # 关闭缓存以节省空间
+            )
             submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
         with gr.Column(scale=2):
+            # --- Output Section ---
             with gr.Tabs():
                 with gr.TabItem("🌐 Live Preview (Result)"):
                     html_preview = gr.HTML(label="Rendered Page")
                 with gr.TabItem("📝 Source Code"):