import os import cv2 import time import uuid import base64 import re import gradio as gr from openai import OpenAI from concurrent.futures import ThreadPoolExecutor, as_completed from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # --- Configuration --- QIANFAN_URL = os.getenv("QIANFAN_URL", "") QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "") MODEL_NAME = "ernie-4.5-turbo-vl" MAX_CONCURRENT_REQUESTS = 4 MAX_VIDEO_DURATION_SEC = 1800 def extract_frames(video_path, interval_sec=1): """Extract frames from the video.""" cap = cv2.VideoCapture(video_path) if not cap.isOpened(): return [] fps = cap.get(cv2.CAP_PROP_FPS) if fps == 0: fps = 30 chunk_frames, chunks, frame_count = [], [], 0 while cap.isOpened(): ret, frame = cap.read() if not ret: break if frame_count % int(fps * interval_sec) == 0: height, width = frame.shape[:2] scale = 512 / height resized_frame = cv2.resize(frame, (int(width * scale), 512)) _, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60]) chunk_frames.append(base64.b64encode(buffer).decode('utf-8')) if len(chunk_frames) == 30: chunks.append(chunk_frames) chunk_frames = [] frame_count += 1 if chunk_frames: chunks.append(chunk_frames) cap.release() return chunks def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3): """Send chunk to LLM.""" prompt = ( "This is a segment from a video. " "If code is visible, please extract the HTML, CSS, and JavaScript. " "If it is a general video (e.g., sports, nature), describe the scene, colors, and mood to inspire a website design. " "Provide a detailed summary suitable for a frontend engineer to build a website from." ) content = [{"type": "text", "text": prompt}] for f in frames_b64: content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}}) for attempt in range(max_retries): try: response = client.chat.completions.create( model=MODEL_NAME, messages=[{"role": "user", "content": content}], temperature=0.1, max_tokens=1024 ) return chunk_index, response.choices[0].message.content except Exception as e: time.sleep(2) return chunk_index, "" def aggregate_and_generate_webpage(client, summaries): """Generate final HTML.""" full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s]) final_prompt = f""" You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file. If the video was a coding tutorial, reconstruct the code. If the video was a general scene, create a modern, responsive website inspired by the video's content (e.g., a skiing resort page for a skiing video). **Summaries:** {full_summary} **Strict Output Instructions:** 1. Return ONLY the raw HTML code. 2. Start directly with ``. 3. End directly with ``. 4. NO introduction text, NO markdown backticks (```), NO explanations after the code. """ response = client.chat.completions.create( model=MODEL_NAME, messages=[{"role": "user", "content": final_prompt}], temperature=0.2, top_p=0.8 ) content = response.choices[0].message.content # Regex Cleaning content = content.replace("```html", "").replace("```", "").strip() match = re.search(r'(.*)', content, re.DOTALL | re.IGNORECASE) if match: content = match.group(1) else: start_match = re.search(r'', content, re.IGNORECASE) if start_match: content = content[start_match.start():] return content def main_process(video_file, progress=gr.Progress()): # 1. 初始化状态 yield "⏳ Initializing...", None, None, None api_key = QIANFAN_TOKEN if not api_key: raise gr.Error("Server Config Error: API KEY missing.") if not video_file: raise gr.Error("Please upload a video.") # check duration cap = cv2.VideoCapture(video_file) fps = cap.get(cv2.CAP_PROP_FPS) count = cap.get(cv2.CAP_PROP_FRAME_COUNT) duration = count / fps if fps > 0 else 0 cap.release() if duration > MAX_VIDEO_DURATION_SEC: raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.") client = OpenAI(api_key=api_key, base_url=QIANFAN_URL) # 2. 抽帧阶段 yield "🎞️ Step 1/3: Extracting video frames...", None, None, None progress(0.1, desc="Extracting frames...") chunks = extract_frames(video_file) if not chunks: raise gr.Error("Frame extraction failed.") # 3. 分析阶段 yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None progress(0.3, desc="Analyzing content...") chunk_summaries = {} with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor: future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)} completed = 0 total = len(chunks) for future in as_completed(future_to_chunk): idx, summary = future.result() if summary: chunk_summaries[idx] = summary completed += 1 # 实时更新状态文字 yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None # 4. 生成代码阶段 yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None progress(0.85, desc="Synthesizing code...") html_code = aggregate_and_generate_webpage(client, chunk_summaries) output_path = f"generated_website_{uuid.uuid4().hex}.html" with open(output_path, "w", encoding="utf-8") as f: f.write(html_code) # Create Iframe b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8') data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}" iframe_html = f"""""" progress(1.0, desc="Done") # 5. 完成,返回所有结果 yield "✅ Generation Complete!", iframe_html, output_path, html_code # --- UI --- with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo: gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent") # 修复2:将 open 设置为 True,默认展开 with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True): gr.Markdown(""" This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding: * **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities. * **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information. * **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details. * **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials. """) gr.Markdown("Upload a video (e.g., a coding tutorial or a scene like skiing). The AI will watch it and generate a website based on the content.") with gr.Row(): with gr.Column(scale=1): # 修复1:去掉了 height 参数,让它自适应高度,不会再“扁扁的” video_input = gr.Video(label="Upload Video", format="mp4") gr.Examples( examples=[["sample_demo.mp4"], ["skiing.mp4"]], inputs=[video_input], label="▶️ Or try this example video:", cache_examples=False ) submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg") # 修复3:新增一个状态文本框,直接显示在这里,不用滚轮找进度条了 status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False) with gr.Column(scale=2): with gr.Tabs(): with gr.TabItem("🌐 Live Preview (Result)"): html_preview = gr.HTML(label="Rendered Page") with gr.TabItem("📝 Source Code"): code_output = gr.Code(language="html", label="HTML Source") with gr.TabItem("⬇️ Download"): file_download = gr.File(label="Download .html File") # 绑定事件:outputs 增加了 status_output submit_btn.click( fn=main_process, inputs=[video_input], outputs=[status_output, html_preview, file_download, code_output] ) if __name__ == "__main__": demo.launch()