File size: 9,377 Bytes
e6bc9f9 f162685 e6bc9f9 5f6df4d e6bc9f9 f162685 e6bc9f9 f162685 e6bc9f9 83dc473 e6bc9f9 5f6df4d e6bc9f9 83dc473 e6bc9f9 5f6df4d e6bc9f9 35da597 5f6df4d f162685 e6bc9f9 f162685 e6bc9f9 35da597 e6bc9f9 35da597 e6bc9f9 5f6df4d e6bc9f9 5f6df4d 35da597 5f6df4d e6bc9f9 5f6df4d 35da597 e6bc9f9 f162685 e6bc9f9 5f6df4d e6bc9f9 5f6df4d e6bc9f9 35da597 e6bc9f9 5f6df4d 35da597 5f6df4d 850ce03 5f6df4d 35da597 5f6df4d 83dc473 5f6df4d e6bc9f9 35da597 5f6df4d 83dc473 5f6df4d 35da597 5f6df4d e6bc9f9 35da597 e6bc9f9 35da597 e6bc9f9 35da597 e6bc9f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import os
import cv2
import time
import uuid
import base64
import re
import gradio as gr
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# --- Configuration ---
QIANFAN_URL = os.getenv("QIANFAN_URL", "")
QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "")
MODEL_NAME = "ernie-4.5-turbo-vl"
MAX_CONCURRENT_REQUESTS = 4
MAX_VIDEO_DURATION_SEC = 1800
def extract_frames(video_path, interval_sec=1):
"""Extract frames from the video."""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return []
fps = cap.get(cv2.CAP_PROP_FPS)
if fps == 0: fps = 30
chunk_frames, chunks, frame_count = [], [], 0
while cap.isOpened():
ret, frame = cap.read()
if not ret: break
if frame_count % int(fps * interval_sec) == 0:
height, width = frame.shape[:2]
scale = 512 / height
resized_frame = cv2.resize(frame, (int(width * scale), 512))
_, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
if len(chunk_frames) == 30:
chunks.append(chunk_frames)
chunk_frames = []
frame_count += 1
if chunk_frames: chunks.append(chunk_frames)
cap.release()
return chunks
def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
"""Send chunk to LLM."""
prompt = (
"This is a segment from a video. "
"If code is visible, please extract the HTML, CSS, and JavaScript. "
"If it is a general video (e.g., sports, nature), describe the scene, colors, and mood to inspire a website design. "
"Provide a detailed summary suitable for a frontend engineer to build a website from."
)
content = [{"type": "text", "text": prompt}]
for f in frames_b64:
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": content}],
temperature=0.1, max_tokens=1024
)
return chunk_index, response.choices[0].message.content
except Exception as e:
time.sleep(2)
return chunk_index, ""
def aggregate_and_generate_webpage(client, summaries):
"""Generate final HTML."""
full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
final_prompt = f"""
You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
If the video was a coding tutorial, reconstruct the code.
If the video was a general scene, create a modern, responsive website inspired by the video's content (e.g., a skiing resort page for a skiing video).
**Summaries:**
{full_summary}
**Strict Output Instructions:**
1. Return ONLY the raw HTML code.
2. Start directly with `<!DOCTYPE html>`.
3. End directly with `</html>`.
4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
"""
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": final_prompt}],
temperature=0.2, top_p=0.8
)
content = response.choices[0].message.content
# Regex Cleaning
content = content.replace("```html", "").replace("```", "").strip()
match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
if match:
content = match.group(1)
else:
start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
if start_match:
content = content[start_match.start():]
return content
def main_process(video_file, progress=gr.Progress()):
# 1. 初始化状态
yield "⏳ Initializing...", None, None, None
api_key = QIANFAN_TOKEN
if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
if not video_file: raise gr.Error("Please upload a video.")
# check duration
cap = cv2.VideoCapture(video_file)
fps = cap.get(cv2.CAP_PROP_FPS)
count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
duration = count / fps if fps > 0 else 0
cap.release()
if duration > MAX_VIDEO_DURATION_SEC:
raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")
client = OpenAI(api_key=api_key, base_url=QIANFAN_URL)
# 2. 抽帧阶段
yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
progress(0.1, desc="Extracting frames...")
chunks = extract_frames(video_file)
if not chunks: raise gr.Error("Frame extraction failed.")
# 3. 分析阶段
yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
progress(0.3, desc="Analyzing content...")
chunk_summaries = {}
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
completed = 0
total = len(chunks)
for future in as_completed(future_to_chunk):
idx, summary = future.result()
if summary: chunk_summaries[idx] = summary
completed += 1
# 实时更新状态文字
yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None
# 4. 生成代码阶段
yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
progress(0.85, desc="Synthesizing code...")
html_code = aggregate_and_generate_webpage(client, chunk_summaries)
output_path = f"generated_website_{uuid.uuid4().hex}.html"
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_code)
# Create Iframe
b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
progress(1.0, desc="Done")
# 5. 完成,返回所有结果
yield "✅ Generation Complete!", iframe_html, output_path, html_code
# --- UI ---
with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
# 修复2:将 open 设置为 True,默认展开
with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
gr.Markdown("""
This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding:
* **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
* **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
* **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
* **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials.
""")
gr.Markdown("Upload a video (e.g., a coding tutorial or a scene like skiing). The AI will watch it and generate a website based on the content.")
with gr.Row():
with gr.Column(scale=1):
# 修复1:去掉了 height 参数,让它自适应高度,不会再“扁扁的”
video_input = gr.Video(label="Upload Video", format="mp4")
gr.Examples(
examples=[["sample_demo.mp4"], ["skiing.mp4"]],
inputs=[video_input],
label="▶️ Or try this example video:",
cache_examples=False
)
submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
# 修复3:新增一个状态文本框,直接显示在这里,不用滚轮找进度条了
status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("🌐 Live Preview (Result)"):
html_preview = gr.HTML(label="Rendered Page")
with gr.TabItem("📝 Source Code"):
code_output = gr.Code(language="html", label="HTML Source")
with gr.TabItem("⬇️ Download"):
file_download = gr.File(label="Download .html File")
# 绑定事件:outputs 增加了 status_output
submit_btn.click(
fn=main_process,
inputs=[video_input],
outputs=[status_output, html_preview, file_download, code_output]
)
if __name__ == "__main__":
demo.launch() |