jzhang533's picture
update settings
f162685
raw
history blame
9.16 kB
import os
import cv2
import time
import uuid
import base64
import re
import gradio as gr
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# --- Configuration ---
QIANFAN_URL = os.getenv("QIANFAN_URL", "")
QIANFAN_TOKEN = os.getenv("QIANFAN_TOKEN", "")
MODEL_NAME = "ernie-4.5-turbo-vl"
MAX_CONCURRENT_REQUESTS = 4
MAX_VIDEO_DURATION_SEC = 1800
def extract_frames(video_path, interval_sec=1):
"""Extract frames from the video."""
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return []
fps = cap.get(cv2.CAP_PROP_FPS)
if fps == 0: fps = 30
chunk_frames, chunks, frame_count = [], [], 0
while cap.isOpened():
ret, frame = cap.read()
if not ret: break
if frame_count % int(fps * interval_sec) == 0:
height, width = frame.shape[:2]
scale = 512 / height
resized_frame = cv2.resize(frame, (int(width * scale), 512))
_, buffer = cv2.imencode('.jpg', resized_frame, [int(cv2.IMWRITE_JPEG_QUALITY), 60])
chunk_frames.append(base64.b64encode(buffer).decode('utf-8'))
if len(chunk_frames) == 30:
chunks.append(chunk_frames)
chunk_frames = []
frame_count += 1
if chunk_frames: chunks.append(chunk_frames)
cap.release()
return chunks
def process_chunk_with_retry(client, chunk_index, frames_b64, max_retries=3):
"""Send chunk to LLM."""
prompt = (
"This is a segment from a frontend web development video tutorial (screenshots taken every second). "
"Please focus intently on the code shown on the screen and the resulting web page style. "
"Describe in detail the HTML structure, CSS styling rules, or JavaScript logic presented in this segment. "
"Ignore unrelated video elements."
)
content = [{"type": "text", "text": prompt}]
for f in frames_b64:
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{f}"}})
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": content}],
temperature=0.1, max_tokens=1024
)
return chunk_index, response.choices[0].message.content
except Exception as e:
time.sleep(2)
return chunk_index, ""
def aggregate_and_generate_webpage(client, summaries):
"""Generate final HTML."""
full_summary = "\n".join([f"Segment {i+1} Summary: {s}" for i, s in sorted(summaries.items()) if s])
final_prompt = f"""
You are an expert Frontend Engineer. Based on the video segment summaries, write a complete HTML file.
**Summaries:**
{full_summary}
**Strict Output Instructions:**
1. Return ONLY the raw HTML code.
2. Start directly with `<!DOCTYPE html>`.
3. End directly with `</html>`.
4. NO introduction text, NO markdown backticks (```), NO explanations after the code.
"""
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "user", "content": final_prompt}],
temperature=0.2, top_p=0.8
)
content = response.choices[0].message.content
# Regex Cleaning
content = content.replace("```html", "").replace("```", "").strip()
match = re.search(r'(<!DOCTYPE html>.*</html>)', content, re.DOTALL | re.IGNORECASE)
if match:
content = match.group(1)
else:
start_match = re.search(r'<!DOCTYPE html>', content, re.IGNORECASE)
if start_match:
content = content[start_match.start():]
return content
def main_process(video_file, progress=gr.Progress()):
# 1. 初始化状态
yield "⏳ Initializing...", None, None, None
api_key = QIANFAN_TOKEN
if not api_key: raise gr.Error("Server Config Error: API KEY missing.")
if not video_file: raise gr.Error("Please upload a video.")
# check duration
cap = cv2.VideoCapture(video_file)
fps = cap.get(cv2.CAP_PROP_FPS)
count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
duration = count / fps if fps > 0 else 0
cap.release()
if duration > MAX_VIDEO_DURATION_SEC:
raise gr.Error(f"Video too long ({duration/60:.1f}m). Limit is 30m.")
client = OpenAI(api_key=api_key, base_url=QIANFAN_URL)
# 2. 抽帧阶段
yield "🎞️ Step 1/3: Extracting video frames...", None, None, None
progress(0.1, desc="Extracting frames...")
chunks = extract_frames(video_file)
if not chunks: raise gr.Error("Frame extraction failed.")
# 3. 分析阶段
yield f"🧠 Step 2/3: ERNIE is analyzing {len(chunks)} segments...", None, None, None
progress(0.3, desc="Analyzing content...")
chunk_summaries = {}
with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_REQUESTS) as executor:
future_to_chunk = {executor.submit(process_chunk_with_retry, client, i, chunk): i for i, chunk in enumerate(chunks)}
completed = 0
total = len(chunks)
for future in as_completed(future_to_chunk):
idx, summary = future.result()
if summary: chunk_summaries[idx] = summary
completed += 1
# 实时更新状态文字
yield f"🧠 Step 2/3: Analyzed segment {completed}/{total}...", None, None, None
# 4. 生成代码阶段
yield "✍️ Step 3/3: Synthesizing final HTML code...", None, None, None
progress(0.85, desc="Synthesizing code...")
html_code = aggregate_and_generate_webpage(client, chunk_summaries)
output_path = f"generated_website_{uuid.uuid4().hex}.html"
with open(output_path, "w", encoding="utf-8") as f:
f.write(html_code)
# Create Iframe
b64_html = base64.b64encode(html_code.encode('utf-8')).decode('utf-8')
data_uri = f"data:text/html;charset=utf-8;base64,{b64_html}"
iframe_html = f"""<iframe src="{data_uri}" width="100%" height="600px" style="border: 1px solid #ccc; border-radius: 8px; background-color: white;"></iframe>"""
progress(1.0, desc="Done")
# 5. 完成,返回所有结果
yield "✅ Generation Complete!", iframe_html, output_path, html_code
# --- UI ---
with gr.Blocks(title="Ernie 4.5 Video2Code", theme=gr.themes.Soft()) as demo:
gr.Markdown("# ⚡ ERNIE 4.5-VL: Video to Code Agent")
# 修复2:将 open 设置为 True,默认展开
with gr.Accordion("📚 Technical Capabilities of ERNIE 4.5-VL", open=True):
gr.Markdown("""
This application is powered by **Baidu ERNIE 4.5**, a state-of-the-art foundation model with specific enhancements for video understanding:
* **👁️ Multimodal Heterogeneous MoE**: Uses dedicated vision experts to process images and video frames without interfering with text generation capabilities.
* **⏳ 3D-RoPE Temporal Modeling**: Incorporates 3D Rotary Position Embeddings to independently encode temporal, width, and height information.
* **📐 Adaptive Resolution**: Dynamically adjusts to different video aspect ratios to capture fine-grained code details.
* **🚀 Long Context Window**: Supports up to 128k context length for analyzing long tutorials.
""")
gr.Markdown("Upload a frontend coding tutorial video. The AI will watch it, understand the code, and render the result instantly.")
with gr.Row():
with gr.Column(scale=1):
# 修复1:去掉了 height 参数,让它自适应高度,不会再“扁扁的”
video_input = gr.Video(label="Upload Video", format="mp4")
gr.Examples(
examples=[["sample_demo.mp4"]],
inputs=[video_input],
label="▶️ Or try this example video:",
cache_examples=False
)
submit_btn = gr.Button("🚀 Generate & Render", variant="primary", size="lg")
# 修复3:新增一个状态文本框,直接显示在这里,不用滚轮找进度条了
status_output = gr.Textbox(label="Agent Status", value="Ready to start...", interactive=False)
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("🌐 Live Preview (Result)"):
html_preview = gr.HTML(label="Rendered Page")
with gr.TabItem("📝 Source Code"):
code_output = gr.Code(language="html", label="HTML Source")
with gr.TabItem("⬇️ Download"):
file_download = gr.File(label="Download .html File")
# 绑定事件:outputs 增加了 status_output
submit_btn.click(
fn=main_process,
inputs=[video_input],
outputs=[status_output, html_preview, file_download, code_output]
)
if __name__ == "__main__":
demo.launch()